前言

记录一下使用python将网页上的数据写入到excel中

一、使用python爬取网上数据并写入到excel中

要爬取数据的网站：

例子一：

https://ip.cn/yinhang.html （该网页有5个网页数据）
在这里插入图片描述
更多工具中查看页面源代码：

找到想要数据的位置

代码中：

例子二：

要爬取数据的网站：
https://www.00cha.com/tel.htm
在这里插入图片描述
页面源代码：

代码中：

二、工具类

import datetime
import openpyxl
import requests
from lxml import etree
def get_url_html(url):
"""
定义一个函数, 新建一个空变量html_str， 请求网页获取网页源码，如果请求成功，则返回结果，如果失败则返回空值
url: 入参参数, 指的是我们普通浏览器中的访问网址
"""
html_str = ""
try:
"""获取网页请求之后，返回的网页源码，类似于在浏览器中右击选择网页源码, 使用三方库etree把网页源码字符串转换成HTML格式"""
r = requests.get(url, timeout=200)
# 该操作为使用网页的编码方式，防止数据乱码
r.encoding = r.apparent_encoding
html_str = etree.HTML(r.text)
except Exception as e:
print(e)
return html_str
# 获取ipcn网站上的数据
def get_ip_cn_data(html_str):
"""
定义一个函数, 新建一个变量pdata_list初始值为空列表（也可以叫空数组）， 在网页源码中匹配出每一行的内容
html_str: 入参参数, 指的是网页源码，HTML格式的
"""
data_list = []
try:
"""查找网页源码中的xpath，找到每一行的位置"""
option = html_str.xpath('//div[@class="layui-card ip_card"]//div[@class="layui-card-body"]/table/tbody[1]//tr')
for op in option:
"""根据每一行，匹配出第一列的字符串，比如'2021年10月20日'，再通过正则匹配出它的数字部分用'/'隔开，则把字符串转换成2021/10/20"""
# col1 = "/".join(re.findall("\d+", op.xpath("./td[1]/text()")[0]))
"""根据每一行，匹配出其他4列的数字字符串，然后通过函数转换，将字符串转换成浮点类型, 获取失败则为空值"""
try:
col1 = str(op.xpath("./td[1]/text()")[0])
except:
col1 = ""
try:
col2 = str(op.xpath("./th/text()")[0])
except:
col2 = ""
data_list.append([col2.strip(), col1.strip()])
except Exception as e:
print(e)
return data_list
# 获取00cha网页上的数据
def get_00cha_data(html_str):
# data_list中的数据为未删除空白行的数据
data_list = []
# not_null_list中的数据为删除空白行后的数据
not_null_list = []
try:
"""查找网页源码中的xpath，找到每一行的位置"""
# 根据网页中的源码，确认表格所在的div等数据
option = html_str.xpath('//div[@class="searchnr"]/table//tr')
for op in option:
"""根据每一行，匹配出第一列的字符串，比如'2021年10月20日'，再通过正则匹配出它的数字部分用'/'隔开，则把字符串转换成2021/10/20"""
# col1 = "/".join(re.findall("\d+", op.xpath("./td[1]/text()")[0]))
"""根据每一行，匹配出其他4列的数字字符串，然后通过函数转换，将字符串转换成浮点类型, 获取失败则为空值"""
try:
col1 = str(op.xpath("./td[1]/a/text()")[0])
except:
col1 = ""
try:
col2 = str(op.xpath("./td[2]/text()")[0])
except:
col2 = ""
try:
col3 = str(op.xpath("./td[3]/a/text()")[0])
except:
col3 = ""
try:
col4 = str(op.xpath("./td[4]/text()")[0])
except:
col4 = ""
data_list.append([col1.strip(), col2.strip()])
data_list.append([col3.strip(), col4.strip()])
except Exception as e:
print(e)
for index, value in enumerate(data_list):
if value[1] != '':
if value[0] != '':
not_null_list.append(value)
return not_null_list
def write_excel(file_name, write_list):
"""
定义一个函数, 将每一行的数据汇总的数组，进行遍历，依次写到excel中
file_name: 入参参数, 指的是写入excel的名字
write_list: 入参参数, 指的是写入excel的每一行汇总的数组
"""
full_excel = openpyxl.Workbook()
full_sheet = full_excel.active
for i in range(0, len(write_list)):
full_sheet.append(write_list[i])
full_excel.save(file_name)
# 主函数
if __name__ == '__main__':
start_time = datetime.datetime.now()
"""
URL的规律是XXXX+当前日期+XXXX+当前页号
"""
now_date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
every_page_result_list = [["pub_phone_number", "pub_phone_desc"]]  # 空数组接受每一页的所有数据行汇总数据
compare_result_list = []
"""循环每一页获取数据"""
# pages = 78
pages = 1
# 获取urls中的数据
for index in range(1, pages+1):
urls = {"https://ip.cn/yinhang.html" , "https://ip.cn/gonggong.html" , "https://ip.cn/jiudian.html" , "https://ip.cn/kuaidi.html" , "https://ip.cn/waimai.html"}
for url in urls:
every_page_result_list = every_page_result_list + get_ip_cn_data(get_url_html(url))
url = "https://www.00cha.com/tel.htm"
compare_result_list = get_00cha_data(get_url_html(url))
# 去除compare_result_list和every_page_result_list中电话相同的数据
for index2, value2 in enumerate(compare_result_list):
for index3, value3 in enumerate(every_page_result_list):
if value2[0] == value3[0]:
compare_result_list.pop(index2)
every_page_result_list = every_page_result_list + compare_result_list
# 将数据写到指定路径下的csv文件中
write_excel(r"D:\PubTel"+now_date+".csv", every_page_result_list)
end_time = datetime.datetime.now()
print(f"耗时总共{(end_time - start_time).seconds}秒")

使用python爬取网站数据并写入到excel中

前言

一、使用python爬取网上数据并写入到excel中

例子一：

例子二：

二、工具类

相关推荐

评论抢沙发

热门文章

热门专题

随机阅读

最新评论

热门标签

网站统计

切换注册登录

切换登录注册

前言

一、使用python爬取网上数据并写入到excel中

例子一：

例子二：

二、工具类

相关推荐

评论 抢沙发

热门文章

热门专题

随机阅读

最新评论

热门标签

网站统计

切换注册登录

切换登录注册

评论抢沙发