解析网站:https://www.qiushibaike.com/text/
解析用户和内容
python代码:
import urllib.request
import urllib.parse
from lxml import etree
import time
import json
item_list=[]
def handle_request(url,page):
# url=url+str(page)+'/'
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}
url=url.format(page)
request=urllib.request.Request(url=url,headers=headers)
return request
def content_parse(content):
#使用xpath在线解析
tree=etree.HTML(content)
qiushi_list=tree.xpath('//div[starts-with(@id,"qiushi_tag")]')
# print(qiushi_list)
# print(len(qiushi_list))
for qiushi in qiushi_list:
zuozhe=qiushi.xpath('.//div[@class="author clearfix"]//h2/text()')[0]
# print(zuozhe)
neirong=qiushi.xpath('.//div[@class="content"]/span/text()')
# print(neirong)
item={
'作者':zuozhe,
'内容':neirong,
}
#将内容添加到列表中
item_list.append(item)
def main():
url='https://www.qiushibaike.com/text/page/{}/'
start_page=int(input('请输入起始页码:'))
end_page=int(input('请输入结束页码:'))
for page in range(start_page,end_page + 1):
#构建请求对象
request=handle_request(url,page)
print('开始下载%s页' % page)
#获取响应
content=urllib.request.urlopen(request).read().decode()
#解析内容
content_parse(content)
print('结束下载%s页' % page)
time.sleep(2)
#写入到文件中
string=json.dumps(item_list,ensure_ascii=False)
with open('baikeduanzi.txt','w',encoding='utf8') as fp:
fp.write(string)
if __name__ == '__main__':
main()
print("下载完成...")
print("请查看当前路径下的baikeduanzi.txt文件")
解析后: