主流程
step1 获得html response
“url ="https://www.douban.com/personage/30091826/creations?sortby=time&type=writer"
html = get_page(url)
step2把respnse 存入文件,可comment
with open('sss.txt','a',encoding='utf-8')as f:
f.write(html)
step3 parse_page(html) 正则表达式处理 html 数据,写入文件
for itemin parse_page(html):
print(item)
write_to_file(item)
语法
yield
https://blog.csdn.net/mieleizhi0522/article/details/82142856
代码
# -*- coding: utf-8 -*-
import json
import re
import requests
from requestsimport RequestException
def get_page(url):
try:
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
response = requests.get(url,headers=headers,verify=False)
if response.status_code ==200:
return response.text,response.status_code
print(response.status_code)
return response.status_code
except RequestException:
print("we did not get response")
return RequestException
def parse_page(html):
pattern = re.compile('target="_blank">(.*?)</a>.*?<span class="rating-star allstar40"></span>.*?<span>(.*?)</span>', re.S)
items = re.findall(pattern, html)
print(items)
for itemin items:
yield{
'title':item[0],
'score':item[1],
}
def write_to_file(content,file):
with open(file,'a',encoding='utf-8')as f:
# print(type(json.dumps(content)))
f.write(json.dumps(content,ensure_ascii=False))
def main():
# url = "https://www.douban.com/personage/30091826/creations?sortby=time&type=writer"
url ="https://www.douban.com/personage/30091820/creations?sortby=time&type=writer"
content = get_page(url)
assert len(content)==2,"we get response"
html=content[0]
with open('sss.txt','a',encoding='utf-8')as f:
f.write(html)
file='testrecord.txt'
for itemin parse_page(html):
print(item)
write_to_file(item,file)
if __name__ =='__main__':
main()