python 爬虫

主流程

step1 获得html response

“url ="https://www.douban.com/personage/30091826/creations?sortby=time&type=writer"

html = get_page(url) 

step2把respnse 存入文件,可comment

with open('sss.txt','a',encoding='utf-8')as f:

f.write(html) 

step3 parse_page(html) 正则表达式处理 html 数据,写入文件

for itemin parse_page(html): 

print(item)

write_to_file(item)

语法

yield

https://blog.csdn.net/mieleizhi0522/article/details/82142856


代码

# -*- coding: utf-8 -*-

import json

import re

import requests

from requestsimport RequestException

def get_page(url):

try:

headers = {

'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}

response = requests.get(url,headers=headers,verify=False)

if response.status_code ==200:

return response.text,response.status_code

print(response.status_code)

return response.status_code

except RequestException:

print("we did not get response")

return RequestException

def parse_page(html):

pattern = re.compile('target="_blank">(.*?)</a>.*?<span class="rating-star allstar40"></span>.*?<span>(.*?)</span>', re.S)

items = re.findall(pattern, html)

print(items)

for itemin items:

yield{

'title':item[0],

'score':item[1],

}

def write_to_file(content,file):

with open(file,'a',encoding='utf-8')as f:

# print(type(json.dumps(content)))

        f.write(json.dumps(content,ensure_ascii=False))

def main():

# url = "https://www.douban.com/personage/30091826/creations?sortby=time&type=writer"

    url ="https://www.douban.com/personage/30091820/creations?sortby=time&type=writer"

    content = get_page(url)

assert len(content)==2,"we get response"

    html=content[0]

with open('sss.txt','a',encoding='utf-8')as f:

f.write(html)

file='testrecord.txt'

    for itemin parse_page(html):

print(item)

write_to_file(item,file)

if __name__ =='__main__':

main()

©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

友情链接更多精彩内容