一、 json模块的使用
响应对象中有 json(),可以直接解析成 python 中的字典,但是一旦服务端返回的数据有特殊字符,则会转换失败。所以可以先用 text 接收字符串数据,如果有特殊字符,可以使用字符的替换方法去先将特殊字符替换一下,再使用 json 模块去转为就 python 的字典
二、一页数据
'''
爬取20页数据
1-找到目标url (先找地址栏的url,看响应数据是否存在,如果不存在,找xhr)
https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1697544602839&pageIndex=1&pageSize=10
https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1697544618262&pageIndex=3&pageSize=10
2-发起请求,获取响应
3-数据解析,数据提取,提取想要的数据(岗位名称,岗位的城市,岗位的要求)
20条:发送20次请求,每次请求的pageIndex值不同
'''
- 代码实现
import requests
url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1697544602839&pageIndex=1&pageSize=10'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}
res = requests.get(url,headers=headers)
# res.text --->json字符串---->字典
# print(res.text)
#
# import json
# data = json.loads(res.text)
# print(type(data))
# print(res.json()) 某些场景可以使用,但是如果前端返回的数据中有特殊的字符,直接使用会异常(报错)
# print(res.text) # res.text.replace('?','')
import json
data = json.loads(res.text)
# print(data)
# print(data['Data']['Posts'][0]['RecruitPostName'])
# print(data['Data']['Posts'][1]['RecruitPostName'])
# 以上的方式是直接固定一条条取的,如果页面中有100条数据,print(data['Data']['Posts'][1]['RecruitPostName'])写一百次
# 结合循环取值
for i in data['Data']['Posts']:
# 取出的是列表中的每一个数据
print(i['RecruitPostName'],i['LocationName'],i['LastUpdateTime'])
三、 多页数据
- 代码实现
import requests
import json
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}
# 循环20次 从1-20
for j in range(1,21):
url = f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1697544602839&pageIndex={j}&pageSize=10'
res = requests.get(url,headers=headers)
data = json.loads(res.text)
# 取出每页的数据,获取具体的岗位详情
for i in data['Data']['Posts']:
# 取出的是列表中的每一个数据
print(i['RecruitPostName'],i['LocationName'],i['LastUpdateTime'])
四、 jsonpath
- 代码实现
'''
jsonpath:信息抽取库
1-安装
pip install jsonpath -i https://pypi.tuna.tsinghua.edu.cn/simple
2-使用
jsonpath.jsonpath(数据,语法规则)
'''
import jsonpath
books = {
"store": { # 存储
"author":"coco",
"book": [
{
"category": "武侠小说",
"author": "奇儒",
"title": "凝风天下",
"price": 99.9
},
{
"category": "文学作品",
"author": "余华",
"title": "活着",
"isbn": "0-395-19395-8", # 国际编号
"price": 22.99
}
],
"car": {
"color": "白色",
"price": 199999
}
}
}
# python类型取值
print(books['store']['book'][0]['price'])
# $ 代表根节点
# .. 不管不要找的数据位置在哪里,选择所有符合条件的数据
# 获取car下面的color
# 如果匹配到了数据,则返回,列表,如果没有匹配数据,返回False
print(jsonpath.jsonpath(books,'$..color')) # ['白色']
print(jsonpath.jsonpath(books,'$..author')) # ['coco', '奇儒', '余华']
print(jsonpath.jsonpath(books,'$.store.author'))
# 获取books['store']['book'][0]
print(jsonpath.jsonpath(books,'$.store.book[0].title'))
# * 代表所有
print(jsonpath.jsonpath(books,'$.store.book[*]'))
# 获取book下面的所有标题
print(jsonpath.jsonpath(books,'$.store.book[*].title'))
# 获取book下面倒数第一个标题
print(jsonpath.jsonpath(books,'$.store.book[-1:].title'))
# 获取book中的所有数据
print(jsonpath.jsonpath(books,'$.store.book'))
# 数据过滤 只要数据中有price 过滤出来价格值大于50的数据
print(jsonpath.jsonpath(books,'$..[?(@.price>50)].price'))
# a = 'abc'
# # print(a[开始位置:结束位置:步长]) 1
# print(a[1:2:2])
五、 正则
正则使用难度高,因为里面也有很多的模式,不到万不得已的情况下,基本不会去使用正则,爬虫里面用的多的就是
(.*?)
'''
. 匹配任意字符
* 匹配0个或多个
? 匹配0个或1个 非贪婪模式
(.*?)
'''
# src = '<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" title="正则1" title="正则2">'
import re
# # 根据参数1的规则在参数2中匹配所有符合条件的结果
# res = re.findall('title="(.*?)"',src)
# res = re.findall('<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" title="(.*?)" title="正则2">',src)
# print(res)
str = '''
<html>
<body>
<p>python1</p>
<p>python2</p>
<p>python3</p>
<div>
这是一个div标签
</div>
</body>
</html>
'''
# print(str)
import re
# 匹配p标签的内容
# res = re.findall('<p>(.*?)</p>',str)
# # re.S 是将字符串作为一个整体,在一个整体中进行匹配
# res = re.findall('<div>(.*?)</div>',str,re.S)[0].strip()
# # res = re.findall('<div>(.*?)</div>',str,r)
# print(res)
# 单张
# url = 'https://gimg2.baidu.com/image_search/src=http%3A%2F%2Fb-ssl.duitang.com%2Fuploads%2Fitem%2F201903%2F14%2F20190314184700_aYnJL.gif&refer=http%3A%2F%2Fb-ssl.duitang.com&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=auto?sec=1700142783&t=96d23fcf29b7aee330ae7bdaa446cc72'
# import requests
# res = requests.get(url)
# '# with open('1.gif','wb')as f':'# f.write(res.content)',
url = 'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1697551157030_R&pv=&ic=&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&dyTabStr=MTEsMCwzLDEsNiw0LDUsMiw4LDcsOQ%3D%3D&ie=utf-8&sid=&word=%E8%A1%A8%E6%83%85%E5%8C%85'
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'BDqhfp=%E8%A1%A8%E6%83%85%E5%8C%85%26%26NaN-1undefined%26%260%26%261; winWH=%5E6_1920x955; BDIMGISLOGIN=0; BIDUPSID=F8F101FACAC168981180241787208F13; PSTM=1691297885; BAIDUID=6AB9F35C494C731DA8867F8B086C6A49:SL=0:NR=10:FG=1; H_WISE_SIDS=131861_213345_214804_110085_244725_261723_236312_265881_266354_267074_264354_268031_269904_269051_271172_270102_234295_234207_272282_263618_272466_272472_260335_273165_273149_273233_273389_274140_273788_274422_274571_263750_275097_275235_269286_270538_275011_275941_275854_276121_276196_276311_276586_276590_276965_276767_253022_277076_277268_276830_277354_277383_277236_251972_276454_271253_273981_277628_277636_275189_275258_270292_277784_275732_272318_276925_276665_277884_277951_278017_277997_259642_278058_278166_278163_278248_277320_278335_278396_278414_274784_275167_278277_278451_278533_278572_278576_278636_274576_278703_278514_278803_277541_278791_278388_256739_278955_278962_278923_274283_279020_279044_277523_276423_279135_278365_279266_279278_8000054_8000133_8000143_8000149_8000157_8000159_8000164_8000172_8000178_8000185_8000203_8000208; H_WISE_SIDS_BFESS=131861_213345_214804_110085_244725_261723_236312_265881_266354_267074_264354_268031_269904_269051_271172_270102_234295_234207_272282_263618_272466_272472_260335_273165_273149_273233_273389_274140_273788_274422_274571_263750_275097_275235_269286_270538_275011_275941_275854_276121_276196_276311_276586_276590_276965_276767_253022_277076_277268_276830_277354_277383_277236_251972_276454_271253_273981_277628_277636_275189_275258_270292_277784_275732_272318_276925_276665_277884_277951_278017_277997_259642_278058_278166_278163_278248_277320_278335_278396_278414_274784_275167_278277_278451_278533_278572_278576_278636_274576_278703_278514_278803_277541_278791_278388_256739_278955_278962_278923_274283_279020_279044_277523_276423_279135_278365_279266_279278_8000054_8000133_8000143_8000149_8000157_8000159_8000164_8000172_8000178_8000185_8000203_8000208; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BA_HECTOR=00ak042l05ah0421840kah8g1iiss2a1p; BAIDUID_BFESS=6AB9F35C494C731DA8867F8B086C6A49:SL=0:NR=10:FG=1; ZFY=YLPvl91jw6HJIzGcSO0JR2h3wLMgXmBaPJZ1IscIOCU:C; PSINO=6; H_PS_PSSID=39398_39396_39418_39414_39436_39497_39463_39233_39403_39467_26350_39421; delPer=0; ZD_ENTRY=baidu; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; userFrom=ala; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[A24tJn4Wkd_]=mk3SLVN4HKm; indexPageSugList=%5B%22%E8%A1%A8%E6%83%85%E5%8C%85%22%2C%22%E6%98%8E%E5%A4%A9%E5%91%A8%E6%97%A5%22%2C%22%E5%91%A8%E6%97%A5%E8%A1%A8%E6%83%85%E5%8C%85%22%2C%22%E5%91%A8%E6%9C%AB%E8%A1%A8%E6%83%85%E5%8C%85%22%5D; cleanHistoryStatus=0; ab_sr=1.0.1_YWI1NGFmMTJmZDI0YTQ0OTA1YmVlNTVlNmVmMTllNDRiMDNhNTg3NDlhOGU1ODYxZTA0MTk0ZjY4YmYyZWNmYmRmMTEzN2NmNTdmN2JmYjk3Y2FiOGIxZGNlMGZhMWUxMjQxMDY0MWVhNGE1YTExODZmOGIzZjk5M2E5MzY2MTU0ZTI5NzE4YjExODdkYTgxYTk1Y2ZjN2Q5ZjMzMWMzOQ==',
'Host':'image.baidu.com',
'Referer':'https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%B1%ED%C7%E9%B0%FC&fr=ala&ala=1&alatpl=normal&pos=0&dyTabStr=MTEsMCwzLDEsNiw0LDUsMiw4LDcsOQ%3D%3D',
'Sec-Ch-Ua':'"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
'Sec-Ch-Ua-Mobile':'?0',
'Sec-Ch-Ua-Platform':'"Windows"',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
}
import requests
res = requests.get(url,headers=headers)
res.encoding='utf-8'
# print(res.text)
data = res.text
imgs = re.findall('"thumbURL":"(.*?)"',data) # 目标数据跟前端看到的标签数据不一样
# 定义一个变量,作为图片名
count = 1
for i in imgs: #[url1,url2,url2]
with open(f'百度图片/{count}.png','wb')as f:
# requests.get(i).content 请求url,拿到二进制数据
# 写入到文件中
f.write(requests.get(i).content)
count+=1
if count == 20:
break