(一)爬虫的简单介绍
一些基础知识可以看看我前面的一篇笔记:
【1】https://www.jianshu.com/p/cd200fc878b9
爬虫的定义:定向的抓取互联网内容(大部分为网页),并进行自动化数据处理的程序。爬虫由URL库,采集器和解析器组成。
爬虫的实现流程:
- 采集器对URL库的内容进行自动爬取,并将结果给到解析器,
- 解析器提取目标内容后写入文件。这里就会用到正则表达式对目标内容进行匹配。(文章开头给的连接里面有介绍)
一些概念的介绍:
- HTTP(超文本传输协议):用于服务器和浏览器之间的通信协议。
- request(请求):一般由浏览器向服务器段发起,请求服务器中的网页或数据资源。
- response(响应):由服务器对客户端进行响应,一般为及那个客户端所需的网页或数据资源发送给客户端。
- URL(统一资源定位符):用来标识服务器里的文件或者地址。
(1)采集器功能实现
首先我们需要构造一个HTTP请求所需要的信息,例如我们在向服务器发送请求的时候,需要一个请求头信息,叫做header。里面包含了浏览器,操作系统等信息。如果没有这个header的话,大部分网站的WAF等防护设备(反爬虫机制)会将我们的请求隔绝。header可以在浏览器里面复制过来。
然后使用request库中的get方法对指定URL返送请求资源信息。如果我们收到回复,求回复的状态码为200,证明请求成功!此时我们就获取到了网页资源,就可以交给解析器进行信息提取了。
import requests
# 采集器
def get_page(url):
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0",
"Accept": "image/webp,*/*",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Sec-Fetch-Dest": "image",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Site": "same-origin",
}
response = requests.get(url,headers=header)
if response.status_code == 200:
return response
else:
return "爬取失败"
print(get_page("https://www.thepaper.cn/newsDetail_forward_20167363"))
(2)解析器的功能和实现
解析器的作用是对采集器返回的html代码进行过滤和筛选,提取需要的内容。
这里需要正则表达式:在字符串中找到特定的子字符串
- 在线测试工具:
【1】:https://c.runoob.com/front-end/854/ - 匹配规则:
【1】:https://www.runoob.com/regexp/regexp-rule.html
【2】:https://www.w3cschool.cn/zhengzebiaodashi/regexp-syntax.html
import re
import os
import xlwt
# 解析器1,将图片保存到对应文件夹
def get_img(url):
text = get_page(url).text
data = re.findall("src=\"(https://[a-zA-Z0-9\.\/]*\.jpg)",text)
print(data)
if not os.path.exists("./img/"): # 创建文件夹
os.mkdir("./img/")
index = 0
for img in data:
content = get_page(img).content # 获取图片
with open("./img/"+str(index)+".jpg",'wb') as f:
f.write(content) # 保存文件
f.close()
index+=1
# get_img("https://www.thepaper.cn/newsDetail_forward_20167363")
# 解析器2,爬取信息将路径保存到excel文档中
def save_info(url):
book = xlwt.Workbook(encoding='utf-8',style_compression=0)# 创建一个worbook对象
sheet = book.add_sheet("新闻图片路径",cell_overwrite_ok=True)# 创建工作表
sheet.col(1).width=256*50
col = ["index","url"]
text = get_page(url).text
data = re.findall("src=\"(https://[a-zA-Z0-9\.\/]*\.jpg)",text)
# 写入第一行
for i in range(len(col)):
style = xlwt.easyxf('pattern: pattern solid, fore_colour ice_blue; font: bold on')
sheet.write(0,i,col[i],style)
# 写入其他数据
for i,img in enumerate(data):
sheet.write(i+1,0,i)
sheet.write(i+1,1,img)
book.save(filename_or_stream="./test/新闻图片信息.xls")
save_info("https://www.thepaper.cn/newsDetail_forward_20167363")
这里来一个小例子,爬取百度的图片。
import requests
import re
from urllib import parse
class BaiduImageSpider:
def __init__(self,url) -> None:
self.url = url
self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0",
"Accept": "image/webp,*/*",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Sec-Fetch-Dest": "image",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Site": "same-origin",
}
# 获取图片
def get_img(self,word):
res = requests.get(self.url, headers=self.header)
res.encoding = 'utf-8'
if res.status_code == 200:
html = res.text
pattern = re.compile('"hoverURL":"(.*?)"',re.S)
img_link_list = pattern.findall(html)
self.save_img(img_link_list,word)
else:
print("获取失败!")
# 保存图片
def save_img(self,img_list,word):
if not os.path.exists("./img/"+word):
os.mkdir("./img/"+word)
path = "./img/" + word + '/'
index=0
for img in img_list:
res = requests.get(img,headers=self.header)
res.encoding = 'utf-8'
if res.status_code == 200:
content = res.content
with open(path+str(index)+".jpg",'wb') as f:
f.write(content) # 保存文件
f.close()
index+=1
words = "猫猫狗狗"
word = parse.quote(words)
url = "https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&fm=index&pos=history&word={}".format(word)
spider = BaiduImageSpider(url)
spider.get_img(words)
批量处理——支持中文
# -*- coding: UTF-8 -*-"""
import requests
import tqdm
from urllib import parse
import re
def configs(search, page, number):
"""
:param search:
:param page:
:param number:
:return:
"""
url = 'https://image.baidu.com/search/acjson'
# params = {
# "tn": "resultjson_com",
# "logid": "11555092689241190059",
# "ipn": "rj",
# "ct": "201326592",
# "is": "",
# "fp": "result",
# "queryWord": search,
# "cl": "2",
# "lm": "-1",
# "ie": "utf-8",
# "oe": "utf-8",
# "adpicid": "",
# "st": "-1",
# "z": "",
# "ic": "0",
# "hd": "",
# "latest": "",
# "copyright": "",
# "word": search,
# "s": "",
# "se": "",
# "tab": "",
# "width": "",
# "height": "",
# "face": "0",
# "istype": "2",
# "qc": "",
# "nc": "1",
# "fr": "",
# "expermode": "",
# "force": "",
# "pn": str(60 * page),
# "rn": number,
# "gsm": "1e",
# "1617626956685": ""
# }
params = {
'tn': 'resultjson_com',
# 'logid': '7603311155072595725',
'ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': search,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': '',
'hd': '',
'latest': '',
'copyright': '',
'word': search,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': '1',
'fr': '',
'expermode': '',
'force': '',
'cg': '', # 这个参数没公开,但是不可少
'pn': str(60 * page), # 显示:30-60-90
'rn': number, # 每页显示 30 条
'gsm': '1e',
'1618827096642': ''
}
return url, params
def loadpic(number, page):
"""
:param number:
:param page:
:return:
"""
while (True):
if number == 0:
break
url, params = configs(search, page, number)
result = requests.get(url, headers=header, params=params)
# if result.status_code == 200:
# print('Request success.')
result.encoding = 'utf-8'
# 正则方式提取图片链接
html = result.text
url_list = re.findall('"thumbURL":"(.*?)",', html, re.S)
for i in range(len(url_list)):
getImg(url_list[i], 60 * page + i, path)
bar.update(1)
number -= 1
if number == 0:
break
page += 1
print("\nfinish!")
def getImg(url, idx, path):
"""
:param url:
:param idx:
:param path:
:return:
"""
img = requests.get(url, headers=header)
file = open(path + 'maintenanceWorker_' + str(idx + 1) + '.jpg', 'wb')
file.write(img.content)
file.close()
if __name__ == '__main__':
# search = parse.quote("党参")
search = "龙石药材"
number = 400
path = 'E:/中药材/code/buchongtupian/img/'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}
bar = tqdm.tqdm(total=number)
page = 0
loadpic(number, page)
还可以参考:
【百度爬图】https://blog.csdn.net/Wenweno0o/article/details/121424776
【谷歌爬图】https://blog.csdn.net/Wenweno0o/article/details/121487706
请朋友们的爬虫技术仅用于工作学习!