1. 多线程爬虫基本思路图
爬取犯贱网图片,案例源码如下:
import threading
import time
from queue import Queue
import requests
from lxml import etree
import json
# 用来存放采集线程
g_crawl_list = []
# 用来存放解析线程
g_parse_list = []
class CrawlThread(threading.Thread):
def __init__(self, name, page_queue, data_queue):
super(CrawlThread, self).__init__()
self.name = name
self.page_queue = page_queue
self.data_queue = data_queue
self.url = 'http://www.fanjian.net/jiantu-{}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
def run(self):
print('%s----线程启动' % self.name)
while 1:
# 判断采集线程何时退出
if self.page_queue.empty():
break
# 从队列中取出页码
page = self.page_queue.get()
# 拼接url,发送请求
url = self.url.format(page)
r = requests.get(url, headers=self.headers)
# 响应内容存放到data_queue中
self.data_queue.put(r.text)
print('%s----线程结束' % self.name)
class ParserThread(threading.Thread):
def __init__(self, name, data_queue, fp, lock):
super(ParserThread, self).__init__()
self.name = name
self.data_queue = data_queue
self.fp = fp
self.lock = lock
def run(self):
print('%s----线程启动' % self.name)
while 1:
try:
# 从data_queue中取出一页数据
data = self.data_queue.get(True, 10)
# 解析内容
self.parse_content(data)
except Exception as e:
break
print('%s----线程结束' % self.name)
def parse_content(self, data):
tree = etree.HTML(data)
# 获取所有的li
li_list = tree.xpath("//ul[@class='cont-list']/li")
items = []
for li in li_list:
# 获取图片标题
title = li.xpath(".//h2/a/text()")[0]
# 获取图片url,这里注意懒加载,要获取data-src
image_url = li.xpath(".//div[contains(@class, 'cont-list-main')]/p/img/@data-src")
item = {
'标题': title,
'连接': image_url
}
items.append(item)
# 写到文件中
self.lock.acquire()
self.fp.write(json.dumps(items, ensure_ascii=False) + '\n')
self.lock.release()
# 创建队列
def create_queue():
# 创建页码队列
page_queue = Queue()
for page in range(1, 50):
page_queue.put(page)
# 创建内容队列
data_queue = Queue()
return page_queue, data_queue
# 创建采集线程
def create_crawl_thread(page_queue, data_queue):
crawl_name = ['采集线程1', '采集线程2', '采集线程3']
for name in crawl_name:
# 创建一个采集线程
tcrwal = CrawlThread(name, page_queue, data_queue)
# 保存到列表中
g_crawl_list.append(tcrwal)
# 创建解析线程
def create_parse_thread(data_queue, fp, lock):
parse_name = ['解析线程1', '解析线程2', '解析线程3']
for name in parse_name:
# 创建一个解析线程
tparse = ParserThread(name, data_queue, fp, lock)
# 保存到列表中
g_parse_list.append(tparse)
def main():
# 创建队列函数
page_queue, data_queue = create_queue()
# 打开文件
fp = open('jian.json', 'a', encoding='utf-8')
# 创建锁
lock = threading.Lock()
# 创建采集线程
create_crawl_thread(page_queue, data_queue)
time.sleep(3)
# 创建解析线程
create_parse_thread(data_queue, fp, lock)
# 启动所有采集线程
for tcrwal in g_crawl_list:
tcrwal.start()
# 启动所有解析线程
for tparse in g_parse_list:
tparse.start()
# 主线程等待子线程结束
for tcrwal in g_crawl_list:
tcrwal.join()
for tparse in g_parse_list:
tparse.join()
# 关闭文件
fp.close()
print('主线程子线程全部结束')
if __name__ == '__main__':
main()
笔者小白一枚,如有纰漏,还请各位读者指点。