分析:
两类线程:下载(3),解析(3)
内容队列:下载线程往队列中put数据,解析线程从队列get数据
url队列:下载线程从url队列get数据
写数据:上锁
案例代码:
import threading
import time
from queue import Queue
from lxml import etree
import json
import requests
#用来存放采集线程
g_crawl_list=[]
#用来存放解析线程
g_parse_list=[]
class CrawlThread(threading.Thread):
"""docstring fos CrawlThread"""
def __init__(self, name,page_queue,data_queue):
super(CrawlThread, self).__init__()
self.name = name
self.page_queue=page_queue
self.data_queue=data_queue
self.url='http://www.fanjian.net/jiantu-{}'
self.headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}
def run(self):
print('%s---------线程启动' % self.name)
while 1:
#判断采集线程何时退出
if self.page_queue.empty():
break
#从队列中取出页码
page=self.page_queue.get()
#拼接url,发送请求
url=self.url.format(page)
r=requests.get(url,headers=self.headers)
#将响应的内容存放到data_queue中
self.data_queue.put(r.text)
print('%s---------线程结束' % self.name)
class ParseThread(threading.Thread):
"""docstring fos CrawlThread"""
def __init__(self, name,data_queue,fp,lock):
super(ParseThread, self).__init__()
self.name = name
self.data_queue=data_queue
self.fp=fp
self.lock=lock
def run(self):
print('%s---------线程启动' % self.name)
while 1:
# if self.data_queue.empty():
# break
#从data_queue中取出一页数据
data=self.data_queue.get(True,10)
#解析内容即可
self.page_content(data)
print('%s---------线程结束' % self.name)
def page_content(self,data):
tree=etree.HTML(data)
#先找所有的li,再从li里面找自己的标题和url
li_list=tree.xpath('//ul[@class="cont-list"]/li')
items=[]
for oli in li_list:
#获取标题
title=oli.xpath('.//h2/a/text()')[0]
#获取图片url
url_page=oli.xpath('.//div[@class="cont-list-main"]//img/@data-src')
item={
'标题': title,
'链接':url_page
}
items.append(item)
#写到文件中
self.lock.acquire()
self.fp.write(json.dumps(items,ensure_ascii=False) + '\n')
self.lock.release()
def create_queue():
#创建页码队列
page_queue=Queue()
for page in range(1,6):
page_queue.put(page)
#创建内容队列
data_queue=Queue()
return page_queue,data_queue
#创建采集线程
def create_crawl_thread(page_queue,data_queue):
crawl_name = ['采集线程1号','采集线程2号','采集线程3号']
for name in crawl_name:
#创建一个采集线程
tcrawl=CrawlThread(name,page_queue,data_queue)
#保存到列表中
g_crawl_list.append(tcrawl)
def create_parse_thread(data_queque,fp,lock):
parse_name = ['解析线程1号','解析线程2号','解析线程3号']
for name in parse_name:
#创建一个解析线程
tparse=ParseThread(name,data_queque,fp,lock)
#保存到列表中
g_parse_list.append(tparse)
def main():
#创建队列
page_queue,data_queue = create_queue()
#打开文件
fp=open('jian.json','a',encoding='utf8')
#创建锁
lock=threading.Lock()
#创建采集线程
create_crawl_thread(page_queue,data_queue)
time.sleep(3)
#创建解析线程
create_parse_thread(data_queue,fp,lock)
#启动所有采集线程
for tcrawl in g_crawl_list:
tcrawl.start()
#启动所有解析线程
for tparse in g_parse_list:
tparse.start()
#主线程等待子线程结束
for tcrawl in g_crawl_list:
tcrawl.join()
for tparse in g_parse_list:
tparse.join()
#在这里关闭文件
fp.close()
print('主线程子线程全部结束')
if __name__ == '__main__':
main()
显示:
采集线程1号---------线程启动
采集线程2号---------线程启动
采集线程3号---------线程启动
解析线程1号---------线程启动
解析线程2号---------线程启动
解析线程3号---------线程启动
采集线程2号---------线程结束
采集线程1号---------线程结束
采集线程3号---------线程结束
Exception in thread 解析线程3号:
Traceback (most recent call last):
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\threading.py", line 917, in _bootstrap_inner
self.run()
File "E:\Sublime Text 3\day1\2_xincheng.py", line 56, in run
data=self.data_queue.get(True,10)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\queue.py", line 178, in get
raise Empty
_queue.Empty
Exception in thread 解析线程1号:
Traceback (most recent call last):
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\threading.py", line 917, in _bootstrap_inner
self.run()
File "E:\Sublime Text 3\day1\2_xincheng.py", line 56, in run
data=self.data_queue.get(True,10)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\queue.py", line 178, in get
raise Empty
_queue.Empty
Exception in thread 解析线程2号:
Traceback (most recent call last):
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\threading.py", line 917, in _bootstrap_inner
self.run()
File "E:\Sublime Text 3\day1\2_xincheng.py", line 56, in run
data=self.data_queue.get(True,10)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\queue.py", line 178, in get
raise Empty
_queue.Empty
主线程子线程全部结束
[Finished in 15.1s]
报错代码待优化...
json数据
image.png