任务:
1、从网站:http://sh.ganji.com/wu/ 中获取类型链接
2、从类型地址中获取商品链接(设置末尾页判断,以防爬取错误信息)
3、从商品地址中获取商品信息:商品标题—goods_title、价格—price、交易地点—swap_site
成果:
商品链接
商品详情
多进程
代码:
第一部分:
from bs4 import BeautifulSoup
import requests
'''
start_url = 'http://sh.ganji.com/wu/'
url = 'http://sh.ganji.com'
wb_data = requests.get(start_url)
soup = BeautifulSoup(wb_data.text,'lxml')
channel_links = soup.select('#wrapper > div.content > div > div > dl > dt > a')
for channel_link in channel_links:
link = url+channel_link.get('href')
print(link)
'''
channels = '''
http://sh.ganji.com/jiaju/
http://sh.ganji.com/rirongbaihuo/
http://sh.ganji.com/shouji/
http://sh.ganji.com/bangong/
http://sh.ganji.com/nongyongpin/
http://sh.ganji.com/jiadian/
http://sh.ganji.com/ershoubijibendiannao/
http://sh.ganji.com/ruanjiantushu/
http://sh.ganji.com/yingyouyunfu/
http://sh.ganji.com/diannao/
http://sh.ganji.com/xianzhilipin/
http://sh.ganji.com/fushixiaobaxuemao/
http://sh.ganji.com/meironghuazhuang/
http://sh.ganji.com/shuma/
http://sh.ganji.com/laonianyongpin/
http://sh.ganji.com/xuniwupin/
http://sh.ganji.com/qitawupin/
http://sh.ganji.com/ershoufree/
http://sh.ganji.com/wupinjiaohuan/
'''
第二部分:
from bs4 import BeautifulSoup
import requests
import pymongo
import time
client = pymongo.MongoClient('localhost',27017)
ganji = client['ganji']
sheet_urls = ganji['sheet_urls']
sheet_info = ganji['sheet_info']
urls = ['http://sh.ganji.com/jiaju/o{}'.format(i) for i in range(57,63)]
url = 'http://zhuanzhuan.ganji.com/detail/788991434206199812z.shtml'
def get_url(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
time.sleep(2)
goods_urls = soup.select('#infolist > div > table > tbody > tr > td.t > a')
infos = soup.select('#infolist > div')
for info in infos:
if info.get('class') != ['noinfo']:
for goods_url in goods_urls:
if goods_url.get('href').split('/')[2]!='sh.ganji.com':
sheet_urls.insert_one({'url':goods_url.get('href').split('?')[0]})
print(goods_url.get('href').split('?')[0])
else:
print('End')
def get_info(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
time.sleep(2)
if soup.title.text.split() == ['【图】_的闲置物品-转转,赶集二手']:
pass
else:
goods_title = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > h1')[0].text
#times = soup.select('')
#types = soup.select('')
price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i')[0].text
area = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')[0].text
#degrees = soup.select('')
print(goods_title,price,area)
sheet_info.insert_one({'goods_title':goods_title,'price':price,'area':area})
第三部分:
from multiprocessing import Pool
from get_urls import get_url,get_info
import pymongo
from get_channel import channels
client = pymongo.MongoClient('localhost',27017)
ganji = client['ganji']
sheet_urls = ganji['sheet_urls']
sheet_info = ganji['sheet_info']
def get_all_links(channel):
for n in range(1,11):
url = channel + 'o{}'.format(str(n))
get_url(url)
pool = Pool(4)
#Spader 1 采用多进程方式,获取所有商品链接地址
pool.map(get_all_links,channels.split())
#Spader 2 采用多进程方式,获取所有商品详情
for i in sheet_urls.find():
url = i['url']
pool.apply_async(get_info, args=(url,))
pool.close()
pool.join()