Good_Href_2.py
函数功能:获取物品链接
from bs4 import BeautifulSoup
import requests
import pymongo
from Judge_end2 import judge_end
import time
import random
from multiprocessing import Pool
client=pymongo.MongoClient('localhost',27017)
Ganji=client['Ganji']
ItemHref=Ganji['ItemHref']
GoodsHref=Ganji['GoodsHref']
#获取物品类别的页数
def get_Item_page(data):
half_url=data["href"]
flag=True
j=0
urls=[]
for i in range(1,300):
url=half_url+"o{}".format(str(i))
if judge_end(url):
urls.append(url)
print(url)
t_sleep = random.uniform(1, 3)
time.sleep(round(t_sleep, 2))
else:
break
return urls
def get_Item_Href(url,type):
# 从数据库中获取链接
html=requests.get(url)
html.encoding='utf-8'
bs_html=BeautifulSoup(html.text,'lxml')
t_sleep=random.uniform(3,5)
time.sleep(round(t_sleep,2))
# 获取标题和链接
goods=bs_html.select('#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a')
for i in goods:
data={
"title":i.get_text().strip(),
"href":i.get('href'),
"data_state":"none",
"goods_type":type
}
GoodsHref.insert_one(data)
'''
http://wh.ganji.com/guizi/o2/
'''
# if __name__ == '__main__':
# if GoodsHref.count()!=0:
# GoodsHref.remove()
# for data in ItemHref.find():
# print(data["Item"])
# pool=Pool()
# urls=get_Item_page(data)
# pool.map(get_Item_Href,urls)
# if __name__=='__main__':
#
# if GoodsHref.count()!=0:
# GoodsHref.remove()
#
# for data in ItemHref.find():
# urls=get_Item_page(data)
# print(urls)
# if len(urls)==0:
# for url in urls:
# get_Item_Href(url,data["Item"])
'''
"href" : "http://www.wh.ganji.com/motorola/"
"href" : "http://www.wh.ganji.com/shouji/_%E5%B0%8F%E7%B1%B3/"
'''
if __name__ == '__main__':
if ItemInformation.count()!=0:
ItemInformation.remove()
while True:
pool=Pool()
pool.map(get_goods_info,GoodsHref.find())
Good_Info.py
函数功能:获取物品详细信息
from bs4 import BeautifulSoup
import requests
import pymongo
import time
import random
from multiprocessing import Pool
client=pymongo.MongoClient('localhost',27017)
Ganji=client['Ganji']
GoodsHref=Ganji['GoodsHref']
ItemInformation=Ganji['ItemInforation']
def get_goods_info(data):
x=data
if x["data_state"]=="none":
url=x['href']
x["data_state"]=="done"
t_sleep = random.uniform(1, 2)
time.sleep(round(t_sleep, 2))
html=requests.get(url)
html.encoding='utf-8'
bshtml=BeautifulSoup(html.text,'lxml')
# 获取名称
title=bshtml.select('#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > h1')
title_data=title[0].get_text() if title!=[] else None
# 获取价格
price=bshtml.select('#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > i.f22.fc-orange.f-type')
price_data=price[0].get_text() if price!=[] else None
# 获取类型
goods_type=bshtml.select('#wrapper > div.content.clearfix > div.leftBox > div > div > ul[class="det-infor"] > li > span > a')
type_data=goods_type[0].get_text() if goods_type!=[] else None
# 获取地点
add=bshtml.select('#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > a')
if len(add)>2:
mid_add=[]
for i in range(1,len(add)):
if i!=len(add)-1:
mid_add.append(add[i].get_text())
print(mid_add)
else:
mid_add=None
add_data=mid_add
data={
'title':title_data,
'price':price_data,
'type':type_data,
'add':add_data,
'goods_type':data['goods_type']
}
ItemInformation.insert_one(data)
else:
pass
if __name__ == '__main__':
if ItemInformation.count()!=0:
ItemInformation.remove()
while True:
pool=Pool()
pool.map(get_goods_info,GoodsHref.find())
# if ItemInformation.count()!=0:
# ItemInformation.remove()
# for data in GoodsHref.find():
# get_goods_info(data)
Item_Href.py
函数功能:获取物品类目链接
from bs4 import BeautifulSoup
import requests
import pymongo
client=pymongo.MongoClient('localhost',27017)
Ganji=client['Ganji']
ItemHref=Ganji['ItemHref']
url_site='http://www.wh.ganji.com'
def get_item_href(url):
html=requests.get(url)
html.encoding='utf-8'
bs_html=BeautifulSoup(html.text,'lxml')
# 获取总页面的所有种类的链接
itemHref=bs_html.select('#wrapper > div.content > div > div > dl > dd > a')
for i in itemHref:
href=url_site+i.get('href')
data={
"Item":i.get_text(),
"href":href,
"read_state":"none"
}
ItemHref.insert_one(data)
get_item_href('http://wh.ganji.com/wu/')
# 测试程序
# get_item_href('http://wh.ganji.com/wu/')
Judge_end2.py
函数功能:判断是否有下一页
import requests
from bs4 import BeautifulSoup
def judge_end(url):
html=requests.get(url)
html.encoding='utf-8'
bsHtml=BeautifulSoup(html.text,'lxml')
# 获取页面号,如果当前页面号中没有下一页,结束
nextNum=bsHtml.select('#wrapper > div.leftBox > div.pageBox > ul > li > a > span')
for i in nextNum:
if i.get_text()=="下一页":
return True
return False
x=judge_end('http://wh.ganji.com/chuangdian/o20/')
print(x)
'''
#wrapper > div.leftBox > div.pageBox > ul > li:nth-child(6) > a > span
'''
Main.py
函数功能:获取所有商品信息
from Item_Href import get_item_href
from Goods_href import get_Item_Href
from Good_Info import get_goods_info
import pymongo
import time
client=pymongo.MongoClient('localhost',27017)
Ganji=client['Ganji']
ItemHref=Ganji['ItemHref']
ItemInformation=Ganji['ItemInforation']
def main():
url="http://wh.ganji.com/wu/"
get_item_href()
get_Item_Href()
get_goods_info()
Monitor.py
函数功能:显示数据库的存储商品信息
import pymongo
import time
client=pymongo.MongoClient('localhost',27017)
Ganji=client['Ganji']
ItemHref=Ganji['ItemHref']
GoodsHref=Ganji['GoodsHref']
ItemInformation=Ganji['ItemInforation']
def monitor():
while True:
print("类别链接数——",ItemHref.count())
print("物品链接数——", GoodsHref.count())
print("物品详细条目数——", ItemInformation.count())
time.sleep(5)
monitor()