Week2_Practice_Final

Good_Href_2.py

函数功能:获取物品链接

from bs4 import BeautifulSoup
import  requests
import pymongo
from Judge_end2 import judge_end
import time
import random
from multiprocessing import Pool

client=pymongo.MongoClient('localhost',27017)
Ganji=client['Ganji']
ItemHref=Ganji['ItemHref']
GoodsHref=Ganji['GoodsHref']

#获取物品类别的页数
def get_Item_page(data):

    half_url=data["href"]
    flag=True
    j=0
    urls=[]
    for i in range(1,300):
        url=half_url+"o{}".format(str(i))
        if judge_end(url):
            urls.append(url)
            print(url)
            t_sleep = random.uniform(1, 3)
            time.sleep(round(t_sleep, 2))
        else:
            break

    return urls

def get_Item_Href(url,type):
    # 从数据库中获取链接
    html=requests.get(url)
    html.encoding='utf-8'
    bs_html=BeautifulSoup(html.text,'lxml')
    t_sleep=random.uniform(3,5)
    time.sleep(round(t_sleep,2))
    # 获取标题和链接
    goods=bs_html.select('#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a')
    for i in goods:
        data={
            "title":i.get_text().strip(),
            "href":i.get('href'),
            "data_state":"none",
            "goods_type":type
        }
        GoodsHref.insert_one(data)

'''
   http://wh.ganji.com/guizi/o2/
   '''
# if __name__ == '__main__':
#     if GoodsHref.count()!=0:
#         GoodsHref.remove()
#     for data in ItemHref.find():
#         print(data["Item"])
#         pool=Pool()
#         urls=get_Item_page(data)
#         pool.map(get_Item_Href,urls)



# if __name__=='__main__':
    #
    # if GoodsHref.count()!=0:
    #     GoodsHref.remove()
    #
    # for data in ItemHref.find():
    #     urls=get_Item_page(data)
    #     print(urls)
    #     if len(urls)==0:
    #         for url in urls:
    #             get_Item_Href(url,data["Item"])



'''
"href" : "http://www.wh.ganji.com/motorola/"
"href" : "http://www.wh.ganji.com/shouji/_%E5%B0%8F%E7%B1%B3/"
'''
if __name__ == '__main__':
    if ItemInformation.count()!=0:
        ItemInformation.remove()
    while True:
        pool=Pool()
        pool.map(get_goods_info,GoodsHref.find())

Good_Info.py

函数功能:获取物品详细信息

from bs4 import BeautifulSoup
import  requests
import pymongo
import time
import random
from multiprocessing import Pool


client=pymongo.MongoClient('localhost',27017)
Ganji=client['Ganji']
GoodsHref=Ganji['GoodsHref']
ItemInformation=Ganji['ItemInforation']



def get_goods_info(data):

    x=data
    if x["data_state"]=="none":
        url=x['href']
        x["data_state"]=="done"
        t_sleep = random.uniform(1, 2)
        time.sleep(round(t_sleep, 2))
        html=requests.get(url)
        html.encoding='utf-8'
        bshtml=BeautifulSoup(html.text,'lxml')

        # 获取名称
        title=bshtml.select('#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > h1')

        title_data=title[0].get_text() if title!=[] else None

        # 获取价格
        price=bshtml.select('#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > i.f22.fc-orange.f-type')

        price_data=price[0].get_text() if price!=[] else None

        # 获取类型
        goods_type=bshtml.select('#wrapper > div.content.clearfix > div.leftBox > div > div > ul[class="det-infor"] > li > span > a')
        type_data=goods_type[0].get_text() if goods_type!=[] else None
        # 获取地点
        add=bshtml.select('#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > a')
        if len(add)>2:
            mid_add=[]
            for i in range(1,len(add)):
                if i!=len(add)-1:
                    mid_add.append(add[i].get_text())
            print(mid_add)
        else:
            mid_add=None
        add_data=mid_add

        data={
            'title':title_data,
            'price':price_data,
            'type':type_data,
            'add':add_data,
            'goods_type':data['goods_type']
        }
        ItemInformation.insert_one(data)
    else:
        pass
if __name__ == '__main__':
    if ItemInformation.count()!=0:
        ItemInformation.remove()
    while True:
        pool=Pool()
        pool.map(get_goods_info,GoodsHref.find())
    # if ItemInformation.count()!=0:
    #     ItemInformation.remove()
    # for data in GoodsHref.find():
    #     get_goods_info(data)

Item_Href.py

函数功能:获取物品类目链接

from bs4 import BeautifulSoup
import  requests
import pymongo

client=pymongo.MongoClient('localhost',27017)
Ganji=client['Ganji']
ItemHref=Ganji['ItemHref']
url_site='http://www.wh.ganji.com'

def get_item_href(url):
    html=requests.get(url)
    html.encoding='utf-8'
    bs_html=BeautifulSoup(html.text,'lxml')

    # 获取总页面的所有种类的链接
    itemHref=bs_html.select('#wrapper > div.content > div > div > dl > dd > a')
    for i in itemHref:
        href=url_site+i.get('href')
        data={
            "Item":i.get_text(),
            "href":href,
            "read_state":"none"
        }
        ItemHref.insert_one(data)

get_item_href('http://wh.ganji.com/wu/')
# 测试程序
# get_item_href('http://wh.ganji.com/wu/')

Judge_end2.py

函数功能:判断是否有下一页

import requests
from bs4 import BeautifulSoup

def judge_end(url):
    html=requests.get(url)
    html.encoding='utf-8'
    bsHtml=BeautifulSoup(html.text,'lxml')

    # 获取页面号,如果当前页面号中没有下一页,结束

    nextNum=bsHtml.select('#wrapper > div.leftBox > div.pageBox > ul > li > a > span')
    for i in nextNum:
        if i.get_text()=="下一页":
            return True
    return False

x=judge_end('http://wh.ganji.com/chuangdian/o20/')
print(x)
'''
#wrapper > div.leftBox > div.pageBox > ul > li:nth-child(6) > a > span
'''

Main.py

函数功能:获取所有商品信息

from Item_Href import get_item_href
from Goods_href import get_Item_Href
from Good_Info import get_goods_info
import pymongo
import time

client=pymongo.MongoClient('localhost',27017)
Ganji=client['Ganji']
ItemHref=Ganji['ItemHref']
ItemInformation=Ganji['ItemInforation']


def main():
    url="http://wh.ganji.com/wu/"
    get_item_href()
    get_Item_Href()
    get_goods_info()




Monitor.py

函数功能:显示数据库的存储商品信息

import pymongo
import time
client=pymongo.MongoClient('localhost',27017)
Ganji=client['Ganji']
ItemHref=Ganji['ItemHref']
GoodsHref=Ganji['GoodsHref']
ItemInformation=Ganji['ItemInforation']


def  monitor():
    while True:
        print("类别链接数——",ItemHref.count())
        print("物品链接数——", GoodsHref.count())
        print("物品详细条目数——", ItemInformation.count())
        time.sleep(5)
monitor()

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容

  • 全文链接 第一章 创建一个blog应用第二章 使用高级特性来增强你的blog第三章 扩展你的blog应用第四章上 ...
    夜夜月阅读 10,119评论 27 36
  • 有了前两篇的基础,接下来通过抓取淘宝和天猫的数据来详细说明,如何通过Scrapy爬取想要的内容。完整的代码:[不带...
    Carltony阅读 14,982评论 11 23
  • 1、网络跟进沟通2人 2、团队学习 个人体会 刚刚认识葆婴的时候,就听说了一旦尝试去了解葆婴,直到完全了解,...
    葆婴USANA廖瑜阅读 343评论 0 0
  • 提起童年应该有许多美好的回忆浮现在大家眼前。美好,快乐,无忧无虑,嬉戏玩耍等词语也会从脑海中一涌而出。而我...
    芝芝1981阅读 250评论 1 3
  • 感动是什么? 只不过是 当我在雨中踽踽独行 一抬头 恰有一个人已在不远处撑伞为我等候 只不过是 当我很失落地想要藏...
    Cathy赤木与森阅读 375评论 1 5