代码

预处理部分 - 获取频道列表

# pre.py
from bs4 import BeautifulSoup
import requests


#
# 根据“全部分类”页面，找到所有的频道入口
#
def parse_list():
    weburl = 'http://bj.ganji.com/wu/'
    web_data = requests.get(weburl)
    soup = BeautifulSoup(web_data.text, 'lxml', from_encoding="utf-8")
    suburllist = soup.select('#wrapper > div.content > div > div > dl > dt > a')
    for suburl in suburllist:
        print('http://bj.ganji.com' + suburl.get('href'))

# 找到的频道入口列表
category_list = '''
    http://bj.ganji.com/jiaju/
    http://bj.ganji.com/rirongbaihuo/
    http://bj.ganji.com/shouji/
    http://bj.ganji.com/shoujihaoma/
    http://bj.ganji.com/bangong/
    http://bj.ganji.com/nongyongpin/
    http://bj.ganji.com/jiadian/
    http://bj.ganji.com/ershoubijibendiannao/
    http://bj.ganji.com/ruanjiantushu/
    http://bj.ganji.com/yingyouyunfu/
    http://bj.ganji.com/diannao/
    http://bj.ganji.com/xianzhilipin/
    http://bj.ganji.com/fushixiaobaxuemao/
    http://bj.ganji.com/meironghuazhuang/
    http://bj.ganji.com/shuma/
    http://bj.ganji.com/laonianyongpin/
    http://bj.ganji.com/xuniwupin/
    http://bj.ganji.com/qitawupin/
    http://bj.ganji.com/ershoufree/
    http://bj.ganji.com/wupinjiaohuan/
'''

if __name__ == '__main__':
    parse_list()

解析各频道列表页面，并将url入库

# splider1.py
from bs4 import BeautifulSoup
from multiprocessing import Pool
import requests
import time
import pymongo
import pre

client = pymongo.MongoClient('localhost', 27017)
ganji = client['ganji']
t_urllist = ganji['t_urllist']


#
# 解析具体的一页列表信息并入库
#
def parse_list(url):
    web_data = requests.get(url)
    soup = BeautifulSoup(web_data.text, 'lxml')
    # “转转”列表页面，并且还有数据
    if soup.find('table', 'tbimg'):
        titles = soup.select('#infolist > div.infocon > table > tbody > tr.zzinfo > td.t > a')
        for title in titles:
            t_urllist.insert_one({'title': title.get_text(), 'url': title.get('href'), 'type': 'zz', 'flag': False})
            # print('{} ==> {}'.format(title.get_text(), title.get('href')))
    # 赶集网自身列表页面，并且还有数据
    elif soup.find('div', 'layoutlist') and soup.find('ul', 'pageLink clearfix'):
        titles = soup.select('#wrapper > div.leftBox > div.layoutlist > dl > dt > a')
        for title in titles:
            t_urllist.insert_one({'title': title.get('title'), 'url': title.get('href'), 'type': 'nm', 'flag': False})
            # print('{} ==> {}'.format(title.get('title'), title.get('href')))
    # 此页无数据啦
    else:
        print('后面没有啦 : ' + url)
        pass
        # Nothing !


#
# 逐页将某频道的列表信息解析入库
#
def process(channel):
    for i in range(1, 100):
        # 第一页特殊处理，因为直接拼接‘o1’将会打开第二页而非第一页
        if i == 1:
            parse_list(channel)
        else:
            parse_list('{}o{}/'.format(channel, str(i)))
        # time.sleep(2)


#
# 程序入口 : 采用多线程将多个频道的列表信息解析入库
#
if __name__ == '__main__':
    # process('http://bj.ganji.com/bangong/')
    pool = Pool()
    pool.map(process, pre.category_list.split())

从数据库获取url解析各详情页面

# splider2.py
from bs4 import BeautifulSoup
from multiprocessing import Pool
import requests
import time
import pymongo

client = pymongo.MongoClient('localhost', 27017)
ganji = client['ganji']
t_urllist = ganji['t_urllist']
t_detail = ganji['t_detail']

#
# 解析基于“转转”平台的页面
#
def parse_zz_detail(url):
    web_data = requests.get(url)
    soup = BeautifulSoup(web_data.text, 'lxml')
    if soup.find('span', 'soldout_btn'):
        print('商品下架啦！' + url)
        pass
        # Nothing !
    else:
        titles = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > h1')
        prices = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i')
        areas = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')
        categories = soup.select('#nav > div')
        data = {
            'url': url,
            'title': titles[0].get_text().strip(),
            'price': prices[0].get_text().strip(),
            'area': areas[0].get_text().strip(),
            'category': list(categories[0].stripped_strings)[-1]
        }
        # print(data)
        t_detail.insert_one(data)


#
# 解析基于赶集自身平台的页面
#
def parse_nm_detail(url):
    web_data = requests.get(url)
    if web_data.status_code == 404:
        print('商品下架啦！' + url)
        pass
        # Nothing !
    else:
        soup = BeautifulSoup(web_data.text, 'lxml')
        titles = soup.select(
            '#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > h1')
        prices = soup.select(
            '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > i.f22.fc-orange.f-type')
        areas = soup.select(
            '#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(2) > div > ul > li:nth-of-type(3) > a')
        categories = soup.select('#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(2) > div > ul > li:nth-of-type(1) > span > a')
        data = {
                'url': url,
            'title': titles[0].get_text().strip(),
            'price': prices[0].get_text().strip(),
            'area': list(map(lambda x:x.text, areas)),
            'category': list(categories[0].stripped_strings)[-1]
        }
        # print(data)
        t_detail.insert_one(data)


#
# 通用解析接口
#
def parse_detail(row):
    print(row)
    if row['type'] == 'zz':
        parse_zz_detail(row['url'])
    else:
        parse_nm_detail(row['url'])

    # 标记已处理的记录
    t_urllist.update({'_id': row['_id']}, {'$set':{'flag': True}})


#
# 程序入口 ： 从数据库读取url，采用多线程进行详情爬取
#
if __name__ == '__main__':
    # parse_detail('http://zhuanzhuan.ganji.com/detail/797106589634494469z.shtml?from=pc&source=ganji&cate=%E5%8C%97%E4%BA%AC%E8%B5%B6%E9%9B%86%7C%E5%8C%97%E4%BA%AC%E4%BA%8C%E6%89%8B%7C%E5%8C%97%E4%BA%AC%E4%BA%8C%E6%89%8B%E6%89%8B%E6%9C%BA&cateurl=bj|wu|shouji', 'zz')
    # parse_detail('http://bj.ganji.com/bangong/2413656831x.htm', 'nm')
    rows = t_urllist.find({'flag': False})
    pool = Pool()
    pool.map(parse_detail, rows)

总结

赶集网的分页，第一页与第二页的规则不同，第一页不能直接拼接“o1/”作为分页标识。
赶集的列表及商品页面有两种：基于“转转”平台的和基于赶集自身平台的。在列表识别以及详情页面爬取时需要予以区分处理。
基于转转的列表页面中，个人信息与商家信息的区分要根据<tr>标签的css样式差异。

Python实战 - 第二周作业

Python实战 - 第二周作业

代码

总结

推荐阅读更多精彩内容