2week-homework

总结

Result.png

代码

# Spider 1
def get_url_list_from(channel, page, who_sells=''):
    list_view = '{}/{}o{}'.format(channel, who_sells, page)
    wb_data = requests.get(list_view, headers=headers, proxies=proxies, timeout=6)
    if wb_data.status_code == 200:
        soup = BeautifulSoup(wb_data.text, 'lxml')
        if soup.find('div', 'pageBox'):
            links = soup.select('dl.list-bigpic > dt > a')
            data = {
                'link': list(map(lambda x: x.get('href'), links))
            }
            # print(data)
            use_urls = []
            for item in data['link']:
                if item.find('ganji.com') == -1:
                    pass
                else:
                    use_urls.append(item)
                    print('item'+item)
                    url_list.insert_one({'url': item})
                    get_item_info(item)
            print(use_urls)
        else:
            print('last page: ' + str(page))
    else:
        print('status_code is ' + str(wb_data.status_code))



# Spider 2
def get_item_info(url):
    wb_data = requests.get(url, headers=headers)
    if wb_data.status_code == 404:
        print('404 not found')
    else:
        try:
            soup = BeautifulSoup(wb_data.text, 'lxml')

            prices = soup.select('.f22.fc-orange.f-type')
            pub_dates = soup.select('.pr-5')
            areas = soup.select('ul.det-infor > li:nth-of-type(3) > a')
            cates = soup.select('ul.det-infor > li:nth-of-type(1) > span > a')

            # print(pub_dates, cates, sep='\n===========\n')
            data = {
                'title': soup.title.text.strip(),
                'price': prices[0].text.strip() if len(prices) > 0 else 0,
                'pub_date': pub_dates[0].text.strip().replace(u'\xa0', u' ') if len(pub_dates) > 0 else "",
                'area': list(map(lambda x: x.text.strip(), areas)),
                'cate': list(map(lambda x: x.text.strip(), cates)),
                'url': url
            }

            print(data)
            item_info.insert_one(data)
        except AttributeError:
            print('shits happened!')
        except UnicodeEncodeError:
            print('shits happened again!')

总结

1. 断点续传

db_urls = [item['url'] for item in url_list.find()]
index_urls = [item['url'] for item in item_info.find()]
x = set(db_urls)
y = set(index_urls)
rest_of_urls = x-y

 pool.map(get_item_info, rest_of_urls)

思路也主要是逻辑上的，不是真正网络的断点续传，而是在Table-1：link_list和Table-2：item_info，插入一个相同的key——URL。
利用url的相同，在Table-1现行获取完url，数量不变的前提下，请求Table-2的数据时，即使发生中断冗余问题，也可根据url数量上的的绝对差值，
来继续Table-2的数据爬取。

2. Set 函数

a = ['1','1','2','3','5']
b = ['1','1','2','5']

x = set(a)      // ['1', '2', '3', '5'] set中的顺序不确定，只是将list中的相同内容剔除掉
y = set(b)

// 有一点需要注意的就是，这里的x和y还只是一个set而不是list，需要list的话，还有再 list（x）才可以。

print( x - y)   // ['3'] ,set可以直接做减法，将重合部分减掉

3. Pool进程池的使用

// 获取cpu数量的进程
pool = multiprocessing.Pool(multiprocessing.cpu_count())

close()           关闭pool，使其不在接受新的任务。

terminate()       结束工作进程，不在处理未完成的任务。

join()            主进程阻塞，等待子进程的退出， join方法要在close或terminate之后使用。

具体用法如下：

pool = multiprocessing.Pool(multiprocessing.cpu_count())

pool.map(get_all_links_from, channelist.split())

# pool.map(get_item_info, rest_of_urls)

pool.close()

pool.join()

参考文章：进程池

4. Headers

headers = {
    'UserAgent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.21 Safari/537.36',
    'Connection': 'keep-alive'
}

headers = {
        'User-Agent':r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
        'Cookie':r'id58=c5/ns1ct99sKkWWeFSQCAg==; city=bj; 58home=bj; ipcity=yiwu%7C%u4E49%u4E4C%7C0; als=0; myfeet_tooltip=end; bj58_id58s="NTZBZ1Mrd3JmSDdENzQ4NA=="; sessionid=021b1d13-b32e-407d-a76f-924ec040579e; bangbigtip2=1; 58tj_uuid=0ed4f4ba-f709-4c42-8972-77708fcfc553; new_session=0; new_uv=1; utm_source=; spm=; init_refer=; final_history={}; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=1'.format(str(infoid)),
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Host':'jst1.58.com',
        'Referer':r'http://bj.58.com/pingbandiannao/{}x.shtml'.format(info_id)
}

5. lambda 匿名函数

list(map(lambda x: x * x, [1, 2, 3, 4, 5, 6, 7, 8, 9]))

[1, 4, 9, 16, 25, 36, 49, 64, 81]


links = soup.select('dl.list-bigpic > dt > a')

data = {
       'link': list(map(lambda x: x.get('href'), links))
}


等价于：

 [cate.text.strip() for cate in cates]

基本上比较常用的用法是，在map中使用，对一个list中的元素进行操作，代码量极大的缩减。

6. try except 函数

      try:
            soup = BeautifulSoup(wb_data.text, 'lxml')

            prices = soup.select('.f22.fc-orange.f-type')
            pub_dates = soup.select('.pr-5')
            areas = soup.select('ul.det-infor > li:nth-of-type(3) > a')
            cates = soup.select('ul.det-infor > li:nth-of-type(1) > span > a')

            # print(pub_dates, cates, sep='\n===========\n')
            data = {
                'title': soup.title.text.strip(),
                'price': prices[0].text.strip() if len(prices) > 0 else 0,
                'pub_date': pub_dates[0].text.strip().split(' ')[0] if len(pub_dates) > 0 else "",
                'area': list(map(lambda x: x.text.strip(), areas)),
                'cate': list(map(lambda x: x.text.strip(), cates)),
                'url': url
            }

            print(data)
            item_info.insert_one(data)

        except AttributeError:
            print('shits happened!')

        except UnicodeEncodeError:
            print('shits happened again!')

def try_to_make(a_mess):
    try:
        print(1/a_mess)
    except (ZeroDivisionError,TypeError): // 可以并列一起写
        print('ok~')


try_to_make('0')

因为在爬取大量数据时，可能会碰到各种问题，但是检查修改起来其实并不容易，能看懂并结局问题所在最好，但是如果出现一些匪夷所思或者不影响抓取信息的错误，可以采取，except UnicodeEncodeError类似的手法，规避错误。

7. 标点符号 punctuation

from string import punctuation

if i['area'] :
    area = [ i for i in i['area] if i not in punctuation ]

最后编辑于：2017.12.04 01:53:17

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 215,133评论 6赞 497
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 91,682评论 3赞 390
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 160,784评论 0赞 350
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 57,508评论 1赞 288
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 66,603评论 6赞 386
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 50,607评论 1赞 293
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 39,604评论 3赞 415
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 38,359评论 0赞 270
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 44,805评论 1赞 307
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 37,121评论 2赞 330
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 39,280评论 1赞 344
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 34,959评论 5赞 339
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 40,588评论 3赞 322
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 31,206评论 0赞 21
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 32,442评论 1赞 268
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 47,193评论 2赞 367
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 44,144评论 2赞 352

2week-homework

总结

代码

总结

1. 断点续传

2. Set 函数

3. Pool进程池的使用

具体用法如下：

4. Headers

5. lambda 匿名函数

6. try except 函数

7. 标点符号 punctuation

推荐阅读更多精彩内容