今天是4.13号。
昨天把会议论文算是完成任务的写完然后提交了,而实习还没有找上,所以最近一段时间应该都会整天在实验室学习python吧,加上最近一个多星期全部都是大雨哪也去不了(说的好像不下雨就会出去转悠一样。本来还想问一下送宋教授现在有什么项目可以跟过去做,但又怕把python的学习拉下,所以还是最近半个月先把这个课程全部学完吧。另外电脑运行pycharm真心带不动,所以也在等家里的那台笔记本寄过来,同时不得不提的是也在等投稿的论文消息,wish there is a good result。
照样在贴上代码之前,总结在实际中新学的知识与所遇到的问题。
(1).快捷键ctrl+/可以多行注释,全部选定后tab可以多行缩进,shift+tab则可以向左缩进。
(2).注意select('')和split('')得到的结果都是列表,所以都要在后面加下标[number]。
(3).X.stripped_strings 用于去除字符串X中包含的空格或空行。同时注意要用list()把那一串数据括起来。
(4).对于多种分类情况时,最好用if语句来进行判断。判断某特点字符串s1是包含在另一字符串s2中,可用if 's1' in 's2'
(5).要关注抓取的数据是网页自带的,还是通过request返回的json数据,一般json都是字典数据。对于浏览量等JS数据,首先在审查元素的network-JS中找到相关网页,然后进行解析。
解析过程包括:将查询网页的id导出,然后用format()直接替换到相应的JS动态网页构造成新的网页;接着跟一般网页解析一样用requests.get()去请求;最后由于JS网页的回应内容都是字符串,所以直接用js.text然后再用相应的split或其他方法截取自己想要的内容。
还一个问题要注意,对于请求JS数据时,记得加上headers包括: 'Referer'和 'User-Agent'
第一段
__author__ = 'guohuaiqi'
#!/usr/bin/env python
# _*_ coding: utf-8 _*_
from bs4 import BeautifulSoup
import requests
import string
url='http://bj.58.com/sale.shtml'
host='http://bj.58.com'
#得到所有商品类目的链接并保存下来
def get_cate_link(url):
web_data=requests.get(url)
soup=BeautifulSoup(web_data.text,'lxml')
allurl=soup.select('#ymenu-side > ul > li > ul > li > b > a')
for item in allurl:
cate_link=host+item.get('href')
#print(cate_link)
# get_cate_link(url)
cate_list="""
http://bj.58.com/shouji/
http://bj.58.com/tongxunyw/
http://bj.58.com/danche/
http://bj.58.com/fzixingche/
http://bj.58.com/diandongche/
http://bj.58.com/sanlunche/
http://bj.58.com/peijianzhuangbei/
http://bj.58.com/diannao/
http://bj.58.com/bijiben/
http://bj.58.com/pbdn/
http://bj.58.com/diannaopeijian/
http://bj.58.com/zhoubianshebei/
http://bj.58.com/shuma/
http://bj.58.com/shumaxiangji/
http://bj.58.com/mpsanmpsi/
http://bj.58.com/youxiji/
http://bj.58.com/jiadian/
http://bj.58.com/dianshiji/
http://bj.58.com/ershoukongtiao/
http://bj.58.com/xiyiji/
http://bj.58.com/bingxiang/
http://bj.58.com/binggui/
http://bj.58.com/chuang/
http://bj.58.com/ershoujiaju/
http://bj.58.com/bangongshebei/
http://bj.58.com/diannaohaocai/
http://bj.58.com/bangongjiaju/
http://bj.58.com/ershoushebei/
http://bj.58.com/yingyou/
http://bj.58.com/yingeryongpin/
http://bj.58.com/muyingweiyang/
http://bj.58.com/muyingtongchuang/
http://bj.58.com/yunfuyongpin/
http://bj.58.com/fushi/
http://bj.58.com/nanzhuang/
http://bj.58.com/fsxiemao/
http://bj.58.com/xiangbao/
http://bj.58.com/meirong/
http://bj.58.com/yishu/
http://bj.58.com/shufahuihua/
http://bj.58.com/zhubaoshipin/
http://bj.58.com/yuqi/
http://bj.58.com/tushu/
http://bj.58.com/tushubook/
http://bj.58.com/wenti/
http://bj.58.com/yundongfushi/
http://bj.58.com/jianshenqixie/
http://bj.58.com/huju/
http://bj.58.com/qiulei/
http://bj.58.com/yueqi/
http://bj.58.com/tiaozao/
"""
第二段
__author__ = 'guohuaiqi'
# !/usr/bin/env python
# _*_ coding: utf-8 _*_
from bs4 import BeautifulSoup
import requests
import time
import pymongo
import sys
client=pymongo.MongoClient('localhost',27017)
tongcheng=client['tongcheng']
urllist=tongcheng['urllist']
content=tongcheng['content']
#爬取所有商品的链接保存下来,这里的url来自cate_list
def get_content_links(cate_url,page):
# http://bj.58.com/danche/pn2/ 这里要构造函数,不然传来的类目链接只是进来后的首页
page_list='{}pn{}/'.format(cate_url,str(page))
web_data=requests.get(page_list)
soup=BeautifulSoup(web_data.text,'lxml')
time.sleep(1)
if soup.find('td','t'):
allurl=soup.select('td.t a.t')
for url1 in allurl:
content_link=url1.get('href').split('?')[0]
if 'bj.58.com' not in content_link:
pass
else:
urllist.insert_one({'url':content_link})
# print(content_link)
get_item_content(content_link)
else:
pass
# cate_url='http://bj.58.com/youxiji/'
# get_content_links(cate_url,20)
# 爬取每个页面的详情内容,包括标题,时间,价格,区域
def get_item_content(content_link):
# 先判断数据是否来自58,将来自精品或者转转的数据,统一不要
# for url2 in content_link:
# if 'bj.58.com' not in url2:
# pass
# else:
try:
web_data1=requests.get(content_link)
soup=BeautifulSoup(web_data1.text,'lxml')
page_not_exist = '404' in soup.find('script',type='text/javascript').get('src').split('/')
if page_not_exist:
pass
else:
if '区域' in soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_tit')[0].get_text():
if soup.find_all('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span'):
district=list(soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span')[0].stripped_strings)
else:
district=list(soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con')[0].stripped_strings)
elif '区域' in soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_tit')[0].get_text():
if soup.find_all('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span'):
district=list(soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span')[0].stripped_strings)
else:
district=list(soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con')[0].stripped_strings)
else:
district=None
data={
'goods_cate':soup.select('#header > div.breadCrumb.f12 > span:nth-of-type(3) > a')[0].text.strip(),
'title':soup.select('#content h1')[0].text.strip(),
'date':soup.select('#content li.time')[0].text.replace('.','-'),
'price':soup.select('span.price.c_f50')[0].text.replace('元','').strip() if '面议'not in soup.select('span.price.c_f50')[0].text else None,
'district':district
}
content.insert_one(data)
# print(data)
except requests.ConnectionError as e:
print(e.response)
#
# b=['http://bj.58.com/shuma/23190415633187x.shtml','http://bj.58.com/yishu/25471342844357x.shtml','http://bj.58.com/shouji/25683386143296x.shtml','http://bj.58.com/shuma/23425779899550x.shtml']
# get_item_content(b)
# get_content_links('http://bj.58.com/shouji/',20)
第三段
# _*_ coding: utf-8 _*_
#!/usr/bin/env python
__author__ = 'guohuaiqi'
from multiprocessing import Pool
from get_cate_link import cate_list
from get_all_contents import get_content_links,urllist,content
# 加入断点续传机制,在出现断开后,用rest_list替换pool,map()函数中的cate_links
db_urllist=[item['url'] for item in urllist.find()]
content_urllist=[item['url'] for item in content.fina()]
x=set(db_urllist)
y=set(content_urllist)
rest_list=x-y
def get_all_links(cate_url):
for page in range(1,101):
get_content_links(cate_url,page)
if __name__=='__main__':
pool=Pool()
pool.map(get_all_links,cate_list.split())
第四段
最后再加上一个count函数来对数据库中的item计数
__author__ = 'guohuaiqi'
# !/usr/bin/env python
# _*_ coding: utf-8 _*_
import time
from get_all_contents1 import content
while True:
print(content.find().count())
time.sleep(3)
再要注意的就是,一定一定在写代码前在最前面加上:
#!/usr/bin/env python
__ coding: utf-8 __**
在爬取了10745条数据后自己手动停止了程序,一共花了差不多12分钟。