写了两个爬虫
本质上其实是跟一个爬虫一样的
爬虫1:获取所有手机号的链接,并且存在数据库里
一共116页
import pymongo
import requests
from bs4 import BeautifulSoup
client=pymongo.MongoClient('localhost',27017)
walden=client['walden']
phone_num_link=walden['phone_num_link']
def get_info_frompage(startpage,endpage):
infos=[]
for i in range(startpage,endpage+1):
# http://bj.58.com/shoujihao/pn1/
oriurl="http://bj.58.com/shoujihao/pn"+str(i)+"/"
wb_data_ori = requests.get(oriurl)
soup_ori = BeautifulSoup(wb_data_ori.text,'lxml')
titles=soup_ori.select("strong.number")
links=soup_ori.select("#infolist > div > ul > div.boxlist > ul > li > a.t")
j=0
for title,link in zip(titles,links) :
if link.get('href').find("jump.zhineng")<0 :#小于0代表找不到“jump.zhineng”
data={
"title":title.text,
"link":link.get('href')
}
phone_num_link.insert_one(data)
j+=1
else:
continue
print(j)
return infos
get_info_frompage(1,116)
爬虫2:从数据库里将链接取出来,根据链接逐一爬取手机号信息
并且将信息存到数据库中。
做了一下404页面校验,不过好像没遇到404
一共3480条数据
import pymongo
import requests
from bs4 import BeautifulSoup
client=pymongo.MongoClient('localhost',27017)
walden=client['walden']
phone_num_link=walden['phone_num_link']
num_info=walden['num_info']
def get_one_info(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
if soup.select("head > script")[0].get('src').find("//j1.58cdn.com.cn/js/404/topbar404.js")>0:#如果是404页面(即找到了404.js)
return 0
else:
title=soup.select("#main > div.col.detailPrimary.mb15 > div.col_sub.mainTitle > h1")[0].text
price=soup.select("#main > div.col.detailPrimary.mb15 > div.col_sub.sumary > ul > li:nth-of-type(1) > div.su_con > span")[0].text
host=soup.select("#divContacter1 > ul > ul > li > a")[0].text
# number_address=soup.select("span.c_999.f12")[0].text if soup.find_all('span','c_999') else None
data={
'title':title.strip()[0:11],
'price':price.strip(),
'host':host.strip(),
# 'number_address':number_address.replace(" ","").replace("\t","").replace("\n","")[:-10][7:],#三个replace分别删除了空格、制表符、换行符,后面两个分别是删去末尾10个字符,删去开头7个字符
}
# print(data)
return data
# get_one_info("http://bj.58.com/shoujihao/26120773122616x.shtml")
for item in phone_num_link.find():
info=get_one_info(item['link'])
# print(info)
num_info.insert_one(info)
结果如下