week2-1作业
学习python的第二周 5.24号完成练习week2-1在爬取58手机号
Level1代码
from bs4 import BeautifulSoup
import requests
import time
import pymongo
client = pymongo.MongoClient('localhost',27017)
walden = client['58_message']
phones_message = walden['phones_message']
def get_phones_links(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36',
'Cookie':'ipcity=sh%7C%u4E0A%u6D77; userid360_xml=BE5F745CB21889493F55E2AB183C6316; time_create=1466562303322; f=n; f=n; ipcity=sh%7C%u4E0A%u6D77; userid360_xml=BE5F745CB21889493F55E2AB183C6316; time_create=1466562500982; bj58_id58s="ZGFzSVk5dkFyUDZQODU4OA=="; id58=c5/njVc0K1eW7SWOA1IbAg==; als=0; myfeet_tooltip=end; bdshare_firstime=1463552754347; 58home=sh; final_history=25359014999104%2C25390255065933%2C26044978248654%2C26044268155977%2C26013739776688; sessionid=83130bc4-3b8b-4861-b967-70d61d824c8e; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=5; 58tj_uuid=0f55bf86-b266-4866-aea3-3fed2ffe6ccf; new_session=0; new_uv=4; utm_source=; spm=; init_refer='
}
print(url)
wb_date = requests.get(url)
soup = BeautifulSoup(wb_date.text,'lxml')
#标题
titles = soup.select('strong.number')
#连接
links = soup.select('a.t')
if len(titles)>0:
for title, link in zip(titles, links):
data = {
'标题': title.get_text(),
'连接': link.get('href')
}
phones_message.insert_one(data)
print(data)
else:
pass
full_url = ['http://bj.58.com/shoujihao/pn{}/'.format(str(i)) for
i in range(0, 200, 1)]
for link in full_url:
time.sleep(3)#设置间隔时间 防止被屏蔽
get_phones_links(link)
Level2代码
client = pymongo.MongoClient('localhost',27017)
walden = client['58_message']
#循环数据库 得到所有连接
phones_message_look = walden['phones_message_look']
def get_Db_Message():
for message in phones_message.find():
#if判断排除几个选好网的推荐
if 'http://bj.58.com/shoujihao/' in message['连接']:
print(message['连接'])
get_phone_message(message['连接'])
time.sleep(2)
return
#获取详细页面内容
def get_phone_message(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
titles = soup.select('#main > div.col.detailPrimary.mb15 > div.col_sub.mainTitle > h1')#标题
prices = soup.select('#main > div.col.detailPrimary.mb15 > div.col_sub.sumary > ul > li:nth-of-type(1) > div.su_con > span')#价格
sellers = soup.select('#divContacter1 > ul > ul > li > a')#卖家姓名
tels = soup.select('#t_phone')#联系方式
areas = soup.select('div.su_con > a') # 区域
if(len(areas)==2):
area = areas[0].get_text()+'-'+areas[1].get_text()
elif(len(areas)==1):
area = areas[0].get_text()
else:
area= 'None'
for title,price,seller,tel in zip(titles,prices,sellers,tels):
data = {
'标题':((title.get_text().replace('\t','')).replace('\n','')).replace(' ','').strip(),#去除样式里\t \n 连续空格等不需要的样式
'价格':price.get_text().strip(),
'卖家姓名':seller.get_text().strip(),
'联系电话':tel.get_text().strip(),
'区域':area
}
print(data)
phones_message_look.insert_one(data)
return
get_Db_Message()
MongoDb数据库
总结:
-.通过本次学习练习,从58同城爬取手机号连接信息以及卖家信息3000余条
-.难点,在本次爬取卖家信息时,QQ信息为图片,但是根据观察应该是根据一个32位的加密字符串有关系,但是技术有限,没能爬去下QQ信息