实战计划第五天,抓了58同城。
最终成果是这样的:
我的代码:
#!/usr/bin/env python #告诉计算机执行程序在系统环境变量中的名字,详细位置在环境变量中设置好了
#-*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import time
import requests
def get_info(link):
wb_detail = requests.get(link)
soup = BeautifulSoup(wb_detail.text, 'lxml')
# nth-of-child(3)改成nth-of-type(3)就可以唯一爬去本页面的信息了 conditions 和 areas爬出来后需要去掉特殊符号
types = soup.select('#header > div.breadCrumb.f12 > span:nth-of-type(3) > a')
titles = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.mainTitle > h1')
dates = soup.select('#index_show > ul.mtit_con_left.fl > li.time')
prices = soup.select(
'#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(1) > div.su_con > span')
conditions = soup.select(
'#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span')
areas = soup.select(
'#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span')
for type, title, date, price, condition, area in zip(types, titles, dates, prices, conditions, areas):
data = {
'type' : type.get_text(),
'title' : title.get_text(),
'data' : date.get_text(),
'price' : price.get_text(),
'conditions' : list(condition.stripped_strings), #list()用法
'area' : list(areas[0].stripped_strings) if soup.find_all('span', 'c_25d') else None,
'view' : get_view(link)
}
print(data)
def get_view(url): #获取浏览量
infoid = url.split('?')[0].split('/')[-1].strip('x.shtml')
api = 'http://jst1.58.com/counter?infoid={}'.format(infoid)
'''这里要加上header信息'''
headers = {'User-Agent':r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
'Cookie':r'id58=c5/ns1ct99sKkWWeFSQCAg==; city=bj; 58home=bj; ipcity=yiwu%7C%u4E49%u4E4C%7C0; als=0; myfeet_tooltip=end; bj58_id58s="NTZBZ1Mrd3JmSDdENzQ4NA=="; sessionid=021b1d13-b32e-407d-a76f-924ec040579e; bangbigtip2=1; 58tj_uuid=0ed4f4ba-f709-4c42-8972-77708fcfc553; new_session=0; new_uv=1; utm_source=; spm=; init_refer=; final_history={}; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=1'.format(str(infoid)),
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host':'jst1.58.com',
'Referer':r'http://bj.58.com/pingbandiannao/{}x.shtml'.format(str(infoid))
}
js = requests.get(api,headers = headers)
#js = requests.get(api)
view = js.text.split('=')[-1]
return view
def get_links_info(page):
urls = ['http://bj.58.com/pbdn/1/pn{}'.format(str(i)) for i in range(1,page)] #必须是个list
for url in urls:
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
schemes = soup.select('#infolist tr td.t a') #为什么写成这样就可以爬取了??
print(schemes)
time.sleep(2)
for scheme in schemes:
link = scheme.get('href')
if link[:17] == 'http://bj.58.com/': #用这种select有效连接
get_info(link)
get_links_info(20)
总结和问题
- list()用法
- CSSpath没有>
- 字典写文件语句
- open路径前面加r
- 确定浏览量代码