总结:几乎用了一个晚上,才搞定,很慢,但有收获,终于小猪的数据来了。另外,这次终于在ubuntu下搞定的
成果
我的代码如下
#!usr/bin/python3
#-*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
def get_lorder_sex(nannv):
if nannv==['member_ico1']:
return 'nv'
else:
return 'yemen'
def get_links(url):
web_data=requests.get(url) #come in
soup=BeautifulSoup(web_data.text,'lxml')
links = soup.select('a.resule_img_a')
for link in links:
href = link.get("href")
get_detail(href)
def get_detail(url):
web_data2=requests.get(url) # use 2 for different
soup=BeautifulSoup(web_data2.text,'lxml')
titles =soup.select("body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em")
#body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em
#body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em
addrs =soup.select("body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span")
#body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span
prices =soup.select("#pricePart > div.day_l > span")
##pricePart > div.day_l > span
images =soup.select("#curBigImage")
##imgMouseCusor
#![](http://upload-images.jianshu.io/upload_images/3861610-082d00105572764e.jpg?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
avartars=soup.select("#floatRightBox > div.js_box.clearfix > div.member_pic > a > img")
##floatRightBox > div.js_box.clearfix > div.member_pic > a > img
names =soup.select("#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a")
##floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a
sexs =soup.select("#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > span")
##floatRightBox > div.js_box.clearfix > div.member_pic > div
##floatRightBox > div.js_box.clearfix > div.member_pic > a > img
##floatRightBox > div.js_box.clearfix > div.w_240 > h6 > span
print(titles)
for title,addr,price,image,avartar,name,sex in zip(titles,addrs,prices,images,avartars,names,sexs):
data={
"title" :title.get_text(),
"addr" :addr.get_text(),
"price" :price.get_text(),
"image" :image.get("src"),
"avartar":avartar.get("src"),
"name" :name.get_text(),
"sex" :get_lorder_sex(sex.get("class")) #into func change to txt
}
print(data)
urls=["http://yantai.xiaozhu.com/penglai-duanzufang-p{}-8/".format(i) for i in range(1,7)]
for one_url in urls:
get_links(one_url)
'''
#the first test coding
url='http://yantai.xiaozhu.com/penglai-duanzufang-8/'
web_data= requests.get(url)
soup=BeautifulSoup(web_data.text,'lxml')
titles = soup.select('a.resule_img_a')
#print(titles)
for title in titles:
link=title.get("href")
print(link)
'''