*#coder:samko date:5.20 10:05#一个详情页爬取
c = ['female','male']url = 'http://bj.xiaozhu.com/fangzi/1779571235.html'page = requests.get(url)
soup = BeautifulSoup(page.text,'lxml')title = soup.select('h4 > em')
address = soup.select('span.pr5')
img = soup.select('img[id="curBigImage"]')
dailyrent = soup.select('div.day_l > span')
landlordimg = soup.select('div.member_pic > a > img')
landlordname = soup.select('h6 > a[class="lorder_name"]')
landlordgender = soup.select('div.w_240 > h6 > span')print(img)
for i,j,k,l,m,n,o in zip(title,address,img,dailyrent,landlordgender,landlordimg,landlordname): def gender(): if 'member_girl_ico' in m: return c[0] else: return c[1] data = { 'title':i.get_text(), 'address':j.get_text(), 'img':k.get('src'), 'rent':l.get_text()+'元', 'lordimg':n.get('src'), 'lordname':o.get_text(), 'gender':gender() } print(data)#多个详情页爬取:如何批量获取链接from bs4 import BeautifulSoupimport requests,re,urllib.requestlinks = []#url = 'http://bj.xiaozhu.com'def get_page(PageNumbers): for page in range(2,PageNumbers):# 每页24个链接,这里输入的是页码 full_url = 'http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(page)) wb_data = requests.get(full_url)#不用在一个大页面下将每一个小的页面都打开分析 soup = BeautifulSoup(wb_data.text,'lxml') for link in soup.select('a.resule_img_a'): # 找到这个 class 样为resule_img_a 的 a 标签即可 links.append(link['href'])#具体分析详情页,从这里面找就行!if __name__ == '__main__': get_page(3) print(links)#还有一种方法,是爬取所有的具体网页:'''def get_pages(): r = r'^http://bj.xiaozhu.com/fangzi/\d{9,10}\.html$' lalala = re.compile(r) lalala.findall(page)#page用urllib.request来写,就不具体分析了··,剩下的步骤与分析某一详情页是一样的! ````'''```
简书的markdown真的不好用,不如jupyter