第一周作业

from bs4 import BeautifulSoup
import lxml
import time
import requests

base_url = 'http://bj.58.com/pbdn/0/pn'

def get_website(url):   #爬取每件商品网址,最后执行爬取详细信息函数
  response = requests.get(url)
time.sleep(2)
soup = BeautifulSoup(response.text,'lxml')

# wb_sites = soup.select('#infolist > div.infocon > table > tbody > tr.jztr > td.img > a')
  wb_sites = soup.select('#infolist > div.infocon > table > tbody > tr.zzinfo > td.img > a')   #除了推广商品,剩下的都是转转商品;没有发现题目要求的正常商品。所以此处改为抓取转转商品

  for wb_site in wb_sites:
href = wb_site.get('href')
get_details(href)

def get_details(href):  #爬取每件商品详细信息
  time.sleep(2)
response = requests.get(href)

soup = BeautifulSoup(response.text,'lxml')

cates = soup.select('#nav > div.breadCrumb.f12')
items = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > h1')
prices = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i')
areas = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')
views = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time')

  for cate, item, price, area, view in zip(cates, items, prices, areas, views):
data = {
  'cate'  : list(cate.stripped_strings),
  'item'  : item.get_text(),
  'price' : price.get_text(),
  'area'  : area.get_text(),
  'view'  : view.get_text()
}
  print(data)

for page_number in range(1,5):   #爬取前五页信息
  url = base_url + str(page_number)
get_website(url)

第一周作业

推荐阅读更多精彩内容