主要工作
准备两个爬虫程序channel_extract.py,page_parsing.py
- channel_extract.py的功能是爬取所有二级分类的URL
- page_parsing.py的功能是爬取详情页中一类列表中所有商品的主要信息并入库
channel_extract.py代码
from bs4 import BeautifulSoup
import requests
start_url = 'http://bj.58.com/sale.shtml'
url_host = 'http://bj.58.com'
def get_channel_urls(url):
web_data = requests.get(url)
soup = BeautifulSoup(web_data.text,'lxml')
links = soup.select('ul.ym-submnu > li > b > a')
for link in links:
page_url = url_host + link.get('href')
print(page_url)
get_channel_urls(start_url)
page_parsing.py 代码
from bs4 import BeautifulSoup
import requests
import time
import pymongo
client = pymongo.MongoClient('localhost',27017)
test_58 = client['test_58']
url_list = test_58['url_list3']
item_info = test_58['item_info']
#spider 1
def get_links_from(channel,pages,who_sells=0):
list_view = '{}{}/pn{}'.format(channel,str(who_sells),str(pages))
web_data = requests.get(list_view)
time.sleep(1)
soup = BeautifulSoup(web_data.text,'lxml')
if soup.find('td','t'):
for link in soup.select('td.t a.t'):
item_link = link.get('href').split('?')[0]
url_list.insert_one({'url':item_link})
print(item_link)
else:
pass
def get_item_info(url):
web_data = requests.get(url)
soup = BeautifulSoup(web_data.text,'lxml')
no_longer_exist = soup.find('span','soldout_btn')
if no_longer_exist:
pass
else:
title = soup.title.text.strip()
price = soup.select('span.price_now i')[0].text
viewed = soup.select('span.look_time')[0].text.split('次')[0]
area = list(soup.select('div.palce_li > span > i')[0].stripped_strings) if soup.find_all('div','palce_li') else None
item_info.insert_one({'title':title,'price':price,'viewed':viewed,'area':area})
print({'title':title,'price':price,'viewed':viewed,'area':area})
get_item_info('http://zhuanzhuan.58.com/detail/756768837813551105z.shtml')
get_links_from('http://bj.58.com/shuma/',2)
#http://zhuanzhuan.58.com/detail/756768837813551105z.shtml
补充作业
程序中断后,为确保数据不重复,需要在get_links_from()
增加一个判断,判断抓取的url是否已经在数据库中,如果在,就pass。
涉及的代码如下:
item_link = link.get('href').split('?')[0]
if url_list.find_one({'url':item_link}):
print('url exists')
pass
else:
url_list.insert_one({'url':item_link})
print(item_link)