GetItemHref.py
from bs4 import BeautifulSoup
import requests
def get_Item_Href(url):
html=requests.get(url)
bs_html=BeautifulSoup(html.text,'lxml')
N_href=[]
hrefs=bs_html.select('td > a[class="t"]')
for i in hrefs:
if i.get('data-addtype')!="level2":
N_href.append(i.get('href'))
return N_href
get_Item_Href('http://bj.58.com/pbdn/0/')
GetItemInfo.py
import requests
from bs4 import BeautifulSoup
import time
def get_Item_Info(url):
html=requests.get(url)
bs_html=BeautifulSoup(html.text,'lxml')
if bs_html.select('#header > div.breadCrumb.f12 > span > a')!=[]:
Item=bs_html.select('#header > div.breadCrumb.f12 > span > a')[-1]
else:
return None
Title=bs_html.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.mainTitle > h1')
Time=bs_html.select('#index_show > ul.mtit_con_left.fl > li.time')
Price=bs_html.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li > div.su_con > span')[0]
Quality=bs_html.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li > div.su_con > span')[1]
Add=bs_html.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li > div.su_con > span')[2]
data={
"Item":Item.get_text(),
'Title':Title[0].get_text(),
'Time':Time[0].get_text(),
'Price':Price.get_text(),
"Quality":Quality.get_text().strip(),
"Add":Add.stripped_strings
}
time.sleep(1)
return data
week1_final.py
from GetItemInfo import get_Item_Info
from GetItemHref import get_Item_Href
def main():
url='http://bj.58.com/pbdn/0/'
urls=get_Item_Href(url)
item_data=[]
for url in urls:
item_data.append(get_Item_Info(url))
print(item_data)
main()