第一周实战作业

成果展示

屏幕快照 2017-03-25 下午7.19.13.png
from bs4 import BeautifulSoup
import requests

import time


def get_links(who_sells):
    url = 'http://bj.58.com/pbdn/{}/pn2'.format(who_sells)
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text,'lxml')
    link_tags = soup.select('td.t  a.t')
    urls_datas = []
    for link_tag in link_tags:
        if link_tag.get('href').find('jump') < 0:
            #转转的link
            urls_datas.append(link_tag.get('href').split('?')[0])
        else:
            #精准的link解析
            urls_datas.append('http://bj.58.com/pingbandiannao/'+link_tag.get('href').split('entinfo')[1].split('&')[0][1:-2]+'x.shtml')
    #print(urls_datas)
    get_infos(urls_datas,who_sells)

def getViews(url):
    #解析获取浏览数,但是得到的始终为0,待解决 ???
    info_id =url.split('/')[-1].strip('x.shtml')
    api = 'http://jst1.58.com/counter?infoid={}'.format(info_id)
    headers = {
        'Cookie': r'bj58_id58s="eG44SE0raFpjSmpwMjI4NQ=="; id58=c5/ns1jBVI5v3RDiA5T7Ag==; als=0; myfeet_tooltip=end; bangbigtip2=1; city=bj; ipcity=gltokyo%7C%u4E1C%u4EAC; sessionid=e3b672f8-f7ca-4c60-8eb3-eba3cfb9a905; 58tj_uuid=a1cc2a2a-8536-417e-8fdf-b86563c43986; new_session=0; new_uv=9; utm_source=; spm=; init_refer=; final_history={}; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=10'.format(str(info_id)),
        'User-Agent': r'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Host': 'jst1.58.com',
        'Referer': r'http://bj.58.com/pingbandiannao/{}x.shtml'.format(info_id)
    }


    r = requests.get(api,headers)
    return r.text.split('total=')[1]

def get_infos(urls,who_sells =0):

    for url in urls:
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        time.sleep(1)

        if "zhuanzhuan" not in url:

            data = {
                'title': soup.title.text.strip(),
                'price': soup.select('.price.c_f50')[0].text,
                'date': soup.select('li.time')[0].text ,
                'area': list(soup.select('.c_25d')[0].stripped_strings) if soup.find_all('span', 'c_25d') else None,
                'cate': '个人' if who_sells == 0 else '商家',
                'views': getViews(url)
            }
            print(data)
        else:
            data = {
                'title': soup.title.text.strip(),
                'price': soup.select('span.price_now i')[0].text,
                'date': None,
                'area': list(soup.select('div.palce_li  i')[0].stripped_strings) if soup.find_all('div', 'palce_li') else None,
                'cate': '个人' ,
                'views': soup.select('span.look_time')[0].get_text().strip(u'次浏览')
            }
            print(data)

        #保存到本地文件中
        # with open('/Users/lht/Downloads/imgs/text','a') as fs:
        #     for data0 in datas:
        #         fs.write(str(data0))






get_links(0)


个人总结

-浏览量通过js请求,还未解决

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容