import requests
from lxml import etree
url = 'https://www.guazi.com/hengyang/buy/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36',
'cookie':'uuid=fe72b945-5ed5-433e-8a18-23ce0b28f289; clueSourceCode=%2A%2300; ganji_uuid=7966218787922810973167; sessionid=83d28f6d-4b95-4ca7-9e52-a4474ed1ce86; lg=1; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22seo_google%22%2C%22ca_n%22%3A%22default%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22-%22%2C%22ca_campaign%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22display_finance_flag%22%3A%22-%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22fe72b945-5ed5-433e-8a18-23ce0b28f289%22%2C%22ca_city%22%3A%22hengyang%22%2C%22sessionid%22%3A%2283d28f6d-4b95-4ca7-9e52-a4474ed1ce86%22%7D; close_finance_popup=2020-11-23; guazitrackersessioncadata=%7B%22ca_kw%22%3A%22-%22%7D; Hm_lvt_bf3ee5b290ce731c7a4ce7a617256354=1606096713,1606105565; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A45317976764%7D; GZ_TOKEN=d7dbWG09RMsVtCS8ShIHxxLvey8aEHZ%2FrvmSYJl5OF%2BPgmp4BUL%2F%2FM0EUSIWLQEcN8SAFj4UqseKGv6rGVlWo0Hai9uuX7wWg%2BAuG4lbNwrS8MBjWQBY5NAyKuobiBXOlJGfjkyVDmmW%2BUxH8w; guaZiUserInfo=5MnskHXiLwWKMXRlmDC%2Bb; userid=750434663; CHDSSO=d7dbWG09RMsVtCS8ShIHxxLvey8aEHZ%2FrvmSYJl5OF%2BPgmp4BUL%2F%2FM0EUSIWLQEcN8SAFj4UqseKGv6rGVlWo0Hai9uuX7wWg%2BAuG4lbNwrS8MBjWQBY5NAyKuobiBXOlJGfjkyVDmmW%2BUxH8w; lng_lat=112.6876_26.88286; gps_type=1; antipas=I6818V2183RF840Gv9TtPTi83; cityDomain=hengyang; user_city_id=207; preTime=%7B%22last%22%3A1606129007%2C%22this%22%3A1606096710%2C%22pre%22%3A1606096710%7D; Hm_lpvt_bf3ee5b290ce731c7a4ce7a617256354=1606129009'
}
def get_detail_urls(url):
resp = requests.get(url, headers=headers)
text = resp.content.decode('utf-8')
pathx = etree.HTML(text)
fpath = pathx.xpath('//li[@data-scroll-track]/a/@href')
detail_urls = []
for href in fpath:
piece = 'https://www.guazi.com' + href
detail_urls.append(piece)
return detail_urls
detail_urls = get_detail_urls(url)
for i in detail_urls:
resp = requests.get(i,headers=headers)
xpath = etree.HTML(resp.text)
r_xpath = xpath.xpath("//h1[@class='titlebox']/text()")
print(r_xpath[0].strip())
之后可以得到