import requests# 导入requests包
import re
from bs4import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
# 中国旅游网
url ='http://www.cntour.cn/'
strhtml = requests.get(url,headers=headers)
# 用lxml解析
soup = BeautifulSoup(strhtml.text, 'lxml')
# 抓取select的内容
data = soup.select('#main > div > div.mtop.firstMod.clearfix > div.centerBox > ul.newsList > li > a')
for itemin data:
result = {
'title': item.get_text(),
'link': item.get('href'),
# 正则去ID数字
'ID':re.findall('\d+',item.get('href'))
}
print(result)