import re
import requests
from fake_useragent import UserAgent
ua = UserAgent()
headers = {'User-Agent': ua.random}
html = requests.get('http://www.baidu.com/',headers = headers)
html.encoding = 'utf-8'
html = html.text
title = re.findall(r'<a href="(http://.*?.com)" name="tj_tr.*?" class="mnav">(\w{2})</a>',html)
print(title)
输出
[('http://news.baidu.com', '新闻'), ('http://map.baidu.com', '地图'), ('http://v.baidu.com', '视频'), ('http://tieba.baidu.com', '贴吧'), ('http://xueshu.baidu.com', '学术')]