该案例调用搜狗微信公众号搜索的接口,实现了输入关键字搜索然后返回对应的公众号名称,公众号,以及公众号描述的功能
# coding:utf-8
import requests
import urllib
from bs4 import BeautifulSoup
# 爬取的网址url:http://weixin.sogou.com/weixin?type=1&s_from=input&query=%E4%BA%A7%E5%93%81&ie=utf8&_sug_=n&_sug_type_=
# 定义获取url管理器的方法
def get_url(keyword, page):
urlList = []
for page in range(page):
page = page + 1
firstUrl = 'http://weixin.sogou.com/weixin?type=1&s_from=input&ie=utf8&_sug_=n&_sug_type_=&query='
lastUrl = '&page='
kw = urllib.quote(keyword)
url = firstUrl + kw + lastUrl + str(page)
urlList.append(url)
return urlList
# 定义获取搜索结果的方法
def get_info(keyword, page):
urlList = get_url(keyword, page)
resList = []
for url in urlList:
response = requests.get(url)
res = response.content
soup = BeautifulSoup(res, 'html.parser')
nameList = soup.findAll('p', attrs={'class': 'tit'})
enameList = soup.findAll('label', attrs={'name': 'em_weixinhao'})
summaryList = soup.select('.gzh-box2 + dl > dd')
# 分页搜索结果返回的条数不一致(搜狗的反爬机制),所以这边选择了7作为临界值,某一个url返回的大于等于7,则爬取第2页,否则不进行第2页爬取
# 这里应该还有更好的方案,大家可以想一想
if len(nameList) >= 7:
for v in range(len(nameList)):
resDict = {}
resDict = {
'name': nameList[v].text.strip('\n'),
'ename': enameList[v].text,
'summary': summaryList[v].text
}
resList.append(resDict)
else:
for v in range(len(nameList)):
resDict = {}
resDict = {
'name': nameList[v].text.strip('\n'),
'ename': enameList[v].text,
'summary': summaryList[v].text
}
resList.append(resDict)
break
for weixin in resList:
print '名字:%s' % (weixin['name'].encode('utf-8'))
print '公众号:%s' % (weixin['ename'].encode('utf-8'))
print '描述:%s' % (weixin['summary'].encode('utf-8'))
print '\n'
if __name__ == '__main__':
keyword = raw_input('请输入关键字:')
page = input('请输入搜索结果的页数:')
get_info(keyword, page)