1.对百度贴吧的任意帖子进行抓取
2.指定是否只抓取楼主发帖内容
3.将抓取到的内容分析并保存到文件
import re
import bs4
from bs4 import BeautifulSoup
import requests
class TiebaSpider(object):
def __init__(self,see_lz):
self.see_lz=see_lz
def getHTMLText(self,url,pageNumber):
try:
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'}
r=requests.get(url+str(pageNumber),timeout=30,headers=headers)
r.raise_for_status()
r.encoding='utf-8'
return r.text
except:
return 'ERROR'
def getTitle(self,html):
try:
title=re.search(r'<h3 class="core_title_txt pull-left text-overflow.*?>(.*?)</h3>',html)
return title.group(1).strip()
except:
return '百度贴吧'
def getContent(self,html):
floors=[]
soup=BeautifulSoup(html,'html.parser')#,attrs={'class':'d_post_content j_d_post_content'}
contents=soup.find_all('div',attrs={'class':'d_post_content j_d_post_content '})
for content in contents:
item=content.get_text()
floors.append(item)
return floors
def getPageNumber(self,html):
soup=BeautifulSoup(html,'html.parser')
li=soup.find('li',attrs={'class':'l_reply_num'})
pageNumber=li.find_all('span')[1].string
#print(pageNumber)
return int(pageNumber)
def writeFile(self,title,contents):
if title is None or contents is None:
return 'ERROR'
floor=0
with open(title+'.txt','a',encoding='utf-8') as f:
for item in contents:
floor=floor+1
floorline=str(floor)+u'楼------------------------------------------------------------------------------'+'\n'
f.write(floorline+item+'\n'+'\n')
def start(self):
start_url='https://tieba.baidu.com/p/3138733512?'
url=start_url+'see_lz='+str(self.see_lz)+'&pn='
html=self.getHTMLText(url,1)
title=self.getTitle(html)
pageNumber=self.getPageNumber(html)
contents=self.getContent(html)
count=0
self.writeFile(title,contents)
for i in range(2,pageNumber+1):
count=count+1
print('\r当前进度{:.2f}%'.format(count*100/pageNumber),end='')
html=self.getHTMLText(url,i)
contents=self.getContent(html)
self.writeFile(title,contents)
baidu=TiebaSpider(1)
baidu.start()