分享一个百度贴吧爬虫

最近公司业务要求，需要找一批疑似涉诈骗的数据，百度贴吧是一个好的平台。

# -*- coding: utf-8 -*-
import requests
import time
from bs4 import BeautifulSoup
 
import io
import sys
#sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gbk') #改变标准输出的默认编码
#生活大爆炸吧
'''
 # 标题&帖子链接：
    <a rel="noreferrer" href="/p/4788526595" title="我的人物设计和制作" target="_blank" class="j_th_tit ">我的人物设计和制作</a>
    
#发帖人：
    <span class="tb_icon_author " title="主题作者: 新日落" data-field="{"user_id":2137596235}"><i class="icon_author"></i><span class="frs-author-name-wrap"><a rel="noreferrer" data-field="{"un":"\u65b0\u65e5\u843d"}" class="frs-author-name j_user_card " href="/home/main/?un=%E6%96%B0%E6%97%A5%E8%90%BD&ie=utf-8&fr=frs" target="_blank">新日落</a></span><span class="icon_wrap  icon_wrap_theme1 frs_bright_icons "></span>    </span>
#发帖日期：
  <span class="pull-right is_show_create_time" title="创建时间">2016-09</span>
  
  
#回复数量：
    <div class="col2_left j_threadlist_li_left">
<span class="threadlist_rep_num center_text" title="回复">73</span>
    </div>
'''



#抓取网页的通用框架,获取页面的内容
def getHtml(url):
    try:
        r= requests.get(url,timeout=30)
        #状态码不是200就发出httpError的异常
        r.raise_for_status()
        #获取正确的编码格式
        # r.encoding=r.apparent_encoding
        r.encoding="utf-8"
        #打印内容
        return r.text
 
 
    except:
        return "wrong!"
 
 
 
#分析网页的html文件，整理信息，保存问列表文件中
def get_content(url):
    #初始化一个列表来保存所有的帖子信息
    contents=[]
 
    #获取网页的内容
    html=getHtml(url)
 
    #将网页内容格式化利用bs4库
    soup = BeautifulSoup(html, 'lxml')
 
    #获取所有的li标签属性为 j_thread_list clearfix，用列表接收
    liTags = soup.find_all('li', attrs={'class': ' j_thread_list clearfix'})
    print  (len(liTags))
 
    #循环这个内容li集合
    for li in liTags:
 
        #将爬取到了每一条信息。保存到字典里
        content={}
 
        #将异样抛出，避免无数据时，停止运
        try:
             #开始筛选信息
             content['title']=li.find('a',attrs={"class":"j_th_tit"}).text.strip()#.strip()  翻译为中文
             print (li.find('a',attrs={"class":"j_th_tit"}).text.strip())
 
             #获取a标签的内部属性
             content['link'] ="http://tieba.baidu.com/"+li.find('a', attrs={"class": "j_th_tit"})["href"]
             print("http://tieba.baidu.com/"+li.find('a', attrs={"class": "j_th_tit"})["href"])

             # 获取第一条内容（标题 + 第一条内容结合内容会比较丰富）
             content['content'] = li.find('div', attrs={"class":'threadlist_abs threadlist_abs_onlyline '}).text.strip()
             print(li.find('div', attrs={"class": 'threadlist_abs threadlist_abs_onlyline '}).text.strip())

             content['author']=li.find('span',attrs={"class":'tb_icon_author '}).text.strip()
             print (li.find('span',attrs={"class":'tb_icon_author '}).text.strip())

 
             content['responseNum']=li.find('span',attrs={'class': 'threadlist_rep_num center_text'}).text.strip()
             print(li.find(
                 'span', attrs={'class': 'threadlist_rep_num center_text'}).text.strip())
             content['creatTime']=li.find('span',attrs={"class":'pull-right is_show_create_time'}).text.strip()
             print (li.find(
                'span', attrs={'class': 'pull-right is_show_create_time'}).text.strip())
             #将字典加入到列表中
             contents.append(content)
 
 
        except:
            print('出问题')
 
 
 
        #返回数据
    return contents
 
 
def writeTxt(object_dir, content):
 
    #这里不能写成 f=open("data.txt",'a+'）否则会乱码，设置沉utf-8的格式，与getHtml(url):中的encoding一致
    f=open(object_dir, 'a+',encoding='utf-8')
 
    for c in content:
        # f.write('标题： {} \t 链接：{} \t 发帖人：{} \t 发帖时间：{} \t 回复数量： {} \n'.format(
        #         c['title'], c['link'], c['author'], c['creatTime'], c['responseNum']))
        # f.write('标题： {} \t 内容： {} \t \n'.format(
        #     c['title'], c['content']))
        f.write('{}。{}\n链接：{}\n'.format(
            c['title'], c['content'], c['link']))


def main(url, page, object_dir='data.txt'):
    url_list=[]
    #将所需要爬去的url放到列表中
    for i in range(0,page):
        url_list.append(url+'&pn='+str(i*50))
 
    for u in url_list:
        content=get_content(u)
        writeTxt(object_dir, content)
 
if __name__=="__main__":
    # url = "https://tieba.baidu.com/f?ie=utf-8&kw" \
    #       "=%E6%B8%B8%E6%88%8F%E5%8F%B7&fr=search"
    # page = 20
    # object_dir = '游戏账号.txt'

    # url = "https://tieba.baidu.com/f?ie=utf-8&kw=%E7%82%" \
    #       "AB%E8%88%9E%E8%B4%A6%E5%8F%B7%E4%BA%A4%E6%98%93"
    # page = 20
    # object_dir = '炫舞账号交易.txt'

    # url = "https://tieba.baidu.com/f?ie=utf-8&kw=" \
    #       "lol%E8%B4%A6%E5%8F%B7%E4%BA%A4%E6%98%93"
    # page = 20
    # object_dir = '英雄联盟账号交易.txt'

    url = "http://tieba.baidu.com/f?" \
          "kw=%E8%99%9A%E6%8B%9F%E5%B8%81&ie=utf-8&pn=0"
    page = 200
    object_dir = '虚拟币.txt'

    main(url,page, object_dir)
    # get_content("https://tieba.baidu.com/f?ie=utf-8&kw=%E6%B8%B8%E6%88%8F%E5%8F%B7&fr=search")

另一个

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author: ShidongDu time2019/9/6

# -*- coding:utf-8 -*-
import os
import codecs
import json
import urllib
import urllib.request
from lxml import etree

class Spider:
    def __init__(self, pages, url, dir):
        # self.pages = int(input('请输入需要爬取的页数(请输入50的倍数数字):'))
        self.pages = pages
        # self.url = 'http://tieba.baidu.com/f?kw=%E6%95%B4%E5%AE%B9&ie=utf-8&pn='
        self.dir = dir
        self.url = url
        self.ua_header = {"User-Agent" : "Mozilla/5.0(compatible;MSIE 9.0;Windows NT 6.1; Trident/5.0;"}


    def tiebaSpider(self):
        for page in range(50, self.pages + 1, 50):
            url = self.url + str(page)
            # 并且获取页面所有帖子链接,
            links = self.loadPage(url)


    #读取页面内容
    def  loadPage(self, url):
        req = urllib.request.Request(url, headers=self.ua_header)
        html = urllib.request.urlopen(req).read().decode('utf8',errors='replace')

        #解析html 为 HTML
        selector = etree.HTML(html)
        # print(selector)
        links = selector.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')

        for link in links:
            link = "http://tieba.baidu.com" + link
            self.loadImages(link)


    # 获取
    def loadImages(self, link):
        req = urllib.request.Request(link, headers= self.ua_header)
        html = urllib.request.urlopen(req).read().decode('utf8',errors='replace')

        selector = etree.HTML(html)

        #获取这个帖子里所有回复人，回复内容，和帖子标题
        title = selector.xpath('//div[@class="left_section"]//div/h1/text()')[0]
        # 获取每个内容块
        content_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright  "]')
        reply = {}
        reply['reply_title'] = title


        for each_content in content_field:
            reply_info = json.loads(each_content.xpath('@data-field')[0])
            author = reply_info['author']['user_name']
            reply_time = reply_info['content']['date']
            content = each_content.xpath('div[@class="d_post_content_main"]/div/cc/div[starts-with(@id, "post_content") \
                                                and contains(@class,"d_post_content j_d_post_content  clearfix")]')
            reply['reply_author'] = author
            reply['reply_content_time'] = reply_time
            reply['reply_content'] = content[0].xpath('string(.)').replace(' ', '')
            self.writeImages(self.dir, reply)



    #按帖子title来建立文件名
    def writeImages(self, dir, reply):
        s_path = './'+ dir+ '/'
        if not os.path.isdir(s_path):
            os.mkdir(s_path)
        else:
            pass
        try:
            file = codecs.open(s_path + str(reply['reply_title']) + '.txt', 'a', encoding='utf-8')
            # file.write(reply['reply_author'] + ":" + reply['reply_content'] + '\n')
            file.write(reply['reply_content'] + '\n')

            file.close()
        except:
            print("oops~")
# 5000条，即100页
pages = 5000
# url:爬取的页面
url = "http://tieba.baidu.com/f?kw=%E7%88%B1%E5%A5%87%E8%89%BAvip%E4%BC%9A%E5%91%98&ie=utf-8"
dir = "爱奇艺"
Spider = Spider(pages, url, dir)
Spider.tiebaSpider()
print("OK!")

分享一个百度贴吧爬虫

分享一个百度贴吧爬虫

另一个

相关阅读更多精彩内容

友情链接更多精彩内容