最近公司业务要求,需要找一批疑似涉诈骗的数据,百度贴吧是一个好的平台。
# -*- coding: utf-8 -*-
import requests
import time
from bs4 import BeautifulSoup
import io
import sys
#sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gbk') #改变标准输出的默认编码
#生活大爆炸吧
'''
# 标题&帖子链接:
<a rel="noreferrer" href="/p/4788526595" title="我的人物设计和制作" target="_blank" class="j_th_tit ">我的人物设计和制作</a>
#发帖人:
<span class="tb_icon_author " title="主题作者: 新日落" data-field="{"user_id":2137596235}"><i class="icon_author"></i><span class="frs-author-name-wrap"><a rel="noreferrer" data-field="{"un":"\u65b0\u65e5\u843d"}" class="frs-author-name j_user_card " href="/home/main/?un=%E6%96%B0%E6%97%A5%E8%90%BD&ie=utf-8&fr=frs" target="_blank">新日落</a></span><span class="icon_wrap icon_wrap_theme1 frs_bright_icons "></span> </span>
#发帖日期:
<span class="pull-right is_show_create_time" title="创建时间">2016-09</span>
#回复数量:
<div class="col2_left j_threadlist_li_left">
<span class="threadlist_rep_num center_text" title="回复">73</span>
</div>
'''
#抓取网页的通用框架,获取页面的内容
def getHtml(url):
try:
r= requests.get(url,timeout=30)
#状态码不是200就发出httpError的异常
r.raise_for_status()
#获取正确的编码格式
# r.encoding=r.apparent_encoding
r.encoding="utf-8"
#打印内容
return r.text
except:
return "wrong!"
#分析网页的html文件,整理信息,保存问列表文件中
def get_content(url):
#初始化一个列表来保存所有的帖子信息
contents=[]
#获取网页的内容
html=getHtml(url)
#将网页内容格式化利用bs4库
soup = BeautifulSoup(html, 'lxml')
#获取所有的li标签属性为 j_thread_list clearfix,用列表接收
liTags = soup.find_all('li', attrs={'class': ' j_thread_list clearfix'})
print (len(liTags))
#循环这个内容li集合
for li in liTags:
#将爬取到了每一条信息。保存到字典里
content={}
#将异样抛出,避免无数据时,停止运
try:
#开始筛选信息
content['title']=li.find('a',attrs={"class":"j_th_tit"}).text.strip()#.strip() 翻译为中文
print (li.find('a',attrs={"class":"j_th_tit"}).text.strip())
#获取a标签的内部属性
content['link'] ="http://tieba.baidu.com/"+li.find('a', attrs={"class": "j_th_tit"})["href"]
print("http://tieba.baidu.com/"+li.find('a', attrs={"class": "j_th_tit"})["href"])
# 获取第一条内容(标题 + 第一条内容结合内容会比较丰富)
content['content'] = li.find('div', attrs={"class":'threadlist_abs threadlist_abs_onlyline '}).text.strip()
print(li.find('div', attrs={"class": 'threadlist_abs threadlist_abs_onlyline '}).text.strip())
content['author']=li.find('span',attrs={"class":'tb_icon_author '}).text.strip()
print (li.find('span',attrs={"class":'tb_icon_author '}).text.strip())
content['responseNum']=li.find('span',attrs={'class': 'threadlist_rep_num center_text'}).text.strip()
print(li.find(
'span', attrs={'class': 'threadlist_rep_num center_text'}).text.strip())
content['creatTime']=li.find('span',attrs={"class":'pull-right is_show_create_time'}).text.strip()
print (li.find(
'span', attrs={'class': 'pull-right is_show_create_time'}).text.strip())
#将字典加入到列表中
contents.append(content)
except:
print('出问题')
#返回数据
return contents
def writeTxt(object_dir, content):
#这里不能写成 f=open("data.txt",'a+')否则会乱码,设置沉utf-8的格式,与getHtml(url):中的encoding一致
f=open(object_dir, 'a+',encoding='utf-8')
for c in content:
# f.write('标题: {} \t 链接:{} \t 发帖人:{} \t 发帖时间:{} \t 回复数量: {} \n'.format(
# c['title'], c['link'], c['author'], c['creatTime'], c['responseNum']))
# f.write('标题: {} \t 内容: {} \t \n'.format(
# c['title'], c['content']))
f.write('{}。{}\n链接:{}\n'.format(
c['title'], c['content'], c['link']))
def main(url, page, object_dir='data.txt'):
url_list=[]
#将所需要爬去的url放到列表中
for i in range(0,page):
url_list.append(url+'&pn='+str(i*50))
for u in url_list:
content=get_content(u)
writeTxt(object_dir, content)
if __name__=="__main__":
# url = "https://tieba.baidu.com/f?ie=utf-8&kw" \
# "=%E6%B8%B8%E6%88%8F%E5%8F%B7&fr=search"
# page = 20
# object_dir = '游戏账号.txt'
# url = "https://tieba.baidu.com/f?ie=utf-8&kw=%E7%82%" \
# "AB%E8%88%9E%E8%B4%A6%E5%8F%B7%E4%BA%A4%E6%98%93"
# page = 20
# object_dir = '炫舞账号交易.txt'
# url = "https://tieba.baidu.com/f?ie=utf-8&kw=" \
# "lol%E8%B4%A6%E5%8F%B7%E4%BA%A4%E6%98%93"
# page = 20
# object_dir = '英雄联盟账号交易.txt'
url = "http://tieba.baidu.com/f?" \
"kw=%E8%99%9A%E6%8B%9F%E5%B8%81&ie=utf-8&pn=0"
page = 200
object_dir = '虚拟币.txt'
main(url,page, object_dir)
# get_content("https://tieba.baidu.com/f?ie=utf-8&kw=%E6%B8%B8%E6%88%8F%E5%8F%B7&fr=search")
另一个
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author: ShidongDu time2019/9/6
# -*- coding:utf-8 -*-
import os
import codecs
import json
import urllib
import urllib.request
from lxml import etree
class Spider:
def __init__(self, pages, url, dir):
# self.pages = int(input('请输入需要爬取的页数(请输入50的倍数数字):'))
self.pages = pages
# self.url = 'http://tieba.baidu.com/f?kw=%E6%95%B4%E5%AE%B9&ie=utf-8&pn='
self.dir = dir
self.url = url
self.ua_header = {"User-Agent" : "Mozilla/5.0(compatible;MSIE 9.0;Windows NT 6.1; Trident/5.0;"}
def tiebaSpider(self):
for page in range(50, self.pages + 1, 50):
url = self.url + str(page)
# 并且获取页面所有帖子链接,
links = self.loadPage(url)
#读取页面内容
def loadPage(self, url):
req = urllib.request.Request(url, headers=self.ua_header)
html = urllib.request.urlopen(req).read().decode('utf8',errors='replace')
#解析html 为 HTML
selector = etree.HTML(html)
# print(selector)
links = selector.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')
for link in links:
link = "http://tieba.baidu.com" + link
self.loadImages(link)
# 获取
def loadImages(self, link):
req = urllib.request.Request(link, headers= self.ua_header)
html = urllib.request.urlopen(req).read().decode('utf8',errors='replace')
selector = etree.HTML(html)
#获取这个帖子里所有回复人,回复内容,和帖子标题
title = selector.xpath('//div[@class="left_section"]//div/h1/text()')[0]
# 获取每个内容块
content_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright "]')
reply = {}
reply['reply_title'] = title
for each_content in content_field:
reply_info = json.loads(each_content.xpath('@data-field')[0])
author = reply_info['author']['user_name']
reply_time = reply_info['content']['date']
content = each_content.xpath('div[@class="d_post_content_main"]/div/cc/div[starts-with(@id, "post_content") \
and contains(@class,"d_post_content j_d_post_content clearfix")]')
reply['reply_author'] = author
reply['reply_content_time'] = reply_time
reply['reply_content'] = content[0].xpath('string(.)').replace(' ', '')
self.writeImages(self.dir, reply)
#按帖子title来建立文件名
def writeImages(self, dir, reply):
s_path = './'+ dir+ '/'
if not os.path.isdir(s_path):
os.mkdir(s_path)
else:
pass
try:
file = codecs.open(s_path + str(reply['reply_title']) + '.txt', 'a', encoding='utf-8')
# file.write(reply['reply_author'] + ":" + reply['reply_content'] + '\n')
file.write(reply['reply_content'] + '\n')
file.close()
except:
print("oops~")
# 5000条,即100页
pages = 5000
# url:爬取的页面
url = "http://tieba.baidu.com/f?kw=%E7%88%B1%E5%A5%87%E8%89%BAvip%E4%BC%9A%E5%91%98&ie=utf-8"
dir = "爱奇艺"
Spider = Spider(pages, url, dir)
Spider.tiebaSpider()
print("OK!")