首先通过脚本遍历微博关注列表。。 self._startid = 1652595727 随便找一个黑客大佬的ID就行。
#coding:utf-8
import requests
import json
import re
from bs4 import BeautifulSoup
import pymongo
import Queue
import time
class sec_weibo:
def __init__(self):
self._startid = 1652595727
self.id = 0
self.client = pymongo.MongoClient('mongodb://172.17.0.2/')
self.db = self.client['weibo']
self.posts = self.db['users']
self.tagets = Queue.Queue()
def get_follow(self):
url = "https://weibo.cn/%d/follow" % self.id
#cookies
cookies = {}
cookies_str = "自己通过chrome F12获取cookie,控制台的不行,因为有HTTP ONLY"
cookies_str = cookies_str.split(";")
for cookie in cookies_str:
cookie = cookie.split("=")
cookies[cookie[0]] = cookie[1]
#获取关注的总页数
try:
res = requests.get(url,cookies=cookies)
soup = BeautifulSoup(res.content,"lxml")
soup.input['type']="hidden"
soup.input['name']="mp"
total_page = soup.find_all("input")[4]['value']
except:
print "启动失败,请重新启动引擎。可能是被微博封IP了。"
time.sleep(60)
exit(0)
n = 0
follows = []
#采集每页的关注
for page in range(1, int(total_page) + 1):
time.sleep(0.5)
try:
url = "https://weibo.cn/%d/follow?page=%d" % (self.id, page)
res = requests.get(url,cookies=cookies)
soup = BeautifulSoup(res.content,"lxml")
soup.td['valign']="top"
for user in soup.find_all("td"):
if "style=\"width: 52px\"" not in str(user):
follow = {}
re_name = "\">(.+?)</a>"
re_uid = "uid=(.+?)&"
name = re.findall(re_name, str(user))[0].split(">")[1]#.decode("utf-8").encode("gbk")
uid = re.findall(re_uid, str(user))[0]
print name
print uid
print '-' * 50,
n += 1
print n
follow["name"] = name
follow["uid"] = uid
self.tagets.put(uid)
follow["follow"] = str(self.id)
#raw_input()
follows.append(follow)
except:
print "当前页面数据抓取失败,跳过。"
time.sleep(10)
self.posts.insert_many(follows)
print "休息10秒!"
time.sleep(10)
def run(self):
self.id = self._startid
self.get_follow()
while self.tagets.qsize() > 0:
print "qsize %d" % self.tagets.qsize()
self.id = int(self.tagets.get())
try:
self.get_follow()
except:
print "此用户数据抓取失败,跳过。"
if __name__ == "__main__":
work = sec_weibo()
work.run()
爬到数据之后,进行统计和排序。
import Queue
import pymongo
class data_analy:
def __init__(self):
self.client = pymongo.MongoClient('mongodb://127.0.0.1/')
self.db = self.client['weibo_vps']
self.posts = self.db['users']
self.tagets = Queue.Queue()
def run(self):
res = self.posts.aggregate([
{"$group":{"_id":"$name","total":{"$sum":1}}}
])
result = []
for i in res:
if i["total"] > 100:
result.append(str(i["total"]) + "<====>" + i["_id"])
result.sort()
for i in result:
print i
if __name__ == "__main__":
work = data_analy()
work.run()
结果:
101<====>Fooying
101<====>Seay_法师
101<====>安全北北
101<====>李劼杰
102<====>月亮山大王
102<====>粉丝服务平台
104<====>KeenTeam
106<====>alert7
106<====>瘦肉丁
107<====>唐门三少_tang3
107<====>栋栋的栋
107<====>爱吃猪肉的ztz
108<====>微博客服
108<====>赵彦_ayazero
110<====>xfkxfk
111<====>Seebug漏洞平台
111<====>互联网的那点事
111<====>吃瓜群众-Fr1day
112<====>秒拍
112<====>腾讯科恩实验室
112<====>阿里云安全
113<====>pynerd
113<====>左耳朵耗子
113<====>瘦古龙
113<====>阿里安全应急响应中心
113<====>黄源小童鞋
114<====>aullik5
116<====>微博数据助手
117<====>PanguTeam
117<====>xisigr
117<====>知乎
118<====>局座召忠
118<====>薛之谦
119<====>佳佳是个软妹纸
120<====>D3AdCa7
121<====>evi1m0
121<====>微博打赏
124<====>RicterZ
124<====>nearg1e
124<====>粉丝群
126<====>RevengeRangers
126<====>rock509
126<====>央视新闻
126<====>微博运动
127<====>拍客小助手
127<====>网易云音乐
128<====>人民日报
130<====>一直播
134<====>矮穷龊-陆羽
134<====>陈良-KeenLab
135<====>Val0z0
136<====>evil_xi4oyu
136<====>exp-sky
140<====>RAyH4c
140<====>国际版小助理
140<====>定时微博小助手
140<====>沈沉舟
142<====>宫一鸣cn
144<====>王思聪
146<====>微博故事
147<====>白帽汇赵武
148<====>yuange1975
148<====>古河120
148<====>长亭科技
150<====>real-肉肉
151<====>Orange_tw
151<====>riusksk
152<====>EvilMoon
156<====>杨卿-Ir0nSmith
159<====>博物杂志
159<====>碳基体
160<====>sunwear
163<====>redrain_QAQ
163<====>乌云知识库
165<====>GeekPwn
167<====>SecWiki
167<====>papi酱
167<====>乌云-漏洞报告平台
168<====>廖新喜已被注销
174<====>安全客官方微博
175<====>龚广OldFresher
177<====>我叫0day谁找我
183<====>phithon别跟路人甲BB
184<====>phunter_lau
188<====>FreeBuf
189<====>安全云舒
192<====>微博红包
194<====>微博问答
200<====>微博安全中心
201<====>呆子不开口
203<====>粉丝红包
208<====>余弦
209<====>tombkeeper
212<====>微博抽奖平台
212<====>来去之间
216<====>超级话题
220<====>Flanker_017
229<====>蒸米spark
261<====>腾讯玄武实验室
嘿嘿。。 这些就是微博里安全圈的大佬了。
也可以像先知里那样,整理一个关注关系网的可视化效果。