1.爬取思路
从一个用户的关注列表入手,爬取所有他关注的用户(微博限制只能看到200个),然后再以这200个入手,爬取这200个关注的。200*200 然后以此类推。由于微博的加载方式是动态的,决定从 m.weibo.cn作为入口,以解析微博api的方式获得数据。爬下来的数据用mysql存储,因为只是学习,只爬10万条好了,就要三个数据:用户名 +微博uid+头像
主页因为可以通过uid跳转,就不单独保存了,头像地址也是拼接出来的,保存最后地址信息就好了
2.使用工具
爬虫还是老朋友 scrapy了
数据库用mysql,python上用SQLAlchemy 框架
创建数据库
create database weibo default charset utf8 COLLATE utf8_general_ci;
use weibo;
create table if not exists t_weibo_user(id bigint auto_increment primary key ,weibo_id bigint unique not null,username varchar(30) not null,avatar text not null)ENGINE=InnoDB DEFAULT CHARSET=utf8;
用chrome的检查功能查看 api的参数,
爬取入口页 选择一个大v的关注列表即可
笔者用的是
姚晨的关注页
用chrome的检查元素功能可以看到请求关注列表 的api涉及的参数
这里可以拿到curl的
然后用curl解析出请求headers,这里的headers 很重要尤其是需要登陆后标记你自己的cookies,因为游客是请求不下这些数据的
3 scarpy 编程部分
我们已经拿到了请求头,分析出获得关注列表的api
https://m.weibo.cn/api/container/getIndex?containerid=231051_-followers-_%s&luicode=10000011&lfid=107603%s&featurecode=20000320&type=uid&value=%s&page=1
这里面的%s是拼接用的占位符,用微博用户uid替换,这个用户uid也是拉下来的
,接下来编写python代码
数据库操作用的 sqlInstance.py
from sqlalchemy import create_engine, text, Column, BigInteger, String, Sequence, Date, UniqueConstraint, MetaData
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
Base = declarative_base()
class WeiboUserBean(Base):
__tablename__ = "t_weibo_user"
id = Column(BigInteger, primary_key=True, autoincrement=True)
weibo_id = Column(BigInteger,unique = True)
username = Column(String(30), nullable=False)
avatar = Column(String, nullable=False)
# __table_args__ = (UniqueConstraint('companyname', 'checkdate'),
# )
engine = create_engine('mysql+pymysql://pig:123456@localhost:3306/weibo?charset=utf8', encoding='utf-8', echo=False,
pool_size=50, pool_recycle=3600)
DBSession = sessionmaker(bind=engine)
class SqlInstance:
def tryadd(self):
session = DBSession()
session.add(WeiboUserBean(weibo_id=1111,username='123',avatar='1234'))
session.commit()
session.close()
def add_weibo_user(self,weibouser):
if isinstance(weibouser,WeiboUserBean):
session = DBSession()
session.add(weibouser)
session.commit()
session.close()
sqlInstance = SqlInstance()
if __name__ == '__main__':
sqlInstance.tryadd()
spider主程序 weibospider.py
# -*- coding: utf-8 -*-
import json
from queue import Queue
import scrapy
from scrapy import Request
from weibo_scrapy import sqlinstance
import re
class WeibospiderSpider(scrapy.Spider):
name = 'weibospider'
allowed_domains = ['m.weibo.cn']
start_urls = ['https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_1266321801&luicode=10000011&lfid=1076031266321801&featurecode=20000320&type=uid&value=1266321801&page=1']
queue=Queue()
def parse(self, response):
pagenow=re.findall(r"page=(.*?)$",response.url)[0]
mjsonobj=json.loads(response.text)
if mjsonobj["cards"]:
try:
usercards=mjsonobj["cards"][-1]["card_group"]
except:
uid = self.queue.get()
yield Request(
'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_%s&luicode=10000011&lfid=107603%s&featurecode=20000320&type=uid&value=%s&page=1' % (
uid, uid, uid))
pass
for usecard in usercards:
user=usecard["user"]
imagepath=""
try:
imagepath=user["avatar_hd"].split("/")[-1]
except:
pass
try:
sqlinstance.sqlInstance.add_weibo_user(sqlinstance.WeiboUserBean(weibo_id=user['id'],username=user['screen_name'],avatar=imagepath))
except Exception as e:
print(e)
pass
self.queue.put(user['id'])
print("query add")
targeturl=response.url.replace("page="+pagenow,"page="+str(int(pagenow)+1))
yield Request(targeturl)
else:
uid=self.queue.get()
yield Request('https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_%s&luicode=10000011&lfid=107603%s&featurecode=20000320&type=uid&value=%s&page=1'%(uid,uid,uid))
在setting.py里面改scrapy的请求头和间隔
这样基本就好了
最后分享一个curl截取headers的方法
import re
str = ''' curl 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_1266321801&luicode=10000011&lfid=1076031266321801&featurecode=20000320&type=uid&value=1266321801&page=2' -H 'Accept-Encoding: gzip, deflate, br' -H 'Accept-Language: zh-CN,zh;q=0.9' -H 'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36' -H 'Accept: application/json, text/plain, */*' -H 'Referer: https://m.weibo.cn/p/index?containerid=231051_-_followers_-_1266321801_-_1042015%253AtagCategory_050&luicode=10000011&lfid=1076031266321801&featurecode=20000320' -H 'X-Requested-With: XMLHttpRequest' -H 'Connection: keep-alive' --compressed '''
searches = re.findall(r"-H '.*?'", str)
dicct = {}
for searche in searches:
searche= searche.replace("-H", "")
searche=searche.replace("'", "")
searche=searche.strip()
split = searche.split(": ")
dicct[split[0].strip()] = split[1].strip()
print(dicct)