#! -*- coding:utf-8 -*-
import urllib2,urllib,json,datetime,datetime,time,requests,re
#获取uid,url
def getUrl(name):
#主页
Surl = 'https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D'+urllib.quote(name)+'&page_type=searchall'
print '主页:',Surl
req = urllib2.Request(Surl,None,headers)
res = json.loads(urllib2.urlopen(req).read())
lfid = res['data']['scheme'].split('lfid=')[1].split('_-_')[0]
# print res['data']['scheme']
uid = res['data']['cards'][0]['card_group'][0]['user']['id']
profile_url = res['data']['cards'][0]['card_group'][0]['user']['profile_url']
back = profile_url.split('&luicode=')[1]
print '获取uid',uid,profile_url,back
url2 = 'https://m.weibo.cn/api/container/getIndex?uid={0}&luicode={1}&type=uid&value={2}'.format(uid,back,uid)
print url2
containerid = getContained(url2)
print '*'*100
print containerid
for i in range(100):
videoList = 'https://m.weibo.cn/api/container/getIndex?uid={0}&luicode={1}&type=uid&value={2}&containerid={3}_-_mcn_time'.format(uid, back, uid, containerid)
print '-' * 100
print videoList+'&page='+str(i)
bool = getInfo(videoList+'&page='+str(i))
if bool == True:
break
#获取containedid
def getContained(url):
req = urllib2.Request(url, None, headers)
res = json.loads(urllib2.urlopen(req).read())
container = res['data']['tabsInfo']['tabs'][2]['containerid']
return container
def getInfo(url):
headers={
'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
'X-DevTools-Emulate-Network-Conditions-Client-Id': '81ED134A01BAFB218319255E8C4C958E',
'Referer': 'https://m.weibo.cn/u/1912713353?uid=1912713353&luicode=10000011&lfid=100103type%3D1%26q%3D6%E5%B2%9B%E5%B2%9B'
}
req = urllib2.Request(url,None,headers)
res = urllib2.urlopen(req).read()
if len(res) < 100:
return True
mediaList = json.loads(res)['data']['cards']
# print mediaList
for m in mediaList:
item = dict()
create_time = datetime.datetime.now()
source = m['mblog']['page_info']['page_url']
object_id = m['mblog']['page_info']['object_id']
title = m['mblog']['page_info']['content2']
# videosRoute = m['mblog']['page_info']['media_info']['stream_url']
img = m['mblog']['page_info']['page_pic']['url']
author = m['mblog']['user']['screen_name']
support = m['mblog']['attitudes_count']
info = m['mblog']['page_info']['content2']
#11111111111111111111111
try:
print source,object_id
url_source = 'http://video.weibo.com/show?fid='+object_id
html = requests.get(url_source, headers=headers1, allow_redirects=False)
url2 = html.headers['Location']
reqw = urllib2.Request(url2, None, headers2)
resw = urllib2.urlopen(reqw).read()
vs = re.compile(r'video-sources="(.*?)"', re.S)
v = re.findall(vs, resw)[0]
str1 = urllib.unquote(v)
str2 = str1.split('http://')[-1]
videosRoute = 'http://' + str2
print '^'*100
print videosRoute
except IndexError:
print 'eeeeeeeeeeeeeeee'
videosRoute = m['mblog']['page_info']['media_info']['stream_url']
#111111111111111111111111
item['title'] = title
item['author'] = author
item['video_url'] = videosRoute
item['img_route'] = img
item['support'] = support
item['description'] = info
item['create_time'] = create_time
try:
item['video_name'] = videosRoute.split('cn/')[1].split('?')[0]
item['img_name'] = img.split('cn/')[1].split('?')[0]
except IndexError:
item['video_name'] = videosRoute.split('stream/')[1].split('?')[0]
try:
item['img_name'] = img.split('images/')[1]
except IndexError:
item['img_name'] = img.split('stream/')[1]
print img
item['video_route'] = 'http://video.xxxx.com/' + item['video_name']
item['video_img'] = 'http://video.xxxx.com/' + item['img_name']
item['tag0'] = ""
print 'title',title
print '视频url',videosRoute
print '视频img',img
print '视频作者',author
print '点赞数',support
print '内容',info
print '-'*100
name = '李子柒'
getUrl(name)
Python爬取新浪微博博主高清视频
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 这篇文章是Python爬虫的第二篇,目标是新浪微博的评论人的性别,地区,等信息,写的不好的地方请指正。 先来分析一...
- 该博客首发于 www.litreily.top 其实,新浪微博用户图片爬虫是我学习python以来写的第一个爬虫,...