获取微博关注者的信息并抓取微博里的发布的照片原图
import re,os,urllib,time,requests
from bs4 import BeautifulSoup
from urllib import request
url = "https://weibo.cn/xxxxxxxxx/follow?page="
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
cookies = {"cookie": "Your cookie"}
dir = "C:\\Users\\xxxxx\\Desktop\\Python\\Weibo"
微博里发布的是单张图片
def single(html,imgdir):
reg_ori = r'src="(.+?)wap180/.+?\.jpg"' #获取图片url的开头
reg_end = r'src=".+?/wap180/(.+?\.jpg)"' #获取图片url的结尾
reg_orire = re.compile(reg_ori)
reg_endre = re.compile(reg_end)
url_ori = reg_orire.findall(html)[0]
url_end = reg_endre.findall(html)[0]
url = url_ori + "large" + '/' + url_end #生成新的url,把wap180换成large就是照片原图的url
print(url)
x = url_end.split('.')[0]
curdir = imgdir + '\\'
urllib.request.urlretrieve(url, '{}{}.jpg'.format(curdir, x))
微博里发布的是多张图片
def group(html,imgdir):
reg = r'<(a href=".+?">.+?)</a>'
regre = re.compile(reg)
lists = regre.findall(html) #找到所有的url
for i in lists:
if u'组图' in i: #判断是不是包含照片url
ureg = r'a href="(https.+?)">'
uregre = re.compile(ureg)
gro_url = uregre.findall(i)[0]
html = requests.get(gro_url,cookies=cookies,headers=headers).text
img = r'img src="(http.+?\.jpg)"' #抓取照片的url
imgre = re.compile(img)
imgurl = imgre.findall(html)
for u in imgurl:
s = r'(.+?)thumb180/.+?\.jpg'
e = r'.+?/thumb180/(.+?\.jpg)'
ss = re.compile(s).findall(u)[0]
ee = re.compile(e).findall(u)[0]
uu = ss + "large" + '/' + ee #将thumb180换成large就是照片原图url
print(uu)
curdir = imgdir + '\\'
x = ee.split('.')[0] #以照片url结尾的字符作为照片的名字保存
urllib.request.urlretrieve(uu, '{}{}.jpg'.format(curdir, x))
time.sleep(2)
time.sleep(2)
获取所有微博的总页数
def Page(url):
response = requests.get(url, cookies=cookies, headers=headers)
reg = r'value="(\d+)"'
page = int(re.compile(reg).findall(response.text)[0])
return page
遍历所有页数的微博
def GetFollow(url,dir):
ori_url = url + str(1)
num = Page(ori_url)
for i in range(1,num+1):
print("第" + str(i) + "页")
curr_url = url + str(i)
try:
response = requests.get(curr_url, cookies=cookies, headers=headers)
while response.status_code != 200:
response = requests.get(curr_url, cookies=cookies, headers=headers)
soup = BeautifulSoup(response.text,'lxml')
weibo = soup.find_all('div',class_ = "c")
for w in weibo:
i = str(w)
if u'原图' in i:
if u'组图' in i:
print("多图")
group(i,dir)
time.sleep(2)
else:
print("单图")
single(i,dir)
time.sleep(2)
except:
time.sleep(2)
continue
time.sleep(2)
获取所有关注的人,并抓取粉丝信息,主页网址信息,创建以微博ID为名字的目录,输出基本信息到txt文件中
def GetAll(url):
for p in range(1,56):
curr_url = url + str(p)
html = requests.get(curr_url,cookies=cookies,headers=headers).text
#print(html)
soup = BeautifulSoup(html,'lxml')
follow = soup.find_all("td")
list = []
for i in range(1,20,2):
list.append(follow[i])
for i in list:
con = str(i)
#print(con)
reg_url = r'td valign="top"><a href="(https://weibo.cn.+?)">.+?</a>' #匹配出微博主页url
reg_name = r'td valign="top"><a href="https://weibo.cn.+?">(.+?)</a>' #匹配出微博ID
reg_fans = r'<br/>(.+?)<br/>' #匹配出微博粉丝
urlre = re.compile(reg_url)
namere = re.compile(reg_name)
fansre = re.compile(reg_fans)
fourl = urlre.findall(con)[0]
foname = namere.findall(con)[0]
fofans = fansre.findall(con)[0]
print(fourl,foname,fofans)
people_dir = dir + '\\' + foname
if not os.path.isdir(people_dir):
os.mkdir(people_dir)
os.chdir(people_dir)
file = people_dir + '\\' + foname + ".txt"
ff = open(file,'at',encoding=('utf-8'))
out = foname + ' ' + fofans + ' ' + fourl + '\n'
ff.write(out)
ff.close()
full_url = fourl + "?page="
GetFollow(full_url, people_dir)
os.chdir(dir)
time.sleep(5)
GetAll(url)