功能效果: 通过爬取关键词,记录微博用户以及博客的相关数据
代码核心部分:
```
fromconfigimportg_none_word,g_weibo_host,g_weibo_headers,WeiboDataimportrequestsfrombs4importBeautifulSoupimportcsvimportreimportjsonimportosimportdateutil.parserdefbase62_decode(string):""" base """alphabet="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"string=str(string)num=0idx=0forcharinstring:power=(len(string)-(idx+1))num+=alphabet.index(char)*(len(alphabet)**power)idx+=1returnnumdefreverse_cut_to_length(content,code_func,cut_num=4,fill_num=7):""" url to mid """content=str(content)cut_list=[content[i-cut_numifi>=cut_numelse0:i]foriinrange(len(content),0,(-1*cut_num))]cut_list.reverse()result=[]fori,iteminenumerate(cut_list):s=str(code_func(item))ifi>0andlen(s)<fill_num:s=(fill_num-len(s))*'0'+sresult.append(s)return''.join(result)defurl_to_mid(url:str):""">>> url_to_mid('z0JH2lOMb') 3501756485200075 """result=reverse_cut_to_length(url,base62_decode)returnint(result)defparse_time(s):""" Wed Oct 19 23:44:36 +0800 2022 => 2022-10-19 23:44:36 """# return "2022-10-19 23:44:36"returndateutil.parser.parse(s).strftime('%Y-%m-%d %H:%M:%S')defparse_user_info(data):""" 解析用户信息 """# 基础信息user={"_id":str(data['id']),"avatar_hd":data['avatar_hd'],"nick_name":data['screen_name'],"verified":data['verified'],}# 额外的信息keys=['description','followers_count','friends_count','statuses_count','gender','location','mbrank','mbtype','credit_score']forkeyinkeys:ifkeyindata:user[key]=data[key]if'created_at'indata:user['created_at']=parse_time(data.get('created_at'))ifuser['verified']:user['verified_type']=data['verified_type']if'verified_reason'indata:user['verified_reason']=data['verified_reason']returnuserclassWeiboCrawler(object):"""爬虫主入口 Args: object (_type_): _description_ """def__init__(self,search_config:dict):self.__search_config=search_configself.__search_result=Falseself.__key_word=search_config.get("keyword",g_none_word)defstart_search(self,is_need_multithreading:bool=False):"""_summary_ Args: is_need_multithreading (bool, optional): 是否需要开启多线程. Defaults to False. Returns: bool: 是否搜索成功 """keyword=self.__search_config.get("keyword",g_none_word)ifkeyword==g_none_word:print("..........未提供关键词,搜索失败............")returnFalseprint(f"搜索开始,关键词为:{keyword}")is_search_by_time=Truebegin_time=self.__search_config.get("begin_time",g_none_word)end_time=self.__search_config.get("end_time",g_none_word)page=self.__search_config.get("page",g_none_word)ifbegin_time==g_none_wordorend_time==g_none_word:print("开始时间或者结束时间设置为空")is_search_by_time=Falseifis_search_by_time:time_scope=f"custom%3A{begin_time}%3A{end_time}"print(f"构建搜索时间范围成功:字段参数为:{time_scope}")req_url=f"{g_weibo_host}q={keyword}&typeall=1&suball=1×cope={time_scope}&Refer=g&page={page}"print(f"需要搜索的url地址构建成功,地址为: {req_url}")resp=requests.get(req_url,headers=g_weibo_headers)ifresp.status_code!=200:print(f".....{req_url} 网址响应异常......")returnFalsetry:resp.encoding="utf-8"resp_text=resp.textself.__search_result=Trueself.__result_text=resp_textreturnTrueexcept:print("搜索结果异常")returnFalseelse:req_url=f"https://s.weibo.com/weibo?q={keyword}&Refer=index"print(f"需要搜索的url地址构建成功,地址为: {req_url}")resp=requests.get(req_url,headers=g_weibo_headers)ifresp.status_code!=200:print(f".....{req_url} 网址响应异常......")returnFalsetry:resp.encoding="utf-8"resp_text=resp.textself.__result_text=resp_text# 返回结果保存self.__search_result=TruereturnTrueexcept:print("搜索结果异常")returnFalsereturnFalsedefparse_blog_info(self,data):tweet={"_id":str(data['mid']),"mblogid":data['mblogid'],"created_at":parse_time(data['created_at']),# 文章发布时间"geo":data['geo'],"ip_location":data.get('region_name',None),"reposts_count":data['reposts_count'],"comments_count":data['comments_count'],"attitudes_count":data['attitudes_count'],"source":data['source'],"content":data['text_raw'].replace('\u200b',''),"pic_urls":["https://wx1.sinaimg.cn/orj960/"+pic_idforpic_idindata.get('pic_ids',[])],"pic_num":data['pic_num'],'isLongText':False,"user":parse_user_info(data['user']),}if'page_info'indataanddata['page_info'].get('object_type','')=='video':tweet['video']=data['page_info']['media_info']['mp4_720p_mp4']tweet['url']=f"https://weibo.com/{tweet['user']['_id']}/{tweet['mblogid']}"# 文章地址if'continue_tag'indataanddata['isLongText']:tweet['isLongText']=Truereturntweetdefsave_wb_data(self,file_name,wb_data:WeiboData):data_dict=wb_data.__dict__is_first=Falseifos.path.exists(file_name):is_first=Falseelse:is_first=Truewithopen(file_name,'a+',newline='')asf:writer=csv.writer(f)ifis_first==True:first_data=["关键词","帖子内容","帖子链接","帖子点赞数","帖子转发数","帖子评论数","图片视频链接","发布时间","发布者的id","发布者的姓名","发布人的账号类型","发布人的粉丝数","作者简介","ip归属地","性别","全部微博数量"]writer.writerow(first_data)data=[]foritem_indata_dict.values():data.append(item_)writer.writerow(data)defsave_to_file(self,file_name:str,is_appended:bool=True):"""保存到文件中 Args: file_name (str): 文件名 is_appended (bool, optional): 是否需要追加的形式写入. Defaults to True. Returns: bool: 是否保存数据成功 str: 相关说明 """ifself.__search_result==False:returnFalse,"未搜索到数据,无法保存"else:result_text=self.__result_text# 结果tweet_ids=re.findall(r'\d+/(.*?)\?refer_flag=1001030103_\'\)">复制微博地址</a>',result_text)fortweet_idintweet_ids:wb_data=WeiboData()# 微博数据wb_data.keyword=self.__key_word# 关键词url=f"https://weibo.com/ajax/statuses/show?id={tweet_id}"resp_blog=requests.get(url,headers=g_weibo_headers)resp_blog.encoding='utf-8'response_text_blog=resp_blog.textdata=json.loads(response_text_blog)item_blog=self.parse_blog_info(data)# 博客数据wb_data.post_content=item_blog.get("content",g_none_word)# 帖子内容wb_data.post_url=item_blog.get("url",g_none_word)# 帖子链接wb_data.post_liked=item_blog.get("attitudes_count","0")# 点赞wb_data.post_transpond=item_blog.get("reposts_count","0")# 转发wb_data.post_comment=item_blog.get("comments_count","0")# 评论wb_data.post_image_videos_link=str(item_blog.get("video",g_none_word))+str(item_blog.get("pic_urls",g_none_word))# 图片记录wb_data.post_release_time=item_blog.get("created_at",g_none_word)# 发布时间wb_data.post_user_id=item_blog["user"]["_id"]# 发布者的idwb_data.post_user_name=item_blog["user"]["nick_name"]forkey_,value_initem_blog.items():ifkey_=="user":user_dict=value_user_id=user_dict["_id"]user_url=f'https://weibo.com/ajax/profile/info?uid={user_id}'# 用户链接resp_user=requests.get(user_url,headers=g_weibo_headers)resp_user.encoding="utf-8"data_user=json.loads(resp_user.text)item_user=parse_user_info(data_user["data"]["user"])url_user_info=f"https://weibo.com/ajax/profile/detail?uid={item_user['_id']}"resp_user_info=requests.get(url_user_info,headers=g_weibo_headers)data_user_info=json.loads(resp_user_info.text)['data']item_user['birthday']=data_user_info.get('birthday',g_none_word)if'created_at'notinitem_user:item_user['created_at']=data_user_info.get('created_at',g_none_word)item_user['desc_text']=data_user_info.get('desc_text',g_none_word)item_user['ip_location']=data_user_info.get('ip_location',g_none_word)item_user['sunshine_credit']=data_user_info.get('sunshine_credit',{}).get('level',g_none_word)item_user['label_desc']=[label['name']forlabelindata_user_info.get('label_desc',[])]if'company'indata_user_info:item_user['company']=data_user_info['company']if'education'indata_user_info:item_user['education']=data_user_info['education']wb_data.post_account_type=item_user.get("verified",g_none_word)# 是否认证wb_data.post_fans_num=item_user.get("friends_count",g_none_word)# 粉丝数wb_data.post_author_brief=item_user.get("description",g_none_word)# 简介wb_data.post_ip_pos=item_user.get("ip_location",g_none_word)sex=item_user.get("gender","m")# m 男性ifsex=="m":wb_data.post_gender="男"else:wb_data.post_gender="女"wb_data.post_all_weibo_nums=item_user.get("statuses_count",g_none_word)self.save_wb_data(file_name,wb_data)
```
源码地址:https://github.com/huifeng-kooboo/weibo_keyword_crawl 有任何问题欢迎联系邮箱: 942840260@qq.com 以及微信:ytouching 用于学习使用,尽量不要用其他用途qu