python爬取网站全部节点并自动提交到百度收录

最近用某在线自建站做了个网站,但是苦于 百度自动推 功能模块需要多给钱,这就不爽了 。图方便建站给你钱可以,但是就这个自动推送一下还想收钱那是万万不得,于是就自己用 python 撸了个百度自动推送出来!

下面附上代码:

import requests
import re
import time
from bs4 import BeautifulSoup as Bs4

# 百度收录的 token
baidu_token = '你的百度收录 Token'

# 网站站点
head_url = "http://www.你的网站.com"

# 判断本站 URL 的正则表达式
self_url_reg = 'http.*你的网站\.com'

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

def baidu_push( url ):
    hostname = re.findall('://.*\.com',url)[0]
    hostname = hostname[3:]
    print(hostname)
    res = requests.post('http://data.zz.baidu.com/urls?site='+hostname+'&token='+baidu_token,data=url)
    return(res.text)

def get_url(urls,all_url=[]):
    new_href = []
    for url in urls:

        # 排除上级连接
        if(url in all_url):
            continue
        # 排除其他网站
        if(not re.match(self_url_reg,url)):
            continue
        
        reaponse = requests.get(url, headers=headers)
        
        print('---->'+url,baidu_push(url))
        all_url.append(url)
        
        
        soup = Bs4(reaponse.text, "lxml")
        links = soup.select("a")
        for link in links:
            url_href = link.get("href")
            if(url_href == None):
                continue
            if(re.match('.*javascript\:.*(\)|;)',url_href) or re.match('(mail|tel)\:',url_href)):
                continue
                
            url_href = re.sub('\/$','',url_href)
            
            if(url_href[:2] == '//'):
                url_href = 'http:' + url_href
            
            if(url_href[:4] != 'http'):
                if(url_href[:1] == '/'):
                    url_href = head_url + url_href
                else:
                    url_href = head_url +'/' + url_href
            
            # 排除上级连接
            if(url_href in all_url or url_href in new_href ):
                continue
                
            # 排除其他网站
            if(not re.match(self_url_reg,url_href)):
                continue
            
            
            # print(url_href +' ' + str(url_href in all_url or url_href in new_href) )
            # time.sleep(2)
            new_href.append(url_href)
        
        new_href = list(set(new_href))
        
    if(len(new_href)>0):
        # 将新地址传入递归,拿到结果后往上传
        return get_url(new_href,all_url)
        
    # 递归完后,把最终结果网上传递回归
    return all_url



if __name__ == "__main__":
    urllist = get_url([head_url])
    postData = ''
    for url in urllist:
        postData = postData + url +'\n'
    print(postData)
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容