youtube项目第三版

from pytube import YouTube
import requests
from fake_useragent import UserAgent
import re
from selenium import webdriver
import ssl
from lxml import etree
import time
from selenium.webdriver.common.keys import Keys
from Google import GoogleTranslate
import redis
import os
import smtplib
import time
from email.mime.text import MIMEText
from email.header import Header
import cv2
opt = webdriver.ChromeOptions()
opt.add_argument('--headless')
#更换头部
cookie_list = [{'name': 'pt2gguin', 'value': 'o0654921690'}, {'name': 'RK', 'value': 'mSglPLOFQ8'}, {'name': 'ptcz', 'value': '5eb1aeb628b6ab67aa306285a78434959b385f8e39415bda4ff4db3ea7763d75'}, {'name': 'pgv_pvid', 'value': '116706325'}, {'name': 'pgv_pvi', 'value': '1972684800'}, {'name': 'pgv_si', 'value': 's3116448768'}, {'name': 'pgv_info', 'value': 'ssid'}, {'name': 'uin', 'value': 'o0654921690'}, {'name': 'skey', 'value': '@7pMFazBu1'}, {'name': 'ptisp', 'value': 'ctc'}, {'name': 'ts_refer', 'value': 'www.baidu.com/link'}, {'name': 'ts_uid', 'value': '3500005680'}, {'name': 'userid', 'value': '5661917'}, {'name': 'fname', 'value': '%E5%BC%BA%E8%BF%AB%E7%97%87%E7%9A%84%E5%B0%8F%E4%B9%90%E8%B6%A3'}, {'name': 'fimgurl', 'value': 'http%3A%2F%2Finews.gtimg.com%2Fnewsapp_ls%2F0%2F1807099981_200200%2F0'}, {'name': 'omtoken', 'value': '3799d6a2c9'}, {'name': 'omtoken_expire', 'value': '1531158354'}, {'name': 'alertclicked', 'value': '%7C%7C'}, {'name': 'rmod', 'value': '1'}, {'name': 'ts_last', 'value': 'om.qq.com/'}, {'name': 'TSID', 'value': 'jemm4veh9kh0ttq4jfv8s17ft1'}, {'name': '9e67236d07bdc7152e6e2b42b7f00f43', 'value': 'f603772395ba8a0bff88b2715af27d5cb5bd4ffaa%253A4%253A%257Bi%253A0%253Bs%253A7%253A%25225661917%2522%253Bi%253A1%253Bs%253A16%253A%2522654921690%2540qq.com%2522%253Bi%253A2%253Bi%253A43200%253Bi%253A3%253Ba%253A15%253A%257Bs%253A6%253A%2522status%2522%253Bi%253A2%253Bs%253A5%253A%2522email%2522%253Bs%253A16%253A%2522654921690%2540qq.com%2522%253Bs%253A9%253A%2522logintype%2522%253Bi%253A1%253Bs%253A3%253A%2522uin%2522%253BN%253Bs%253A5%253A%2522phone%2522%253BN%253Bs%253A4%253A%2522wxid%2522%253BN%253Bs%253A6%253A%2522imgurl%2522%253Bs%253A55%253A%2522http%253A%252F%252Finews.gtimg.com%252Fnewsapp_ls%252F0%252F1807099981_200200%252F0%2522%253Bs%253A4%253A%2522name%2522%253Bs%253A21%253A%2522%25E5%25BC%25BA%25E8%25BF%25AB%25E7%2597%2587%25E7%259A%2584%25E5%25B0%258F%25E4%25B9%2590%25E8%25B6%25A3%2522%253Bs%253A10%253A%2522isVerified%2522%253Bb%253A1%253Bs%253A10%253A%2522isRejected%2522%253Bb%253A0%253Bs%253A9%253A%2522agreeAcpt%2522%253Bb%253A0%253Bs%253A6%253A%2522pwdChg%2522%253Bb%253A0%253Bs%253A9%253A%2522avatarChg%2522%253Bb%253A0%253Bs%253A2%253A%2522lk%2522%253Bs%253A24%253A%2522lXI_TXsMWwxKS18anK76DQ00%2522%253Bs%253A2%253A%2522id%2522%253Bs%253A7%253A%25225661917%2522%253B%257D%257D'}]

ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
opt.add_argument('user-agent="%s"' % ua)
# opt.add_argument('cookie=%s'% cookie)
driver = webdriver.Chrome(chrome_options=opt)
driver.maximize_window()
redis_cli = redis.Redis(host='127.0.0.1',port=6379)
class QIE:
    def __init__(self):
        self.ua = UserAgent()
        # 上传异常计数,超过阈值放弃当前上传的视频
        self.upload_error_handling_num = 0
        self.path = '/Users/admin/Documents/qiehao/'


    # 拿到目标视频的链接
    def get_video_urls(self,index_page):
        while True:

            # index_page = 'https://www.youtube.com/channel/UC6XKMuLChaxW4Isw0BTZI8A/videos'
            headers = {
                'User-Agent':self.ua.random
            }
            proxies = {
                'https':'127.0.0.1:1087'
            }
            response = requests.get(index_page,headers=headers,proxies=proxies).text
            list_urls = re.findall(r'"(/watch\?v=.+?)"',response)
            result_ = []
            for url in list_urls:
                if url not in result_:
                    result_.append(url)
            list_urls = result_

            if len(list_urls)>0:
                print(list_urls)
                # 取前3个视频
                if len(list_urls)>3:
                    list_urls = list_urls[:3]
                    list_urls = set(list_urls)
                    list_urls = list(list_urls)
                return list_urls
            else:
                print('匹配失败')
                print(response)
                time.sleep(2)

    # 下载所有需要的视频
    def download_vides(self):
        index_page1 = 'https://www.youtube.com/channel/UCmLQqA5g62GIfQa9oME05wQ/videos?sort=dd&view=0&flow=grid'
        # index_page2 = 'https://www.youtube.com/channel/UC-dX0gOx-mwQqEMYLxNT43A/videos?flow=grid&view=0&sort=dd'
        # index_page3 = 'https://www.youtube.com/channel/UCOxZsnY8lnh5u9A6xEqbRwA/videos?sort=dd&view=0&flow=grid'
        result1 = self.get_video_urls(index_page1)
        # result2 = self.get_video_urls(index_page2)
        # result3 = self.get_video_urls(index_page3)
        result = result1


        if result != 'no':

            # 文件名列表
            self.names = []
            # 下载计数
            num = 0

            for url in result:
                redis_result = redis_cli.get(url)
                if redis_result == None:
                    if num < 1:
                        redis_cli.set(url,1)
                        print('{}已经写入redis'.format(url))
                        print('一共{}个视频'.format(len(result)))
                        ssl._create_default_https_context = ssl._create_unverified_context
                        url = 'https://www.youtube.com' + url
                        yt = YouTube(url=url,proxies={'https':'127.0.0.1:1087'})
                        data = yt.streams.first()
                        file_name = data.default_filename
                        print('视频开始下载......')
                        # 调用google翻译生成新的文件名
                        go = GoogleTranslate(file_name)
                        file_name = go.get_translated()
                        file_name = file_name.replace('最','')
                        file_name = file_name.replace('震惊','')
                        file_name = file_name.replace('令人震惊','')
                        file_name = file_name.replace('令人吃惊','')
                        file_name = file_name.replace('让人震惊','')
                        file_name = file_name.replace('令人惊讶','')
                        print(file_name)
                        while True:
                            try:
                                data.download(filename=file_name)
                                break
                            except:
                                print('ssl.错误,重试')
                                time.sleep(5)
                        num += 1
                        print('第%d个视频下载完成,文件名是:%s' % (num,file_name))
                        if '.mp4' not in file_name:
                            file_name += '.mp4'
                        self.names.append(file_name)
                        print(self.names)
            if num > 0:
                return 'ok'
            else:
                return 'no'
        else:
            return 'no'

    # 连接youtube和腾讯
    def context(self):
        result = self.download_vides()
        if result == 'ok':

            if len(self.names) > 3:
                self.names = self.names[0:3]
            for file_name in self.names:
                with open(file_name, 'a') as f:
                    f.write('xxoo')
                print('md5修改完成')
                self.set_cookie()
                self.upload(file_name)
                time.sleep(5)


    # 上传所有本地的视频,每日最多3条
    def set_cookie(self):
        # 请求目标地址
        url = 'https://om.qq.com/article/articlePublish#/!/view:article?typeName=multivideos'
        driver.get(url)
        driver.implicitly_wait(18)
        # 给目标地址添加cookie
        # driver.delete_all_cookies()

        try:
            with open('origins.txt') as f:
                print('从文件导入cookie')
                cookie = f.read()
                cookie = eval(cookie)
                for c in cookie:
                    driver.add_cookie(c)
        except:
            cookie = cookie_list
            for c in cookie:
                new = dict(c,**{
                "domain": ".qq.com",
                "expires": "",
                'path': '/',
                'httpOnly': False,
                'HostOnly': False,
                'Secure': False,
            })
                driver.add_cookie(new)
        # 再次请求目标地址
        driver.get(url)
        driver.implicitly_wait(18)

    # 文件导入,上传
    def upload(self,file_name):

        # 将本地文件上传到页面的input标签
        # path1 = '/home/ubuntu/Documents/tencent/'

        # file_name = "化妆变身迪士尼公主.mp4"
        # 标题名需要小于30个字符
        # file_name = file_name.split(' ')[0]
        file_name2 = self.path+file_name
        print(file_name2)
        driver.find_element_by_xpath('//input').send_keys(file_name2)
        path = '进入视频上传主页.png'
        driver.save_screenshot(path)
        print('进入视频上传主页截图保存成功')
        # 尝试cookie登陆,如果失败模拟登陆
        try:
            status = driver.find_element_by_xpath('//span[@class="text-title"]').get_attribute('textContent').strip()
            print(status)
        except:
            print('进入失败,可能cookie已经过期,重新模拟登陆')
            driver.delete_all_cookies()
            url = 'https://om.qq.com/article/articlePublish#/!/view:article?typeName=multivideos'
            driver.get(url)
            driver.implicitly_wait(18)
            driver.find_element_by_xpath('//input[@name="email"]').send_keys('xx')
            driver.find_element_by_xpath('//input[@name="password"]').send_keys('oo)
            driver.find_element_by_xpath('//button[@class="btnLogin btn btn-primary"]').click()
            time.sleep(5)
            driver.get(url)
            cookies = driver.get_cookies()
            with open('origins.txt' ,'w') as f:
                f.write(str(cookies))
            driver.implicitly_wait(18)
            self.upload(file_name)
        else:
            # 在此页面可能会弹出重新登录的窗口
            try:
                # 等待上传完成
                num = 0
                while True:
                    time.sleep(60)
                    num += 1
                    status = driver.find_element_by_xpath('//span[@class="text-title"]').get_attribute('textContent').strip()
                    path = '载入进度.png'
                    driver.save_screenshot(path)
                    print('已经上传了第%d分钟'%num)
                    if status == '共1个视频,已上传1个':
                        print('ok')
                        break
            except:
                self.set_cookie()
                self.upload(file_name)
            else:
                path = 'success.png'
                driver.save_screenshot(path)
                print('文件导入成功')
                #滚动到目标位置 选择分类
                # 上传错误处理
                self.upload_error_handling(file_name)
                print('当前上传结束的文件名是{}'.format(file_name))

    # 封装上传错误处理
    def upload_error_handling(self,file_name):
        try:
            command = driver.find_element_by_xpath('//span[contains(./text(),"请选择分类")]')
        except:
            # 不超过阈值重试
            self.upload_error_handling_num +=1
            if self.upload_error_handling_num < 4:
                print('文件上传出现异常,准备重试,重试第{}次'.format(self.upload_error_handling_num))
                driver.refresh()
                self.upload(file_name)
            else:
                # 超过阈值初始化计数器,放弃本条视频上传;
                self.upload_error_handling_num = 0
                print('上传异常次数过多,可能本条视频已经传过了,放弃。准备上传下一条......')
                pass
        else:
            driver.execute_script("arguments[0].scrollIntoView();", command)
            command.click()
            time.sleep(1)
            # 滚动到目标位置,输入内容并且回车
            command = driver.find_element_by_xpath('//div[@class="chosen-search"]/input')
            driver.execute_script("arguments[0].scrollIntoView();", command)
            command.send_keys('奇闻趣事')
            command.send_keys(Keys.ENTER)
            # 将页面滚动到底部
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            time.sleep(10)
            # 选择标签,默认选择9个标签
            self.get_tag()
            # 选择封面图
            driver.find_element_by_xpath('//button[contains(./text(),"设置封面")]').click()
            # 滚动到目标位置并且点击
            # 如果封面图加载超时过长那么就放弃本次上传
            num = 0
            time.sleep(10)
            # 上传状态
            status = self.cover_upload(file_name)
            if status == 'ok':
                # 发布
                time.sleep(3)
                num = 0
                while True:
                    try:
                        driver.find_element_by_xpath('//button[@action="publish"]').click()
                        path = '上床成功.png'
                        driver.save_screenshot(path)
                        print('上传成功')
                        break
                    except:
                        time.sleep(5)
                        num += 1
                        if num > 3:
                            print('上传按钮等待超时,放弃上传')
                            break
            else:
                print('封面问题,放弃上传')
            os.remove(file_name)


    # 填写标签
    def get_tag(self):
        single = driver.find_element_by_xpath('//li[contains(./text(),"推荐标签")]/following::li[1]/a').get_attribute('textContent').strip()
        print(single)
        if single != '暂无':
            # 滚动到标签位置
            command = driver.find_element_by_xpath('//li[contains(./text(),"推荐标签")]/following::li[1]/a')
            driver.execute_script("arguments[0].scrollIntoView();", command)
            try:
                driver.find_element_by_xpath('//li[contains(./text(),"推荐标签")]/following::li[1]/a').click()
            except:
                print('没有推荐的标签')
                driver.find_element_by_xpath('//div/input[contains(./@id,"videoTags")]').send_keys('英语')
                driver.find_element_by_xpath('//div/input[contains(./@id,"videoTags")]').send_keys(Keys.ENTER)
                # driver.find_element_by_xpath('//label[contains(./text(),"标签")]').click()
            try:
                driver.find_element_by_xpath('//li[contains(./text(),"推荐标签")]/following::li[2]/a').click()
            except:
                print('没有第二个推荐的标签')
                driver.find_element_by_xpath('//div/input[contains(./@id,"videoTags")]').send_keys('奇趣')
                driver.find_element_by_xpath('//div/input[contains(./@id,"videoTags")]').send_keys(Keys.ENTER)
                # driver.find_element_by_xpath('//label[contains(./text(),"标签")]').click()
            try:
                driver.find_element_by_xpath('//li[contains(./text(),"推荐标签")]/following::li[3]/a').click()
            except:
                print('没有第三个推荐的biaoqian ')

            try:
                driver.find_element_by_xpath('//li[contains(./text(),"推荐标签")]/following::li[4]/a').click()
            except:
                print('没有第四个推荐的标签')
            try:
                driver.find_element_by_xpath('//li[contains(./text(),"推荐标签")]/following::li[5]/a').click()
            except:
                print('没有第五个推荐的标签')
            try:
                driver.find_element_by_xpath('//li[contains(./text(),"推荐标签")]/following::li[6]/a').click()
            except:
                print('没有第六个推荐的标签')
            try:
                driver.find_element_by_xpath('//li[contains(./text(),"推荐标签")]/following::li[7]/a').click()
            except:
                print('没有第七个推荐的标签')
            try:
                driver.find_element_by_xpath('//li[contains(./text(),"推荐标签")]/following::li[8]/a').click()
            except:
                print('没有第八个推荐的标签')
            try:
                driver.find_element_by_xpath('//li[contains(./text(),"推荐标签")]/following::li[9]/a').click()
            except:
                print('没有第九个推荐的标签')
        else:
            # 滚动到目标位置,手动输入标签
            command = driver.find_element_by_xpath('//div/input[contains(./@id,"videoTags")]')
            driver.execute_script("arguments[0].scrollIntoView();", command)
            command.send_keys('英语')
            command.send_keys(Keys.ENTER)
            command.send_keys('奇趣')
            command.send_keys(Keys.ENTER)

    # 第三方 SMTP 服务
    def emai(self, text,project_name):
        new_time = time.strftime("%Y-%m-%d-%H:%M:%S")
        sender = 'shixiaolongfw@163.com'  # 你发送邮箱的账号
        receivers = '654921690@qq.com'  # 接收邮件,可设置为你的QQ邮箱或者其他邮箱

        message = MIMEText("""你好当前时间{},异常原因{}""".format(new_time, text))
        message['From'] = "{}项目".format(project_name)
        message['To'] = "<654921690@qq.com>"
        # 标题
        subject = '爬虫出现异常'
        message['Subject'] = Header(subject, 'utf-8')

        try:
            smtpObj = smtplib.SMTP()
            smtpObj.connect("smtp.163.com", 25)  # 25 为 SMTP 端口号
            smtpObj.login(sender, "shixiaolong22")
            smtpObj.sendmail(sender, receivers, message.as_string())
            print("邮件发送成功")
        except smtplib.SMTPException as e:
            print("Error: 无法发送邮件", e)

    # 每24小时更新一次
    def main(self):
        try:
            num = 0
            while True:

                while True:
                    if int(time.time()) > 1532037966:
                        break
                    time.sleep(20)
                num+=1
                self.context()
                now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                print('第{}次更新完,当前时间是{}'.format(num,now))
                time.sleep(24*60*60)
        except Exception as e:
            self.emai(e,'origins')

    # 封面图上传,opencv解决,如果解决不了考虑系统自动生成
    def cover_upload(self, file_name):
        path = 'cover_image_food.jpg'
        video = cv2.VideoCapture()
        if not video.open(file_name):
            print("can not open the video")
            exit(1)
        count = 1
        status = 0
        while True:
            _, frame = video.read()
            if frame is None:
                print('frame是None,读取视频失败')
                break
            if count == 25*60:
                cv2.imwrite(path, frame)  # 存储为图像
                print('封面保存成功')
                status = 1
                break
            count += 1
        video.release()

        driver.current_window_handle
        path = self.path + path
        print(path)

        if status == 1:
            driver.find_element_by_xpath('//li[@data-id="upload"]/span').click()
            driver.current_window_handle
            # driver.find_element_by_xpath('//input[@type="file"]').send_keys(path)
            driver.find_element_by_xpath('//input[@class="webuploader-element-invisible"]').send_keys(path)
            time.sleep(5)
            driver.find_element_by_xpath('//button[contains(./text(),"确定")]').click()
            time.sleep(2)
            return 'ok'
        else:
            num = 0
            while True:
                try:
                    command = driver.find_element_by_xpath('//div[@class="upload-cover-block"]/span/img[contains(./@src,"1.jpg")]')
                    print('图片加载好了')
                    driver.execute_script("arguments[0].scrollIntoView();", command)
                    command.click()
                    # 如果加载超时,放弃执行接下来语句

                    driver.find_element_by_xpath('//button[contains(./text(),"确定")]').click()
                    return 'ok'
                except:
                    num += 1
                    print('系统自动生成的封面图加载中.....')
                    time.sleep(10)
                    if num > 60:
                        print('图片加载超时,放弃上传')
                        return 'no'

if __name__ == '__main__':
    # 目前限传三条
    qie = QIE()
    qie.main()
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 214,444评论 6 496
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 91,421评论 3 389
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 160,036评论 0 349
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 57,363评论 1 288
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 66,460评论 6 386
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 50,502评论 1 292
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 39,511评论 3 412
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 38,280评论 0 270
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 44,736评论 1 307
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 37,014评论 2 328
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 39,190评论 1 342
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 34,848评论 5 338
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 40,531评论 3 322
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 31,159评论 0 21
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 32,411评论 1 268
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 47,067评论 2 365
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 44,078评论 2 352

推荐阅读更多精彩内容