近期接到一个任务,每日上网搜索热门新闻标题,发送给运营同事,按照之前负责此项工作的同事交接,每日上网,搜索,复制,粘贴到本地,然后发邮件给同事。。。
内容不复杂,但是很无聊,所以就想到了偷懒
毕竟人类社会的进步就是偷懒来促进的。。
首先,爬取目标,腾讯新闻首页 http://news.qq.com/,只要两个的新闻标题,热门频道,和娱乐频道(好像大家跟我一样也很八卦)
爬虫库选择的时候先试了试urllib.request,发现这个库爬下来的内容被加密了,还要解密,奈何我怕麻烦,水平又不够,就转用requests + 美丽汤了
首先,导入需要的库
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import datetime as dt
import pandas as pd # 转成Excel文件很方便
import smtplib #发送邮件
from email.mime.text import MIMEText # 以下几个模块都是构造邮件用的
from email.mime.multipart import MIMEMultipart
from email.header import Header
然后是爬虫的类
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
class Crawlnewsname(object): # 爬取腾讯新闻首页
# url = 'http://news.qq.com/'
# headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
def __init__(self,url): # 默认返回解析后网站结构
try:
self.url = url
self.res = requests.get(self.url, headers = headers)
self.res.encoding = 'gbk' # 试了utf-8,不好使,反而是gbk好使
self.soup = BeautifulSoup(self.res.text, 'html.parser') # 解析返回的内容
except Exception as e:
print(e)
def major_news(self): # 解析热门标签新闻,具体需要查看网页结构,有可能会有变化
data = self.soup.select('div[class="item major"]')
result = []
for name in data[0].select('a[class="linkto"]'):
if len(name.text) >0:
newsdict = {}
newsdict['name'] = name.text.strip()
newsdict['link'] = name['href']
result.append(newsdict)
else:
pass
return result
def ent_news(self): # 解析娱乐标签新闻
data = self.soup.select('div[class="item ent"]')
result = []
for name in data[0].select('a[class="linkto"]'):
if len(name.text) >0:
newsdict = {}
newsdict['name'] = name.text.strip()
newsdict['link'] = name['href']
result.append(newsdict)
else:
pass
return result
接下来是发送邮件
class Sendmail(object):
def __init__(self, sender, recevers, server, port = 465): # 用的腾讯邮箱,端口默认是465
self.sender = sender
self.recevers = recevers
self.server = server
self.port = port
def sendmail(self, title, text, path, name): #title,邮件主题,text,邮件正文,path附件路径,name附件名称
message = MIMEMultipart() # 创建一个可发送附件的对象
message['From'] = self.sender
message['To'] = ';'.join(self.recevers) # 收件人需要是字符串,用分号将收件人地址隔开
message['Subject'] = Header(title, 'utf-8') # 邮件主题
message.attach(MIMEText(text, 'plain','utf-8')) # 邮件正文
# 打开要发送的附件,注意格式编码
att = MIMEText(open(path + '\\' + name, 'rb').read(), 'base64', 'utf-8')
#这两行不要写错了,我就写错了,网上搜了好久才发现是我写错了,呜呜呜
att['Content-Type'] = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
att['Content-Disposition'] = "attachment; filename =%s" % name
# 将附件添加到message
message.attach(att)
# 尝试发送邮件
try:
server = smtplib.SMTP_SSL(self.server, self.port)
server.login(self.sender, '*******') # 星号是你的邮箱的授权码,填写自己的
server.sendmail(self.sender, self.recevers, message.as_string())
server.quit()
print("邮件发送成功,请注意查收!")
except Exception as e:
print('邮件发送失败,错误消息如下:')
print(e)
return True
再加上一个保存到excel文件的函数
def to_excel(df, path, result_time): # 输入dateframe到excel
df['date'] = result_time # 添加日期字段
df['name'] = '[阅读+0.01]' + df['name']
names = ['date','catagory','name','link']
df = df.reindex(columns = names)
df.to_excel(path + '\\' + result_time + '.xlsx', index = False) # 以日期命名文件
接下来
if __name__ == '__main__':
# 得到字符串的日期 yyyy-mm-dd
ctime = datetime.now()
delta = dt.timedelta(days = 3) # 日期顺延三日
next_time = ctime + delta
result_time = next_time.strftime('%Y-%m-%d') # 日期转为字符串 yyyy-mm-dd格式
print('正在爬取新闻标题...')
# 爬取新闻标题
url = 'http://news.qq.com/'
news = Crawlnewsname(url) # 创建对象
major = news.major_news() # 爬要闻
ent = news.ent_news() # 爬娱乐新闻
print('正在整理爬取的数据...')
# 使用pandas整理得到的数据
major_df = pd.DataFrame(major) # 使用pandas整理数据
major_df['catagory'] = 'major' # 添加固定字段
ent_df = pd.DataFrame(ent)
ent_df['catagory'] = 'ent'
my_df = pd.DataFrame({'catagory': None, 'name':'您昨天的收益已到账,请点击查看', 'link': None}, index = [0])
combine_df = pd.concat([my_df,major_df, ent_df]) # 合并热门新闻和娱乐新闻
print('正在将爬取的数据输出到本地...')
# 将dataframe输出为excel到本地
path = r'D:\日常数据\日常push'
to_excel(combine_df, path, result_time)
print('正在发送邮件')
# 发送邮件
sender = '***@**.com' #填你自己的
receivers = ['***@**.com', '***@**.com'] # 填你认识的
server = 'smtp.exmail.qq.com'
title = '每日推送新闻标题%s' % result_time
text = '你好,附件为%s日的新闻标题,请查收' % result_time
mail = Sendmail(sender, receivers, server)
mail.sendmail(title, text, path, result_time + '.xlsx')
收到的邮件如下:
本地文件如下:
酱紫我就可以更懒了,哈哈哈