基于Vue+CI+py selenium爬虫框架前后端博客系统

Vue前端界面


image.png

image.png

image.png

尝鲜地址:http://blog.runsss.com
https://github.com/wangjiulian/vue-blog.git

CI框架后台界面


image.png

image.png

尝鲜地址:http://admin.blog.runsss.com/
测试账号 test 222222
apidoc地址: http://api.blog.runsss.com/apidoc/
git@github.com:wangjiulian/vue-blog-admin.git

selenium爬虫程序抓取头条数据
crawl.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from pyquery import PyQuery as pq

from bs4 import BeautifulSoup
import multiprocessing
from multiprocessing import Pool
import time
import requests
import json
import pymysql
import re
import random
from random import choice

db = pymysql.connect('localhost', 'root', '', 'blog')
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}

开始爬取数据

def start(url):
nowCount = getCount()
res = requests.get(url, headers=headers)
if res.status_code == 200:
jsonData = json.loads(res.text)
if jsonData and 'data' in jsonData.keys() and len(jsonData['data']) > 0:
arr = []
for item in jsonData['data']:
url = 'https://www.toutiao.com/group/' + item['group_id']
title = item['title']
if isExist(title):
# print(title)
continue
arr.append(url)
# break
prase_detail(arr)
endCount = getCount()
return endCount - nowCount;

爬取详情数据

def prase_detail(urls):
start = time.time()
pool = multiprocessing.Pool(processes=3)
for url in urls:
pool.apply_async(parase_html, (url,))
pool.close()
pool.join() # 调用join之前,先调用close函数,否则会出错。执行完close后不会有新的进程加入到pool,join函数等待所有子进程结束
# print('all done', int(time.time() - start), 's ', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))

解析网页

def parase_html(url):
try:
# print('开始加载:' + url)
chrome_options = Options()
chrome_options.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在的报错
chrome_options.add_argument('window-size=1920x3000') # 指定浏览器分辨率 尽量让完全显示所有控件,以防无法点击报错
chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
# chrome_options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
# chrome_options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
chrome_options.add_argument('--headless') # 浏
browser = webdriver.Chrome(options=chrome_options)
browser.get(url)
# wait = WebDriverWait(browser, 10)
# doc = pq(browser.page_source)
# title = doc.find(_class='article-title').text()
# content = doc('.article-content').text
doc = BeautifulSoup(browser.page_source, 'lxml')
title = doc.select('.article-title')[0].text
content = doc.select('.article-content')[0]
pattern = re.compile('.?<img.?src="(.?)".?', re.S)
result = re.findall(pattern, str(content))
imgs = ''
if result and len(result) > 0:
imgs = ','.join(result)
# print(title);
commit(title, imgs, content)
except Exception as e:
pass
# print('parse err:', e)
finally:
browser.close()

提交数据库

def commit(title, imgs, content):
try:
user_id = random.randint(1,1700)
blog_type = random.randint(1,7);
cursor = db.cursor()
sql = " INSERT INTO blog (user_id,blog_type,hot,type,title,imgs,content,create_time) VALUES ('%d','%d',1,2,'%s','%s','%s','%s')" % (user_id,blog_type,
title, imgs, content, int(time.time()))
cursor.execute(sql)
db.commit()
# print('已爬取数量:' + str(getCount()))
return True
except Exception as e:
# print('db err:', e)
db.rollback()
return False

查询当前爬取数量

def getCount():
cursor = db.cursor()
cursor.execute(" SELECT COUNT(*) as num from blog ")
result = cursor.fetchone()
return result[0]

判断是否重复

def isExist(title):
cursor = db.cursor()
sql = " SELECT title from blog where title='%s'" % (title)
row_count = cursor.execute(sql)
if row_count > 0:
return True
return False

def main():
arr = ['https://www.toutiao.com/api/pc/feed/?category=news_hot',
'https://www.toutiao.com/api/pc/feed/?category=news_tech',
'https://www.toutiao.com/api/pc/feed/?category=news_car',
'https://www.toutiao.com/api/pc/feed/?category=news_finance',
]
url = choice(arr)
print(start(url))
# while (True):
# url = choice(arr)
# print('加载类型:' + url)
# start(url)
# time.sleep(30)

if name == 'main':
try:
main()
except:
time.sleep(30)
print('重新启动')
main()

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容

  • 渐变的面目拼图要我怎么拼? 我是疲乏了还是投降了? 不是不允许自己坠落, 我没有滴水不进的保护膜。 就是害怕变得面...
    闷热当乘凉阅读 4,356评论 0 13
  • 感觉自己有点神经衰弱,总是觉得手机响了;屋外有人走过;每次妈妈不声不响的进房间突然跟我说话,我都会被吓得半死!一整...
    章鱼的拥抱阅读 2,226评论 4 5
  • 夜莺2517阅读 127,762评论 1 9
  • 版本:ios 1.2.1 亮点: 1.app角标可以实时更新天气温度或选择空气质量,建议处女座就不要选了,不然老想...
    我就是沉沉阅读 6,976评论 1 6