pyspider是个蛮简洁的框架,爬取内容直接存放在resultdb里,可以web查看,超级方便实用。
废话少说,开始实战吧
本项目目的:
使用pyspider爬取顶点小说网的小说,并存入本地mysql数据库
思路:
代码逻辑很简单,先爬取小说分类的url,沿着分类爬取各类目下的小说名,然后再爬取各章节,最后获取到每章节内容,把需要的各个信息存入数据库
步骤:
1,pyspider all启动pyspider
2,新建一个项目
3,输入代码Handler
这里重点用到了response的save用来保存数据,以及覆盖了on_result以便存储到本地数据库
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017-07-20 16:56:22
# Project: dingdian
from pyspider.libs.base_handler import *
import re
from bs4 import BeautifulSoup
from pyspider.result import ResultWorker
from pyspider.database.mysql.mysqldb import SQL
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
baseurl = 'http://www.x23us.com/class/'
sufix = '_1.html'
for i in range(1,11):
url = baseurl + str(i) + sufix
self.crawl(url,callback=self.index_page,validate_cert=False)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
total_page_num = response.doc('.last').text()
total_page_num = int(total_page_num)
first = response.doc('.first').text()
first = int(first)
baseurl = 'http://www.x23us.com/class/1_'
sufix='.html'
for index in range(first,total_page_num+1):
url = baseurl + str(index) + sufix
self.crawl(url,callback=self.list_books,validate_cert=False)
def list_books(self, response):
items = response.doc('tr').items()
for item in items:
booktitle = item.find('.L').find('a').eq(1).text()
if len(booktitle)==0:
continue
c = item.find('.C')
author = item.find('.C').eq(0).text()
updatetime = item.find('.C').eq(1).text()
status = item.find('.C').eq(2).text()
latestchapter = item.find('.L').eq(1).text()
bookurl = item.find('.L').find('a').eq(1).attr('href')
savedata = {'booktitle':booktitle,'author':author,'updatetime':updatetime,'status':status}
self.crawl(bookurl,callback=self.list_chapter,save = savedata,validate_cert=False)
def list_chapter(self,response):
items = response.doc('.L').items()
booktitle = response.save['booktitle']
author = response.save['author']
updatetime = response.save['updatetime']
status = response.save['status']
for item in items:
chaptertitle = item.find('a').text()
chapterurl = item.find('a').attr('href')
savedata = {'booktitle':booktitle,'author':author,'updatetime':updatetime,'status':status,'chaptertitle':chaptertitle}
self.crawl(chapterurl,callback=self.list_content,save = savedata,validate_cert=False)
@config(priority=2)
def list_content(self,response):
nav=response.doc('dt > a').items()
navlist = []
for item in nav:
navlist.append(item.text())
if len(navlist) > 0:
category = navlist[1]
items = response.doc('h1').items()
prevnexturls = response.doc('h3').items()
contents = response.doc('#contents').items()
booktitle = response.save['booktitle']
author = response.save['author']
updatetime = response.save['updatetime']
status = response.save['status']
chaptertitle = response.save['chaptertitle']
for item in contents:
content = item.text()
for item in prevnexturls:
prevurl = item.find('a').eq(0).attr('href')
nexturl = item.find('a').eq(2).attr('href')
#for item in items:
# chaptertitle = item.text()
return {
"booktitle":booktitle,
"author":author,
"updatetime":updatetime,
"status":status,
"category":category,
"chaptertitle":chaptertitle,
"content":content
}
def on_result(self, result):
if not result or not result['booktitle']:
return
sql = SQL()
sql.replace('novel',**result)
其他的代码都很简单,重点说下存入本地数据库,
首先需要在C:\Python3.5\Lib\site-packages\pyspider\database\mysql目录下新建一个mysqldb.py模块,然后输入:
from six import itervalues
# import mysql.connector
import pymysql
from datetime import date, datetime, timedelta
class SQL:
username = 'root'
password = ''
database = 'dingdian'
host = 'localhost'
connection = ''
charset = 'utf8'
connect = True
placeholder = '%s'
def __init__(self):
if self.connect:
SQL.connect(self)
def escape(self,string):
return '`%s`' % string
def connect(self):
config={'user':SQL.username,'password':SQL.password,'host':SQL.host,'charset':SQL.charset}
if SQL.database != None:
config['database'] = SQL.database
try:
cnx = pymysql.connect(**config)
# cnx = mysql.connector.connect(**config)
SQL.connection = cnx
return True
except Exception as err:
print('Something went wrong',err)
def replace(self,tablename=None,**values):
if SQL.connection == '':
print('Please connect first')
return False
tablename = self.escape(tablename)
if values:
_keys = ",".join(self.escape(k) for k in values)
_values = ",".join([self.placeholder,]*len(values))
sql_query = "REPLACE INTO %s (%s) VALUES (%s)" % (tablename,_keys,_values)
else:
sql_query = "REPLACE INTO %s DEFAULT VALUES" % tablename
cur = SQL.connection.cursor()
try:
if values:
cur.execute(sql_query,list(itervalues(values)))
else:
cur.execute(sql_query)
SQL.connection.commit()
return True
except Exception as err:
print("An error occured :{}".format(err))
return False
再通过wamp中的phpmyadmin来新建一个dingdian数据库和novel表,
表的字段包括id,boottitle,chaptertitle,category,author,status,content,updatetime
这样你就实现了顶点小说网的小说爬取了。具体代码
https://github.com/chenxiang2017/spidersamples/tree/master/dingdian/dingdianpyspider
注意,我这里连接mysql用的是pymysql,如果没装,需要pip install pymysql安装下。