使用XPath简单爬取起点网并将数据存入MySQL数据库
import requests#pip3 install requests
from lxml import etree#pip3 install lxml
import time
import re
import pymysql#pip3 install PyMySQL
def db():
#连接数据库
conn = pymysql.connect(host='localhost',port=3306,user='root',database='数据库名',password='数据库密码',charset='utf8')
print('连接上数据库了')
return conn
def insertdb0(conn,data0):
#创建光标
cursor = conn.cursor()
#SQL语句
sql = """
INSERT INTO qidian1(%s) VALUE(%s)
"""%(','.join([k for k,v in data0.items()]),
','.join(['%s' for k,v in data0.items()]),
)
# print([v for k,v in data0.items()])
# print([k for k,v in data0.items()])
try:
cursor.execute(sql,[v for k,v in data0.items()])
conn.commit()
except:
print('失败')
conn.rollback()
def insertdb1(conn,data1):
cursor = conn.cursor()
sql = """
INSERT INTO info_parse(%s) VALUE(%s)
"""%(','.join([k for k,v in data1.items()]),
','.join(['%s' for k,v in data1.items()]),
)
try:
cursor.execute(sql,[v for k,v in data1.items()])
conn.commit()
except:
print('失败')
conn.rollback()
def insertdb2(conn,data2):
cursor = conn.cursor()
sql = """
INSERT INTO content_parse(%s) VALUE(%s)
"""%(','.join([k for k,v in data2.items()]),
','.join(['%s' for k,v in data2.items()]),
)
try:
cursor.execute(sql,[v for k,v in data2.items()])
conn.commit()
except:
print('失败')
conn.rollback()
#内容页面的URL
def content_url(ctx1):
for k,v in ctx1.items():
# print(v)
src = 'https:{}'.format(v)
# print(src)
# time.sleep(5)
content_parse(src)
#使用正则解析内容页面
def content_parse(src):
# print(src)
req_headers = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Host': 'read.qidian.com',
}
response = requests.get(src,headers=req_headers)
# print(response.text)
#匹配内容
content1 = re.compile(r'.*?<div.*?class="read-content j_readContent">(.*?)</div>.*?',re.S)
sult = re.findall(content1,response.text)[0]
# print(sult)
#替换内容的\n
content2 = re.sub(r"<[^>]*>| |\n","",sult)
# print(content2)
data2 = {
'content2':content2
}
conn = db()
insertdb2(conn,data2)
#信息页URL
def info_url(ctx):
# print(ctx)
for k,v in ctx.items():
# print(v)
src = 'https:{}'.format(v)
# print(src)
info_parse(src)
#信息页URL解析,XPath提取数据
def info_parse(src):
# print(src)
req_headers = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Host': 'book.qidian.com',
}
response = requests.get(src,headers=req_headers)
# print(response.text)
xml2 = etree.HTML(response.text)
# print(xml2)
zong = xml2.xpath('//div[@class="volume-wrap"][1]/div[@class="volume"]/ul[@class="cf"]/li')
# print(len(zong))
for i in zong:
href = i.xpath('./a/@href')[0]
# print(href)
data1 = {
'href':href
}
conn = db()
insertdb1(conn,data1)
time.sleep(2)
ctx1 = {
'href':href
}
content_url(ctx1)
#首页页面,使用XPath提取数据
def qidian_index():
url = 'https://www.qidian.com/all'
req_headers = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
response = requests.get(url,headers=req_headers)
# print(response.text)
print(response.url)
xml1 = etree.HTML(response.text)
# print(xml1)
#图片封面
img = xml1.xpath('//div[@class="book-img-box"]/a/img/@src')
# print(img)
zong = xml1.xpath('//div[@class="book-mid-info"]')
for i in zong:
#标题
title = i.xpath('./h4/a/text()')[0]
# print(title)
#作者
zuozhe = i.xpath('./p/a[1]/text()')[0]
# print(zuozhe)
fenlei1 = i.xpath('./p/a[2]/text()')[0]
# print(fenlei)
fenlei2 = i.xpath('./p/a[3]/text()')[0]
# print(fenlei2)
#分类
# fenlei = fenlei1 + '.' + fenlei2
fenlei = '{}.{}'.format(fenlei1,fenlei2)
# print(fenlei)
#内容
content = i.xpath('./p/text()')[0].strip()
# print('*'*100)
# print(content)
src_cover = i.xpath('./h4/a/@href')[0]
# print(src_cover)
data0 = {
'title':title,
'zuozhe':zuozhe,
'fenlei':fenlei,
'content':content,
}
conn = db()
insertdb0(conn,data0)
# time.sleep(2)
ctx = {
'src_cover':src_cover
}
info_url(ctx)
if __name__ == '__main__':
qidian_index()