'''
import requests
import os
from lxml import etree
class Spider(object): #创建类
def start_request(self): #定义函数
response=requests.get("https://www.qidian.com/all")
html=etree.HTML(response.content.decode()) #html树节点关系
Bigtit_list=html.xpath('//div[@class="book-mid-info"]/h4/a/text()') #属性为class="book-mid-info"]的div下h4标签下a标签的文本
Bigtit_src=html.xpath('//div[@class="book-mid-info"]/h4/a/@href') #属性为class="book-mid-info"]的div下h4标签下a标签href信息
for bigtit,bigsrc in zip(Bigtit_list, Bigtit_src): #建立一一对应关系
if os.path.exists(bigtit)==False:
os.mkdir(bigtit)
self.file_data(bigtit,bigsrc)##被下面函数调用
def file_data(self,bigtit,bigsrc):
response = requests.get("https:"+bigsrc)
html = etree.HTML(response.content.decode()) # html树节点关系
Little_list = html.xpath('//ul[@class="cf"]/li/a/text()') # 属性为class="book-mid-info"]的div下h4标签下a标签的文本
Little_src= html.xpath('//ul[@class="cf"]/li/a/@href') # 属性为class="book-mid-info"]的div下h4标签下a标签href信息
for littit, litsrc in zip(Little_list, Little_src): # 建立一一对应关系
self.finally_file(littit,litsrc,bigtit)
def finally_file(self,littit,litsrc,bigtit):
response = requests.get("https:" + litsrc)
html = etree.HTML(response.content.decode()) # html树节点关系
content="\n".join(html.xpath('//div[@class="read-content j_readContent"]/p/text()'))
file_name=bigtit+"\\"+littit+".txt"
print("正在抓取文章"+file_name)
with open(file_name,"a",encoding="utf-8") as f:##多媒体文件写入用"wb"
f.write(content)
spider=Spider()
spider.start_request() #运行Spider类下start_request函数
'''