movie.py
# -*- coding: utf-8 -*-
import scrapy
from six_video.items import SixVideoItem
class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['dytt8.net']
start_urls = ['http://www.dytt8.net/html/gndy/dyzz/index.html']
def parse(self, response):
print('===========================')
movie_list = response.xpath("//table[@class='tbspan']")
for movie in movie_list:
#创建一个item
item = SixVideoItem()
# 在当前一级页面提取name和movie_info
item["movie_name"] = movie.xpath(".//a[@class='ulink']/text()").extract_first()
item["movie_info"] = movie.xpath(".//tr[last()]/td/text()").extract_first()
# 其他属性在二级页面中,在这里发起一个二级页面的请求
# 取出二级页面的链接
# 分析:item除了要在这里存储相关的信息以外,还需要在二级页面中存储其他信息
# Request对象提供了一个参数meta,可以将请求的响应体中添加一些信息
movie_url = "http://www.dytt8.net" + movie.xpath(".//a[@class='ulink']/@href").extract_first()
# 调取下载器
yield scrapy.Request(url=movie_url, meta={"movie_item": item}, callback=self.parse_1)
# 此时response在正常响应的基础上通过Request对象里的meta参数新增了一个属性"movie_item",里面保存了item
# 定义一个回调函数,用于解析电影的二级页面
def parse_1(self, response):
# 从response中提取出上个页面传递过来的item
item = response.meta["movie_item"]
item["image_url"] = response.xpath("//div[@id='Zoom']//img[1]/@src").extract_first()
# 通过.xpath("string(.)")过滤掉简介中的其他标签只留文本
item["story_info"] = response.xpath("//div[@id='Zoom']//p").xpath("string(.)").extract_first()
item["download_url"] = response.xpath("//td[@bgcolor='#fdfddf']/a/text()").extract_first()
yield item
items.py
import scrapy
class SixVideoItem(scrapy.Item):
movie_name = scrapy.Field()
movie_info = scrapy.Field()
image_url = scrapy.Field()
story_info = scrapy.Field()
download_url = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import csv
import pymysql
class SixVideoPipeline(object):
#这个方法在下载器下载开始下载并且交给蜘蛛来处理数据的时候调用
def open_spider(self, spider):
#json
#打开一个json文件
self.movie_file = open("movies.json", "w", encoding="utf8")
#定义一个列表,用于整合所有的item信息
self.items = []
#csv
#打开一个csv文件
self.csv_file= open("movies.csv", "w", encoding="utf-8")
#定义一个列表,整合所有的csv数据
self.csv_items = []
print("开始处理数据")
#这个方法每迭代一次下载结果,会调用一次
def process_item(self, item, spider):
#json
#将item转为普通字典
dic = dict(item)
#追加到items
self.items.append(dic)
#csv
#整合csv数据,csv是二维列表,定义一个空列表
csv_item = []
csv_item.append(item["movie_name"])
csv_item.append(item["movie_info"])
csv_item.append(item["image_url"])
csv_item.append(item["story_info"])
csv_item.append(item["download_url"])
self.csv_items.append(csv_item)
return item
# 所有数据都处理完了会调用这个方法
def close_spider(self, spider):
#json
#在这里把数据写入本地json文件
self.movie_file.write(json.dumps(self.items))
#写入完毕需关闭,否则占内存
self.movie_file.close()
#csv
#在这里把数据写入本地csv文件
writor = csv.writer(self.csv_file) #这个writer会往本地的csv_file文件写入数据
#写表头
writor.writerow(["movie_name", "movie_info", "image_url", "story_info", "download_url"])
#写内容
writor.writerows(self.csv_items)
print("处理数据完毕")
#定义一个类,用于操作MySQL数据库
class MysqlPipeline(object):
def __init__(self):
#初始化连接相关问题
self.user = 'root'
self.password = '9998'
self.host = '127.0.0.1'
self.port = 3306
self.dbname = 'dytt'
def open_spider(self, spider):
pass
#连接数据库
self.conn = pymysql.connect(
host=self.host,
port=self.port,
user=self.user,
password=self.password,
db=self.dbname,
charset='utf8'
)
#创建游标
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
print("++++++++++++++++++++++++++")
#用游标来执行sql语句
sql = "INSERT INTO movies_table VALUES (NULL,'%s','%s','%s','%s','%s')" % (
item["movie_name"], item["movie_info"], item["image_url"], item["story_info"], item["download_url"])
print(sql)
self.cursor.execute(sql)
self.conn.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
settings.py
ITEM_PIPELINES = {
'six_video.pipelines.SixVideoPipeline': 300,
'six_video.pipelines.MysqlPipeline': 299,
}