一直想用Python
做爬虫爬些东西,看上一家网站的书觉得不错就开始用爬虫下手爬了!
开始爬虫工作的第一步使用
scrapy startproject BookSpider
创建项目文件-
创建完成并修改后的文件目录:
dmoz_spider.py
中定义了用来爬取网站的Spider
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["salttiger.com"]
start_urls = [
"http://www.salttiger.com/category/ebooks/"
]
def parse(self, response):
item = BookspiderItem()
title = response.xpath(".//*/header/h1/a/text()").extract()
item['title'] =[t.encode('utf-8') for t in title]
url = response.xpath(".//*/header/h1/a/@href").extract()
item['url'] = [t.encode('utf-8') for t in url]
image = response.xpath(".//*/div/p[1]/img/@src").extract()
item['image'] = [t.encode('utf-8') for t in image]
for t in title:
print(t.encode('utf-8'))
yield item
urls = response.xpath(".//*[@id='nav-below']/div/a[@class='page larger']/@href").extract()
for ul in urls:
print (urls)
yield Request(ul, callback=self.parse)
- 自定义
pipline
文件将数据写入mongodb
和本地:
FILE_NAME = 'meizi_images'
class MongoPipline(object):
collection_name = 'scrapy_items'
def __init__(self):
connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DB']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
abs_path = get_abs_path(item)
save_to_folder(item, abs_path)
self.collection.insert(dict(item))
return item
def get_abs_path(item):
abs_path = os.path.join(os.getcwd(), FILE_NAME)
print ("DDDDDDDDDDD")
print (item['title'])
print ("DDDDDDDDDDD")
if not os.path.exists(abs_path):
os.mkdir(abs_path)
for uri in item['title']:
abs_paths = os.path.join(abs_path, uri)
if not os.path.exists(abs_paths):
os.mkdir(abs_paths)
yield abs_paths
def save_to_folder(item, abs_path):
m = []
for url in item['image']:
img_name ='1.jpg'
img_abs_path = os.path.join(abs_path.next(), img_name)
m.append(img_abs_path)
print(m)
item['localImage'] = m
urllib.urlretrieve(url, img_abs_path)
完整的项目文件:github