scrapy startproject example
tree
├── example
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ ├── settings.pyc
│ └── spiders
│ ├── book_spider.py
│ ├── book_spider.pyc
│ ├── __init__.py
│ └── __init__.pyc
cd example
cd spider
vim book_spider.py
#-*- coding: utf-8 -*-
import scrapy
class BooksSpider(scrapy.Spider):
#每个爬虫都有相应的标识符
name = "book"
#定义开始爬取的起始点 可以有多个
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
for book in response.css('article.product_pod'):
name = book.xpath('./h3/a/@title').extract_first()
price = book.css('p.price_color::text').extract_first()
yield {
'name':name,
'price':price
}
next_url = response.css('ul.pager li.next a::attr(href)').extract_first()
if next_url:
next_url = response.urljoin(next_url)
yield scrapy.Request(next_url,callback=self.parse)
这个http://books.toscrape.com/可以用来练习爬虫
scrapy crawl book -o book.csv