encoding=utf8
import scrapy
import time
from scrapyLuntan.items import ScrapyluntanItem
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
lis = []
class LunTan(scrapy.Spider):
# 这个爬虫的识别名称,必须是唯一的,在不同的爬虫必须定义不同的名字
name='luntan'
# 是搜索的域名范围,也就是爬虫的约束区域,
# 规定爬虫只爬取这个域名下的网页,不存在的URL会被忽略。
allowd_domains = ['http://dzh2.mop.com/']
#爬取的URL元祖 / 列表。爬虫从这里开始抓取数据,
# 所以,第一次下载的数据将会从这些urls开始。其他子URL将会从这些起始URL中继承性生成。
start_urls = ('http://www.mop.com/',)
#解析的方法,每个初始URL完成下载后将被调用,
# 调用的时候传入从每一个URL传回的Response对象来作为唯一参数
def parse(self, response):
li_list = response.xpath("//li[@class='mop-item-a']")
print len(li_list)
for i in li_list:
a_href = i.xpath("./a/@href")[0].extract()
a_href = a_href.replace('http://dzh2.mop.com/dzh_index.html#rlink=','')
print a_href
# 发送新的url请求加入待爬队列,并调用回调函数 self.parse
yield scrapy.Request(a_href, meta={'a_href': a_href},callback=self.res_detail)
def res_detail(self, response):
item = ScrapyluntanItem()
detail_url = response.meta['a_href']
print response.meta['a_href']
if 'http://dzh2.mop.com/' in detail_url:
text = response.xpath("//div[@class='post-date fl mr15']/span/text()")[0].extract()
else:
text = response.xpath("//div[@class='mr20 inlineBlock']/span/text()")[0].extract()
text = text.strip()
print text
if '年' in text:
print 88888
text = text.replace('年','-')
text = text.replace('月','-')
text = text.replace('日','')
timeArray = time.strptime(text,"%Y-%m-%d %H:%M:%S")
item['time'] = int(time.mktime(timeArray))
open('./time.txt','a+').write(str(item['time'])+"\r\n")
yield item
# print timeStamp