本文参考:Python爬虫(3):爬取豆瓣电影TOP250
9.23更新:修改了代码,将抓取的数据存入MongoDB数据库,再从数据库导出,结果同存入CSV文件。主要目的是练习将数据存入数据库,为以后抓取大量的数据做准备。
注意:要先启动MongoDB数据库的服务,再运行代码.
存在问题:使用MongoVUE查询数据有一些字段查了没显示出来
#coding=utf-8
import requests
from lxml import html
from pymongo import MongoClient
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" ) # 设置系统默认编码
client = MongoClient() ## 建立与MongoDB数据库的连接
dbName = 'doban' ## 数据库名字(参考数据库对象)
dbTable = 'top250' ## 集合对象(等同sql的表格)
tab = client[dbName][dbTable] # 等同于:tab=client['douban']['top250'] 创建数据库和表格
def gethtml(url):
info_list=[]
for k in range(0,10):
link=url+'%s%s%s'%('?start=',k*25,'&filter=')
HTML=html.fromstring(requests.get(link).content)
infos=HTML.xpath('//ol[@class="grid_view"]/li')
for i in infos:
title=i.xpath('div[@class="item"]//span[@class="title"]/text()')[0]
info=i.xpath('div[@class="item"]//div[@class="bd"]/p[1]/text()')
info_1=info[0].replace(" ", "").replace("\n", "") # print info_1.encode("gbk",'ignore') 打印时,&nsp不能编码为gbk,选择忽略
date=info[1].replace(" ", "").replace("\n", "").split('/')[0]
country=info[1].replace(" ", "").replace("\n", "").split('/')[1]
type=info[1].replace(" ", "").replace("\n", "").split('/')[2]
rate=i.xpath('div[@class="item"]//div[@class="star"]/span[@class="rating_num"]/text()')[0]
critic=i.xpath('div[@class="item"]//div[@class="star"]/span[last()]/text()')[0]
info_list.append([title,info_1,date,country,type,rate,critic])
return info_list
def getData(data_list):
for x in data_list:
key={'title':x[0],'director':x[1],'date':x[2],'country':x[3],'type':x[4],'rate':x[5],'comment number':x[6]}
tab.insert(key)
print '爬取成功!'
URL='https://movie.douban.com/top250'
getData(gethtml(URL))
原文:
注意的地方
1,编码问题,修改系统编码为utf-8。
2,翻页问题,for循环构造链接实现翻页
3,xpath语法问题,跳级选取节点时要用//,否则选不到
4,replace函数和split函数的运用,达到选取目标文本的目的。
5,cmd默认编码为gbk,遇到html中的 会出错,打印时要注意 # print info_1.encode("gbk",'ignore'),
6,多目标信息以列表形式添加进列表,再存入csv文件,注意csv模块的使用方法
7,代码改进之处:相比原作者存入txt文件,我选择利用csv模块,将爬取的数据存入csv文件,用excel打开,以表格形式显示,看起来更加清晰明了。
#coding=utf-8
import requests
from lxml import html
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" ) # 设置系统默认编码
def gethtml(url):
info_list=[]
for k in range(0,10):
link=url+'%s%s%s'%('?start=',k*25,'&filter=')
HTML=html.fromstring(requests.get(link).content)
infos=HTML.xpath('//ol[@class="grid_view"]/li')
for i in infos:
title=i.xpath('div[@class="item"]//span[@class="title"]/text()')[0]
info=i.xpath('div[@class="item"]//div[@class="bd"]/p[1]/text()')
info_1=info[0].replace(" ", "").replace("\n", "") # print info_1.encode("gbk",'ignore') 打印时,&nsp不能编码为gbk,选择忽略
date=info[1].replace(" ", "").replace("\n", "").split('/')[0]
country=info[1].replace(" ", "").replace("\n", "").split('/')[1]
type=info[1].replace(" ", "").replace("\n", "").split('/')[2]
rate=i.xpath('div[@class="item"]//div[@class="star"]/span[@class="rating_num"]/text()')[0]
critic=i.xpath('div[@class="item"]//div[@class="star"]/span[last()]/text()')[0]
info_list.append([title,info_1,date,country,type,rate,critic])
return info_list
import csv
import codecs
def getCSV(data_list):
file_name='test.csv'
with codecs.open(file_name,'wb') as f:
writer=csv.writer(f)
for q in data_list:
writer.writerow(q)
URL='https://movie.douban.com/top250'
getCSV(gethtml(URL))
原作代码:
# coding:utf-8
import requests from lxml import html
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" ) # 设置系统默认编码
k = 1
for i in range(10):
url = 'https://movie.douban.com/top250?start={}&filter='.format(i*25)
con = requests.get(url).content
sel = html.fromstring(con)
# 所有的信息都在class属性为info的div标签里,可以先把这个节点取出来
for i in sel.xpath('//div[@class="info"]'):
# 影片名称
title = i.xpath('div[@class="hd"]/a/span[@class="title"]/text()')[0]
info = i.xpath('div[@class="bd"]/p[1]/text()')
# 导演演员信息
info_1 = info[0].replace(" ", "").replace("\n", "")
# 上映日期
date = info[1].replace(" ", "").replace("\n", "").split("/")[0]
# 制片国家
country = info[1].replace(" ", "").replace("\n", "").split("/")[1]
# 影片类型
geners = info[1].replace(" ", "").replace("\n", "").split("/")[2]
# 评分
rate = i.xpath('//span[@class="rating_num"]/text()')[0]
# 评论人数
comCount = i.xpath('//div[@class="star"]/span[4]/text()')[0]
# 打印结果看看
print "TOP%s" % str(k)
print title, info_1, rate, date, country, geners, comCount
# 写入文件
with open("top250.txt", "a") as f:
f.write("TOP%s\n影片名称:%s\n评分:%s %s\n上映日期:%s\n上映国家:%s\n%s\n" % (k, title, rate, comCount, date, country, info_1))
f.write("==========================\n")
k += 1