爬虫爬取豆瓣top250并保存到mongoDB数据库中
import requests
from lxml import etree
import pymongo
import time
class DouBan:
def getUrl(self,url,):
try:
for page in range(10):
url = 'https://movie.douban.com/top250?start=' + str(page * 25) + '&filter='
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
re = etree.HTML(r.text)
title = re.xpath('//div[@class="hd"]/a/span[@class="title"][1]//text()')
href = re.xpath('//div[@class="hd"]/a/@href')
for i in range(len(title)):
data1.insert_one({'影名': title[i], '链接': href[i]})
time.sleep(0.1)
except Exception as e:
print(e)
if __name__ == '__main__':
# MongoDB的连接
client = pymongo.MongoClient('localhost', 27017)
data = client['douban']
data1 = data['db']
url = 'https://movie.douban.com/top250'
douban = DouBan()
douban.getUrl(url)