#! /usr/bin/python3
# -*- coding:UTF-8
import requests
from lxml import etree
url = 'https://movie.douban.com/top250?'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
}
new_url = ''
file = open('xiaoshuo.txt', 'r+', encoding="utf-8")
for i in range(0, 10):
new_url = url + 'start=' + str(i*25) + '&filter='
response = requests.get(new_url, headers=headers)
html = etree.HTML(response.text)
title_list = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/span[1]/text()')
inf_list = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[1]/text()')
score_list = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/div/span[2]/text()')
quote_list = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[2]/span/text()')
for j in range(0, 25):
str1 = title_list[j] + ' ' + inf_list[2*j].strip() + inf_list[2*j+1].strip() +' 评分为:' + score_list[j] + ' "' + quote_list[j] + '"\n'
file.write(str1)
print(str1)
print('电影爬取完毕')
file.close()