使用 requests 和 BeautifulSoup 爬虫小例子,直接上代码:
# coding:utf-8
import requests
from bs4 import BeautifulSoup
import codecs
URL = "https://movie.douban.com/top250"
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0'}
def download_page(url):
data = requests.get(url, headers=HEADERS).content
return data
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
# 测试时可以使用print soup.prettify()打印查看获得的页面
# 根据css获取页面信息
movie_list_ol = soup.find('ol', attrs={'class':'grid_view'})
movie_name_list = []
# 遍历页面中有关的信息
for movie_li in movie_list_ol.find_all('li'):
# 电影描述
detail = movie_li.find('div', attrs={'class':'hd'})
# 电影名字
movie_name = detail.find('span', attrs={'class':'title'}).getText()
movie_name_list.append(movie_name)
# 找到下一页
next_page = soup.find('span', attrs={'class':'next'}).find('a')
if next_page:
# 拼接下一页的url,继续爬取下一页
return movie_name_list, URL + next_page['href']
return movie_name_list, None
def main():
url = URL
with codecs.open('movies.txt', 'w', encoding='utf-8') as fp:
movies_all = []
while url:
html = download_page(url)
movies, url = parse_html(html)
movies_all.extend(movies)
for index, movie in enumerate(movies_all):
index += 1
# 将获得的信息写入文件
fp.write('{index}.{movie}\n'.format(index=index, movie=movie))
if __name__ == '__main__':
main()