发现没什么好说的,主要是这个 pyquery 库比较好用,能实现像 操纵DOM 一样解析网页。
主要功能:
- 将爬取的网页先保存到本地,然后解析,避免重复请求。
- 将解析的结果保存到 MongoDB。
import requests
import pymongo
from pyquery import PyQuery as pq
class Model(object):
"""
基类, 用来显示类的信息
"""
def __repr__(self):
name = self.__class__.__name__
properties = ('{}=({})'.format(k, v) for k, v in self.__dict__.items())
s = '\n<{} \n {}>'.format(name, '\n '.join(properties))
return s
class Movie(Model):
"""
存储电影信息
"""
def __init__(self):
self.name = ''
self.score = 0
self.quote = ''
self.cover_url = ''
self.ranking = 0
def cached_url(url):
"""
缓存, 避免重复下载网页浪费时间
"""
folder = 'cached'
filename = url.split('=', 1)[-1] + '.html'
path = os.path.join(folder, filename)
if os.path.exists(path):
with open(path, 'rb') as f:
s = f.read()
return s
else:
# 建立 cached 文件夹
if not os.path.exists(folder):
os.makedirs(folder)
headers = {
'user-agent': '''Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8''',
}
# 发送网络请求, 把结果写入到文件夹中
r = requests.get(url, headers)
with open(path, 'wb') as f:
f.write(r.content)
return r.content
def movie_from_div(div):
"""
从一个 div 里面获取到一个电影信息
"""
e = pq(div)
# 小作用域变量用单字符
m = Movie()
m.name = e('.title').text()
m.score = e('.rating_num').text()
m.quote = e('.inq').text()
m.cover_url = e('img').attr('src')
m.ranking = e('.pic').find('em').text()
return m
def movies_from_url(url):
"""
从 url 中下载网页并解析出页面内所有的电影
"""
page = cached_url(url)
e = pq(page)
# 2.父节点
items = e('.item')
# 调用 movie_from_div
# list comprehension
movies = [movie_from_div(i) for i in items]
return movies
def download_image(url, file):
folder = "img"
name = file.split("/")[0] + '.jpg'
path = os.path.join(folder, name)
if not os.path.exists(folder):
os.makedirs(folder)
if os.path.exists(path):
return
headers = {
'user-agent': '''Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8''',
}
# 发送网络请求, 把结果写入到文件夹中
r = requests.get(url, headers)
with open(path, 'wb') as f:
f.write(r.content)
def savemovies(movies):
'''
保存到 MongoDB
'''
connection = pymongo.MongoClient()
DoubanMovies_db = connection.DoubanMovies_db
Movietable = DoubanMovies_db.movies
for m in movies:
movie = {}
movie['name'] = m.name
movie['score'] = m.score
movie['quote'] = m.quote
movie['ranking'] = m.ranking
movie['cover_url'] = m.cover_url
Movietable.insert_one(movie)
def main():
for i in range(0, 250, 25):
url = 'https://movie.douban.com/top250?start={}'.format(i)
movies = movies_from_url(url)
savemovies(movies)
print('top250 movies', movies)
[download_image(m.cover_url, str(m.name)) for m in movies]
if __name__ == '__main__':
main()