简单的爬取一个网页的信息
1.需要做好访问用户的返回信息,防止被认出是爬虫
2.找好想要爬取的网站
3.找准想要爬取信息的位置
import requests
导入extree模块
from lxml import html
etree = html.etree
game_list_info = []
def spider_guaishou_top12():
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36"}
url = 'https://www.apk8.com/zt/zt_1604.html'
#获取bytes类型响应(二进制)
data = requests.get(url, headers=headers).content
html = etree.HTML(data)
yemian_list = html.xpath('//div[@id="zl"]/div[1]/div[@class="sli_item"]')
for game in yemian_list:
#游戏名
game_name = game.xpath('./a[2]/text()')[0]
print(game_name)
#游戏类型
game_type = game.xpath('./p[1]/a/text()')[0]
print(game_type)
#游戏大小
game_size = game.xpath('./p[1]/text()')[1].strip()
game_size = game_size.replace('/ ', '')
print(game_size)
#游戏下载链接
game_interlinkage = game.xpath('./p[@class="sli_btn"]/a/@href')[0]
print('游戏下载链接:{}'.format(game_interlinkage))
#游戏图片地址
game_picture = game.xpath('./a[1]/img/@src')[0]
print('游戏图片地址:{}'.format(game_picture))
将爬取到的信息集合到一起
game_list_info.append({
'game_name':game_name,
'game_type':game_type,
'game_size':game_size,
'game_interlinkage':game_interlinkage,
'game_picture':game_picture
})
将图片下载到本地的imgs文件夹下
for game in game_list_info:
url = game['game_picture']
resp = requests.get(url)
if resp.status_code == 200:
img_name = '{}.jpg'.format(game['game_name'])
with open('./imgs/{}'.format(img_name), 'wb') as f:
f.write(resp.content)
spider_guaishou_top12()
这是爬取到的游戏的封面