爬网页的流程:
- 选择要爬的网址(url),分析网页结构,找到图片位置
- requests获取网页信息,将读取的信息放入BeautifulSoup解析出图片地址
- 下载图片
import requests
from bs4 import BeautifulSoup
url = "http://www.dili360.com/cng/article/p5350c3d7d8c9236.htm" #爬取的图片网站
proxy = {
"http":"http://ip:port"
"https":"http://ip:port"
}
html = requests.get(url,proxies=proxy).text
soup = BeautifulSoup(html,features='html.parser')
imgs = soup.find_all("div",{"class":"img"})
for i1 in imgs:
i2 = i1.find_all('img') #也可以用正则匹配:i = re.findall(r'http://.*?rw9',str(i1))[0]
for i3 in i2:
i = i3['src']
img_name = i.split('/')[-1].split('@')[0]
r = requests.get(i.stream=True,proxies=proxy) #也可以使用urlretrieve来下载
with open(f'./image/{img_name}','wb') as f:
for chunk in r.iter_content(chunk_size=32):
f.write(chunk)
print(f"Saved {img_name}")