- requests中使用代理:可以用爬虫爬取xicidaili.com上面的免费代理
- 保存文件
- 使用format
import requestsfrom bs4
import BeautifulSoup
proxies = {'http' : '36.7.172.18:82'}
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
def save_img(url):
r = requests.get(url, proxies = proxies, headers = headers, stream = True)
if 200 != r.status_code:
print('unable to open {}'.format(url))
return
filename = './images/{}.jpg'.format(url.split('/')[-2])
with open(filename, 'wb') as f:
f.write(r.content)
def get_pic_url(url):
r = requests.get(url, proxies = proxies, headers = headers)
if 200 != r.status_code:
return []
soup = BeautifulSoup(r.text, 'lxml')
imgs = soup.select('img.entry-thumbnail')
urls = []
for i in imgs:
urls.append(i['src'])
return urls
if __name__ == '__main__':
for page in range(1, 2):
url = 'http://weheartit.com/inspirations/taylorswift?page={}'.format(page)
imgs = get_pic_url(url):
print('{} images are found in Page {}.'.format(len(imgs), page))
for p in imgs:
save_img(p)