平时喜欢看篮球,所以经常逛虎扑,这几天写爬虫,所以就想写个爬虫爬一下虎扑帖子里面的图片。
一般来说,帖子里面图片是这样的:
打开网页源代码,看到图片在网页中的路径如下:
根据图片存储的路径,可以采用正则表达式提取、bs提取或者xpath表达式提取,此处采用正则表达式提取,表达式如下:
img1Pattern = re.compile('<img .?(http://i.?)?/format,webp')
全部代码如下,存储路径换成自己本地路径
import requests
from urllib.requestimport urlopen, urlretrieve
import time
import re
import os
import threading
filepath ="本地路径"
获取网页中的链接
def get_url():
totalurl = []
for iin range(1,22):
baseurl ='网页结构'+str(i)+'.html'
pattern ='" href="(https://bbs.hupu.com/.*?.html)"'
url = filepath + baseurl
print(url)
html =open(url, 'r', encoding='utf-8').read()
urllist = re.compile(pattern).findall(html)
totalurl += urllist
time.sleep(5)
print(len(totalurl))
return totalurl
def getImgUrls():
urllist = get_url()
img1Pattern = re.compile('
img2Pattern = re.compile('
titlePattern = re.compile('>(.*?)')
url = urllist[1]
print(url)
imgList = []
for ain range(len(urllist) -1):
url = urllist[a]
try:
html = requests.get(url).text
print(html)
img1 = re.findall(img1Pattern, html)
img2 = re.findall(img2Pattern, html)
imgList = img1 + img2
title = re.findall(titlePattern, html)[0]
path = filepath + title[:-3]
print(path)
if not os.path.exists(path):
os.mkdir(path)
x =0
for iin range(len(imgList) -1):
img = imgList[i]
imgPath = path +'/' +str(x) +'.jpg'
urlretrieve(img, imgPath)
x +=1
except Exception as e:
print(e)
if name =='main':
getImgUrls()