import os
import re
import wget
def get_url(fname,patt,charset=None):
patt_list = []
cpatt = re.compile(patt)
with open(fname_sogou,encoding=charset) as fobj:
for line in fobj:
m = cpatt.search(line)
if m:
patt_list.append(m.group())
return patt_list
if __name__ == '__main__':
dir_sogou = '/tmp/sogou'
fname_sogou = '/tmp/sogou/sogou.html'
urlsogou= 'http://www.sogou.com'
if not os.path.exists(dir_sogou):
os.mkdir(dir_sogou)
if not os.path.exists(fname_sogou):
wget.download(urlsogou,fname_sogou)
img_patt = '(http|https)://[\w./-]+\.(jpg|jpeg|gif|png)'
img_list = get_url(fname_sogou,img_patt)
for img_url in img_list:
try:
wget.download(img_url,dir_sogou)
except:
pass