目标:获取来自每个国家的香水品牌名单
技术路线:urllib+ mysql
代码:
fragrantica.py
import urllib.request
import re
import pymysql
def getData(url):
#设置主机头
headers=('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36')
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
file=urllib.request.urlopen(url)
#获取页面内容
data=str(file.read().decode('utf-8'))
return data
def getLink(data):
pat='<option value="(.*?.html)"'
ls=[]
a_list=re.compile(pat).findall(data)
for i in a_list:
#构建网址
urls='https://www.fragrantica.asia'+i
ls.append(urls)
return ls
def getHtml(data):
pat0='<a href=".*?jpg"'
b_list=re.compile(pat0).findall(data)
for j in b_list:
pat1='href="(.*?.html)"' #提取品牌网址
pat2='src="(.*?.jpg)"' #提取品牌logo
pat3='.html">(.*?) <br' #提取品牌名称
html='https://www.fragrantica.asia'+re.compile(pat1).findall(j)[0]
logo=re.compile(pat2).findall(j)[0]
company=re.compile(pat3).findall(j)[0]
return [html,logo,company]
def getInsert(c_list):
#创建连接
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='123456',db='db', charset="utf8")
#创建游标
cur = conn.cursor()
#插入数据语句
html=c_list[0]
logo=c_list[1]
company=c_list[2]
sql='insert into fgt (html,company,logo) VALUES (%s, %s, %s)'
cur.execute(sql,(html,company,logo))
cur.close()
conn.close()
def main():
data=getData('https://www.fragrantica.asia/country/China.html')
ls=getLink(data)
for n in ls:
data2=getData(n)
lss=getHtml(data2)
getInsert(lss)
print(lss)
main()