简单爬虫,爬取基金信息,使用了mysql数据库,代码如下:
import pymysql
from urllib.error import HTTPError
from urllib.request import urlopen
from bs4 import BeautifulSoup
import uuid
import datetime
#建立数据库连接
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='1234', db='mysql',charset='utf8')
cur = conn.cursor()
cur.execute("USE fund")
def getFundNumStr(str):
for i in range(6-len(str)):
str = "0"+str;
return str
#获取数据
def getFundData(url):
#打开链接
try:
html = urlopen(url)
except HTTPError :
return None;
#获取目标数据
try:
bsObj = BeautifulSoup(html, "lxml")
name = bsObj.find("div",{"class":"fundDetail-tit"}).div.get_text()
name = name[:name.index("(")]
value = bsObj.find("",{"id":"gz_gsz"}).get_text()
time = bsObj.find("",{"id":"gz_gztime"}).get_text()
if(time != '--'):
time = "20"+time[time.index("(")+1:time.index(")")]
data = [name,value,time]
except AttributeError:
return None
return data
#如果是新基金,则保存新基金信息
def saveNewFundInfo(code,name):
cur.execute("SELECT * FROM fund_info WHERE code = %s", (code))
if cur.rowcount == 0:
cur.execute("INSERT INTO fund_info (code,name) VALUES (%s, %s)", (code, name))
try:
for num in range(419,1000000):
funCode = getFundNumStr(str(num))
url = "http://fund.eastmoney.com/"+funCode+".html"
data = getFundData(url)
if data != None:
id = str(uuid.uuid1()).replace("-","");
time = datetime.datetime.now().strftime('%Y-%m-%d')
saveNewFundInfo(funCode, data[0])
if(data[1] != '--'):
cur.execute("INSERT INTO fund_day_data (id,code,data,data_time,create_time,update_time) \
VALUES (%s, %s,%s, %s, %s, %s)", (id,funCode, float(data[1]),data[2],time,time))
cur.connection.commit()
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
finally:
cur.close()
conn.close()