coding=utf-8
import requests
from bs4 import BeautifulSoup
import re
import pymongo
import json
import pandas as pd
import numpy as np
import xlrd
import datetime
from pyecharts import Line,Grid,EffectScatter,Overlap
from pyecharts.echarts.axis import YAxisLabel
def onedayPriceSpider(spiderDay):
for m in range(1,4):
url = "http://www.hfzgncp.com.cn/index.php?m=content&c=index&a=lists&catid=59&sendtime="+str(spiderDay)+"&page="+m
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0)' +\
'Gecko/20100101 Firefox/61.0'}
r = requests.get(url,headers=headers)
html = r.text.encode(encoding='utf_8').decode()
soup = BeautifulSoup(html,"lxml")
table = soup.find('table', attrs={'class' :{'h_list_table r_list_table'}})
#print(table)
#查看表格数据行数
height = len(table.findAll(lambda tag:tag.name=='tr' and
len(tag.findAll('td'))>=1))
if height <=0:
return
print('height:'+str(height))
#收集表头
columns = [x.text for x in table.tr.findAll('th')]
columns = [x.replace('\xa0','') for x in columns]
#print(columns)
#构造dataframe 准备存储表格
width = len(columns) #表头列数
#print('width:'+str(width))
df = pd.DataFrame(data = np.full((height,width),'',dtype = 'U'),columns = columns)
rows = [row for row in table.findAll('tr') if row.find('td')!=None]
#逐行分析表格
for i in range(len(rows)):
cells = rows[i].findAll('td')
if len(cells) == width:
df.iloc[i] = [cell.text.replace(' ','').replace('\n','') for cell in cells] #去点空格和换行
else:
w=len(cells)
df.iloc[i,width-w:] = [cell.text.replace(' ','').replace('\n','') for cell in cells]
#print(df)
sendtime = soup.find('input', attrs={'id' :{'sendtime'}})['value'].rstrip('/-') #获取数据时间
sendtimeStr=re.sub("\-","",sendtime)
#print(sendtimeStr)
outputfilePath="D:/xlsx/"+sendtimeStr+".xlsx"
df.to_excel(outputfilePath)
xlsx_data=xlrd.open_workbook(outputfilePath)
xlsx_table=xlsx_data.sheet_by_name(u'Sheet1')
columnIndex = None
rowIndex = None
for j in range(xlsx_table.ncols):
for i in range(height):
if(xlsx_table.cell_value(i, j) == '平均价'):
columnIndex = j
break
if(xlsx_table.cell_value(i, j) == '草鱼'):
rowIndex = i
break
print(xlsx_table.cell_value(rowIndex, columnIndex))
price_data.append(xlsx_table.cell_value(rowIndex, columnIndex))
price_date.append(sendtime)
#client= pymongo.MongoClient()
#获取一个数据库
#db=client.priceSpider
#创建 或获取一个集合,并在collection下新建books
#account=db.prcie
#data=xlrd.open_workbook("D:/"+sendtimeStr+".xlsx")
#table=data.sheets()[0]
#读取excel第一行数据作为存入mongodb的字段名
#rowstag=table.row_values(0)
#nrows=table.nrows
#print('-------------nrows----------------'+str(nrows))
#ncols=table.ncols #print rows
#returnData={}
#for i in range(1,nrows):
#将字段名和excel数据存储为字典形式,并转换为json格式
#returnData[i]=json.dumps(dict(zip(rowstag,table.row_values(i))))
#通过编解码还原数据
#returnData[i]=json.loads(returnData[i])
#print returnData[i]
#account.insert(returnData[i])
#return daySpider
def getColumnIndex(table, columnName):
columnIndex = None
for i in range(table.ncols):
if(table.cell_value(0, i) == columnName):
columnIndex = i
break
return columnIndex
def getPriceLine(price_date,price_data):
attr = price_date
v1 = price_data
line = Line('价格走势图')
yAxis: {
type: 'value',
min: 400,
max: 700
}
line.add('鲫鱼',attr,v1,is_smooth = True,mark_point = ['max','min'],mark_line=["average"],yaxis_formatter="元")
grid =Grid()
grid.add(line,grid_top="10%")
grid.render('./鲫鱼.html')
#es = EffectScatter()
#es.add('',attr,v1,effect_scale=8) #闪烁
#overlop = Overlap()
#overlop.add(line) #必须先添加line,在添加es
#overlop.add(es)
#overlop.render('./line-es01.html')
if name=="main":
price_data = []
price_date = []
spiderDaynumber =10
for i in range(spiderDaynumber):
spiderDay = datetime.date.today()- datetime.timedelta(days=i+1)
print(spiderDay)
onedayPriceSpider(spiderDay)
print(price_data)
print(price_date)
getPriceLine(price_date,price_data)
#xlsfile= 'F:/test_AutoTesting/TestCase/RunList.xlsx'
# table = readExcelDataByName(xlsfile, 'Sheet1')[0]
#获取第一行的值
# testcase_id = table.cell_value(1, getColumnIndex(table,'平均价'))