参考内容:
续上一篇
8. Labeling of data part 1
本部分将stock_price和sp500的值波动情况用百分比表示,代码中标有#new
的是本次新加的内容。
import pandas as pd
import os
import time
from datetime import datetime
path = "/home/sum/share/Ubuntu_DeepLearning/intraQuarter" #cd path & pwd
def Key_Stats(gather="Total Debt/Equity (mrq)"):
#read the data sets
statspath = path+'/_KeyStats'
stock_list = sorted([x[0] for x in os.walk(statspath)]) #in Linux use sorted() func
df = pd.DataFrame(columns=['Date',
'Unix',
'Ticker',
'DE Ratio',
'Price',
'stock_p_change', #new
'SP500',
'sp500_p_change']) #new
sp500_df = pd.DataFrame.from_csv("YAHOO-INDEX_GSPC.csv")
ticker_list = [] #new
for each_dir in stock_list[1:]:
each_file = os.listdir(each_dir)
ticker = each_dir.split("/")[-1]
ticker_list.append(ticker) #new
starting_stock_value = False #new
starting_sp500_value = False #new
if len(each_file) > 0:
for file in each_file:
date_stamp = datetime.strptime(file, '%Y%m%d%H%M%S.html')
unix_time = time.mktime(date_stamp.timetuple())
full_file_path = each_dir+'/'+file
source = open(full_file_path, 'r').read()
try:
value = source.split(gather+':') #exist </td> or </th>, may exist \n, so just use : and split twice
if 1 < len(value):
value = value[1].split('<td class="yfnc_tabledata1">')[1].split('</td>')[0]
else:
value = 'NoValue'
try:
sp500_date = datetime.fromtimestamp(unix_time).strftime('%Y-%m-%d')
row = sp500_df[(sp500_df.index == sp500_date)]
sp500_value = float(row["Adjusted Close"])
except:
sp500_date = datetime.fromtimestamp(unix_time-259200).strftime('%Y-%m-%d')
row = sp500_df[(sp500_df.index == sp500_date)]
sp500_value = float(row["Adjusted Close"])
#The reason for the Try and Except here is because some of our stock data may have been pulled on a weekend day.
# If we hunt for a weekend day's value of the S&P 500, that date just simply wont exist in the dataset
stock_price = float(source.split('</small><big><b>')[1].split('</b></big>')[0])
#print("ticker:",ticker,"sp500_date:",sp500_date,"stock_price:",stock_price,"sp500_value:",sp500_value)
if not starting_stock_value: #new
starting_stock_value = stock_price #new
if not starting_sp500_value: #new
starting_sp500_value = sp500_value #new
stock_p_change = ((stock_price-starting_stock_value)/starting_stock_value) * 100 #new
sp500_p_change = ((sp500_value-starting_sp500_value)/starting_sp500_value) * 100 #new
#part of the stock_price doesn't exist
df = df.append({'Date':date_stamp,
'Unix':unix_time,
'Ticker':ticker,
'DE Ratio':float(value),
'Price':stock_price,
'stock_p_change':stock_p_change, #new
'SP500':sp500_value,
'sp500_p_change':sp500_p_change}, ignore_index=True) #new
except Exception as e:
pass
#print(str(e))
save = gather.replace(' ','').replace('(','').replace(')','').replace('/','')+('.8.csv')
print(save)
df.to_csv(save)
Key_Stats()