一、Mean-Variance-Optimization模型
均值方差模型,由于国内股市不允许卖空,主要讨论卖空限制下的均值方差模型(风险最小化),其等价于含有不等式约束(权重非负)的二次规划问题,求解需要用到智能算法,这里没有自己编写优化过程,直接用的python的优化包cvxopt.
理论基础方面主要是一些数学推导,涉及矩阵求导和运筹学,这里mark几篇推导比较详细的文章
知乎大神@丹尼尔的两篇文章:Mean-Variance Optimization 1 - 矩阵微分,Mean-Variance Optimization 2-优化过程
MIT课件:Portfolio Theory
cvxopt:cvxopt求解二次规划
二、选股流程
1.数据获取、导入和清洗
数据清洗主要包括:脏数据去除,如分隔行;日期筛选;复牌股票停牌期间数据填充
股票数据来源:通达信,PC端——系统——数据导出——高级导出,添加品种,前复权,设置目录。这里股票池选择的是上证A股中的1200支。
数据导入:
def data_read(file,sn):
'''file:list,总股票池
sn:int,前sn只股票
返回值:DataFrame,所有股票数据
'''
dl= [pd.read_csv(path + '\\' + f + '.txt',index_col=None,encoding='gbk',names=names) for f in file[:sn]]
for df,f in zip(dl,file[:sn]):
df['股票代码'] = None
df['股票代码'] = df['股票代码'].fillna(f)
return pd.concat(dl,ignore_index=True)
查看数据集情况,发现数据集时间跨度比较大,同时有含有类似分隔行的无效信息
# file[:sn]
data.head()
data.tail()
# data.股票代码.value_counts()
# data.describe()
剔除分隔行
df = data.drop(index=data.loc[data.日期=='数据来源:通达信',:].index,axis=0,inplace=False)
筛选2018年1月到2020年6月的股票数据
df['日期'] = pd.to_datetime(df['日期'])
df = df[(df.日期 >= '2018-01-01') & (df.日期 < '2020-07-01')]
月度收益计算,从日度数据中选出每月的第一个工作日和最后一个工作日
month_end_list = [int(each[-1]) for each in df.groupby([df.日期.dt.year, df.日期.dt.month,df.股票代码.values]).groups.values()]
month_start_list = [int(each[0]) for each in df.groupby([df.日期.dt.year, df.日期.dt.month,df.股票代码.values]).groups.values()]
df1 = df.iloc[month_end_list,:].copy().set_index(['日期']) # 月末
df2 = df.iloc[month_start_list,:].copy().set_index(['日期']) # 月初
补全复牌股票停牌期间数据,从通达信下载的股票数据不包含停牌期间的信息
dl1 = []
dl2 = []
for f in file[:sn]:
dft1 = df1.loc[df1.股票代码==f,:]
if len(dft1) < (2020-2018)*12 + 6:
dft1 = dft1.resample('BM').asfreq()
dft1.iloc[1:,:].ffill(inplace=True)
dft1.iloc[:-1,:].bfill(inplace=True)
dl1.append(dft1)
dft2 = df2.loc[df2.股票代码==f,:]
if len(dft2) < (2020-2018)*12 + 6:
dft2 = dft2.resample('BMS').asfreq()
dft2.iloc[1:,:].ffill(inplace=True)
dft2.iloc[:-1,:].bfill(inplace=True)
dl2.append(dft2)
df1 = pd.concat(dl1,ignore_index=False)
df2 = pd.concat(dl2,ignore_index=False)
由于实际选股时需要每月更改股票池,而不是像我做的这样一次性选连续两年半的股票数据作分析,所以实际上还要剔除当月的停牌股票(不能买入)和新股(无历史数据,不适用于MVO的选股范围)。
计算月度收益率,根据下面公式
data_dict = {f:(df1.loc[df1.股票代码==f,['收盘价']].values / df2.loc[df2.股票代码==f,['开盘价']].values - 1).flatten() for f in file[:sn]}
Return = pd.DataFrame(data_dict)
上述流程封装
def data_read(file,sn):
'''file:list,总股票池
sn:int,前sn只股票
返回值:DataFrame,所有股票数据,list,筛选后的股票池
'''
dl= []
fl = []
for f in file[:sn]:
df = pd.read_csv(path + '\\' + f + '.txt',index_col=None,encoding='gbk',names=names)
df.drop(index=df.loc[df.日期=='数据来源:通达信',:].index,axis=0,inplace=True) # 剔除分隔行
df['日期'] = pd.to_datetime(df['日期'])
if ((not df.loc[df.日期=='2018-01-02',['开盘价']].empty) and
(not df.loc[df.日期=='2018-01-31',['收盘价']].empty) and
(not df.loc[df.日期=='2020-06-30',['收盘价']].empty)): # 回测时段内时序数据必须完整
df['股票代码'] = None
df['股票代码'] = df['股票代码'].fillna(f)
df = df[(df.日期 >= '2018-01-01') & (df.日期 < '2020-07-01')] # 日期筛选
dl.append(df)
fl.append(f)
return pd.concat(dl,ignore_index=True),fl
def data_cleaning(df,long,fl):
'''data:pd.DataFrame,要清洗的股票数据集
long:series with index,[start_time,end_time],start_time,end_time为str,时间范围
fl:list,筛选后的股票池
返回值:dict,DataFrame,包含月初数据,月末数据,月度收益率矩阵
'''
start_time = long[0]
end_time = long[1]
# 筛选月末和月初数据
df.index = range(len(df))
month_end_list = [int(each[-1]) for each in df.groupby([df.日期.dt.year, df.日期.dt.month,df.股票代码.values]).groups.values()]
month_start_list = [int(each[0]) for each in df.groupby([df.日期.dt.year, df.日期.dt.month,df.股票代码.values]).groups.values()]
df1 = df.iloc[month_end_list,:].set_index(['日期']) # 月末
df2 = df.iloc[month_start_list,:].set_index(['日期']) # 月初
# 补全停牌后复牌的股票的价格
dl1 = []
dl2 = []
for f in file[:sn]:
dft1 = df1.loc[df1.股票代码==f,:]
if len(dft1) < (2020-2018)*12 + 6:
dft1 = dft1.resample(rule='BM').asfreq()
dft1.iloc[1:,:].bfill(inplace=True)
dft1.iloc[:-1,:].ffill(inplace=True)
# dft1.bfill(inplace=True)
# dft1.ffill(inplace=True)
dl1.append(dft1)
dft2 = df2.loc[df2.股票代码==f,:]
if len(dft2) < (2020-2018)*12 + 6:
dft2 = dft2.resample(rule='BMS').asfreq()
dft2.iloc[1:,:].bfill(inplace=True)
dft2.iloc[:-1,:].ffill(inplace=True)
# dft2.bfill(inplace=True)
# dft2.ffill(inplace=True)
dl2.append(dft2)
df1 = pd.concat(dl1,ignore_index=False)
df2 = pd.concat(dl2,ignore_index=False)
# 计算月度收益率
data_dict = {f:(df1.loc[df1.股票代码==f,['收盘价']].values / df2.loc[df2.股票代码==f,['开盘价']].values - 1).flatten() for f in fl}
Return = pd.DataFrame(data_dict)
return {'df1':df1,'df2':df2,'Return':Return}
2.最优化问题求解
权重计算(非负约束二次规划求解)
def weight_opt(R,Cov,R0):
'''R:期望收益率向量,nx1
Cov:收益率协方差矩阵,nxn
R0:期望收益率,r0
返回值:array,股票权重
'''
P = matrix(Cov)
q = matrix(np.zeros((len(R),1)))
G = matrix(np.diag([-1. for i in range(len(R))]))
h = matrix(np.zeros((len(R),1)))
A = matrix(np.concatenate((R.T,np.ones((1,len(R)))),axis=0))
b = matrix(np.array([[R0],[1]]))
result = solvers.qp(P,q,G,h,A,b)
return np.array(result['x'])
3.策略回测和可视化
策略回测,用过去11个月的月度数据训练出的最优权重在当月验证,train:test = 11:1.
# 回测函数
def looking_back(train,test,z=11,tr=0.003,re=0.05):
'''z:int,回测月数
tr:float,换手费
re:float,股票剔除的权重阈值
train:list,训练集
test:list,验证集
返回值:list,股票权重,月度收益率,累计收益率
'''
opt_weight = [np.array([each/sum([float(w) if w > re else 0 for w in weight]) for each in [float(w) if w > re else 0 for w in weight]]).reshape(weight.shape) for weight in [weight_opt(x['R'],x['Return'].cov().values,x['R'].mean()) for x in train]]
opt_return = [float(np.dot(w.T,r).flatten()) for w,r in zip(opt_weight,test)]
equal_return = [float((np.dot(1/len(r)*np.ones(len(r)),r)).flatten()) for r in test]
total_opt_return = [sum(opt_return[:i+1])*(1-tr) for i in range(len(opt_return))]
total_equal_return = [sum(equal_return[:i+1])*(1-tr) for i in range(len(equal_return))]
return opt_weight,opt_return,equal_return,total_opt_return,total_equal_return
绘制时间——累计收益率趋势图
# 绘图
def pict(m='opt',opt_return=None,total_opt_return=None,mtkl_return=None,total_mtkl_return=None):
fig = plt.figure(figsize=(10,8))
plt.grid(True)
plt.xticks(rotation=45)
if m == 'mtkl':
plt.plot([f'20{18+int((i+z)/12)}-{(i+z)%12+1}' for i in range(len(mtkl_return))],total_mtkl_return,c='r',marker='o',linestyle='-',label='mtkl')
else:
plt.plot([f'20{18+int((i+z)/12)}-{(i+z)%12+1}' for i in range(len(opt_return))],total_opt_return,c='b',marker='o',linestyle='-',label='mvo')
plt.plot(range(len(equal_return)),total_equal_return,c='g',marker='o',linestyle='-',label='equal')
plt.legend(loc='best')
xlabel = 'time'
ylabel = 'return'
return None
三、实际运行
# 全局变量
path = r'C:\Users\lenovo\Desktop\上证50数据' # path = r'C:\Users\lenovo\Desktop\沪深A股数据'
names = ['日期','开盘价','最高价','最低价','收盘价','成交量','成交额']
file = [os.path.splitext(f)[0] for f in os.listdir(path)] # file = glob.glob(os.path.join(path,'**#**.txt'))
long = ('2018-01-01','2020-07-01')
sn = 1000
z = 11
tr = 3/1000 # 手续费
re = 5/100 # 舍弃权重小于re的股票,权重设置为0
dr = data_read(file,sn)
data = dr[0]
fl = dr[1]
Return = data_cleaning(data,long,fl)['Return']
train,test = [{'Return':Return[i:i+z],'R':Return[i:i+z].values.mean(axis=0).reshape(len(Return.columns),1)} for i in range(0,len(Return)-z)],[r.reshape(len(Return.columns),1) for r in Return.values[z:]]
opt_weight,opt_return,equal_return,total_opt_return,total_equal_return = looking_back(train,test,tr=tr,re=re)
timeseries = [f'20{18+int((i+z)/12)}年{(i+z)%12+1}月' for i in range(len(opt_return))] # mtkl_return
data0 = [[timeseries[j],fl[i],round(opt_weight[j].flatten()[i],4)] for j in range(len(opt_weight)) for i in np.argwhere(opt_weight[j].flatten() != 0).flatten().tolist()]
columns = ['日期','股票代码','权重']
pict(opt_return=opt_return,total_opt_return=total_opt_return)
四、总结和改进
累计收益率看结果比不过等权重,后续改进尝试:月度数据改季度数据;股票池直接用沪深300,上证50股票;模型算法方面调优,如下半方差VaR,期望收益率,协方差的估计方法改进等。