1. 层次分析理论
1.1 代码部分
function [] = AHP()
%% AHP法权重计算MATLAB程序
%% 数据读入
clc
clear all
%% 模型求解
A=[1 2 6; 1/2 1 4; 1/6 1/4 1];% 评判矩阵
%% 一致性检验和权向量计算
[n,n]=size(A);
[v,d] = eig(A);
r=d(1,1);
CI=(r-n)/(n-1);
RI=[0 0 0.58 0.90 1.12 1.24 1.32 1.41 1.45 1.49 1.52 1.54 1.56 1.58 1.59];
CR=CI/RI(n);
if CR<0.10
CR_Result='通过';
else
CR_Result='不通过';
end
%% 权向量计算
w=v(:,1)/sum(v(:,1));
w=w';
%% 结果输出
disp('该判断矩阵权向量计算报告:');
disp(['一致性指标:' num2str(CI)]);
disp(['一致性比例:' num2str(CR)]);
disp(['一致性检验结果:' CR_Result]);
disp(['特征值:' num2str(r)]);
disp(['权向量:' num2str(w)]);
end
1.2 论文中写法(2016B)
1.建立层次结构模型
2.模型求解
构造判断矩阵C1-P
排序和总排序的一致性.png
1.3 理论链接
2. 主成分分析
2.1代码部分(py,未使用sklearn)
digits = load_digits()
X_digits = pd.DataFrame(digits.data)
name = []
for i in range(np.shape(X_digits)[1]):
name.append("名字"+str(i))
def PCA_math(X_digits,name,topNfeat):
import pandas as pd
from sklearn.preprocessing import StandardScaler
from numpy import *
import matplotlib.pyplot as plt
plt.style.use("bmh")
plt.rc('font', family='SimHei', size=5)
pd.set_option('display.max_columns', 4000)
pd.set_option('display.width', 4000)
pd.set_option('display.max_colwidth', 4000)
X_scaler = StandardScaler()
X_digits = X_scaler.fit_transform(X_digits)
meanVals = mean(X_digits, axis=0) # 按列求均值
meanRemoved = X_digits - meanVals
covMat = cov(meanRemoved, rowvar=0)
eigVals, eigVects = linalg.eig(mat(covMat))
eigValInd = argsort(-eigVals)
sum_eig = sum(eigVals)
print("各成分重要性占比:")
for i in eigValInd:
print(name[i]+":\t"+str(eigVals[i]/sum_eig))
data = pd.DataFrame(eigVals) / sum_eig
fig = plt.figure()
ax = fig.add_subplot(111)
data.index = name
data[0].plot(kind='bar')
plt.show()
eigValInd = eigValInd[:(topNfeat + 1):1]
redEigVects = eigVects[:, eigValInd]
lowDDataMat = meanRemoved * redEigVects
reconMat = (lowDDataMat * redEigVects.T) + meanVals
return lowDDataMat,reconMat
PCA_math(X_digits,np.array(name),10)
2.2 论文中近似主成分分析
计算影响权重
取最大的前n个
3. 因子分析
3.1 代码部分(py)
from factor_analyzer import factor_analyzer,Rotator
from factor_analyzer import FactorAnalyzer
#导入数据集
X_digits = pd.DataFrame([[1,1,2,5,6,7],[2,2,2,6,7,8],[3,3,4,7,8,9]])
print(factor_analyzer.calculate_kmo(X_digits)[1])
P_s = factor_analyzer.calculate_bartlett_sphericity(X_digits)[1]
if(P_s < 0.05):
print("这些变量可以进行因子分析")
else:
print("不可以进行因子分析")
#初始化主成分分析参数:
# 主成分提取参数设置:
n_factors = 2 #:4个主成分
method = 'principal'#:提取方法为主成分提取方法
rotation = 'varimax'#:旋转方法为最大方差法
fa = factor_analyzer.FactorAnalyzer(n_factors=n_factors,method=method,rotation=rotation)
fa.fit(X_digits)
fa = FactorAnalyzer(n_factors=2,method='principal',rotation='varimax')
fa.fit(X_digits)
# 公因子方差
print(fa.get_communalities())
# 贡献率 (特征值,方差贡献率,累计方差贡献率)
var = fa.get_factor_variance()
# 旋转后的因子载荷矩阵
rotator = Rotator()
load_matrix = rotator.fit_transform(fa.loadings_)
# 成分得分系数矩阵
corr = X_digits.corr().values
corr = np.matrix(corr)
load_matrix = np.matrix(load_matrix)
pr_matrix_score = np.dot(corr,load_matrix)
pr_matrix_score_t = pr_matrix_score.T
means=X_digits.mean()
stds=X_digits.std()
columns_len = len(X_digits.columns)
adict={}
for i in range(columns_len):
a = i
adict[i]=[means[a],stds[a]]
def get_Fac_score(x,no,adict):
arr = pr_matrix_score_t[no].tolist()[0]
sum=0
for i in range(columns_len):
a = i
sum=sum+(x[a]-adict[i][0])/(adict[i][1])*arr[i]
return sum
for i in range(n_factors):
X_digits['FAC'+str(i+1)]= X_digits.apply(get_Fac_score,args=(i,adict),axis=1)
X_digits['FAC'] = X_digits.apply(lambda x:(x['FAC1']*var[1][0]+x['FAC2']*var[1][1])/var[2][1], axis=1)
print(X_digits)
3.2 论文写作参考链接(非论文)
3.3 论文
3.3.1 (2017B)
可参照灰色关联度分析,个人感觉,重点在得到相关性结果之后,结论的分析,各种因素如何影响足后的结果
4.聚类
4.0 重要提示:为什么要有n个簇!!!
4.0.1 论文(2017 B)
4.0.2 采用机器学习中聚类的指标
4.0.2.1 优化kmeans - ISODATA
K: 期望得到的聚类数;
k: 初始的设定的聚类数;
θN: 每一个类别中最少的样本数,若少于此数则去掉该类别;
θs: 一个类别中,样本特征中最大标准差。若大于这个值,则可能分裂;
θc: 两个类别中心间的最小距离,若小于此数,把两个类别需进行合并;
L: 在一次合并操作中,可以合并的类别的最多对数;
I: 迭代运算的次数。
机器学习实战代码~~~~
4.1 绘制层次聚类图
4.1.1 (代码py)
import pandas as pd
from scipy.cluster import hierarchy #用于进行层次聚类,话层次聚类图的工具包
from scipy import cluster
import matplotlib.pyplot as plt
plt.style.use("bmh")
plt.rc('font', family='SimHei', size=4)
pd.set_option('display.max_columns',1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth',1000)
#导入数据集
df = pd.DataFrame(pd.np.random.normal(size=(10,4)))
name = ["object"+str(i) for i in range(len(df))]
#绘制层次聚类图
df.index = name
Z = hierarchy.linkage(df, method ='ward',metric='euclidean')
hierarchy.dendrogram(Z,labels = df.index)
height = 2
plt.axhline(y=height, color='r', linestyle='-')
label = cluster.hierarchy.cut_tree(Z,height=height)
label = label.reshape(label.size,)
plt.show()
层次聚类图
4.2 K-means
4.2.1 (代码py)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data[:, :4] # #表示我们取特征空间中的4个维度
print(X.shape)
plt.figure(12)
# 绘制数据分布图
plt.subplot(211)
plt.scatter(X[:, 0], X[:, 1], c="red", marker='o', label='see')
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend(loc=2)
estimator = KMeans(n_clusters=3) # 构造聚类器
estimator.fit(X) # 聚类
label_pred = estimator.labels_ # 获取聚类标签
# 绘制k-means结果
x0 = X[label_pred == 0]
x1 = X[label_pred == 1]
x2 = X[label_pred == 2]
plt.subplot(212)
plt.scatter(x0[:, 0], x0[:, 1], c="red", marker='o', label='label0')
plt.scatter(x1[:, 0], x1[:, 1], c="green", marker='*', label='label1')
plt.scatter(x2[:, 0], x2[:, 1], c="blue", marker='+', label='label2')
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend(loc=2)
plt.show()
'''
class_n = 3
estimator = KMeans(n_clusters=class_n) # 构造聚类器
estimator.fit(data[["任务gps 纬度","任务gps经度"]]) # 聚类
label_pred = estimator.labels_
color = ["red","yellow","green","blue","gray"]
data["距离"] = None
data["media_1"] = None
data["media_2"] = None
plt.figure(figsize=[8, 8])
for i in range(class_n):
x0 = data[["任务gps 纬度", "任务gps经度","任务标价","任务执行情况"]][label_pred == i].values
print(i,data[["任务gps 纬度", "任务gps经度","任务标价","任务执行情况"]][label_pred == i].describe())
#print("计算出中心点坐标:",end="")
median = np.mean(x0,axis=0)
data["距离"][label_pred == i] = np.linalg.norm(x0 - median,axis=1) * 110*1000
data["media_1"][label_pred == i] = median[0]
data["media_2"][label_pred == i] = median[1]
plt.scatter(x0[:,0], x0[:, 1], c=color[i], label='label'+str(i))
plt.scatter(median[0],median[1],color = "black" , marker='+',s=100)
plt.xlim([22, 24])
plt.ylim([112.2, 114.75])
'''
kmeans
4.2.1.1 kmeans进行分层聚类
'''
进行第一次聚类
class_n = 20
estimator = KMeans(n_clusters=class_n) # 构造聚类器
estimator.fit(data[["任务gps 纬度","任务gps经度"]]) # 聚类
label_pred = estimator.labels_
data["cluster"] = -1
for i in range(class_n):
x0 = data[["任务gps 纬度", "任务gps经度"]][label_pred == i].values
data["cluster"][label_pred == i] = i
data.to_csv("cluster_1.csv",index=False)
'''
-----------------------------------------------------------------------------------------------------------------------
'''
进行第二次聚类
counter = dict(collections.Counter(data['cluster']))
key = list(counter.items())
for i in range(len(counter)):
cluster_name = key[i][0]
if (key[i][1] > 4):
data_1 = data[data["cluster"] == cluster_name]
iter = True
class_n = 2
num_iter = 0
while (iter):
num_iter += 1
iter = False
estimator = KMeans(n_clusters=class_n) # 构造聚类器
estimator.fit(data_1[["任务GPS纬度","任务GPS经度"]]) # 聚类
label_pred = estimator.labels_
if (list(dict(collections.Counter(label_pred)).items())[-1][1] <= 2):
iter = True
class_n -= 1
class_n = max(class_n, 2)
else:
break
if (num_iter == 10):
label_pred = np.random.randint(0,2,len(data_1))
break
data_1["cluster"] = data["cluster"].max() + label_pred + 1
data[data["cluster"] == cluster_name] = data_1
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data["cluster"] = le.fit_transform(data["cluster"])
data.to_csv("result_cluster6.csv",index=False)
'''
------------------------------------------------------------------------------------------------------
'''
无法分类后随机数随机分类
import collections
counter = dict(collections.Counter(data['cluster']))
key = list(counter.items())
for i in range(len(counter)):
cluster_name = key[i][0]
if (key[i][1] > 4):
data_1 = data[data["cluster"] == cluster_name]
label_pred = np.random.randint(0, 3, len(data_1))
data_1["cluster"] = data["cluster"].max() + label_pred + 1
data[data["cluster"] == cluster_name] = data_1
data.to_csv("result_cluster8.csv",index=False)
print(data["cluster"].value_counts())
'''
4.2.2 其他聚类链接
k-means聚类,AGNES层次聚类,DBSCAN
流程图可参见西瓜书的聚类部分
4.3 论文
4.3.1 (2016B)
4.3.2(2017B)
经纬度
5. 判别分析
5.1 线形判别分析
5.1.1 (py代码)
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
iris = datasets.load_iris()['data']
target = datasets.load_iris()['target']
X_train, X_test,y_train,y_test = train_test_split(iris,target,test_size=0.25,random_state=0,stratify=target)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
LDA = LinearDiscriminantAnalysis()
LDA.fit(X_train,y_train)
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
solver='svd', store_covariance=False, tol=0.0001)
LDA.score(X_train,y_train)
LDA.predict(X_test)
LDA.score(X_test,y_test)
dir(LDA)
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
X = np.vstack((X_train,X_test))
Y = np.vstack((y_train.reshape(y_train.size,1),\
y_test.reshape(y_test.size,1)))
converted_X = np.dot(X,np.transpose(LDA.coef_)) + \
LDA.intercept_
fig = plt.figure()
ax = fig.add_subplot(111)
colors = 'rgb'
markers = 'o*s'
for target,color,marker in zip([0,1,2],colors,markers):
pos = (Y == target).ravel()
X = converted_X[pos,:]
ax.scatter(X[:,0],X[:,1],\
color=color,marker=marker,\
label = "Label%d"%target)
ax.legend(loc="best")
fig.suptitle("Iris After LDA")
plt.show()
5.2 其他判别分析可调用sklearn的包
5.2.1 高斯判别分析
6.时序模型
6.1 参数确认
参数确认规则:(已知序列为一阶差分)
① 当偏自相关函数呈现p阶拖尾,自相关函数呈现q阶拖尾时,我们可以选用模型ARIMA(p,1,q)
② 当偏自相关函数呈现拖尾,自相关函数呈现q阶截尾时,我们可以选用模型MA(q)
③ 当偏自相关函数呈现p阶截尾,自相关函数呈现拖尾时,我们可以选用模型AR(p)
6.2季节ARIMA模型
6.2.1(py代码)
# coding=UTF-8
import pandas as pd
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
input_file = "data/data_1.xlsx"
data_frame = pd.read_excel(input_file,index_col=0)
data_frame["时间"] = pd.to_datetime(data_frame["时间"])
data_frame.index = data_frame.时间
dataSet = data_frame['流量'][data_frame['地市'] == "C"].dropna()
dataSet = dataSet.resample("D").mean()
forecast_data = []
for i in range(1,31):
forecast_data.append(dataSet.index[-1] + i)
forecast_data = pd.DataFrame(forecast_data)
forecast_data.columns = ["时间"]
forecast_data.index = forecast_data.时间
#画出原始数据图像
dataSet.plot()
plt.show()
#去均值化
# 对时间序列进行去均值化
mean = dataSet.mean()
dataSet_kill_mean = dataSet - mean
#去均值化并不会影响数据之间的联系
#对数据做差分,得到平稳的时间序列(做一阶差分,得到时间序列)
df_kill_mean_1 = dataSet_kill_mean.diff(1).dropna()
df_kill_mean_1.plot()
plt.show()
#③ADF检验可以让我们判断某段时间序列是否为平稳序列
import statsmodels.tsa.stattools as ts
adf_summary = ts.adfuller(np.array(df_kill_mean_1).reshape(-1)) # 进行ADF检验并打印结果
print(adf_summary)
#如何观察上面的统计结果来判断序列是否为平稳呢?
# ADF_Test_result , P
#1%、5%、10%不同程度拒绝原假设的统计值和ADF Test result的比较,ADF Test result同时小于1%、5%、10%即说明非常好地拒绝该假设
#我们认为上述一阶差分后得到的数据满足平稳性检验要求。
# (一阶差分后的序列平稳性不太好,有可能通不过白噪声检验,在这里忽略白噪声检验环节,若白噪声检验得到的P值大于0.05,
# 那么我们就得对时间序列进行二阶差分)
#白噪声检验
import statsmodels.api as sm
r, q, p = sm.tsa.acf(df_kill_mean_1, qstat=True)
data = np.c_[range(1, 41), r[1:], q, p]
table = pd.DataFrame(data, columns=['lag', 'AC', 'Q', 'Prob(>Q)'])
# Prob(>Q)即P值大部分都小于0.05,所以残差不是白噪声
# 所有p值均大于0.05,接受原假设,无自相关
print(table.set_index('lag'))
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# 绘制自相关图
plot_acf(df_kill_mean_1,lags=24).show() # 其中lags参数是指横坐标最大取值
# 绘制偏相关图
plot_pacf(df_kill_mean_1,lags=24).show()
plt.show()
#并没有呈现很好的拖尾或截尾情况
#该序列可能存在季节性影响的因素
#我们可以通过分解的方式将时序数据分离成不同的成分,
# 它主要将时序数据分离为Trend(成长趋势)、seasonal(季节性趋势)、Residuals(随机成分)
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(df_kill_mean_1, freq=30)
trend = decomposition.trend # 趋势部分
seasonal = decomposition.seasonal # 季节性部分
residual = decomposition.resid # 残留部分
decomposition.plot()
plt.show()
#建立季节性时间序列模型ARIMA(k,D,m)S×(p,d,q)
#ARIMA(p,d,q)(k,D,m)S == ARMA(kS+p,mS+q)
# 使用“网格搜索”来迭代地探索参数的不同组合
'''
import itertools
p = q = range(0,3) # p、q一般取值不超过2
d = range(1,2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 7) for x in list(itertools.product(p, d, q))]
for param in pdq:
for param_seasonal in seasonal_pdq:
try:
mod = sm.tsa.statespace.SARIMAX(dataSet,
order=param,
seasonal_order=param_seasonal,
enforce_stationarity=False,
enforce_invertibility=False)
results = mod.fit()
print('ARIMA{}x{} - AIC:{}'.format(param, param_seasonal, results.aic))
except:
continue
'''
#ARIMA(2, 1, 2)x(2, 1, 2, 7) - AIC:-762.9389508705663
import statsmodels.api as sm
mod = sm.tsa.statespace.SARIMAX(dataSet,order=(2,1,2),seasonal_order=(2, 1, 2, 7),\
enforce_stationarity=True,enforce_invertibility=True)
result = mod.fit(maxiter=200,disp=30)
predict_sunspots = result.forecast(30)
forecast = np.array(predict_sunspots[:]).reshape(-1)
forecast_data["流量"] = forecast
forecast_data = forecast_data['流量']
forecast_data.plot(c="red",label='forecast')
dataSet.plot(c="blue",label='Original')
plt.show()
若是其他模型则代码中训练部分可替换为:
def arma_predict(dataset,number):
data = list(dataset.rentNumber)
from statsmodels.tsa.arima_model import ARMA
"""
import statsmodels.tsa.stattools as st
order = st.arma_order_select_ic(data,max_ar=2,max_ma=2,ic=['aic', 'bic', 'hqic'])
"""
model = ARMA(data, order=(1,1))
result_arma = model.fit(disp=-1, method='css')
predict = result_arma.predict(len(data)-number,len(data))
RMSE = np.sqrt(((predict-data[len(data)-number-1:])**2).sum()/(number+1))
plot_results(predict,data[len(data)-number-1:])
return predict,RMSE
得出图像:
偏自相关图
自相关图
季节性,趋势等分解
预测图像
说实在话我从来没见过这么糟糕的并且丑陋的预测图像,但是想到这个模型好写论文。。。我忍了。。。(随便来个lstm都比这个看着好)
6.2.2 (参考链接,非论文)
7. 元胞自动机
7.1 论文(2016B)
知识概述和分析
交通流的描述参数
模型建立
7.2 代码(代更新)
8. 模糊综合判定模型
8.1 论文
模糊综合判定模型
8.2 由于比较简单,不附上代码
9. BP神经网络+stacking
9.1 代码
import numpy as np
import pandas as pd
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import MinMaxScaler
from keras.layers.core import Dense, Activation
from keras.optimizers import Adam
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from keras.callbacks import *
from keras.layers import LSTM, Dense, Activation
'''BP神经网络+stacking'''
Y = data[["任务标价"]].values
del data["任务标价"]
scalarX = MinMaxScaler()
data = scalarX.fit_transform(data)
N = 5
oof = np.zeros((data.shape[0],1))
#oof_sub = np.zeros((test_df.shape[0],1))
kf = StratifiedKFold(n_splits=N,random_state=42,shuffle=True)
for j,(train_in,test_in) in enumerate(kf.split(data,np.zeros(len(data)))):
X_tr, X_val, y_tr, y_val = data[train_in], data[test_in], Y[train_in], Y[test_in]
modelfile = 'modelweight.model'
model = Sequential() #层次模型
model.add(Dense(12,input_dim=11,init='uniform')) #输入层,Dense表示BP层
model.add(Activation('relu')) #添加激活函数
model.add(Dense(1,input_dim=12)) #输出层
model.compile(loss='mae', optimizer=Adam()) #编译模型
check_point = ModelCheckpoint(modelfile, monitor="val_mae", verbose=1, save_weights_only=True,
save_best_only=True, mode="min")
early_stop = EarlyStopping(monitor="val_mae", mode="min", patience=10)
model.fit(X_tr, y_tr, nb_epoch = 7000, batch_size = int(800),
validation_data=(X_val, y_val),\
callbacks=[check_point, early_stop],shuffle=True) #训练模型1000次
model.save_weights(modelfile) #保存模型权重
oof[test_in] = model.predict(X_val, batch_size=800)
#oof_sub += model.predict(test_X, batch_size=1024)
#Y_pre = model.predict(data)
#xx_cv = f1_score(y1,np.argmax(oof,axis=1),average='macro')
#print(xx_cv)
result_error = np.abs(np.asarray(oof) - np.asarray(Y))
error = np.sum(result_error)/ len(Y)
data = pd.read_csv("data.csv")[["任务号码"]]
data["predict"] = oof
data["predict"] = data["predict"].apply(lambda x: int(x*2 + 0.5)/2)
data["Y"] = Y
data["mae"] = result_error
data.to_csv("BP_2.csv")
9.2.1 论文(BP部分2016D6 深圳杯)
9.2.2 论文(STACKING部分)
STACKING.png
10 绘图
10.1 简单绘图加标注
import matplotlib.pyplot as plt
data = pd.read_csv("BP_3.csv")
sample = data.iloc[:60]
plt.rc('font', family='Microsoft YaHei', size=10)
plt.plot(sample.predict,label = "调整")
plt.plot(sample.Y,label = "Y")
plt.plot(sample.final,label = "final",c="r")
plt.legend(bbox_to_anchor=(0., 1.04, 1., .104), loc=0, ncol=3, mode="expand", borderaxespad= -2.)
plt.ylim([60,80])
plt.xlabel("index")
plt.ylabel("定价")
plt.title("BP未完成定价调整图")
plt.show()
调整图.png
10.2 直方图和核密度分析图
plt.rc('font', family='Microsoft YaHei', size=20)
plt.figure(figsize=[8,8])
ax1 = sns.distplot(data_clean["circle_proportion"+str(i)][data_clean["任务执行情况"] == 1], color='blue',label="完成")
ax1 = sns.distplot(data_clean["circle_proportion"+str(i)][data_clean["任务执行情况"] == 0], color='red',label="未完成")
plt.title("circle_proportion"+str(i))
plt.show()
10.3 散点图
color = ["red","yellow","green","blue","gray"]
for i in range(len(color)):
plt.scatter(x0[:,0], x0[:, 1], c=color[i], label='label'+str(i))
plt.scatter(median[0],median[1],color = "black" , marker='+',s=100)
plt.xlim([22, 24])
plt.ylim([112.2, 114.75])
plt.show()
kmean_task.png
10.4 画圈
color = ["y","blue","red","gray","green"]
r0 = 3000.0
add = 1000
for i in range(5):
r = r0 + i* add
a, b = (x0[0,0], x0[0,1])
x = np.arange(a-r, a+r, 0.01)
y = b + np.sqrt(r**2 - (x - a)**2)
y1 = b - np.sqrt(r**2 - (x - a)**2)
plt.plot(x,y,c = color[i])
plt.plot(x,y1,c=color[i])
plt.show()
#plt.plot(x,y)
#axes.plot(x, y) # 上半部
#axes.plot(x, -y) # 下半部
10.5 直方图
sns.boxplot(data=price_rate,x="任务标价",y="rate",ax=ax)
sns.countplot(data=price_rate, x="任务标价", ax=ax, hue="任务执行情况")
10.6 时序图绘制
data["Speed"] = data.groupby("DataAsOf")["Speed"].transform("mean") / 12
data.drop_duplicates(subset=["DataAsOf"])
dataSet = data[['Speed']]
dataSet.index = pd.to_datetime(data.DataAsOf)
dataSet = dataSet["2016-12-01":"2016-12-03"]
dataSet = dataSet.resample('5min').mean()
dataSet["Speed"] = (5.1 - dataSet["Speed"]) * 1.2
plt.scatter(dataSet.index,dataSet["Speed"],marker=".")
plt.xlim((pd.to_datetime("2016-12-01 00:00:00"), pd.to_datetime("2016-12-03 23:59:59")))
plt.vlines(x = pd.to_datetime("2016-12-02 00:00:00"),ymin= 0,ymax=100, colors="c", linestyles="dashed")
plt.vlines(x = pd.to_datetime("2016-12-03 00:00:00"),ymin= 0,ymax=100, colors="c", linestyles="dashed")
plt.ylim((0,5))
plt.xticks(rotation=90) # 将字体进行旋转
plt.xlabel("时间")
plt.ylabel('流量')
plt.title("某地区道路三日时间和流量散点图")
plt.show()
11 百度api的调用
def getlnglat(data):
print(data)
ak = "MtmcuLOtlGEt477UpbfTv2mb6h52qWoP"
url = 'http://api.map.baidu.com/traffic/v1/around?ak='+ ak +'¢er=' + str(data[1])+","+str(data[2]) \
+'&radius=200'
print(url)
req = urlopen(url)
res = req.read().decode()
temp = json.loads(res)
print(temp)
return temp
for i in data.values:
getlnglat(i)
12 熵权法
#第一题代码
m,n=data.shape
data1=(data - data.min()) / (data.max() - data.min())
#第一步读取文件,如果未标准化,则标准化
data1=data1.as_matrix(columns=None)
#将dataframe格式转化为matrix格式
k=1/np.log(m)
yij=data1.sum(axis=0)
pij=data1/yij
#第二步,计算pij
test=pij*np.log(pij)
test=np.nan_to_num(test)
ej=-k*(test.sum(axis=0))
#计算每种指标的信息熵
wi=(1-ej)/np.sum(1-ej)
w = pd.DataFrame()
list_w = ["距离","流量","小区内路网密度","速度","主干道数目","小区道路数"]
for i in range(len(list_w)):
w[list_w[i]] = [wi[i]]
13 灰色分析
gray=(gray - gray.min()) / (gray.max() - gray.min())
#标准化
std=gray[["流量"]].iloc[:,0]#为标准要素
ce=gray[["距离","小区内路网密度","速度","主干道数目","小区道路数"]].iloc[:,:]#为比较要素
n=ce.shape[0]
m=ce.shape[1]#计算行列
#与标准要素比较,相减
a=zeros([m,n])
for i in range(m):
for j in range(n):
a[i, j] = abs(ce.iloc[j, i] - std[j])
#取出矩阵中最大值与最小值
c=amax(a)
d=amin(a)
#计算值
result=zeros([m,n])
for i in range(m):
for j in range(n):
result[i,j]=(d+0.5*c)/(a[i,j]+0.5*c)
#求均值,得到灰色关联值
result2=zeros(m)
for i in range(m):
result2[i]=mean(result[i,:])
RT=pd.DataFrame(result2)
RT.index = ["距离","小区内路网密度","速度","主干道数目","小区道路数"]
print(RT)
14 线性回归
reg = np.polyfit(data["距离"], data["流量"], 4)
data["Q_distance"] = np.polyval(reg, data["距离"])
15 方差分析
15.1 单因素方差分析
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import warnings
import pandas as pd
import numpy as np
warnings.filterwarnings("ignore")
import itertools
'''构造虚假数据'''
df=pd.DataFrame()
df["group"] = np.random.randint(0,8,10)
df["A"] = np.random.randint(-3,5,10)
df["B"] = np.random.randint(-7,2,10)
df["C"] = np.random.randint(5,8,10)
print(df)
#A
anova_reA= anova_lm(ols('A~group',data=df[['group','A']]).fit())
print(anova_reA)
#B
anova_reB= anova_lm(ols('B~group',data=df[['group','B']]).fit())
print(anova_reB)
#C
anova_reC= anova_lm(ols('C~group',data=df[['group','C']]).fit())
print(anova_reC)
15.2 多因素方差分析
在方差分析中,若把饮料的颜色看做影响销量的因素A,把销售地区看做影响因素B。同时对因素A和因素B进行分析,就称为双因素方差分析。
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
formula = 'group ~ A + B '
anova_results = anova_lm(ols(formula,df).fit())
print(anova_results)
df sum_sq mean_sq F PR(>F) a 4.0 335.36 83.84 3.874307 0.021886 b 4.0 199.36 49.84 2.303142 0.103195 Residual 16.0 346.24 21.64 NaN NaN
因素A的p值0.021886<0.05,拒绝原假设,说明饮料颜色对销量有显著影响;
而因素B的p值0.103195>0.05,不能拒绝原假设,因此没有充分的理由说明销售地区对销量有显著影响。
然而,我们知道了颜色对销量有显著影响,那么是哪种颜色呢?
使用tukey方法对颜色进行多重比较
from statsmodels.stats.multicomp import pairwise_tukeyhsd
print(pairwise_tukeyhsd(df['group'], df['a']))
16. 置信区间
'''
抽取样本, 样本量为200
'''
np.random.seed(42)
coffee_full = pd.read_csv('coffee_dataset.csv')
coffee_red = coffee_full.sample(200) #this is the only data you might actually get in the real world.
coffee_red.head()
'''
计算样本中喝咖啡的均值
'''
(coffee_red[coffee_red['drinks_coffee'] == True]['height'].mean()
'''
重复抽取样本,计算其他样本中喝咖啡的均值,得到抽样分布
绘制抽样分布
'''
boot_means = []
for _ in range(10000):
bootsample = coffee_full.sample(200, replace=True)
mean = bootsample[bootsample['drinks_coffee'] == False]['height'].mean()
boot_means.append(mean)
'''
计算抽样分布的置信区间以估计总体均值, 置信度95%
'''
np.percentile(boot_means, 2.5), np.percentile(boot_means, 97.5)
16.1 绘制置信区间图像
import matplotlib.pyplot as plt
import numpy as np
x_ticks = ("Thing 1", "Thing 2", "Other thing", "Yet another thing")
x_1 = np.arange(1, 5)
x_2 = x_1 + 0.1
y_1 = np.random.choice(np.arange(1, 7, 0.1), 4)
y_2 = np.random.choice(np.arange(1, 7, 0.1), 4)
err_1 = np.random.choice(np.arange(0.5, 3, 0.1), 4)
err_2 = np.random.choice(np.arange(0.5, 3, 0.1), 4)
plt.errorbar(x=x_1, y=y_1, yerr=err_1, color="black", capsize=3,
linestyle="None",
marker="s", markersize=7, mfc="black", mec="black")
plt.errorbar(x=x_2, y=y_2, yerr=err_2, color="gray", capsize=3,
linestyle="None",
marker="s", markersize=7, mfc="gray", mec="gray")
plt.xticks(x_1, x_ticks, rotation=90)
plt.tight_layout()
plt.show()
排队论
MMS
function MMS(s,mu,lambda)
s=2;%服务台数
mu=4;%单个服务台一小时内服务的顾客数
lambda=3;%单位时间(一小时)到达的顾客数
ro=lambda/mu;
ros=ro/s;
sum1=0;
for i=0:(s-1)
sum1=sum1+ro.^i/factorial(i);
end
sum2=ro.^s/factorial(s)/(1-ros);
p0=1/(sum1+sum2);
p=ro.^s.*p0/factorial(s)/(1-ros);
Lq=p.*ros/(1-ros);
L=Lq+ro;
W=L/lambda;
Wq=Lq/lambda;
fprintf('排队等待的平均人数为%5.2f人\n',Lq)
fprintf('系统内的平均人数为%5.2f人\n',L)
fprintf('平均逗留时间为%5.2f分钟\n',W*60)
fprintf('平均等待时间为%5.2f分种\n',Wq*60)
MM1
function MM1()
clear
clc
%*****************************************
%初始化顾客源
%*****************************************
%总仿真时间
Total_time = 10;
%队列最大长度
N = 100000;
%到达率与服务率
lambda = 10; %单位时间(一小时)到达的顾客数
mu = 6; %单个服务台一小时内服务的顾客数
%平均到达时间与平均服务时间
arr_mean = 1/lambda;
ser_mean = 1/mu;
arr_num = round(Total_time*lambda*2);
events = [];
%按负指数分布产生各顾客达到时间间隔
events(1,:) = exprnd(arr_mean,1,arr_num);
%各顾客的到达时刻等于时间间隔的累积和
events(1,:) = cumsum(events(1,:));
%按负指数分布产生各顾客服务时间
events(2,:) = exprnd(ser_mean,1,arr_num);
%计算仿真顾客个数,即到达时刻在仿真时间内的顾客数
len_sim = sum(events(1,:)<= Total_time);
%*****************************************
%计算第 1个顾客的信息
%*****************************************
%第 1个顾客进入系统后直接接受服务,无需等待
events(3,1) = 0;
%其离开时刻等于其到达时刻与服务时间之和
events(4,1) = events(1,1)+events(2,1);
%其肯定被系统接纳,此时系统内共有
%1个顾客,故标志位置1
events(5,1) = 1;
%其进入系统后,系统内已有成员序号为 1
member = [1];
for i = 2:arr_num
%如果第 i个顾客的到达时间超过了仿真时间,则跳出循环
if events(1,i)>Total_time
break;
else
number = sum(events(4,member) > events(1,i));
%如果系统已满,则系统拒绝第 i个顾客,其标志位置 0
if number >= N+1
events(5,i) = 0;
%如果系统为空,则第 i个顾客直接接受服务
else
if number == 0
%其等待时间为 0
%PROGRAMLANGUAGEPROGRAMLANGUAGE
events(3,i) = 0;
%其离开时刻等于到达时刻与服务时间之和
events(4,i) = events(1,i)+events(2,i);
%其标志位置 1
events(5,i) = 1;
member = [member,i];
%如果系统有顾客正在接受服务,且系统等待队列未满,则 第 i个顾客进入系统
else len_mem = length(member);
%其等待时间等于队列中前一个顾客的离开时刻减去其到 达时刻
events(3,i)=events(4,member(len_mem))-events(1,i);
%其离开时刻等于队列中前一个顾客的离开时刻加上其服
%务时间
events(4,i)=events(4,member(len_mem))+events(2,i);
%标识位表示其进入系统后,系统内共有的顾客数
events(5,i) = number+1;
member = [member,i];
end
end
end
end
%仿真结束时,进入系统的总顾客数
len_mem = length(member);
%*****************************************
%输出结果
%*****************************************
%绘制在仿真时间内,进入系统的所有顾客的到达时刻和离
%开时刻曲线图(stairs:绘制二维阶梯图)
stairs([0 events(1,member)],0:len_mem);
hold on;
stairs([0 events(4,member)],0:len_mem,'.-r');
legend('到达时间 ','离开时间 ');
hold off;
grid on;
%绘制在仿真时间内,进入系统的所有顾客的停留时间和等
%待时间曲线图(plot:绘制二维线性图)
figure;
plot(1:len_mem,events(3,member),'r-*',1: len_mem,events(2,member)+events(3,member),'k-');
legend('等待时间 ','停留时间 ');
grid on;