# coding: utf-8
# In[ ]:
# !/use/bin/env python
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn.model_selection import train_test_split
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import *
import sklearn.ensemble
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
import itertools
import sys
from sklearn.feature_selection import f_classif
import warnings
from sklearn.externals import joblib
warnings.filterwarnings('ignore')
path=""
inputname=sys.argv[1]
outputname=sys.argv[2].split(".")[0]
name=outputname
cross_times=sys.argv[3]
cpu_values=sys.argv[4]
distance=sys.argv[5]
# inputname="test.csv"
# outputname="test_2RFH"
# name=outputname
# cross_times=2
# cpu_values=12
# distance=10
# In[ ]:
def performance(labelArr, predictArr):
#labelArr[i] is actual value,predictArr[i] is predict value
TP = 0.; TN = 0.; FP = 0.; FN = 0.
for i in range(len(labelArr)):
if labelArr[i] == 1 and predictArr[i] == 1:
TP += 1.
if labelArr[i] == 1 and predictArr[i] == 0:
FN += 1.
if labelArr[i] == 0 and predictArr[i] == 1:
FP += 1.
if labelArr[i] == 0 and predictArr[i] == 0:
TN += 1.
if (TP + FN)==0:
SN=0
else:
SN = TP/(TP + FN) #Sensitivity = TP/P and P = TP + FN
if (FP+TN)==0:
SP=0
else:
SP = TN/(FP + TN) #Specificity = TN/N and N = TN + FP
if (TP+FP)==0:
precision=0
else:
precision=TP/(TP+FP)
if (TP+FN)==0:
recall=0
else:
recall=TP/(TP+FN)
GM=math.sqrt(recall*SP)
#MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
return precision,recall,SN,SP,GM,TP,TN,FP,FN
# In[ ]:
"""
cross validation and f-score and xgboost
"""
datapath =path+inputname
classifier="svm_f-score"
mode="crossvalidation"
print("start")
train_data = pd.read_csv(datapath, header=None, index_col=None)
Y = list(map(lambda x: 1, xrange(len(train_data) // 2)))
Y2 = list(map(lambda x: 0, xrange(len(train_data) // 2)))
Y.extend(Y2)
Y = np.array(Y)
F, pval = f_classif(train_data, Y)
idx = np.argsort(F)
selected_list_=idx[::-1]
F_sort_value=[F[e] for e in selected_list_]
print(F_sort_value)
print(selected_list_)
feature_dimension_and_scores=[]
feature_dimension_and_scores.append(F_sort_value)
feature_dimension_and_scores.append(selected_list_)
feature_dimension_and_scores=np.array(feature_dimension_and_scores).T
pd.DataFrame(feature_dimension_and_scores).to_excel("feature_dimension_and_score.xlsx",index=False,header=["score","dimension"])
print("After filtering Nan and inf")
selected_list_=[a for a,b in zip(selected_list_,F_sort_value) if (not math.isnan(b) and not math.isinf(b))]
print(selected_list_)
bestACC=0
bestC=0
bestgamma=0
best_dimension=0
row0 = [u'特征集', u'样本个数', u'分类器', u'Accuracy', u'Precision', u'Recall', u'SN', u'SP',
u'Gm', u'F_measure', u'F_score', u'MCC', u'ROC曲线面积', u'tp', u'fn', u'fp', u'tn']
all_dimension_results=[]
all_dimension_results.append(row0)
select_list=[]
best_savedata=""
prediction_probability=[]
prediction_probability.append(Y.astype(int))
for select_num,temp_data in enumerate(selected_list_):
train_data2=train_data.values
select_list.append(int(temp_data))
if len(select_list)%distance==0 or len(select_list)==len(selected_list_):
X_train=pd.DataFrame(train_data2)
X_train=X_train.iloc[:,select_list]
X = np.array(X_train)
svc = svm.SVC()
parameters = {'kernel': ['rbf'], 'C':map(lambda x:2**x,np.linspace(-2,5,7)), 'gamma':map(lambda x:2**x,np.linspace(-5,2,7))}
clf = GridSearchCV(svc, parameters, cv=cross_times, n_jobs=cpu_values, scoring='accuracy')
clf.fit(X, Y)
C=clf.best_params_['C']
gamma=clf.best_params_['gamma']
print("the best C and gamma are:",C,gamma)
y_predict=cross_val_predict(svm.SVC(kernel='rbf',C=C,gamma=gamma),X,Y,cv=cross_times,n_jobs=cpu_values)
ROC_AUC_area=metrics.roc_auc_score(Y, y_predict)
y_predict_prob=cross_val_predict(svm.SVC(kernel='rbf',C=C,gamma=gamma,probability=True),X,Y,cv=cross_times,n_jobs=cpu_values,method='predict_proba')
prediction_probability.append(y_predict_prob[:,1])
# predict_save=[Y.astype(int),y_predict.astype(int),y_predict_prob[:,1]]
# predict_save=np.array(predict_save).T
# pd.DataFrame(predict_save).to_csv(path+classifier+"_"+mode+"_"+outputname+"_"+'predict.csv',header=None,index=False)
ROC_AUC_area=metrics.roc_auc_score(Y,y_predict)
ACC=metrics.accuracy_score(Y,y_predict)
precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(Y, y_predict)
F1_Score=metrics.f1_score(Y, y_predict)
F_measure=F1_Score
MCC=metrics.matthews_corrcoef(Y, y_predict)
pos=TP+FN
neg=FP+TN
savedata=[str(select_num+1),"positive:"+str(pos)+","+"negative:"+str(neg),'svm',ACC,precision, recall,SN, SP, GM,F_measure,F1_Score,MCC,ROC_AUC_area,TP,FN,FP,TN]
if ACC>bestACC:
bestACC=ACC
bestC=C
bestgamma=gamma
best_savedata=savedata
best_dimension=X.shape[1]
print savedata
print X.shape[1]
all_dimension_results.append(savedata)
print("bestACC",bestACC)
print("bestC",bestC)
print("bestgamma",bestgamma)
print("best_dimension",best_dimension)
print("all_useful_dimensions_number",len(all_dimension_results))
final_X=train_data.values
final_select_list=np.array(selected_list_)
final_select_list=final_select_list[range(best_dimension)]
final_X=pd.DataFrame(final_X)
final_X=final_X.iloc[:,list(final_select_list)]
clf=svm.SVC(kernel='rbf',C=bestC,gamma=bestgamma)
clf.fit(final_X,Y)
joblib.dump(clf,path+classifier+"_"+mode+"_"+outputname+"_"+'.model')
temp=range(1,len(all_dimension_results))
temp=[str(e) for e in temp]
prediction_probability_list=['labels']
prediction_probability_list.extend(temp)
prediction_probability=np.array(prediction_probability).T.tolist()
prediction_probability[:][0]=[int(e) for e in prediction_probability[:][0]]
pd.DataFrame(prediction_probability).to_csv(path+classifier+"_"+mode+"_"+outputname+"_"+'predict.csv',header=prediction_probability_list,index=False)
pd.DataFrame(all_dimension_results).to_excel(path+'cross_validation_'+classifier+"_"+outputname+'.xlsx',sheet_name="_crossvalidation",header=None,index=False)
pd.DataFrame(np.array(all_dimension_results)[[0,best_dimension/distance],:]).to_excel(path+'cross_validation_'+classifier+"_"+outputname+"_"+'best.xlsx',sheet_name="_crossvalidation",header=None,index=False)
example:
python f-score_svm.py input.csv outputname crossvalidation_values cpu_values distance
其中:
- f-score_svm.py: 程序名字
- input.csv: 输入文件,以csv结尾
- outputname: 输出文件的标记
- crossvalidation_values: 几折交叉验证,如十折交叉验证填:10
- cpu+values: 多进程数,需要查清自己电脑有多少cpu,建议使用少于自己电脑cpu数的值,如有4个cpu, 可以填3
- distance:特征选择过程的幅度,一般填1,不可为0