demo文件--参数的传入
作者提供的demo.sh文件是在linux平台下运行的bash文件。
#!/bin/bash
# This file trains all the models presented here.
echo "python scatter_net.py --data data/8_layer_tio2 --output_folder results/8_layer_tio2 --n_batch 100 --numEpochs 5000 --lr_rate .0006 --lr_decay .99 --num_layers 4 --n_hidden 250 --percent_val .2 --patience 10"
python scatter_net.py --data data/8_layer_tio2 --output_folder results/8_layer_tio2 --n_batch 100 --numEpochs 5000 --lr_rate .0006 --lr_decay .99 --num_layers 4 --n_hidden 250 --percent_val .2 --patience 10
echo "python scatter_net.py --data data/7_layer_tio2 --output_folder results/7_layer_tio2 --n_batch 100 --numEpochs 5000 --lr_rate .0006 --lr_decay .99 --num_layers 4 --n_hidden 225 --percent_val .2 --patience 10"
python scatter_net.py --data data/7_layer_tio2 --output_folder results/7_layer_tio2 --n_batch 100 --numEpochs 5000 --lr_rate .0006 --lr_decay .99 --num_layers 4 --n_hidden 225 --percent_val .2 --patience 10
echo "python scatter_net.py --data data/6_layer_tio2 --output_folder results/6_layer_tio2 --n_batch 100 --numEpochs 5000 --lr_rate .0006 --lr_decay .99 --num_layers 4 --n_hidden 225 --percent_val .2 --patience 10"
python scatter_net.py --data data/6_layer_tio2 --output_folder results/6_layer_tio2 --n_batch 100 --numEpochs 5000 --lr_rate .0006 --lr_decay .99 --num_layers 4 --n_hidden 225 --percent_val .2 --patience 10
echo "python scatter_net.py --data data/5_layer_tio2 --output_folder results/5_layer_tio2 --n_batch 100 --numEpochs 5000 --lr_rate .0006 --lr_decay .99 --num_layers 4 --n_hidden 200 --percent_val .2 --patience 10"
python scatter_net.py --data data/5_layer_tio2 --output_folder results/5_layer_tio2 --n_batch 100 --numEpochs 5000 --lr_rate .0006 --lr_decay .99 --num_layers 4 --n_hidden 200 --percent_val .2 --patience 10
echo "python scatter_net.py --data data/4_layer_tio2 --output_folder results/4_layer_tio2 --n_batch 100 --numEpochs 5000 --lr_rate .0006 --lr_decay .99 --num_layers 4 --n_hidden 125 --percent_val .2 --patience 10"
python scatter_net.py --data data/4_layer_tio2 --output_folder results/4_layer_tio2 --n_batch 100 --numEpochs 5000 --lr_rate .0006 --lr_decay .99 --num_layers 4 --n_hidden 125 --percent_val .2 --patience 10
echo "python scatter_net.py --data data/3_layer_tio2 --output_folder results/3_layer_tio2 --n_batch 100 --numEpochs 5000 --lr_rate .0006 --lr_decay .99 --num_layers 4 --n_hidden 100 --percent_val .2 --patience 10"
python scatter_net.py --data data/3_layer_tio2 --output_folder results/3_layer_tio2 --n_batch 100 --numEpochs 5000 --lr_rate .0006 --lr_decay .99 --num_layers 4 --n_hidden 100 --percent_val .2 --patience 10
echo "python scatter_net.py --data data/2_layer_tio2 --output_folder results/2_layer_tio2 --n_batch 100 --numEpochs 5000 --lr_rate .0006 --lr_decay .99 --num_layers 4 --n_hidden 100 --percent_val .2 --patience 10"
python scatter_net.py --data data/2_layer_tio2 --output_folder results/2_layer_tio2 --n_batch 100 --numEpochs 5000 --lr_rate .0006 --lr_decay .99 --num_layers 4 --n_hidden 100 --percent_val .2 --patience 10
文件展示了2-8层的TiO2-Si的nanoparticle的训练过程。scatter_net.py
的最后专门写了一个参数解析模块parser
对传入参数进行了解析。
if __name__=="__main__":
parser = argparse.ArgumentParser(description="Physics Net Training")
parser.add_argument("--data",type=str,default='data/5_layer_tio2') # Where the data file is. Note: This assumes a file of _val.csv and .csv
parser.add_argument("--reuse_weights",type=str,default='False') # Whether to load the weights or not. Note this just needs to be set to true, then the output folder directed to the same location.
parser.add_argument("--output_folder",type=str,default='results/5_layer_tio2') #Where to output the results to. Note: No / at the end.
parser.add_argument("--weight_name_load",type=str,default="")#This would be something that goes infront of w_1.txt. This would be used in saving the weights. In most cases, just leave this as is, it will naturally take care of it.
parser.add_argument("--weight_name_save",type=str,default="") #Similiar to above, but for saving now.
parser.add_argument("--n_batch",type=int,default=100) # Batch Size
parser.add_argument("--numEpochs",type=int,default=5000) #Max number of epochs to consider at maximum, if patience condition is not met.
parser.add_argument("--lr_rate",type=float,default=.001) # Learning Rate.
parser.add_argument("--lr_decay",type=float,default=.7) # Learning rate decay. It decays by this factor every epoch.
parser.add_argument("--num_layers",default=4) # Number of layers in the network.
parser.add_argument("--n_hidden",default=225) # Number of neurons per layer. Fully connected layers.
parser.add_argument("--percent_val",default=.2) # Amount of the data to split for validation/test. The validation/test are both split equally.
parser.add_argument("--patience",default=10) # Patience for stopping. If validation loss has not decreased in this many steps, it will stop the training.
parser.add_argument("--compare",default='False') # Whether it should output the comparison or not.
parser.add_argument("--sample_val",default='True') # Wether it should sample from validation or not, for the purposes of graphing.
parser.add_argument("--spect_to_sample",type=int,default=300) # Zero Indexing for this. Position in the data file to sample from (note it will take from validation)
parser.add_argument("--matchSpectrum",default='False') # If it should match an already existing spectrum file.
parser.add_argument("--match_test_file",default='results/2_layer_tio2/test_47.5_45.3') # Location of the file with the spectrum in it.
parser.add_argument("--designSpectrum",default='False') # If it should
parser.add_argument("--design_test_file",default='data/test_gen_spect.csv') # This is a file that should contain 0's and 1's where it should maximize and not maximize.
args = parser.parse_args() #是一个具有某些属性的对象,这里是一个__dict__
dict = vars(args) #返回args这个对象的非私有属性 key-value,是一个dict。
print(dict)
for key,value in dict.items():
if (dict[key]=="False"):
dict[key] = False
elif dict[key]=="True":
dict[key] = True
try:
if dict[key].is_integer():
dict[key] = int(dict[key])
else:
dict[key] = float(dict[key])
except:
pass
print (dict)
#上面这段代码将所有dict的值都转化为bool,interger和float类型。
#Note that reuse MUST be set to true.
if (dict['compare'] or dict['matchSpectrum'] or dict['designSpectrum']):
if dict['reuse_weights'] != True:
print("Reuse weights must be set true for comparison, matching, or designing. Setting it to true....")
time.sleep(1)
dict['reuse_weights'] = True
kwargs = {
'data':dict['data'],
'reuse_weights':dict['reuse_weights'],
'output_folder':dict['output_folder'],
'weight_name_save':dict['weight_name_save'],
'weight_name_load':dict['weight_name_load'],
'n_batch':dict['n_batch'],
'numEpochs':dict['numEpochs'],
'lr_rate':dict['lr_rate'],
'lr_decay':dict['lr_decay'],
'num_layers':int(dict['num_layers']),
'n_hidden':int(dict['n_hidden']),
'percent_val':dict['percent_val'],
'patienceLimit':dict['patience'],
'compare':dict['compare'],
'sample_val':dict['sample_val'],
'spect_to_sample':dict['spect_to_sample'],
'matchSpectrum':dict['matchSpectrum'],
'match_test_file':dict['match_test_file'],
'designSpectrum':dict['designSpectrum'],
'design_test_file':dict['design_test_file']
}
#定义一个dict
if kwargs['designSpectrum'] == True: #当需要designSpectrum时运行下面函数
design_spectrum(**kwargs) #将上面定义的dict作为关键字参数传入
elif kwargs['matchSpectrum'] == True: #当需要matchSpectrum时运行下面函数
match_spectrum(**kwargs)
else:
main(**kwargs) #main()函数是train model。
为了理解第一段代码,我们打印一下运行结果:
-
print (args)
得到的是一个namespace的对象。Python使用叫做命名空间的东西来记录变量的轨迹。命名空间是一个 字典(dictionary) ,它的键就是变量名,它的值就是那些变量的值
Namespace(compare='False', data='data/5_layer_tio2', designSpectrum='False', design_test_file='data/test_gen_spect.csv', lr_decay=0.7, lr_rate=0.001, matchSpectrum='False', match_test_file='results/2_layer_tio2/test_47.5_45.3', n_batch=100, n_hidden=225, numEpochs=5000, num_layers=4, output_folder='results/5_layer_tio2', patience=10, percent_val=0.2, reuse_weights='False', sample_val='True', spect_to_sample=300, weight_name_load='', weight_name_save='')
-
print (vars(args))
利用vars()
函数,将object args表示成了标准的dict:
{'sample_val': 'True', 'weight_name_load': '', 'compare': 'False', 'match_test_file': 'results/2_layer_tio2/test_47.5_45.3', 'patience': 10, 'numEpochs': 5000, 'design_test_file': 'data/test_gen_spect.csv', 'matchSpectrum': 'False', 'n_batch': 100, 'spect_to_sample': 300, 'output_folder': 'results/5_layer_tio2', 'n_hidden': 225, 'percent_val': 0.2, 'designSpectrum': 'False', 'num_layers': 4, 'lr_rate': 0.001, 'weight_name_save': '', 'reuse_weights': 'False', 'data': 'data/5_layer_tio2', 'lr_decay': 0.7}
- 对dict进行处理之后,把value的值转变为合适的bool,int or float数据类型,比如
'True'
输入时会时str
的数据类型,处理之后变成bool值True
:
{'sample_val': True, 'weight_name_load': '', 'compare': False, 'match_test_file': 'results/2_layer_tio2/test_47.5_45.3', 'patience': 10, 'numEpochs': 5000, 'design_test_file': 'data/test_gen_spect.csv', 'matchSpectrum': False, 'n_batch': 100, 'spect_to_sample': 300, 'output_folder': 'results/5_layer_tio2', 'n_hidden': 225, 'percent_val': 0.2, 'designSpectrum': False, 'num_layers': 4, 'lr_rate': 0.001, 'weight_name_save': '', 'reuse_weights': False, 'data': 'data/5_layer_tio2', 'lr_decay': 0.7}
linux中的参数传入
在linux中运行,用的是关键字参数传入。
python scatter_net.py --data data/5_layer_tio2 --output_folder results/5_layer_tio2 --n_batch 100 --numEpochs 5000 --lr_rate .0006 --lr_decay .99 --num_layers 4 --n_hidden 250 --percent_val .2 --patience 10
-
--data data/5_layer_tio2
给出了数据的存储位置; -
--output_folder results/5_layer_tio2
给出了结果的存储位置; -
--n_batch 100
给出了batch size是100; --numEpochs 5000
-
--lr_rate .0006
给出了学习速率; -
--lr_decay .99
给出了decay速度; -
--num_layers 4
一共有四层网络; -
--n_hidden 250
每层网络有250个神经元; -
--percent_val .2
用来做validation的数据的比例; -
--patience 10
tranning stop的条件。
可知,传入上述参数之后将会运行main(**kwargs)
模块,其他没传入的参数使用默认值。
main()函数中的数据处理
来到main函数(scatter_net line167),函数的第一部分还是在处理文件名,我们从line189:#getting the data
看起,其中最重要的在line 191:
train_X, train_Y , test_X, test_Y, val_X, val_Y , x_mean, x_std = get_data(data,percentTest=percent_val)
调用了get_data()
函数,这个函数位于scatter_net_core.py
文件中:
from sklearn.model_selection import train_test_split
def get_data(data,percentTest=.2,random_state=42):
x_file = data+"_val.csv"
y_file = data+".csv"
train_X = np.genfromtxt(x_file,delimiter=',')#[0:20000,:]
train_Y = np.transpose(np.genfromtxt(y_file,delimiter=','))#[0:20000,:]
train_x_mean = train_X.mean(axis=0) #train_X is an array. 求第一列的所有数据(sample)的平均值
train_x_std = train_X.std(axis=0) #第一列所有数据的标准差
train_X = (train_X-train_X.mean(axis=0))/train_X.std(axis=0) #对数据进行了zero-mean normalization,使得输入的数据是一个标准正态分布。
X_train, test_X, y_train, test_Y = train_test_split(train_X,train_Y,test_size=float(percentTest),random_state=random_state)
#将原始数据train_X, train_Y输入,并给定train和test的分离比例,就可以把数据分成train和test两部分。
X_test, X_val, y_test, y_val = train_test_split(test_X,test_Y,test_size=.5,random_state=random_state)
#将上一步分离好的test数据按0.5进行了二次split,一部分作为test,一部分留作X_val,和y_val。
return X_train, y_train, X_test, y_test, X_val, y_val, train_x_mean, train_x_std
data文件夹中对于每种情况(比如5_layer)有"_val.csv"和".csv"两个文件。data文件夹的read me内容如下:
These are all the data files used in the paper.
Note all these were generated using the "ScatterNet_Matlab" directory here in the repository. Be cautious of the order of the harmonics - as the particle get more layers, more orders must be added to compensate for more modes.
Directory:
Data for n layer particle with alternating silica/TiO2 shells:
n_layer_tio2.csv
n_layer_tio2_val.csv
The _val file indicates what the values of the thickneses are (in nanometers). The other file - 2_layer_tio2.csv - indicates the values of the spectrum for each corresponding particle. That is, the first line in 2_layer_tio2 corresponds to the first line in 2_layer_tio2_val.
The 2 layer particle has 30k records. The 3,4,5,6,7 layer has 40k. The 8 layer has 50k.
Data for 3 layer jaggregate particle:
jagg_layer_tio2.csv
jagg_layer_tio2_val.csv
Same format as above. The _val file indicates the thickness of the metallic silver core, dielectric layer of silica, and outside layer of the J-Aggregate dye respectively. The last number is the tuned resonnance for the J-Aggregate dye.
如上所述:
- "_val.csv"存储的是input layer (X_train)的值,形式是一个的matrix,m代表训练的sample数,n代表输入层的neuron数,比如是5_layer,那么n就是5。
- ".csv"存储的是output layer (Y_train)的值,形式是一个的matrix,m代表输出层(Y)的neuron数(本例中是离散的frequency的值),n代表sample量。
main()函数中的neural network的建立
weight initialization
先来看代码,从scatter_net.py
line200看起:
x_size = train_X.shape[1] #输入层的neuron个数
y_size = train_Y.shape[1]#输出层的neuron个数
# Symbols
X = tf.placeholder("float", shape=[None, x_size])#placeholder 是tensorflow中常用的占位符,这里分配给了X一个列数为x_size行数不确定的二维向量,数据类型是float
y = tf.placeholder("float", shape=[None, y_size])
weights = []
biases = []
# Weight initializations
if reuse_weights:
(weights, biases) = load_weights(output_folder,weight_name_load,num_layers)
#如果reuse_weights 为True则直接从output_folder中导入,在train model时,reuse_weights默认为False,所以执行下面的语句
else:
for i in xrange(0,num_layers): #对每层网络的初始weights和bias调用了init_weights/bias函数给出
if i ==0:
weights.append(init_weights((x_size,n_hidden)))
else:
weights.append(init_weights((n_hidden,n_hidden)))
biases.append(init_bias(n_hidden))
weights.append(init_weights((n_hidden,y_size)))
biases.append(init_bias(y_size))
上面给出输入层(X)和输出层(Y)数据的时候,用到了placeholder占位符,关于占位符的用法具体可参考:https://www.jianshu.com/p/e4ff91317f7e
上面的weights和bias的初始化用到了init_weights()
和init_bias()
两个函数,which are defined in scatter_net_core.py
as following:
#As per Xaiver init, this should be 2/n(input), though many different initializations can be tried.
def init_weights(shape,stddev=.1):
""" Weight initialization """
weights = tf.random_normal(shape, stddev=stddev)
return tf.Variable(weights)
def init_bias(shape, stddev=.1):
""" Weight initialization """
biases = tf.random_normal([shape], stddev=stddev)
return tf.Variable(biases)
这里就用了正态分布函数tf.random_normal()
进行了初始化。关于weights的初始化,有很多方法可以选择,具体可参考:https://zhuanlan.zhihu.com/p/25110150。
Forward propagation
现在有了网络的结构,有了初始值,以及有了输入层,就可以构建正向网络了
# Forward propagation
yhat = forwardprop(X, weights,biases,num_layers)
这里作者构建了一个forwardprop
函数,具体形式位于scatter_net.py
中:
def forwardprop(X, weights, biases, num_layers, dropout=False, minLimit=None, maxLimit=None):
if minLimit is not None:
X = tf.maximum(X, minLimit)
X = tf.minimum(X, maxLimit)
htemp = None
for i in xrange(0, num_layers):
if i ==0:
htemp = tf.nn.relu(tf.add(tf.matmul(X, weights[i]), biases[i]))
else:
htemp = tf.nn.relu(tf.add(tf.matmul(htemp, weights[i]), biases[i]))
yval = tf.add(tf.matmul(htemp, weights[-1]), biases[-1])
return yval
用的activation function是ReLu。
Backward propagation
# Backward propagation
dif = tf.abs(y-yhat)
peroff = tf.reduce_mean(dif/tf.abs(y))
cost = tf.reduce_mean(tf.square(y-yhat))
global_step = tf.Variable(0, trainable=False)
print("LR Rate: " , lr_rate) # learnin rate
print(int(train_X.shape[0]/n_batch))
print(lr_decay) #learning rate decay. It decays by this factor every epoch.
print("--done--")
learning_rate = tf.train.exponential_decay(lr_rate,global_step,int(train_X.shape[0]/n_batch),lr_decay,staircase=False) #learning rate 用了exponential_decay来更新
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost,global_step=global_step) #用了RMS优化器来更新weights和bias
Training
下面就是写一个循环进行training:
#Now do the training.
step =0; curEpoch =0; cum_loss =0; perinc = 0;
lowVal = 1000000.0 #Just make this some high number. 这个参数是用来决定是否停止训练的,如果在(patient=10)次内,
#有val_loss大于这个数,那么说明网络太差了,要停止训练。
start_time=time.time()
#Session 是 Tensorflow 为了控制,和输出文件的执行的语句.
#运行 session.run() 可以获得你要得知的运算结果, 或者是你所要运算的部分.
with tf.Session() as sess:
init = tf.global_variables_initializer()
sess.run(init)
if (compare): #Just run a comparison。在demo.sh文件中,这个默认为False,我们先跳过。
x_set = train_X
y_set = train_Y
if sample_val:
x_set = val_X
y_set = val_Y
batch_x = x_set[spect_to_sample : (spect_to_sample+1) ]
batch_y = y_set[spect_to_sample : (spect_to_sample+1) ]
mycost = sess.run(cost,feed_dict={X:batch_x,y:batch_y})
myvals0 = sess.run(yhat,feed_dict={X:batch_x,y:batch_y})
outputSpectsToFile(output_folder,spect_to_sample,batch_x,batch_y,myvals0,mycost,x_mean,x_std)
return
print("======== Iterations started ========")
while curEpoch < numEpochs: #写了一个循环来控制训练次数,对全部sample训练完一次,curEpoch += 1
batch_x = train_X[step * n_batch : (step+1) * n_batch] #n_batch 是batch_size,代表一个batch中的sample个数,默认输入是100.
batch_y = train_Y[step * n_batch : (step+1) * n_batch]
peroffinc, cuminc, _ = sess.run([peroff,cost,optimizer], feed_dict={X: batch_x, y: batch_y})
cum_loss += cuminc #cuminc=cost是batch中所有sample的平均loss
perinc += peroffinc
step += 1 #每传入一个batch,step +1;表示一次iteration。
#End of each epoch.
if step == int(train_X.shape[0]/n_batch): #train_X.shape[0]/n_batch代表batch的总数,当step==batch的总数的时候,代表完成一次全部数据的训练,即一次epoch。
curEpoch +=1 #curEpoch +1
cum_loss = cum_loss/float(step) #所有batch的平均loss
perinc = perinc/float(step)
step = 0
train_loss_file.write(str(float(cum_loss))+"," + str(perinc) + str("\n"))
# Every 10 epochs, do a validation.
if (curEpoch % 10 == 0 or curEpoch == 1):
val_loss, peroff2 = sess.run([cost,peroff],feed_dict={X:test_X,y:test_Y})
val_loss_file.write(str(float(val_loss))+","+str(peroff2)+str("\n"))
val_loss_file.flush()
train_loss_file.flush()
if (val_loss > lowVal):
patience += 1
else:
patience = 0
lowVal = min(val_loss,lowVal)
#每十次epoch,输出一次
print("Validation loss: " , str(val_loss) , " per off: " , peroff2)
print("Epoch: " + str(curEpoch+1) + " : Loss: " + str(cum_loss) + " : " + str(perinc))
if (patience > patienceLimit):
print("Reached patience limit. Terminating")
break
cum_loss = 0
perinc = 0
#保存最终得到的weights和bias。
save_weights(weights,biases,output_folder,weight_name_save,num_layers)
- 关于batch_size, epoch, iteration有很多说明,简而言之,只有在数据很庞大的时候(在机器学习中,几乎任何时候都是),我们才需要使用 epochs,batch size,迭代这些术语,在这种情况下,一次性将数据输入计算机是不可能的。因此,为了解决这个问题,我们需要把数据分成小块,一块一块的传递给计算机,在每一步的末端更新神经网络的权重,拟合给定的数据。具体可参考:https://www.jianshu.com/p/005d05e18c7d