enumerate
一般情况下对一个列表或数组既要遍历索引又要遍历元素时,会这样写:
for i in range (0,len(list)):
print i ,list[i]
但是这种方法有些累赘,使用内置enumerrate函数会有更加直接,优美的做法,先看看enumerate的定义:
for index,text in enumerate(list)):
print index ,text
magic word %store
You can exchange the variable between different jupyter notebooks.
%store y_test
And you can restore this variable in another jupyter notebook, like:
%store -r y_test
Imutability and mutable variable
I have not found a better way to change a mutable variable to an immutable variable, so I change a list into a tuple, which is an equivalent version of list.
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
X_train_folds = []
y_train_folds = []
################################################################################
# TODO: #
# Split up the training data into folds. After splitting, X_train_folds and #
# y_train_folds should each be lists of length num_folds, where #
# y_train_folds[i] is the label vector for the points in X_train_folds[i]. #
# Hint: Look up the numpy array_split function. #
################################################################################
#X_train_folds is a list with 5 element type of ndarray, and 4 elements needs to be
#concatenated together, but they can't be imutated directly, because X_train_folds
#needs to be used in another loop, so I transfer the type of X_train_folds terminently
# to the tuple, and when we need to mutate it, we assign its value to X_realtrain;
#and transfer the type of X_realtrain to list.
X_train_folds = np.array_split(X_train, num_folds)
X_train_folds = tuple(X_train_folds)
y_train_folds = np.array_split(y_train, num_folds)
y_train_folds = tuple(y_train_folds)
################################################################################
# END OF YOUR CODE #
################################################################################
# A dictionary holding the accuracies for different values of k that we find
# when running cross-validation. After running cross-validation,
# k_to_accuracies[k] should be a list of length num_folds giving the different
# accuracy values that we found when using that value of k.
k_to_accuracies = {}
################################################################################
# TODO: #
# Perform k-fold cross validation to find the best value of k. For each #
# possible value of k, run the k-nearest-neighbor algorithm num_folds times, #
# where in each case you use all but one of the folds as training data and the #
# last fold as a validation set. Store the accuracies for all fold and all #
# values of k in the k_to_accuracies dictionary. #
################################################################################
for i in range(len(k_choices)):
accuracy = [0]*num_folds
for j in range(num_folds):
X_realtrain = list(X_train_folds)
y_realtrain = list(y_train_folds)
X_val = X_realtrain.pop(j)
y_val = y_realtrain.pop(j)
#print(type(X_val))
#print(type(X_realtrain))
X_realtrain = np.asarray(X_realtrain)
#print(X_realtrain.shape)
#The remaining 4 elements of ndarray needs to be concatenated, and
#array.concatenate can only have two ndarrays, so we use np.vstack
X_realtrain = np.vstack((X_realtrain[0],X_realtrain[1],X_realtrain[2],X_realtrain[3]))
distance = classifier.compute_distances_no_loops(X_val)
y_test_pred = classifier.predict_labels(distance, k = k_choices[i])
num_correct = np.sum(y_test_pred == y_val) / num_test
accuracy[j] = float(num_correct)
j += 1
k_to_accuracies.update({k_choices[i]:accuracy})
i = i + 1
pass
################################################################################
# END OF YOUR CODE #
################################################################################
# Print out the computed accuracies
for k in sorted(k_to_accuracies):
for accuracy in k_to_accuracies[k]:
print('k = %d, accuracy = %f' % (k, accuracy))
L2 distance 3 ways
two loop version
for i in xrange(num_test):
for j in xrange(num_train):
#####################################################################
# TODO: #
# Compute the l2 distance between the ith test point and the jth #
# training point, and store the result in dists[i, j]. You should #
# not use a loop over dimension. #
#####################################################################
dists[i, j] = (((self.X_train[j] - X[i])**2).sum(0))
#####################################################################
# END OF YOUR CODE #
#####################################################################
return dists
one loop version
for i in xrange(num_test):
#######################################################################
# TODO: #
# Compute the l2 distance between the ith test point and all training #
# points, and store the result in dists[i, :]. #
#######################################################################
dists[i,:] = np.sum((self.X_train - X[i])**2, axis = 1)
pass
#######################################################################
# END OF YOUR CODE #
#######################################################################
return dists
no loop version
#according to the formula (x-y)**2 = x**2 + y**2 -2*x*y, so u can calculate
# the 2xy(500*5000) and plus the 5000*1 column vector y**2.
dists = -2*np.dot(X, self.X_train.T) + np.sum(self.X_train**2, axis = 1) \
+np.sum(X**2, axis = 1)[:, np.newaxis]