cs231n assignment1 SVM 完整代码


  • SVM就是将你当前数据集映射到一个不同维度的支持向量机算法

  • 第一步:随机生成一个W矩阵,用这个矩阵将(3072,49000)的training图像集转化成(10,49000),对应的就是十种类别

  • 第二步:计算loss function和W的梯度,简单而言就是对
    L = (1/N)∑iLi + λR(W)求导,其中Li为
    Li = ∑j≠yi max(0, (xiW)j−(xyiW)j+Δ),看起来十分复杂,其实就是分情况为j=yi和j!=yi的情况,导致的结果就是正负的关系.
    ∇Wyi Li = - xiT(∑j≠yi1(xiWj - xiWyi +1>0)) + 2λWyi and
    ∇Wj Li = xiT 1(xiWj - xiWyi +1>0) + 2λWj , (j≠yi)

  • 第三步:利用W -= dW来迭代减少loss function直到loss function收敛,这里用的是SGD(随机梯度下降),意思就是设定计算步数和每一步随机取得的training data数量

  • 第四步:利用训练好的W来预测test data的准确率,大概都是在40%左右,调整步长和正则化大小可以适量提升准确率

  • linear_classifiar

    import numpy as np
    from cs231n.classifiers.linear_svm import *
    from cs231n.classifiers.softmax import *

    class LinearClassifier:

    def __init__(self):
    self.W = None

    def train(self, X, y, learning_rate=1e-3, reg=1e-5, num_iters=100,
            batch_size=200, verbose=False):
    dim, num_train = X.shape
    num_classes = np.max(y) + 1 # assume y takes values 0...K-1 where K is number of classes
    if self.W is None:
      # lazily initialize W
      self.W = np.random.randn(num_classes, dim) * 0.001

    # Run stochastic gradient descent to optimize W
    loss_history = []
    for it in xrange(num_iters):
      X_batch = None
      y_batch = None

      sample_index = np.random.choice(num_train, batch_size, replace=False)
      X_batch = X[:,sample_index]
      y_batch = y[sample_index]

      # y_batch = np.random.choice(y,batch_size,replace=False)
      # X_batch = X[:,y_batch]
      # evaluate loss and gradient
      loss, grad = self.loss(X_batch, y_batch, reg)
      # evaluate weight with using loss and gradient
      self.W += -learning_rate * grad

      if verbose and it % 100 == 0:
        print 'iteration %d / %d: loss %f' % (it, num_iters, loss)

    return loss_history

    def predict(self, X):

    y_pred = np.zeros(X.shape[1])
    y_pred = np.argmax(np.dot(self.W,X),axis=0)

    return y_pred
    def loss(self, X_batch, y_batch, reg):
    Compute the loss function and its derivative. 
    Subclasses will override this.

    - X_batch: D x N array of data; each column is a data point.
    - y_batch: 1-dimensional array of length N with labels 0...K-1, for K classes.
    - reg: (float) regularization strength.

    Returns: A tuple containing:
    - loss as a single float
    - gradient with respect to self.W; an array of the same shape as W

    class LinearSVM(LinearClassifier):
    """ A subclass that uses the Multiclass SVM loss function """

    def loss(self, X_batch, y_batch, reg):
        return svm_loss_vectorized(self.W, X_batch, y_batch, reg)

    class Softmax(LinearClassifier):
    """ A subclass that uses the Softmax + Cross-entropy loss function """

    def loss(self, X_batch, y_batch, reg):
        return softmax_loss_vectorized(self.W, X_batch, y_batch, reg)
  • linear_svm
        import numpy as np
        from random import shuffle

    def svm_loss_naive(W, X, y, reg):
    Structured SVM loss function, naive implementation (with loops)
    - W: 10 x 3073 array of weights
    - X: 3073 x 49000 array of data. Data are D-dimensional columns
    - y: 1-dimensional array of length N with labels 0...K-1, for 49000 classes
    - reg: (float) regularization strength
    a tuple of:
    - loss as single float
    - gradient with respect to weights W; an array of same shape as W
    dW = np.zeros(W.shape) # initialize the gradient as zero

    # compute the loss and the gradient
    num_classes = W.shape[0]#10
    num_train = X.shape[1]#49000
    loss = 0.0
    # implement max(0,s[j]-s[yi]+1)
    for i in xrange(num_train):
      scores = W.dot(X[:, i])#calculate to get the every label under W(f(x,w))
      correct_class_score = scores[y[i]]#y[i]
      for j in xrange(num_classes):
        if j == y[i]:
        margin = scores[j] - correct_class_score + 1 # max(0,f[i]-f[yi]+1)
        if margin > 0:#if margin < 0 and j!= y[i],there is no effect on result
          loss += margin
          dW[y[i],:] += -X[:,i]
          dW[j,:] += X[:,i]
    # Right now the loss is a sum over all training examples, but we want it
    # to be an average instead so we divide by num_train.

    loss /= num_train #the average of loss
    dW /= num_train

    # Add regularization to the loss.
    loss += 0.5 * reg * np.sum(W * W)
    dW += reg*W
    # TODO:                                                                     #
    # Compute the gradient of the loss function and store it dW.                #
    # Rather that first computing the loss and then computing the derivative,   #
    # it may be simpler to compute the derivative at the same time that the     #
    # loss is being computed. As a result you may need to modify some of the    #
    # code above to compute the gradient.                                       #

    return loss, dW

     def svm_loss_vectorized(W, X, y, reg):
    Structured SVM loss function, vectorized implementation.

    Inputs and outputs are the same as svm_loss_naive.
    #compute the loss function
    loss = 0.0
    dW = np.zeros(W.shape) # initialize the gradient as zero
    num_train = X.shape[1]
    num_classes = W.shape[0]
    scores = W.dot(X)
    correct_score = scores[y,np.arange(num_train)]
    margins = scores-correct_score+1.0
    margins[y,np.arange(num_train)] = 0.0
    margins[margins<0] = 0.0
    loss += np.sum(margins)/num_train
    loss += 0.5*reg*np.sum(W*W)

    #compute the gradient
    margins[margins > 0] = 1.0
    row_sum = np.sum(margins,axis=0)
    margins[y,np.arange(num_train)] = -row_sum
    dW += np.dot(margins,X.T)/num_train + reg*W

    return loss, dW
  • svm.py
        import random
    import numpy as np
    from cs231n.data_utils import load_CIFAR10
    import matplotlib.pyplot as plt
    import time

    plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
    plt.rcParams['image.interpolation'] = 'nearest'
    plt.rcParams['image.cmap'] = 'gray'

    cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

    num_training = 49000
    num_validation = 1000
    num_test = 1000

    mask = range(num_training, num_training + num_validation)
    X_val = X_train[mask]
    y_val = y_train[mask]

    mask = range(num_training)
    X_train = X_train[mask]
    y_train = y_train[mask]

    mask = range(num_test)
    X_test = X_test[mask]
    y_test = y_test[mask]

    X_train = np.reshape(X_train,(X_train.shape[0],-1))
    X_val = np.reshape(X_val,(X_val.shape[0],-1))
    X_test = np.reshape(X_test,(X_test.shape[0],-1))

    print 'Training data shape: ', X_train.shape
    print 'Validation data shape: ', X_val.shape
    print 'Test data shape: ', X_test.shape

    #get the average of every label in X_train
    mean_image = np.mean(X_train,axis=0)
    print mean_image[:10]

    X_train -= mean_image
    X_val -= mean_image
    X_test -= mean_image
    print X_train.shape,X_train.shape[0]
    print "np.ones((X_train.shape[0],1))'s shape is",np.ones((X_train.shape[0],1)).shape

    X_train = np.hstack([X_train,np.ones((X_train.shape[0],1))]).T
    X_val = np.hstack([X_val,np.ones((X_val.shape[0],1))]).T
    X_test = np.hstack([X_test,np.ones((X_test.shape[0],1))]).T

    print X_train.shape,X_val.shape,X_test.shape

    # from cs231n.classifiers.linear_svm import svm_loss_naive
    # W = np.random.randn(10,3073)*0.0001
    # print W.shape
    # loss,grad = svm_loss_naive(W,X_train,y_train,0.00001)
    # print 'loss:%f' %(loss,)
    # loss,grad = svm_loss_naive(W,X_train,y_train,0.0)

    # from cs231n.gradient_check import  grad_check_sparse
    # f = lambda w:svm_loss_naive(W,X_train,y_train,0.0)[0]
    # grad_numerical = grad_check_sparse(f,W,grad,10)
    # tic = time.time()
    # loss_native,grad_native = svm_loss_naive(W,X_train,y_train,0.00001)
    # toc = time.time()
    # print 'Naive loss:%e computed in %fs' %(loss_native,toc-tic)
    # from cs231n.classifiers.linear_svm import svm_loss_vectorized
    # tic = time.time()
    # loss_vectorized,_ = svm_loss_vectorized(W,X_train,y_train,0.00001)
    # toc = time.time()
    # print 'Vectorized loss: %e computed in %fs' %(loss_vectorized,toc-tic)
    # print 'difference: %f' %(loss_native - loss_vectorized)
    # tic = time.time()
    # _,grad_naive = svm_loss_naive(W,X_train,y_train,0.00001)
    # toc = time.time()
    # print 'Naive loss and gradient:computed in %fs' %(toc-tic)
    # tic = time.time()
    # _, grad_vectorized = svm_loss_vectorized(W, X_train, y_train, 0.00001)
    # toc = time.time()
    # print 'Vectorized loss and gradient: computed in %fs' % (toc - tic)
    # difference = np.linalg.norm(grad_naive-grad_vectorized,ord='fro')
    # print 'difference:%f' %difference

    from cs231n.classifiers import LinearSVM
    svm = LinearSVM()
    tic = time.time()
    loss_hist = svm.train(X_train, y_train, learning_rate=1e-7, reg=5e4,num_iters=1500, verbose=True)
    toc = time.time()
    print 'That took %fs' % (toc - tic)

    # plt.plot(loss_hist)
    # plt.xlabel('Iteration number')
    # plt.ylabel('Loss value')
    # plt.show()

    y_train_pred = svm.predict(X_train)
    print 'training accutacy: %f' %(np.mean(y_train == y_train_pred), )
    y_val_pred = svm.predict(X_val)
    print 'validation accuracy: %f' %(np.mean(y_val == y_val_pred), )

    learning_rates = [1e-7,5e-5]
    regularization_strengths = [5e4,1e5]

    results = {}
    best_val = -1
    best_svm = None
    iters = 2000

    for lr in learning_rates:
        for rs in regularization_strengths:
            svm = LinearSVM()
            svm.train(X_train,y_train,learning_rate=lr,reg=rs,num_iters=iters)#train times=iters
            y_train_pred = svm.predict(X_train)
            acc_train = np.mean(y_train==y_train_pred)
            y_val_pred = svm.predict(X_val)
            acc_val = np.mean(y_val==y_val_pred)
            results[(lr,rs)] = (acc_train,acc_val)

            if best_val<acc_val:
                best_val = acc_val
                best_svm = svm

    for lr,reg in sorted(results):
        train_accuracy,val_accuracy = results[(lr,reg)]
        print 'lr %e reg %e train accuracy: %f val accuracy: %f'%(

    print 'best validation accuracy achieved during cross-validation: %f' %best_val

    import math
    x_scatter = [math.log10(x[0])for x in results]
    y_scatter = [math.log10(x[1])for x in results]

    sz = [results[x][0]*1500 for x in results]
    # plt.subplot(1,2,1)
    # plt.scatter(x_scatter,y_scatter,sz)
    # plt.xlabel('log learning rate')
    # plt.ylabel('log regularization strength')
    # plt.title('CIFAR-10 training accuracy')
    # sz = [results[x][1]*1500 for x in results]
    # plt.subplot(1,2,2)
    # plt.scatter(x_scatter,y_scatter,sz)
    # plt.xlabel('log learning rate')
    # plt.ylabel('log regularization strength')
    # plt.title('CIFAR-10 validation accuracy')
    # plt.show()

    y_test_pred = best_svm.predict(X_test)
    test_accuracy = np.mean(y_test==y_test_pred)
    print 'linear SVM on raw pixels final test set accuracy: %f' %test_accuracy

    w = best_svm.W[:,:-1]
    w = w.reshape(10,32,32,3)
    w_min,w_max = np.min(w),np.max(w)
    classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
    for i in xrange(10):
        wimg = 255.0*(w[i].squeeze()-w_min)/(w_max-w_min)

