import numpy as np from sklearn.datasets import fetch_mldata import time import matplotlib.pyplot as plt mnist = fetch_mldata('MNIST original') X = mnist.data.astype(float) Y = mnist.target.astype(float) mask = np.random.permutation(range(np.shape(X)[0])) num_train = 10000 num_test = 500 K = 10 X_train = X[mask[:num_train]] Y_train = Y[mask[:num_train]] X_mean = np.mean(X_train,axis = 0) X_train = (X_train-X_mean)/255 X_test = X[mask[num_train:num_train+num_test]] X_test = (X_test - X_mean)/255 Y_test = Y[mask[num_train:num_train+num_test]] print('X_train',X_train.shape) print('Y_train',Y_train.shape) print('X_test',X_test.shape) print('Y_test',Y_test.shape) ex_image = (np.reshape(X_train[10,:]*255 + X_mean, (28, 28))).astype(np.uint8) plt.imshow(ex_image, interpolation='nearest') # **Computing the distance matrix (num_test x num_train)** # Version 1 (Naive implementation using two for loops) start = time.time() dists_1 = np.zeros((num_test,num_train)) for i in xrange(num_test): for j in xrange(num_train): dists_1[i,j] = np.sqrt(np.square(np.sum(X_test[i,:]-X_train[j,:]))) stop = time.time() time_taken = stop-start print('Time taken with two for loops: {}s'.format(time_taken)) # Version 2(Somewhat better implementation using one for loop) start = time.time() dists_2 = np.zeros((num_test,num_train)) for i in xrange(num_test): dists_2[i,:] = np.sqrt(np.square(np.sum(X_test[i,:]-X_train,axis = 1))) stop = time.time() time_taken = stop-start print('Time taken with just one for loop: {}s'.format(time_taken)) # Version 3 (Fully vectorized implementation with no for loop) start = time.time() dists_3 = np.zeros((num_test,num_train)) A = np.sum(np.square(X_test),axis = 1) B = np.sum(np.square(X_train),axis = 1) C = np.dot(X_test,X_train.T) dists_3 = np.sqrt(A[:,np.newaxis]+B[np.newaxis,:]-2*C) stop = time.time() time_taken = stop-start print('Time taken with no for loops: {}s'.format(time_taken)) sorted_dist_indices = np.argsort(dists_3,axis = 1) closest_k = Y_train[sorted_dist_indices][:,:K].astype(int) Y_pred = np.zeros_like(Y_test) for i in xrange(num_test): Y_pred[i] = np.argmax(np.bincount(closest_k[i,:])) accuracy = (np.where(Y_test-Y_pred == 0)[0].size)/float(num_test) print('Prediction accuracy: {}%'.format(accuracy*100))