Supervised Learning – Digits Classification with a simple CNN¶

Objectives¶

(a) Dataset is from the MNIST dataset of handwritten digits, which has a training set of 60,000 examples, and a test set of 10,000 examples. The digits have been size-normalized and centered in a fixed-size image.

The goal is to determine whether or not an example is a 3, therefore your binary classifier will seek to estimate $y=1$ if the digit is a 3, and $y=0$ otherwise.

(b) Will plot 10 examples of each class (i.e. class $y=0$, which are not 3’s and class $y=1$ which are 3’s), from the training dataset.

(c) Will plot a histogram of samples by class to check class balance and highlight potential issues that imbalance might this cause, if it exists?

(d) Using cross-validation, I’l train and test a classifier. Compare the performance against (1) a classifier that randomly guesses the class, and (2) a classifier that guesses that all examples are NOT 3’s. Will then plot tge corresponding ROC curves and precision-recall curves.

(f) Using a logistic regression classifier (a linear classifier), I’ll apply lasso regularization and retrain the model and evaluate its performance over a range of values on the regularization coefficient. In addition, as I vary the regularization coefficient, I’ll plot (1) the number of model parameters that are estimated to be nonzero; (2) the logistic regression cost function; (3) $F_1$-score, and (4) area under the curve (AUC).

(a)

import tensorflow as tf
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

import numpy as np
y_train_bin = np.zeros((len(y_train),1))
y_test_bin = np.zeros((len(y_test),1))
y_train_bin[y_train == 3] = 1
y_test_bin[y_test == 3] = 1

(b)

import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots(2,10,figsize=(17,10),sharex=True,sharey=True)

for j in range(10):
    ax[0,j].imshow(x_train[y_train != 3,:,:][j,:,:], cmap='Greys')
    ax[1,j].imshow(x_train[y_train == 3,:,:][j,:,:], cmap='Greys')
    
#fig.suptitle("Images grouped by row on if digit is 3 or not")
plt.xticks(())
plt.yticks(())
plt.show()

(c)

(unique, counts) =np.unique(y_train_bin, return_counts=True)
y_pos = np.arange(len(unique))
plt.figure(figsize=(10,6))
plt.bar(y_pos,counts, align='center', alpha=0.5)
plt.xticks(y_pos, ('Class 0: Not 3','Class 1: Is a 3'))
plt.ylabel('Frequency')
plt.xlabel('Class of image')
plt.title('Frequency of Number 3 images')
plt.show()

The dataset has 6 131 images that are classed as 3 and 53 869 images that are not. The dataset is imbalanced, so this could result in the model potentially overfitting for the more frequent class. Accuracy as a metric, might give false impression of model performance because if model decides to allocate all observations to more frequent class, then it might appear as if it is performing better than chance when it is actually not.

(d)

x_train_cnn = x_train.reshape(x_train.shape[0], 28, 28, 1)
x_test_cnn = x_test.reshape(x_test.shape[0], 28, 28, 1)
input_shape = (28, 28, 1)

# Importing the required Keras modules containing model and layers
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D
from sklearn.model_selection import StratifiedKFold

# Creating a Sequential Model and adding the layers
cnn_ml = Sequential()
cnn_ml.add(Conv2D(28, kernel_size=(3,3), input_shape=input_shape))
cnn_ml.add(MaxPooling2D(pool_size=(2, 2)))
cnn_ml.add(Flatten()) # Flattening the 2D arrays for fully connected layers
cnn_ml.add(Dense(128, activation=tf.nn.relu))
cnn_ml.add(Dropout(0.2))
cnn_ml.add(Dense(2,activation=tf.nn.softmax))

cnn_ml.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

prediction_scores = np.empty(y_train_bin.shape[0],dtype='object')
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train_index, val_index in kf.split(x_train_cnn, y_train_bin):
    # Extract the training and validation data for this fold
    x_train_cv, x_val_cv   = x_train_cnn[train_index], x_train_cnn[val_index]
    y_train_cv          = y_train_bin[train_index]
        
    # Training the cnn classifier
    cnn_ml.fit(x=x_train_cv,y=y_train_cv, epochs=10,use_multiprocessing=True,verbose=0)
        
    # Test the classifier on the validation data for this fold
    cpred            = cnn_ml.predict_proba(x_val_cv)
        
    # Save the predictions for this fold
    prediction_scores[val_index] = cpred[:,1]

cnn_ml.fit(x=x_train_cnn,y=y_train_bin, epochs=10,use_multiprocessing=True,verbose=0)
cnn_ml.evaluate(x_test_cnn, y_test_bin)

Using TensorFlow backend.

10000/10000 [==============================] - 0s 48us/step

[0.062073560202395836, 0.9944999814033508]

from sklearn.metrics import roc_curve,roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc

# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(y_test_bin))]
ns_fpr, ns_tpr, _ = roc_curve(y_test_bin, ns_probs)

y_val_cat_prob=cnn_ml.predict_proba(x_test_cnn)
fpr , tpr , thresholds = roc_curve ( y_test_bin , y_val_cat_prob[:,1])


cnn_precision, cnn_recall, _ = precision_recall_curve(y_test_bin, y_val_cat_prob[:,1])
cnn_auc = auc(cnn_recall, cnn_precision)

def plot_roc_curve(fpr,tpr,ns_fpr,ns_tpr):

    plt.figure(figsize=(10,6), dpi= 100)
    plt.plot(fpr,tpr, marker='.', label='Model')
    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='Random Guess')
    plt.plot(0, 0, marker='o', markersize=10, color="red",label='Classify all as 0')
    plt.axis([0,1,0,1]) 
    plt.xlabel('False Positive Rate') 
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    
    # show the legend
    plt.legend()
    
    # Hide the right and top spines
    ax = plt.gca() 
    ax.spines['right'].set_visible(False) 
    ax.spines['left'].set_visible(False) 
    ax.spines['top'].set_visible(False)
    
    plt.show()
    
def plot_precision_recall_curve(y_true,ml_recall,ml_precision):
    plt.figure(figsize=(10,6), dpi= 100)
    random_guess = len(y_true[y_true==1]) / len(y_true)
    plt.plot(ml_recall, ml_precision, marker='.', label='Model')
    plt.plot([0, 1], [random_guess, random_guess], linestyle='--', label='Random Guess')
    #plt.hlines(0,0,1,colors= 'red',linestyles='dashed',label='Classify all as 0')
    plt.plot(0, 0, marker='o', markersize=10, color="red",label='Classify all as 0')
    plt.axis([0,1,0,1]) 
    # axis labels
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    # show the legend
    plt.legend()
    
    # Hide the right and top spines
    ax = plt.gca() 
    ax.spines['right'].set_visible(False) 
    ax.spines['left'].set_visible(False) 
    ax.spines['top'].set_visible(False)
    
    # show the plot
    plt.show()

plot_roc_curve (fpr,tpr,ns_fpr,ns_tpr)

plot_precision_recall_curve(y_test_bin,cnn_recall,cnn_precision)

The model performs better than both a random classify and one that classifies all as zero as shown on the Precision recall curve and ROC curves.

The ROC curve highlights that the model is performing significantly well, with an accuracy 0.994. The issue though is that because the dataset is imbalanced, the ROC curve overestimates performance. As shown by the Precision-Recall curve, to achieve higher levels of Recall, more significant levels of Precision, that would be initially thought, if one had looked at the ROC curve only.

(f)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score,roc_auc_score
import numpy as np
import warnings
warnings.simplefilter('ignore')

def confidence_probabilities(X,w):
    z = np.dot(X,w)
    g = 1/(1+np.exp(-z))
    return g

def cost(X,Y,w):
    m = len(X)
    prediction = confidence_probabilities(X,w)
    J = (1/m)*np.sum((-Y*np.log(prediction)) - ((1-Y)*np.log(1-prediction)))
    return J

x_train_lr = x_train.reshape(len(x_train),-1)

from sklearn.linear_model import LogisticRegression

weights, params,scores,costs,nz_weights_cnt,f1_scores,aucs,reg_strength = [], [],[],[],[],[],[],[]
for c in np.arange(-15, 5, dtype=float):
    c_f = 2**c
    #print('Starting: Penalty parameter %s' % str(c_f))
    reg_strength.append(1.0/c_f)
    lr_lasso = LogisticRegression(random_state=0,penalty='l1',C=c_f)
    #score = cross_val_score(lr_lasso, x_train_lr, y_train_bin, cv=5)
    lr_lasso.fit(x_train_lr, y_train_bin)
    y_pred = lr_lasso.predict(x_train_lr)
    y_pred_prob = lr_lasso.predict_proba(x_train_lr)
    w = np.r_[lr_lasso.intercept_,lr_lasso.coef_[0]]
    costs.append(cost(np.c_[np.ones(x_train_lr.shape[0]),x_train_lr],y_train_bin,w))
    weights.append(w)
    params.append(c_f)
    #scores.append(score)
    nz_weights_cnt.append(np.count_nonzero(lr_lasso.coef_[0]))
    f1_scores.append(f1_score(y_train_bin, y_pred))
    aucs.append(roc_auc_score(y_train_bin, y_pred_prob[:,1]))
    #print('Ending: Penalty parameter %s' % str(c_f))

import matplotlib.pyplot as plt

#plt.figure(figsize=(10,6), dpi= 100)
fig, ax =plt.subplots(nrows=2,ncols=2,figsize=(17,10))


ax[0,0].plot(reg_strength,nz_weights_cnt)
ax[0,0].set_xscale('log')
ax[0,0].set_title('Number of None-zero weights')


ax[0,1].plot(reg_strength,costs)
ax[0,1].set_xscale('log')
ax[0,1].set_title('Costs')


ax[1,0].plot(reg_strength,f1_scores)
ax[1,0].set_xscale('log')
ax[1,0].set_title('F1-Scores')
ax[1,0].set_xlabel('Regulization Strength')

ax[1,1].plot(reg_strength,aucs)
ax[1,1].set_xscale('log')
ax[1,1].set_title('AUC achieved')
ax[1,1].set_xlabel('Regulization Strength')

fig.suptitle('Effect of Regulization Strength on the different components')

Text(0.5, 0.98, 'Effect of Regulization Strength on the different components')

The graphs above highlight the following as regulization strength increases:

Number of none-zero weights and cost decrease
$F_1$-score and AUC initially remain relatively flat (with $F_1$-score even slightly increasing at some points), even as the number of none-zero weights decreases, but eventually both start decreasing.

The implication is that LASSO, implicitly will do feature selection, and if the appropriate strength is chosen, a more parsimonious or simpler model can be chosen, which performs equally as well (if not better), as one with all the features.