MNIST

mnist=fetch_openml('mnist_784', version=1, parser='pandas')
import matplotlib as mpl
import matplotlib.pyplot as plt

some_digit=X.iloc[0].to_numpy() # This returns Series
some_digit_image=some_digit.reshape(28,28)
Xd=X.iloc[[0]] #This returns DataFrame

print(type(Xd))
plt.imshow(some_digit_image, cmap=mpl.cm.binary, interpolation="nearest")
plt.axis("off")
plt.show()
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

Training a Binary Classifier

y_train_5=(y_train=='5')
y_test_5=(y_test=='5')
from sklearn.linear_model import SGDClassifier
sgd_clf=SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)
sgd_clf.predict(Xd)

Measuring Accuracy Using Cross-Validation

from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")

Accuracy is gernerally not preferred performance measure for classifiers, especially when you are dealing with skewed datasets (i.e., when some classes are much more frequent than others).

Confusion Matrix

  • count the number of times instances of class A are classified as class B
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

#our classifier
y_train_pred=cross_val_predict(sgd_clf, X_train, y_train_5, cv=3) #K-fold-cross-validation
confusion_matrix(y_train_5, y_train_pred, labels=[True, False])

#nonsense classifier
y_train_pred_never=cross_val_predict(never_5_clf, X_train, y_train_5, cv=3) #K-fold-cross-validation
confusion_matrix(y_train_5, y_train_pred_never, labels=[True, False])
  • Precision: (true positive) / (true positive + false positive)
  • Recall (sensitivity, true positive rate): (true positive) / (true positives + false negative)
  • F1 Score: harmonic mean of precision and recall