MNIST

MNIST (“Modified National Institute of Standards and Technology”) is the de facto “hello world” dataset of computer vision. Since its release in 1999, this classic dataset of handwritten images has served as the basis for benchmarking classification algorithms. As new machine learning techniques emerge, MNIST remains a reliable resource for researchers and learners alike.

setwd("D:/mnist")
rm(list=ls()) #take out the papers and the trash
memory.size(max=TRUE)  #set this bad to max
## [1] 32.19
rm(list=ls())
train=read.csv("train.csv", stringsAsFactors=F)  #read in the raw data
str(train)
## 'data.frame':    42000 obs. of  785 variables:
##  $ label   : int  1 0 1 4 0 0 7 3 5 3 ...
##  $ pixel0  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel1  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel2  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel3  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel4  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel5  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel6  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel7  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel8  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel9  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel10 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel11 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel12 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel13 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel14 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel15 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel16 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel17 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel18 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel19 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel20 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel21 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel22 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel23 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel24 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel25 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel26 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel27 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel28 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel29 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel30 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel31 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel32 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel33 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel34 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel35 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel36 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel37 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel38 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel39 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel40 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel41 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel42 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel43 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel44 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel45 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel46 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel47 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel48 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel49 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel50 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel51 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel52 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel53 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel54 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel55 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel56 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel57 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel58 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel59 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel60 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel61 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel62 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel63 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel64 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel65 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel66 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel67 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel68 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel69 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel70 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel71 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel72 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel73 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel74 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel75 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel76 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel77 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel78 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel79 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel80 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel81 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel82 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel83 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel84 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel85 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel86 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel87 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel88 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel89 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel90 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel91 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel92 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel93 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel94 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel95 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel96 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pixel97 : int  0 0 0 0 0 0 0 0 0 0 ...
##   [list output truncated]
mylabel=train[,1]  #assign label to variable
train[,1]=NULL   #eliminates the first column from the training data set

test=read.csv("test.csv", stringsAsFactors=F)  #read in the test data

Check out the Pics

library(EBImage)  #image processing library
myorder=sort(seq(1:784), decreasing=TRUE)  #optional
newtrain=train[,c(myorder)] #optional

myviz=as.vector(unlist(newtrain))  #build a vector from the matrix
im=array(myviz, dim=c(42000,28,28)) #build an array from the vector
newim=flip(im) #flip the image over, so that it displays right...   optional

par(mfrow=c(3,3))  #set the plotting space to 3 x 3
par(ask=TRUE)  #change to next 9 pix when ENTER is hit

for (i in 1:100){  #verify images are displaying
image(1:28, 1:28, newim[i,,], col=gray((0:255)/255)) #color the images gray scale
}

###Outlier Analysis Think I should oversample 1’s and 7’s?

#Don't forget to analyze outliers before you do anything!!
#x outliers
new=as.matrix(newtrain) #set up our training data in matrix form
maxit=matrix(c(rep(0,30)),30)  #set up a matrix of zeros to store the max values
minit=matrix(c(rep(0,30)),30)  #set up a matrix of zeros to store the min values

for (i in 1:30){            # loop
minit[i]=which.min(new[,i])   #identify the observations with the min values of all variables
maxit[i]=which.max(new[,i])   #identify the observations with the max values of all variables
}

par(mfrow=c(3,3))            #now let's look only at the 60 pictures that might be outliers
par(ask=TRUE)                #don't flip automatically
for (i in 1:30){  #loop over the outliers
image(1:28, 1:28, newim[minit[i],,], col=gray((0:255)/255),xlab=minit[i]) #disply min outlier
image(1:28,1:28,newim[maxit[i],,], col=gray((0:255)/255),xlab=maxit[i])   #display max outlier
}

mylabel[minit]
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
mylabel[maxit]
##  [1] 1 1 1 1 7 7 9 7 9 7 7 7 7 7 7 7 7 7 7 7 7 7 7 1 1 1 1 1 1 1

###One technique you might want to try is to add graphical objects to the dataset that are somewhat warped or transformed. In this way, you are inducing error that might be modeled. But I am not going to do that.

NN Simple Example

library(nnet)  #neural net library
library(e1071) 
## Warning: package 'e1071' was built under R version 3.3.3
library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
## Loading required package: ggplot2
set.seed=1234  #keep the same pseudo-random number stream
tn=read.csv("train.csv")  #read in the raw data
test=read.csv("test.csv")  #read in the test data
tn$label=factor(tn$label)  #assign the label as a factor

#tmodel=tune.nnet(label~., data=tn, size=1:10, MaxNWts=100000, trace=TRUE, maxit=200)

mynn=nnet(label~., data=tn, size=4, maxit=200, MaxNWts=3200, trace=TRUE)
## # weights:  3190
## initial  value 101665.709387 
## iter  10 value 74067.250609
## iter  20 value 72228.813799
## iter  30 value 71828.742134
## iter  40 value 68718.559163
## iter  50 value 64921.714329
## iter  60 value 62250.103581
## iter  70 value 60718.154658
## iter  80 value 59293.646990
## iter  90 value 58942.950135
## iter 100 value 58767.553394
## iter 110 value 58174.810657
## iter 120 value 57034.228503
## iter 130 value 56814.670134
## iter 140 value 56517.634644
## iter 150 value 55150.652660
## iter 160 value 54940.333479
## iter 170 value 54875.420881
## iter 180 value 54706.036880
## iter 190 value 54406.288739
## iter 200 value 53228.702193
## final  value 53228.702193 
## stopped after 200 iterations
mypredict=predict(mynn,tn,type="class")
nntable=table(mypredict,tn$label)
confusionMatrix(mypredict, tn$label)
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(mypredict, tn$label): Levels are not in
## the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2    3    4    5    6    7    8    9
##          0 3864   59 2933  742   18 2491  728   29 2647   51
##          1    2 4556  178  311   54  288   44  191  934  124
##          2    0    0    0    0    0    0    0    0    0    0
##          3  126   59  422 3121    0  833    1   23  251   45
##          4    3    2  216   41 3449   53  219  320  149 3451
##          5    0    0    0    1    0    0    0    0    0    0
##          6  135    0  374   40  521  102 3144   72   64  187
##          7    2    8   54   94   29   28    1 3766   18  330
##          8    0    0    0    1    0    0    0    0    0    0
##          9    0    0    0    0    1    0    0    0    0    0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5214          
##                  95% CI : (0.5166, 0.5262)
##     No Information Rate : 0.1115          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4674          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity           0.93514   0.9727  0.00000  0.71731  0.84700
## Specificity           0.74390   0.9430  1.00000  0.95325  0.88257
## Pos Pred Value        0.28491   0.6818      NaN  0.63942  0.43642
## Neg Pred Value        0.99058   0.9964  0.90055  0.96686  0.98173
## Prevalence            0.09838   0.1115  0.09945  0.10360  0.09695
## Detection Rate        0.09200   0.1085  0.00000  0.07431  0.08212
## Detection Prevalence  0.32290   0.1591  0.00000  0.11621  0.18817
## Balanced Accuracy     0.83952   0.9579  0.50000  0.83528  0.86479
##                       Class: 5 Class: 6 Class: 7  Class: 8  Class: 9
## Sensitivity          0.000e+00  0.75997  0.85571 0.000e+00 0.000e+00
## Specificity          1.000e+00  0.96052  0.98500 1.000e+00 1.000e+00
## Pos Pred Value       0.000e+00  0.67773  0.86975 0.000e+00 0.000e+00
## Neg Pred Value       9.096e-01  0.97342  0.98314 9.033e-01 9.003e-01
## Prevalence           9.036e-02  0.09850  0.10479 9.674e-02 9.971e-02
## Detection Rate       0.000e+00  0.07486  0.08967 0.000e+00 0.000e+00
## Detection Prevalence 2.381e-05  0.11045  0.10310 2.381e-05 2.381e-05
## Balanced Accuracy    5.000e-01  0.86024  0.92036 5.000e-01 5.000e-01
submitframe=as.data.frame(seq(1:28000))
submitframe$Label=predict(mynn,test,type="class")
colnames(submitframe)=c("ImageId", "Label")
write.csv(submitframe,"firstsubmit.csv", row.names=FALSE)

Random Forest

library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:EBImage':
## 
##     combine
#library(party)
myrf=randomForest(label~., data=tn, ntree=100, mtry=10, maxnodes=10, importance=FALSE)
#mycf = cforest(label ~ ., data = tn, controls=cforest_unbiased(ntree=1000, mtry=10))  #party package
myrf
## 
## Call:
##  randomForest(formula = label ~ ., data = tn, ntree = 100, mtry = 10,      maxnodes = 10, importance = FALSE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 10
## 
##         OOB estimate of  error rate: 31.44%
## Confusion matrix:
##      0    1    2    3    4   5    6    7    8    9 class.error
## 0 3921   24   10   53   12   2   56   24   26    4  0.05106486
## 1    1 4637   24    8    1   0    3    8    2    0  0.01003416
## 2  192  561 2715  140  101   4  200  197   34   33  0.35001197
## 3  136  418  137 3257   26   4   52  164   52  105  0.25143645
## 4   45  260   14   21 2716   1  165  284   14  552  0.33300589
## 5  360  688   43 1357  122 475  145  231  142  232  0.87483531
## 6  217  428   34   46   91   6 3251   40    8   16  0.21416485
## 7   24  284   37    6   54   0    5 3901    2   88  0.11361054
## 8   97 1014  218  589   73   6  163  115 1567  221  0.61432439
## 9   62  312   14   82  504   0   53  786   18 2357  0.43720153
rfsubmit=submitframe
rfsubmit$label=predict(myrf,test,type="class")
colnames(submitframe)=c("ImageId", "Label")
write.csv(submitframe,"secondsubmit.csv", row.names=FALSE)

Convolutional Neural Net

###You will need to run this to install mxnet###

#cran <- getOption("repos")
#cran["dmlc"] <- "https://s3-us-west-2.amazonaws.com/apache-mxnet/R/CRAN/"
#options(repos = cran)
#install.packages("mxnet",dependencies = T)


library(mxnet)
## Warning: package 'mxnet' was built under R version 3.4.0
library(mlbench)
## Warning: package 'mlbench' was built under R version 3.3.3
mx.set.seed(0)
X=as.matrix(tn[,-c(1)])
y=tn[,1]

model=mx.mlp(X,y, hidden_node=100, out_node=100, activation="tanh", out_activation="softmax",num.round=20, learning.rate=0.1, momentum=0.9, eval.metric=mx.metric.accuracy)
## Warning in mx.model.select.layout.train(X, y): Auto detect layout of input matrix, use rowmajor..
## Start training with 1 devices
## [1] Train-accuracy=0.31176162347561
## [2] Train-accuracy=0.259403495440729
## [3] Train-accuracy=0.213098404255319
## [4] Train-accuracy=0.176481762917933
## [5] Train-accuracy=0.199966755319149
## [6] Train-accuracy=0.238316869300912
## [7] Train-accuracy=0.243992211246201
## [8] Train-accuracy=0.223594224924012
## [9] Train-accuracy=0.213407104863222
## [10] Train-accuracy=0.197972074468085
## [11] Train-accuracy=0.179948708206687
## [12] Train-accuracy=0.197188449848024
## [13] Train-accuracy=0.197758358662614
## [14] Train-accuracy=0.195265007598784
## [15] Train-accuracy=0.189043503039514
## [16] Train-accuracy=0.205737082066869
## [17] Train-accuracy=0.206212006079027
## [18] Train-accuracy=0.208847834346505
## [19] Train-accuracy=0.209417743161094
## [20] Train-accuracy=0.210984992401216