Identifying Handwritten Digits

Here, we need to identify the digit in given images. We have total 70,000 images, out of which 49,000 are part of train images with the label of digit and rest 21,000 images are unlabeled (known as test images). Now, We need to identify the digit for test images.

About the Data

The data set used for this problem is from the populat MNIST data set. Developed by Yann LeCun, Corina Cortes and Christopher Burger for evaluating machine learning models on the handwritten digit classification problem.

Download and tidy up the data

I downloaded the data from the Analytics Vidhya website, and stored locally.

tr<- read.csv("train.csv")       
test <-read.csv("Test_fCbTej3.csv"  )

library(png)
tr_im <- data.frame(1:784)
for (i in 1:dim(tr)[1]){
        filename <-as.character(tr[i,1])  #to identify the image
        im <- readPNG(filename)           
        greyim <- im[,,1]+im[,,2]+im[,,3] #convert to greyscale by averaging over colour channels
        greyim <- as.vector(greyim/max(greyim)) #convert matrix to vector and normalize
        tr_im[,i] <-greyim          #store as column in data frame
        names(tr_im)[i]<-filename
}

filename <- names(tr_im)
tr_im <- t(tr_im)      #transpose so that each image is a row
tr_im <- cbind(filename,tr_im)
tr_im <- data.frame(tr_im)

test_im <-  data.frame(1:784)

#similarly for test data
for (i in 1:dim(test)[1]){
        filename <-as.character(test[i,1])
        im <- readPNG(filename)
        greyim <- im[,,1]+im[,,2]+im[,,3]
        greyim <- as.vector(greyim/max(greyim))
        test_im[,i] <-greyim
        names(test_im)[i]<-filename
}
filename <- names(test_im)
test_im <- t(test_im)
test_im <- cbind(filename,test_im)
test_im <- data.frame(test_im)

tr <- merge(tr,tr_im)              #associate labels with images
test <- merge(test,test_im)        # order test data


tr$label <- as.factor(tr$label)
tr[,3:786] <- sapply(tr[,3:786],function(x)as.numeric(as.character(x)))
test[,2:785] <- sapply(test[,2:785],function(x) as.numeric(as.character(x)))

Learning from the data

library(mlr)
#make a small task to compare a few classifiers
train1 <- tr[1:5000,]
trainTask <- makeClassifTask(data=train1[,2:786],target="label")

#list multiclass classfiers available in mlr
l<-listLearners(trainTask, properties = "multiclass")
print(l$shortnames)

## NULL

#choose a few and compare accuracy on small part of data
r = generateLearningCurveData( list("classif.rpart","classif.knn","classif.fnn","classif.gbm","classif.ksvm","classif.xgboost"),
                              task = trainTask, percs = seq(0.2, 1, by = 0.2),
                              measures = acc, resampling = makeResampleDesc(method = "Subsample",stratify = T, iters = 3),
                              show.info = TRUE)

print(r$task)

## Supervised task: train1[, 2:786]
## Type: classif
## Target: label
## Observations: 5000
## Features:
## numerics  factors  ordered 
##      784        0        0 
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Classes: 10
##   0   1   2   3   4   5   6   7   8   9 
## 459 569 503 500 498 449 517 525 507 473 
## Positive class: NA

plotLearningCurve(r)

#choose best classifier
lrn <- as.character(r$data$learner[which.max(r$data$acc)])
#create learner
model<- makeLearner(lrn,predict.type="response")
#set 3 fold cross validation
set_cv <- makeResampleDesc("CV",stratify = T,iters = 3L)
#make validation task for chosen learning model
val <- tr[30001:49000,]
valTask <- makeClassifTask(data=val[,2:786],target="label")

#train the model on the small task
ksvm1 <- train(model,trainTask)

#validate
p1<- predict(ksvm1,valTask)

#accuracy on validation set
sum(p1$data$response==val$label)/19000

## [1] 0.9456842

This has given almost 95% accuracy on the validation set, using only a fraction of the data.
Let’s tune the model:

ps = makeParamSet(
  makeNumericParam("C", lower=0.1,upper=3),
  makeNumericParam("sigma",lower=0.001,upper=0.1)
)
ctrl = makeTuneControlRandom(maxit = 100L)
rdesc = makeResampleDesc("CV", iters = 3L)
res = tuneParams("classif.ksvm", task = trainTask, resampling = rdesc,
  par.set = ps, control = ctrl,measure = acc)

#The best result was for C=2.851, sigma=0.0217 with mean acc=0.9517993
#Now use these values
tuned <- setHyperPars(model,par.vals=res$x)

ksvm2 <- train(tuned,trainTask)

#validate
p2<- predict(ksvm2,valTask)

#accuracy on validation set
sum(p2$data$response==val$label)/19000

## [1] 0.9592632

So as was to be expected, tuning gives an increase in accuracy. I tuned only on a fraction of the data, and that took a couple of hours. I hope that the parameter values are also optimal for the whole data set. Let’s train now on all the data with these parameter values:

#use all data 

trainTask <- makeClassifTask(data=tr[,2:786],target="label")
ksvm3 <- train(tuned,trainTask)

#make test set have correct format 
test$label <- rep("0",21000)
names(test)[2:785]<- names(train1)[3:786]
testTask <- makeClassifTask(data=test,target="label")
p3 <- predict(ksvm3,testTask)

This gave accuracy of 0.9844 on the test set ,as of the public leaderboard on Analytics Vidhya, and sixth place on the leaderboard. This is an impressive result, achieved without trying deep learning!

Identifying Handwritten Digits

Elisa Lerner

31 October 2016

About the Data

Download and tidy up the data

Learning from the data