This R Markdown deals with the Digit Recognizer problem made available on Kaggle. I have solved the probelm using xgboost regression. Although the model provides good results with the initial parameters, I have also gone forwards and performed Principal Component Analysis to help reduce the no. of variables used in the model. Post that I have used Cross-Validation to identify the ideal parameters for the model.

More details about the problem statement and the data can be found here

Lets dive into the problem

Data Initialization

library(xgboost)
library(caret)

train_raw <- read.csv("/Users/singh3ab/Desktop/Kaggle/train.csv")
test_raw  <- read.csv("/Users/singh3ab/Desktop/Kaggle/test.csv")

set.seed(150689)
split <- createDataPartition(y=train_raw$label,p=0.7,list=FALSE)
train_data <- train_raw[split,]
test_data  <- train_raw[-split,]

train_X <- train_data[,-1]
train_Y <- train_data[,1]
test_X  <- test_data[,-1]
test_Y  <- test_data[,1]


digitClasses <- as.character(0:9)
numClasses <- length(digitClasses)

XGBoost Classification Model Training

param1 <- list("objective" = "multi:softmax",    # multiclass classification 
               "eval_metric" = "merror",    # evaluation metric 
               "nthread" = 8,   # number of threads to be used 
               "max_depth" = 10,    # maximum depth of tree 
               "eta" = 0.2,    # step size shrinkage 
               "gamma" = 0,    # minimum loss reduction 
               "subsample" = 0.66,    # part of data instances to grow tree 
               "colsample_bytree" = 0.75,  # subsample ratio of columns when constructing each tree 
               "min_child_weight" = 5  # minimum sum of instance weight needed in a child 
)

set.seed(13579)
system.time(xgb_m1 <- xgboost(data=xgb.DMatrix(model.matrix(~.,data=train_X),label=train_Y),
                         verbose=1,
                         num_class=numClasses,
                         nrounds=200,
                         early.stop.round=3,
                         params=param1))

Prediction on Test Data

system.time(xgb_pred1 <- predict(xgb_m1,newdata=xgb.DMatrix(model.matrix(~.,data=test_X),label=test_Y)))
##    user  system elapsed 
##    1.30    0.06    0.53
confusionMatrix(xgb_pred1,test_Y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2    3    4    5    6    7    8    9
##          0 1219    0    7    1    2    4    7    5    2    7
##          1    0 1392    5    5    3    0    1    6    6    4
##          2    2    3 1198   22    3    2    0    9    6    2
##          3    1    6    6 1228    0   14    0    3    8   13
##          4    1    1    4    2 1172    2    6    5    8   13
##          5    1    2    0   11    0 1080    9    1    5    2
##          6    7    1    5    1    8    9 1227    0    6    0
##          7    0    1   10    6    1    2    0 1265    2   19
##          8   10    4    5   24    1    7    5    6 1185    8
##          9    1    0    5   12   24    6    0   18   17 1162
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9628         
##                  95% CI : (0.9593, 0.966)
##     No Information Rate : 0.1119         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9586         
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.98148   0.9872  0.96225  0.93598  0.96540  0.95915
## Specificity           0.99692   0.9973  0.99568  0.99548  0.99631  0.99730
## Pos Pred Value        0.97209   0.9789  0.96071  0.96013  0.96540  0.97210
## Neg Pred Value        0.99797   0.9984  0.99586  0.99258  0.99631  0.99600
## Prevalence            0.09859   0.1119  0.09883  0.10415  0.09637  0.08939
## Detection Rate        0.09677   0.1105  0.09510  0.09748  0.09304  0.08573
## Detection Prevalence  0.09955   0.1129  0.09899  0.10153  0.09637  0.08820
## Balanced Accuracy     0.98920   0.9923  0.97897  0.96573  0.98086  0.97822
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           0.97769   0.9598  0.95181  0.94472
## Specificity           0.99674   0.9964  0.99383  0.99270
## Pos Pred Value        0.97073   0.9686  0.94422  0.93333
## Neg Pred Value        0.99753   0.9953  0.99471  0.99401
## Prevalence            0.09963   0.1046  0.09883  0.09764
## Detection Rate        0.09740   0.1004  0.09407  0.09224
## Detection Prevalence  0.10034   0.1037  0.09963  0.09883
## Balanced Accuracy     0.98721   0.9781  0.97282  0.96871

Principal Component Analysis

data <- train_raw[,-1] / 255.0             # Scale all pixel values to [0,1]
covX <- cov(data)                          # Covariance matrix
pca <- prcomp(covX)                        # Perform PCA

#Variance Explained
var_exp <- as.data.frame(pca$sdev^2/sum(pca$sdev^2))
var_exp <- cbind(c(1:ncol(data)),var_exp,cumsum(var_exp[,1]))
colnames(var_exp) <- c("Principal_Components","Variance","Cumulative_Variance")
#Plotting the Variance Curves
#Individual Variance
plot(var_exp$Principal_Components,var_exp$Variance,type='b',xlim=c(0,100),pch=16,xlab = "Principal Componets",ylab = "Variance",main = 'Principal Components vs Variance')

#Cumulative Variance
plot(var_exp$Principal_Components,var_exp$Cumulative_Variance,type='b',xlim=c(0,100),pch=16,xlab = "Principal Componets",ylab = "Cumulative Variance",main = 'Principal Components vs Cumulative Variance')

var_exp[50:60,]
##    Principal_Components     Variance Cumulative_Variance
## 50                   50 0.0003355017           0.9929776
## 51                   51 0.0003227039           0.9933003
## 52                   52 0.0003128051           0.9936131
## 53                   53 0.0002820244           0.9938951
## 54                   54 0.0002668536           0.9941620
## 55                   55 0.0002571440           0.9944191
## 56                   56 0.0002380925           0.9946572
## 57                   57 0.0002311834           0.9948884
## 58                   58 0.0002145745           0.9951030
## 59                   59 0.0002101487           0.9953131
## 60                   60 0.0001984946           0.9955116

From the variance plot, we see that 60 PCs explain ~99.5% of the data. So we will use 60 instead of 784

Model Training with CV & 60 PCs

pca_fin <- pca$rotation[,1:60]        # Rotaion matrix (784x60)

PCA <- function(X) {                # Reduce observations from N x 784 to N x 60
  as.matrix(X/255) %*% pca_fin
}


train_pca_X <- PCA(train_X)
test_pca_X  <- PCA(test_X)


xgb.grid <- expand.grid(
  nrounds=100,
  max_depth=c(5,10,15),
  eta=c(0.5,0.2,0.1),
  gamma=c(0,0.5),
  colsample_bytree=0.75,
  min_child_weight=5
)

xgb.trcontrol <- trainControl(
  method="cv",
  number=3,
  verboseIter=TRUE,
  returnData=FALSE,
  returnResamp="all",
  classProbs=TRUE,
  allowParallel=TRUE
)

system.time(xgb_m2 <- train(x=train_pca_X,y=train_Y,
                     verbose=1,
                     objective="multi:softmax",
                     eval_metric="merror",
                     early.stop.round=5,
                     num_class=numClasses,
                     watchlist=list(train=xgb.DMatrix(data=train_pca_X,label=train_Y)),
                     trControl=xgb.trcontrol,
                     tuneGrid=xgb.grid,
                     method="xgbTree"
))

Model Validation

#Traning Data
xgb_pred_train <- predict(xgb_m2,train_pca_X)
train_cm <- confusionMatrix(xgb_pred_train,train_Y)
train_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2    3    4    5    6    7    8    9
##          0 2889    0    0    0    1    0    0    0    0    0
##          1    0 3274    0    0    0    0    0    0    0    0
##          2    0    0 2931    2    0    0    0    1    0    1
##          3    0    0    0 3031    0    0    0    0    2    1
##          4    0    0    0    0 2854    0    0    0    0    3
##          5    0    0    0    0    0 2668    0    0    0    1
##          6    0    0    0    0    0    0 2882    0    0    0
##          7    0    0    1    1    1    0    0 3081    1    2
##          8    1    0    0    3    0    1    0    0 2815    0
##          9    0    0    0    2    2    0    0    1    0 2950
## 
## Overall Statistics
##                                           
##                Accuracy : 0.999           
##                  95% CI : (0.9986, 0.9994)
##     No Information Rate : 0.1113          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9989          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.99965   1.0000  0.99966   0.9974  0.99860  0.99963
## Specificity           0.99996   1.0000  0.99985   0.9999  0.99989  0.99996
## Pos Pred Value        0.99965   1.0000  0.99864   0.9990  0.99895  0.99963
## Neg Pred Value        0.99996   1.0000  0.99996   0.9997  0.99985  0.99996
## Prevalence            0.09829   0.1113  0.09972   0.1034  0.09720  0.09077
## Detection Rate        0.09826   0.1113  0.09968   0.1031  0.09706  0.09074
## Detection Prevalence  0.09829   0.1113  0.09982   0.1032  0.09717  0.09077
## Balanced Accuracy     0.99981   1.0000  0.99975   0.9986  0.99924  0.99979
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           1.00000   0.9994  0.99894   0.9973
## Specificity           1.00000   0.9998  0.99981   0.9998
## Pos Pred Value        1.00000   0.9981  0.99823   0.9983
## Neg Pred Value        1.00000   0.9999  0.99989   0.9997
## Prevalence            0.09802   0.1049  0.09584   0.1006
## Detection Rate        0.09802   0.1048  0.09574   0.1003
## Detection Prevalence  0.09802   0.1050  0.09591   0.1005
## Balanced Accuracy     1.00000   0.9996  0.99937   0.9986
#Test Data
xgb_pred_test <- predict(xgb_m2,test_pca_X)
test_cm <- confusionMatrix(xgb_pred_test,test_Y)
test_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2    3    4    5    6    7    8    9
##          0 1216    0    9    0    2    6   10    4    2    9
##          1    0 1390    2    2    4    0    2    7    9    2
##          2    2    2 1178   21    7    2    0   15   12    0
##          3    4    5   10 1214    0   26    0    2   24   16
##          4    1    1    8    3 1159   10    5    8    5   16
##          5    4    2    3   27    0 1047   15    1   14    6
##          6    7    1    7    3    7   17 1219    0    6    0
##          7    0    3   11    5    2    1    0 1255    4   17
##          8    7    6   15   24    3   10    4    4 1153   12
##          9    1    0    2   13   30    7    0   22   16 1152
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9513         
##                  95% CI : (0.9474, 0.955)
##     No Information Rate : 0.1119         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9458         
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.97907   0.9858  0.94618  0.92530  0.95470  0.92984
## Specificity           0.99630   0.9975  0.99463  0.99229  0.99499  0.99372
## Pos Pred Value        0.96661   0.9803  0.95077  0.93313  0.95313  0.93566
## Neg Pred Value        0.99771   0.9982  0.99410  0.99132  0.99517  0.99312
## Prevalence            0.09859   0.1119  0.09883  0.10415  0.09637  0.08939
## Detection Rate        0.09653   0.1103  0.09351  0.09637  0.09201  0.08312
## Detection Prevalence  0.09987   0.1126  0.09836  0.10328  0.09653  0.08883
## Balanced Accuracy     0.98768   0.9917  0.97041  0.95880  0.97484  0.96178
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           0.97131  0.95220  0.92610  0.93659
## Specificity           0.99577  0.99619  0.99251  0.99199
## Pos Pred Value        0.96212  0.96687  0.93134  0.92679
## Neg Pred Value        0.99682  0.99442  0.99190  0.99313
## Prevalence            0.09963  0.10463  0.09883  0.09764
## Detection Rate        0.09677  0.09963  0.09153  0.09145
## Detection Prevalence  0.10058  0.10304  0.09828  0.09867
## Balanced Accuracy     0.98354  0.97419  0.95931  0.96429

So from the output above we can see that both training & testing have an accuracy higher than 95%, with the training data accuracy being 99.9% & the testing data accuracy being 95.13%.