Digit Recognizer

Lets dive into the problem

Data Initialization

library(xgboost)
library(caret)

train_raw <- read.csv("/Users/singh3ab/Desktop/Kaggle/train.csv")
test_raw  <- read.csv("/Users/singh3ab/Desktop/Kaggle/test.csv")

set.seed(150689)
split <- createDataPartition(y=train_raw$label,p=0.7,list=FALSE)
train_data <- train_raw[split,]
test_data  <- train_raw[-split,]

train_X <- train_data[,-1]
train_Y <- train_data[,1]
test_X  <- test_data[,-1]
test_Y  <- test_data[,1]


digitClasses <- as.character(0:9)
numClasses <- length(digitClasses)

XGBoost Classification Model Training

param1 <- list("objective" = "multi:softmax",    # multiclass classification 
               "eval_metric" = "merror",    # evaluation metric 
               "nthread" = 8,   # number of threads to be used 
               "max_depth" = 10,    # maximum depth of tree 
               "eta" = 0.2,    # step size shrinkage 
               "gamma" = 0,    # minimum loss reduction 
               "subsample" = 0.66,    # part of data instances to grow tree 
               "colsample_bytree" = 0.75,  # subsample ratio of columns when constructing each tree 
               "min_child_weight" = 5  # minimum sum of instance weight needed in a child 
)

set.seed(13579)
system.time(xgb_m1 <- xgboost(data=xgb.DMatrix(model.matrix(~.,data=train_X),label=train_Y),
                         verbose=1,
                         num_class=numClasses,
                         nrounds=200,
                         early.stop.round=3,
                         params=param1))

Prediction on Test Data

system.time(xgb_pred1 <- predict(xgb_m1,newdata=xgb.DMatrix(model.matrix(~.,data=test_X),label=test_Y)))

##    user  system elapsed 
##    1.30    0.06    0.53

confusionMatrix(xgb_pred1,test_Y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2    3    4    5    6    7    8    9
##          0 1219    0    7    1    2    4    7    5    2    7
##          1    0 1392    5    5    3    0    1    6    6    4
##          2    2    3 1198   22    3    2    0    9    6    2
##          3    1    6    6 1228    0   14    0    3    8   13
##          4    1    1    4    2 1172    2    6    5    8   13
##          5    1    2    0   11    0 1080    9    1    5    2
##          6    7    1    5    1    8    9 1227    0    6    0
##          7    0    1   10    6    1    2    0 1265    2   19
##          8   10    4    5   24    1    7    5    6 1185    8
##          9    1    0    5   12   24    6    0   18   17 1162
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9628         
##                  95% CI : (0.9593, 0.966)
##     No Information Rate : 0.1119         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9586         
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.98148   0.9872  0.96225  0.93598  0.96540  0.95915
## Specificity           0.99692   0.9973  0.99568  0.99548  0.99631  0.99730
## Pos Pred Value        0.97209   0.9789  0.96071  0.96013  0.96540  0.97210
## Neg Pred Value        0.99797   0.9984  0.99586  0.99258  0.99631  0.99600
## Prevalence            0.09859   0.1119  0.09883  0.10415  0.09637  0.08939
## Detection Rate        0.09677   0.1105  0.09510  0.09748  0.09304  0.08573
## Detection Prevalence  0.09955   0.1129  0.09899  0.10153  0.09637  0.08820
## Balanced Accuracy     0.98920   0.9923  0.97897  0.96573  0.98086  0.97822
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           0.97769   0.9598  0.95181  0.94472
## Specificity           0.99674   0.9964  0.99383  0.99270
## Pos Pred Value        0.97073   0.9686  0.94422  0.93333
## Neg Pred Value        0.99753   0.9953  0.99471  0.99401
## Prevalence            0.09963   0.1046  0.09883  0.09764
## Detection Rate        0.09740   0.1004  0.09407  0.09224
## Detection Prevalence  0.10034   0.1037  0.09963  0.09883
## Balanced Accuracy     0.98721   0.9781  0.97282  0.96871

Principal Component Analysis

data <- train_raw[,-1] / 255.0             # Scale all pixel values to [0,1]
covX <- cov(data)                          # Covariance matrix
pca <- prcomp(covX)                        # Perform PCA

#Variance Explained
var_exp <- as.data.frame(pca$sdev^2/sum(pca$sdev^2))
var_exp <- cbind(c(1:ncol(data)),var_exp,cumsum(var_exp[,1]))
colnames(var_exp) <- c("Principal_Components","Variance","Cumulative_Variance")
#Plotting the Variance Curves
#Individual Variance
plot(var_exp$Principal_Components,var_exp$Variance,type='b',xlim=c(0,100),pch=16,xlab = "Principal Componets",ylab = "Variance",main = 'Principal Components vs Variance')

#Cumulative Variance
plot(var_exp$Principal_Components,var_exp$Cumulative_Variance,type='b',xlim=c(0,100),pch=16,xlab = "Principal Componets",ylab = "Cumulative Variance",main = 'Principal Components vs Cumulative Variance')

var_exp[50:60,]

##    Principal_Components     Variance Cumulative_Variance
## 50                   50 0.0003355017           0.9929776
## 51                   51 0.0003227039           0.9933003
## 52                   52 0.0003128051           0.9936131
## 53                   53 0.0002820244           0.9938951
## 54                   54 0.0002668536           0.9941620
## 55                   55 0.0002571440           0.9944191
## 56                   56 0.0002380925           0.9946572
## 57                   57 0.0002311834           0.9948884
## 58                   58 0.0002145745           0.9951030
## 59                   59 0.0002101487           0.9953131
## 60                   60 0.0001984946           0.9955116

From the variance plot, we see that 60 PCs explain ~99.5% of the data. So we will use 60 instead of 784

Model Training with CV & 60 PCs

pca_fin <- pca$rotation[,1:60]        # Rotaion matrix (784x60)

PCA <- function(X) {                # Reduce observations from N x 784 to N x 60
  as.matrix(X/255) %*% pca_fin
}


train_pca_X <- PCA(train_X)
test_pca_X  <- PCA(test_X)


xgb.grid <- expand.grid(
  nrounds=100,
  max_depth=c(5,10,15),
  eta=c(0.5,0.2,0.1),
  gamma=c(0,0.5),
  colsample_bytree=0.75,
  min_child_weight=5
)

xgb.trcontrol <- trainControl(
  method="cv",
  number=3,
  verboseIter=TRUE,
  returnData=FALSE,
  returnResamp="all",
  classProbs=TRUE,
  allowParallel=TRUE
)

system.time(xgb_m2 <- train(x=train_pca_X,y=train_Y,
                     verbose=1,
                     objective="multi:softmax",
                     eval_metric="merror",
                     early.stop.round=5,
                     num_class=numClasses,
                     watchlist=list(train=xgb.DMatrix(data=train_pca_X,label=train_Y)),
                     trControl=xgb.trcontrol,
                     tuneGrid=xgb.grid,
                     method="xgbTree"
))

Model Validation

#Traning Data
xgb_pred_train <- predict(xgb_m2,train_pca_X)
train_cm <- confusionMatrix(xgb_pred_train,train_Y)
train_cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2    3    4    5    6    7    8    9
##          0 2889    0    0    0    1    0    0    0    0    0
##          1    0 3274    0    0    0    0    0    0    0    0
##          2    0    0 2931    2    0    0    0    1    0    1
##          3    0    0    0 3031    0    0    0    0    2    1
##          4    0    0    0    0 2854    0    0    0    0    3
##          5    0    0    0    0    0 2668    0    0    0    1
##          6    0    0    0    0    0    0 2882    0    0    0
##          7    0    0    1    1    1    0    0 3081    1    2
##          8    1    0    0    3    0    1    0    0 2815    0
##          9    0    0    0    2    2    0    0    1    0 2950
## 
## Overall Statistics
##                                           
##                Accuracy : 0.999           
##                  95% CI : (0.9986, 0.9994)
##     No Information Rate : 0.1113          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9989          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.99965   1.0000  0.99966   0.9974  0.99860  0.99963
## Specificity           0.99996   1.0000  0.99985   0.9999  0.99989  0.99996
## Pos Pred Value        0.99965   1.0000  0.99864   0.9990  0.99895  0.99963
## Neg Pred Value        0.99996   1.0000  0.99996   0.9997  0.99985  0.99996
## Prevalence            0.09829   0.1113  0.09972   0.1034  0.09720  0.09077
## Detection Rate        0.09826   0.1113  0.09968   0.1031  0.09706  0.09074
## Detection Prevalence  0.09829   0.1113  0.09982   0.1032  0.09717  0.09077
## Balanced Accuracy     0.99981   1.0000  0.99975   0.9986  0.99924  0.99979
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           1.00000   0.9994  0.99894   0.9973
## Specificity           1.00000   0.9998  0.99981   0.9998
## Pos Pred Value        1.00000   0.9981  0.99823   0.9983
## Neg Pred Value        1.00000   0.9999  0.99989   0.9997
## Prevalence            0.09802   0.1049  0.09584   0.1006
## Detection Rate        0.09802   0.1048  0.09574   0.1003
## Detection Prevalence  0.09802   0.1050  0.09591   0.1005
## Balanced Accuracy     1.00000   0.9996  0.99937   0.9986

#Test Data
xgb_pred_test <- predict(xgb_m2,test_pca_X)
test_cm <- confusionMatrix(xgb_pred_test,test_Y)
test_cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2    3    4    5    6    7    8    9
##          0 1216    0    9    0    2    6   10    4    2    9
##          1    0 1390    2    2    4    0    2    7    9    2
##          2    2    2 1178   21    7    2    0   15   12    0
##          3    4    5   10 1214    0   26    0    2   24   16
##          4    1    1    8    3 1159   10    5    8    5   16
##          5    4    2    3   27    0 1047   15    1   14    6
##          6    7    1    7    3    7   17 1219    0    6    0
##          7    0    3   11    5    2    1    0 1255    4   17
##          8    7    6   15   24    3   10    4    4 1153   12
##          9    1    0    2   13   30    7    0   22   16 1152
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9513         
##                  95% CI : (0.9474, 0.955)
##     No Information Rate : 0.1119         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9458         
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.97907   0.9858  0.94618  0.92530  0.95470  0.92984
## Specificity           0.99630   0.9975  0.99463  0.99229  0.99499  0.99372
## Pos Pred Value        0.96661   0.9803  0.95077  0.93313  0.95313  0.93566
## Neg Pred Value        0.99771   0.9982  0.99410  0.99132  0.99517  0.99312
## Prevalence            0.09859   0.1119  0.09883  0.10415  0.09637  0.08939
## Detection Rate        0.09653   0.1103  0.09351  0.09637  0.09201  0.08312
## Detection Prevalence  0.09987   0.1126  0.09836  0.10328  0.09653  0.08883
## Balanced Accuracy     0.98768   0.9917  0.97041  0.95880  0.97484  0.96178
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           0.97131  0.95220  0.92610  0.93659
## Specificity           0.99577  0.99619  0.99251  0.99199
## Pos Pred Value        0.96212  0.96687  0.93134  0.92679
## Neg Pred Value        0.99682  0.99442  0.99190  0.99313
## Prevalence            0.09963  0.10463  0.09883  0.09764
## Detection Rate        0.09677  0.09963  0.09153  0.09145
## Detection Prevalence  0.10058  0.10304  0.09828  0.09867
## Balanced Accuracy     0.98354  0.97419  0.95931  0.96429

So from the output above we can see that both training & testing have an accuracy higher than 95%, with the training data accuracy being 99.9% & the testing data accuracy being 95.13%.