More details about the problem statement and the data can be found here
library(xgboost)
library(caret)
train_raw <- read.csv("/Users/singh3ab/Desktop/Kaggle/train.csv")
test_raw <- read.csv("/Users/singh3ab/Desktop/Kaggle/test.csv")
set.seed(150689)
split <- createDataPartition(y=train_raw$label,p=0.7,list=FALSE)
train_data <- train_raw[split,]
test_data <- train_raw[-split,]
train_X <- train_data[,-1]
train_Y <- train_data[,1]
test_X <- test_data[,-1]
test_Y <- test_data[,1]
digitClasses <- as.character(0:9)
numClasses <- length(digitClasses)
param1 <- list("objective" = "multi:softmax", # multiclass classification
"eval_metric" = "merror", # evaluation metric
"nthread" = 8, # number of threads to be used
"max_depth" = 10, # maximum depth of tree
"eta" = 0.2, # step size shrinkage
"gamma" = 0, # minimum loss reduction
"subsample" = 0.66, # part of data instances to grow tree
"colsample_bytree" = 0.75, # subsample ratio of columns when constructing each tree
"min_child_weight" = 5 # minimum sum of instance weight needed in a child
)
set.seed(13579)
system.time(xgb_m1 <- xgboost(data=xgb.DMatrix(model.matrix(~.,data=train_X),label=train_Y),
verbose=1,
num_class=numClasses,
nrounds=200,
early.stop.round=3,
params=param1))
system.time(xgb_pred1 <- predict(xgb_m1,newdata=xgb.DMatrix(model.matrix(~.,data=test_X),label=test_Y)))
## user system elapsed
## 1.30 0.06 0.53
confusionMatrix(xgb_pred1,test_Y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 1219 0 7 1 2 4 7 5 2 7
## 1 0 1392 5 5 3 0 1 6 6 4
## 2 2 3 1198 22 3 2 0 9 6 2
## 3 1 6 6 1228 0 14 0 3 8 13
## 4 1 1 4 2 1172 2 6 5 8 13
## 5 1 2 0 11 0 1080 9 1 5 2
## 6 7 1 5 1 8 9 1227 0 6 0
## 7 0 1 10 6 1 2 0 1265 2 19
## 8 10 4 5 24 1 7 5 6 1185 8
## 9 1 0 5 12 24 6 0 18 17 1162
##
## Overall Statistics
##
## Accuracy : 0.9628
## 95% CI : (0.9593, 0.966)
## No Information Rate : 0.1119
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9586
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.98148 0.9872 0.96225 0.93598 0.96540 0.95915
## Specificity 0.99692 0.9973 0.99568 0.99548 0.99631 0.99730
## Pos Pred Value 0.97209 0.9789 0.96071 0.96013 0.96540 0.97210
## Neg Pred Value 0.99797 0.9984 0.99586 0.99258 0.99631 0.99600
## Prevalence 0.09859 0.1119 0.09883 0.10415 0.09637 0.08939
## Detection Rate 0.09677 0.1105 0.09510 0.09748 0.09304 0.08573
## Detection Prevalence 0.09955 0.1129 0.09899 0.10153 0.09637 0.08820
## Balanced Accuracy 0.98920 0.9923 0.97897 0.96573 0.98086 0.97822
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.97769 0.9598 0.95181 0.94472
## Specificity 0.99674 0.9964 0.99383 0.99270
## Pos Pred Value 0.97073 0.9686 0.94422 0.93333
## Neg Pred Value 0.99753 0.9953 0.99471 0.99401
## Prevalence 0.09963 0.1046 0.09883 0.09764
## Detection Rate 0.09740 0.1004 0.09407 0.09224
## Detection Prevalence 0.10034 0.1037 0.09963 0.09883
## Balanced Accuracy 0.98721 0.9781 0.97282 0.96871
data <- train_raw[,-1] / 255.0 # Scale all pixel values to [0,1]
covX <- cov(data) # Covariance matrix
pca <- prcomp(covX) # Perform PCA
#Variance Explained
var_exp <- as.data.frame(pca$sdev^2/sum(pca$sdev^2))
var_exp <- cbind(c(1:ncol(data)),var_exp,cumsum(var_exp[,1]))
colnames(var_exp) <- c("Principal_Components","Variance","Cumulative_Variance")
#Plotting the Variance Curves
#Individual Variance
plot(var_exp$Principal_Components,var_exp$Variance,type='b',xlim=c(0,100),pch=16,xlab = "Principal Componets",ylab = "Variance",main = 'Principal Components vs Variance')
#Cumulative Variance
plot(var_exp$Principal_Components,var_exp$Cumulative_Variance,type='b',xlim=c(0,100),pch=16,xlab = "Principal Componets",ylab = "Cumulative Variance",main = 'Principal Components vs Cumulative Variance')
var_exp[50:60,]
## Principal_Components Variance Cumulative_Variance
## 50 50 0.0003355017 0.9929776
## 51 51 0.0003227039 0.9933003
## 52 52 0.0003128051 0.9936131
## 53 53 0.0002820244 0.9938951
## 54 54 0.0002668536 0.9941620
## 55 55 0.0002571440 0.9944191
## 56 56 0.0002380925 0.9946572
## 57 57 0.0002311834 0.9948884
## 58 58 0.0002145745 0.9951030
## 59 59 0.0002101487 0.9953131
## 60 60 0.0001984946 0.9955116
From the variance plot, we see that 60 PCs explain ~99.5% of the data. So we will use 60 instead of 784
pca_fin <- pca$rotation[,1:60] # Rotaion matrix (784x60)
PCA <- function(X) { # Reduce observations from N x 784 to N x 60
as.matrix(X/255) %*% pca_fin
}
train_pca_X <- PCA(train_X)
test_pca_X <- PCA(test_X)
xgb.grid <- expand.grid(
nrounds=100,
max_depth=c(5,10,15),
eta=c(0.5,0.2,0.1),
gamma=c(0,0.5),
colsample_bytree=0.75,
min_child_weight=5
)
xgb.trcontrol <- trainControl(
method="cv",
number=3,
verboseIter=TRUE,
returnData=FALSE,
returnResamp="all",
classProbs=TRUE,
allowParallel=TRUE
)
system.time(xgb_m2 <- train(x=train_pca_X,y=train_Y,
verbose=1,
objective="multi:softmax",
eval_metric="merror",
early.stop.round=5,
num_class=numClasses,
watchlist=list(train=xgb.DMatrix(data=train_pca_X,label=train_Y)),
trControl=xgb.trcontrol,
tuneGrid=xgb.grid,
method="xgbTree"
))
#Traning Data
xgb_pred_train <- predict(xgb_m2,train_pca_X)
train_cm <- confusionMatrix(xgb_pred_train,train_Y)
train_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 2889 0 0 0 1 0 0 0 0 0
## 1 0 3274 0 0 0 0 0 0 0 0
## 2 0 0 2931 2 0 0 0 1 0 1
## 3 0 0 0 3031 0 0 0 0 2 1
## 4 0 0 0 0 2854 0 0 0 0 3
## 5 0 0 0 0 0 2668 0 0 0 1
## 6 0 0 0 0 0 0 2882 0 0 0
## 7 0 0 1 1 1 0 0 3081 1 2
## 8 1 0 0 3 0 1 0 0 2815 0
## 9 0 0 0 2 2 0 0 1 0 2950
##
## Overall Statistics
##
## Accuracy : 0.999
## 95% CI : (0.9986, 0.9994)
## No Information Rate : 0.1113
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9989
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.99965 1.0000 0.99966 0.9974 0.99860 0.99963
## Specificity 0.99996 1.0000 0.99985 0.9999 0.99989 0.99996
## Pos Pred Value 0.99965 1.0000 0.99864 0.9990 0.99895 0.99963
## Neg Pred Value 0.99996 1.0000 0.99996 0.9997 0.99985 0.99996
## Prevalence 0.09829 0.1113 0.09972 0.1034 0.09720 0.09077
## Detection Rate 0.09826 0.1113 0.09968 0.1031 0.09706 0.09074
## Detection Prevalence 0.09829 0.1113 0.09982 0.1032 0.09717 0.09077
## Balanced Accuracy 0.99981 1.0000 0.99975 0.9986 0.99924 0.99979
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 1.00000 0.9994 0.99894 0.9973
## Specificity 1.00000 0.9998 0.99981 0.9998
## Pos Pred Value 1.00000 0.9981 0.99823 0.9983
## Neg Pred Value 1.00000 0.9999 0.99989 0.9997
## Prevalence 0.09802 0.1049 0.09584 0.1006
## Detection Rate 0.09802 0.1048 0.09574 0.1003
## Detection Prevalence 0.09802 0.1050 0.09591 0.1005
## Balanced Accuracy 1.00000 0.9996 0.99937 0.9986
#Test Data
xgb_pred_test <- predict(xgb_m2,test_pca_X)
test_cm <- confusionMatrix(xgb_pred_test,test_Y)
test_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 1216 0 9 0 2 6 10 4 2 9
## 1 0 1390 2 2 4 0 2 7 9 2
## 2 2 2 1178 21 7 2 0 15 12 0
## 3 4 5 10 1214 0 26 0 2 24 16
## 4 1 1 8 3 1159 10 5 8 5 16
## 5 4 2 3 27 0 1047 15 1 14 6
## 6 7 1 7 3 7 17 1219 0 6 0
## 7 0 3 11 5 2 1 0 1255 4 17
## 8 7 6 15 24 3 10 4 4 1153 12
## 9 1 0 2 13 30 7 0 22 16 1152
##
## Overall Statistics
##
## Accuracy : 0.9513
## 95% CI : (0.9474, 0.955)
## No Information Rate : 0.1119
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9458
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.97907 0.9858 0.94618 0.92530 0.95470 0.92984
## Specificity 0.99630 0.9975 0.99463 0.99229 0.99499 0.99372
## Pos Pred Value 0.96661 0.9803 0.95077 0.93313 0.95313 0.93566
## Neg Pred Value 0.99771 0.9982 0.99410 0.99132 0.99517 0.99312
## Prevalence 0.09859 0.1119 0.09883 0.10415 0.09637 0.08939
## Detection Rate 0.09653 0.1103 0.09351 0.09637 0.09201 0.08312
## Detection Prevalence 0.09987 0.1126 0.09836 0.10328 0.09653 0.08883
## Balanced Accuracy 0.98768 0.9917 0.97041 0.95880 0.97484 0.96178
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.97131 0.95220 0.92610 0.93659
## Specificity 0.99577 0.99619 0.99251 0.99199
## Pos Pred Value 0.96212 0.96687 0.93134 0.92679
## Neg Pred Value 0.99682 0.99442 0.99190 0.99313
## Prevalence 0.09963 0.10463 0.09883 0.09764
## Detection Rate 0.09677 0.09963 0.09153 0.09145
## Detection Prevalence 0.10058 0.10304 0.09828 0.09867
## Balanced Accuracy 0.98354 0.97419 0.95931 0.96429
So from the output above we can see that both training & testing have an accuracy higher than 95%, with the training data accuracy being 99.9% & the testing data accuracy being 95.13%.