library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
library(klaR)
## Loading required package: MASS
library(naivebayes)
## naivebayes 0.9.6 loaded
This project is to perform digit recognition in handwriting images using Machine Learning.Three algorithms are used:naive Bayes, random forest and SVM. By contructing different classifiers and comparing their performance, this study chooses the best one to predict the image digits.
All the algorithms are based on the first 20 principle components after reducing the dimensions from the orignal datasets. The major data cleaning includes the following two steps:
train <- 'Kaggle-digit-train-sample-small-1400.csv'
pred <- 'Kaggle-digit-test-sample1000.csv'
# read dataset
trainDf <- read.csv(train,header = T,sep=',')
predDf <- read.csv(pred,header=T,sep=',')
trainDf$label <- as.factor(trainDf$label) #change label from interger to factor
#datacleaning, delete columns that all the values are 0
zeroCol <- sapply(trainDf[,2:785], sum)
zeronum <- which(zeroCol>0)+1
trainDf0 <- trainDf[,c(1,zeronum)]
#PCA analysis
pca <- prcomp(trainDf0[,-1], scale. = T, center = T)
pcs <- as.data.frame(predict(pca, newdata = trainDf0))
pcs1 <- pcs[,1:20]
pcs1$label <- as.factor(trainDf0$label)
head(pcs1)
## PC1 PC2 PC3 PC4 PC5 PC6
## 1 2.144014 14.3684391 8.981672 -1.655067 2.4234601 1.5913458
## 2 7.707694 -2.0777231 -3.655695 2.936247 6.3794573 -1.9886359
## 3 -5.558268 8.3590357 -2.884201 9.115514 2.5036470 0.4973012
## 4 2.199615 0.6672391 -1.163542 -7.937492 -0.8826763 -6.8821894
## 5 -5.203437 -8.8061093 6.230741 1.665698 4.4121017 1.5961947
## 6 -12.500652 13.8806815 -18.265989 4.562380 1.7565104 10.7341932
## PC7 PC8 PC9 PC10 PC11 PC12
## 1 4.5273981 -9.0439748 9.6839723 3.9275278 7.34976321 4.9960456
## 2 1.9662906 1.1200049 -1.2653708 -1.0494600 -0.42082051 -0.9346120
## 3 0.5597759 -2.9595400 -1.7854735 13.8317641 -1.06634512 2.3552781
## 4 -0.9475827 0.9951338 0.5480592 -0.1772277 -1.49886036 0.2746483
## 5 5.1903765 -1.6140891 -1.8117808 5.3434150 -0.01639397 -1.1688908
## 6 -6.8777339 -6.4154007 -7.0967063 7.6892506 8.95217804 -5.7245288
## PC13 PC14 PC15 PC16 PC17 PC18
## 1 4.4461421 -1.2954902 0.027406739 1.9282573 -0.2319545 0.8106651
## 2 -0.8473155 1.8070304 -0.370526389 -0.1635908 0.8471344 1.0133916
## 3 -0.3753569 9.4051045 -7.440950334 1.6775289 -8.2673479 4.8958219
## 4 1.5257361 -0.7653945 -0.002122787 -0.1748738 0.2647101 1.1843010
## 5 -4.1995923 -2.6065327 2.037016918 -1.6942583 3.5965902 0.2723162
## 6 1.6897618 -5.0826689 -0.131055428 2.2452819 3.8628339 6.2408094
## PC19 PC20 label
## 1 0.554055222 -0.51303866 7
## 2 -0.007060992 -0.85088997 1
## 3 -7.259290927 -1.46222194 6
## 4 0.156271897 -0.10987729 5
## 5 -0.600623181 -0.08510289 0
## 6 -3.940859742 -8.16357694 6
train_index <- createDataPartition(pcs1$label,p=0.7,list=F)
train_df <- pcs1[train_index,]
dim(train_df)
## [1] 982 21
test_df <- pcs1[-train_index,]
dim(test_df)
## [1] 418 21
prop.table(table(pcs1$label))
##
## 0 1 2 3 4 5
## 0.10142857 0.11928571 0.09285714 0.10000000 0.09357143 0.09071429
## 6 7 8 9
## 0.10357143 0.09785714 0.09785714 0.10285714
prop.table(table(train_df$label))
##
## 0 1 2 3 4 5
## 0.10183299 0.11914460 0.09266802 0.09979633 0.09368635 0.09063136
## 6 7 8 9
## 0.10386965 0.09775967 0.09775967 0.10285132
Naive Bayesian using default hyper parameters The accuracicy is 0.7895, which is much higer than the No Information Rate(0.1196).
Start1 <- Sys.time()
model_nb1 <- train(label ~.,data=train_df,method='nb')
End1 <- Sys.time()-Start1
predict_nb1 <- predict(model_nb1, newdata = test_df, type = "raw")
confusionMatrix(predict_nb1,test_df$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 37 0 0 0 1 0 1 1 0 1
## 1 0 46 0 0 1 0 0 0 2 1
## 2 0 1 30 1 0 0 2 1 5 2
## 3 0 0 2 30 1 4 1 1 3 1
## 4 0 0 1 1 22 2 0 2 0 5
## 5 0 0 2 3 3 29 0 1 2 0
## 6 3 0 2 0 0 1 39 1 0 0
## 7 1 0 1 0 0 0 0 25 0 5
## 8 0 3 1 6 0 2 0 2 28 0
## 9 1 0 0 1 11 0 0 7 1 28
##
## Overall Statistics
##
## Accuracy : 0.7512
## 95% CI : (0.7069, 0.7919)
## No Information Rate : 0.1196
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7233
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.88095 0.9200 0.76923 0.71429 0.56410 0.76316
## Specificity 0.98936 0.9891 0.96834 0.96543 0.97098 0.97105
## Pos Pred Value 0.90244 0.9200 0.71429 0.69767 0.66667 0.72500
## Neg Pred Value 0.98674 0.9891 0.97606 0.96800 0.95584 0.97619
## Prevalence 0.10048 0.1196 0.09330 0.10048 0.09330 0.09091
## Detection Rate 0.08852 0.1100 0.07177 0.07177 0.05263 0.06938
## Detection Prevalence 0.09809 0.1196 0.10048 0.10287 0.07895 0.09569
## Balanced Accuracy 0.93516 0.9546 0.86878 0.83986 0.76754 0.86711
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.9070 0.60976 0.68293 0.65116
## Specificity 0.9813 0.98143 0.96286 0.94400
## Pos Pred Value 0.8478 0.78125 0.66667 0.57143
## Neg Pred Value 0.9892 0.95855 0.96543 0.95935
## Prevalence 0.1029 0.09809 0.09809 0.10287
## Detection Rate 0.0933 0.05981 0.06699 0.06699
## Detection Prevalence 0.1100 0.07656 0.10048 0.11722
## Balanced Accuracy 0.9442 0.79559 0.82290 0.79758
After tuning, the accuracy is the same.
Start2 <- Sys.time()
model_nb2 <- train(label ~.,data=train_df, method = "nb",
trControl = trainControl(method = "cv", number = 3),
tuneGrid = expand.grid(fL = 1:3, usekernel = c(TRUE, FALSE), adjust = 1:3))
End2 <- Sys.time()-Start2
predict_nb2 <- predict(model_nb2, newdata = test_df, type = "raw")
confusionMatrix(predict_nb2,test_df$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 37 0 0 0 1 0 1 1 0 1
## 1 0 46 0 0 1 0 0 0 2 1
## 2 0 1 30 1 0 0 2 1 5 2
## 3 0 0 2 30 1 4 1 1 3 1
## 4 0 0 1 1 22 2 0 2 0 5
## 5 0 0 2 3 3 29 0 1 2 0
## 6 3 0 2 0 0 1 39 1 0 0
## 7 1 0 1 0 0 0 0 25 0 5
## 8 0 3 1 6 0 2 0 2 28 0
## 9 1 0 0 1 11 0 0 7 1 28
##
## Overall Statistics
##
## Accuracy : 0.7512
## 95% CI : (0.7069, 0.7919)
## No Information Rate : 0.1196
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7233
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.88095 0.9200 0.76923 0.71429 0.56410 0.76316
## Specificity 0.98936 0.9891 0.96834 0.96543 0.97098 0.97105
## Pos Pred Value 0.90244 0.9200 0.71429 0.69767 0.66667 0.72500
## Neg Pred Value 0.98674 0.9891 0.97606 0.96800 0.95584 0.97619
## Prevalence 0.10048 0.1196 0.09330 0.10048 0.09330 0.09091
## Detection Rate 0.08852 0.1100 0.07177 0.07177 0.05263 0.06938
## Detection Prevalence 0.09809 0.1196 0.10048 0.10287 0.07895 0.09569
## Balanced Accuracy 0.93516 0.9546 0.86878 0.83986 0.76754 0.86711
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.9070 0.60976 0.68293 0.65116
## Specificity 0.9813 0.98143 0.96286 0.94400
## Pos Pred Value 0.8478 0.78125 0.66667 0.57143
## Neg Pred Value 0.9892 0.95855 0.96543 0.95935
## Prevalence 0.1029 0.09809 0.09809 0.10287
## Detection Rate 0.0933 0.05981 0.06699 0.06699
## Detection Prevalence 0.1100 0.07656 0.10048 0.11722
## Balanced Accuracy 0.9442 0.79559 0.82290 0.79758
The accuracy has been improved to 0.8589, which is higher than Naive Bayesian.
Start3 <- Sys.time()
model_rf <- train(label ~ ., data = train_df, method = "rf")
End3 <- Sys.time()-Start3
pred_rf <- predict(model_rf, newdata = test_df)
model_rf
## Random Forest
##
## 982 samples
## 20 predictor
## 10 classes: '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 982, 982, 982, 982, 982, 982, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8437271 0.8259598
## 11 0.8174232 0.7966961
## 20 0.7908915 0.7672097
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
confusionMatrix(pred_rf,test_df$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 41 0 0 0 0 1 1 2 0 0
## 1 0 49 1 1 0 0 0 1 1 1
## 2 0 0 35 0 0 0 1 1 1 1
## 3 0 0 0 36 1 1 0 1 4 1
## 4 0 0 1 1 35 2 0 1 0 4
## 5 0 0 0 1 0 32 0 0 2 0
## 6 1 0 2 0 0 0 41 0 0 0
## 7 0 0 0 0 0 1 0 33 0 2
## 8 0 1 0 3 0 1 0 0 33 0
## 9 0 0 0 0 3 0 0 2 0 34
##
## Overall Statistics
##
## Accuracy : 0.8828
## 95% CI : (0.848, 0.912)
## No Information Rate : 0.1196
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8696
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.97619 0.9800 0.89744 0.85714 0.89744 0.84211
## Specificity 0.98936 0.9864 0.98945 0.97872 0.97625 0.99211
## Pos Pred Value 0.91111 0.9074 0.89744 0.81818 0.79545 0.91429
## Neg Pred Value 0.99732 0.9973 0.98945 0.98396 0.98930 0.98433
## Prevalence 0.10048 0.1196 0.09330 0.10048 0.09330 0.09091
## Detection Rate 0.09809 0.1172 0.08373 0.08612 0.08373 0.07656
## Detection Prevalence 0.10766 0.1292 0.09330 0.10526 0.10526 0.08373
## Balanced Accuracy 0.98278 0.9832 0.94344 0.91793 0.93684 0.91711
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.95349 0.80488 0.80488 0.79070
## Specificity 0.99200 0.99204 0.98674 0.98667
## Pos Pred Value 0.93182 0.91667 0.86842 0.87179
## Neg Pred Value 0.99465 0.97906 0.97895 0.97625
## Prevalence 0.10287 0.09809 0.09809 0.10287
## Detection Rate 0.09809 0.07895 0.07895 0.08134
## Detection Prevalence 0.10526 0.08612 0.09091 0.09330
## Balanced Accuracy 0.97274 0.89846 0.89581 0.88868
Data PeProcessing: Standardize the values. Tune Models: boot
set.seed(1818)
Start4 <- Sys.time()
model_svm_linear <- train(label ~ ., data = train_df,
method = "svmLinear",
preProcess = c("center", "scale"),
trControl = trainControl(method = "boot", number = 25),
tuneGrid = expand.grid(C = seq(0, 1, 0.05)))
End4 <- Sys.time()-Start4
predict_svm_linear <- predict(model_svm_linear, newdata = test_df)
confusionMatrix(predict_svm_linear,test_df$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 41 0 0 0 0 0 1 0 0 0
## 1 0 48 1 1 0 2 0 1 1 0
## 2 1 1 34 0 1 1 1 1 4 1
## 3 0 0 0 35 0 2 0 0 3 0
## 4 0 0 1 1 34 1 1 0 0 1
## 5 0 0 0 2 0 30 0 0 5 0
## 6 0 0 1 0 0 1 39 0 0 0
## 7 0 0 2 0 0 0 0 34 0 6
## 8 0 0 0 2 0 0 1 0 26 0
## 9 0 1 0 1 4 1 0 5 2 35
##
## Overall Statistics
##
## Accuracy : 0.8517
## 95% CI : (0.8139, 0.8844)
## No Information Rate : 0.1196
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8351
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.97619 0.9600 0.87179 0.83333 0.87179 0.78947
## Specificity 0.99734 0.9837 0.97098 0.98670 0.98681 0.98158
## Pos Pred Value 0.97619 0.8889 0.75556 0.87500 0.87179 0.81081
## Neg Pred Value 0.99734 0.9945 0.98660 0.98148 0.98681 0.97900
## Prevalence 0.10048 0.1196 0.09330 0.10048 0.09330 0.09091
## Detection Rate 0.09809 0.1148 0.08134 0.08373 0.08134 0.07177
## Detection Prevalence 0.10048 0.1292 0.10766 0.09569 0.09330 0.08852
## Balanced Accuracy 0.98677 0.9718 0.92139 0.91002 0.92930 0.88553
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.90698 0.82927 0.63415 0.81395
## Specificity 0.99467 0.97878 0.99204 0.96267
## Pos Pred Value 0.95122 0.80952 0.89655 0.71429
## Neg Pred Value 0.98939 0.98138 0.96144 0.97832
## Prevalence 0.10287 0.09809 0.09809 0.10287
## Detection Rate 0.09330 0.08134 0.06220 0.08373
## Detection Prevalence 0.09809 0.10048 0.06938 0.11722
## Balanced Accuracy 0.95082 0.90402 0.81309 0.88831
Randomn Forest has the highest accuracy.
model_comparison <- resamples(list(NB=model_nb1,RF = model_rf,
SVMLinear = model_svm_linear))
summary(model_comparison)
##
## Call:
## summary.resamples(object = model_comparison)
##
## Models: NB, RF, SVMLinear
## Number of resamples: 25
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## NB 0.6963788 0.7351351 0.7391304 0.7408546 0.7521127 0.7774481 0
## RF 0.8111111 0.8304598 0.8423913 0.8437271 0.8579235 0.8771429 0
## SVMLinear 0.8133705 0.8333333 0.8392371 0.8430103 0.8571429 0.8694444 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## NB 0.6625067 0.7054332 0.7093255 0.7115437 0.7243667 0.7520357 0
## RF 0.7895660 0.8107911 0.8245444 0.8259598 0.8418639 0.8631495 0
## SVMLinear 0.7924300 0.8143252 0.8207711 0.8252274 0.8410204 0.8542673 0
SVM is the fastest algorithm.
Duration <- c(End2, End3, End4)
Algorithim <- c('NB','RF','SVM')
as.data.frame(Duration,Algorithim)
## Duration
## NB 49.69468 secs
## RF 57.37902 secs
## SVM 35.05078 secs
Following step is to use Randomn Forest model to predict the results. links to Kaggle:https://www.kaggle.com/wwwwwwc/kernel2b26715859?scriptVersionId=23243581
test <- read.csv('test.csv',header = T,sep=',')
set.seed(100)
test_sample_index <- sample(1:nrow(test),replace = T,1000)
predDf <- test[,zeronum-1]
pred_pcs <- as.data.frame(predict(pca, newdata = predDf))
pred_pcs <- pred_pcs[,1:20]
predict_rf <- predict(model_rf, newdata = pred_pcs)
Label <- as.factor(predict_rf)
ImageId <- 1:28000
predict_result <- as.data.frame(cbind(ImageId,Label),colnames=c('ImageId','Label'))
predict_result <- predict_result[order(predict_result$ImageId,decreasing = F),]
#write.csv(predict_result,'predict_result.csv')
head(predict_result)
## ImageId Label
## 1 1 3
## 2 2 1
## 3 3 10
## 4 4 5
## 5 5 4
## 6 6 8