library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
library(klaR)
## Loading required package: MASS
library(naivebayes)
## naivebayes 0.9.6 loaded

Section 1: Introduction

This project is to perform digit recognition in handwriting images using Machine Learning.Three algorithms are used:naive Bayes, random forest and SVM. By contructing different classifiers and comparing their performance, this study chooses the best one to predict the image digits.

Data Preparation

All the algorithms are based on the first 20 principle components after reducing the dimensions from the orignal datasets. The major data cleaning includes the following two steps:

  • Remove redudant columns: delete columns where all the values are 0
  • Reduce Dimension: using PCA to reduce dimension since currently there are too much vairables
train <- 'Kaggle-digit-train-sample-small-1400.csv'
pred <- 'Kaggle-digit-test-sample1000.csv'

# read dataset
trainDf <- read.csv(train,header = T,sep=',')
predDf <- read.csv(pred,header=T,sep=',')
trainDf$label <- as.factor(trainDf$label) #change label from interger to factor

#datacleaning, delete columns that all the values are 0
zeroCol <- sapply(trainDf[,2:785], sum)
zeronum <- which(zeroCol>0)+1
trainDf0 <- trainDf[,c(1,zeronum)]

#PCA analysis
pca <- prcomp(trainDf0[,-1], scale. = T, center = T)
pcs <- as.data.frame(predict(pca, newdata = trainDf0))
pcs1 <- pcs[,1:20]
pcs1$label <- as.factor(trainDf0$label)
head(pcs1)
##          PC1        PC2        PC3       PC4        PC5        PC6
## 1   2.144014 14.3684391   8.981672 -1.655067  2.4234601  1.5913458
## 2   7.707694 -2.0777231  -3.655695  2.936247  6.3794573 -1.9886359
## 3  -5.558268  8.3590357  -2.884201  9.115514  2.5036470  0.4973012
## 4   2.199615  0.6672391  -1.163542 -7.937492 -0.8826763 -6.8821894
## 5  -5.203437 -8.8061093   6.230741  1.665698  4.4121017  1.5961947
## 6 -12.500652 13.8806815 -18.265989  4.562380  1.7565104 10.7341932
##          PC7        PC8        PC9       PC10        PC11       PC12
## 1  4.5273981 -9.0439748  9.6839723  3.9275278  7.34976321  4.9960456
## 2  1.9662906  1.1200049 -1.2653708 -1.0494600 -0.42082051 -0.9346120
## 3  0.5597759 -2.9595400 -1.7854735 13.8317641 -1.06634512  2.3552781
## 4 -0.9475827  0.9951338  0.5480592 -0.1772277 -1.49886036  0.2746483
## 5  5.1903765 -1.6140891 -1.8117808  5.3434150 -0.01639397 -1.1688908
## 6 -6.8777339 -6.4154007 -7.0967063  7.6892506  8.95217804 -5.7245288
##         PC13       PC14         PC15       PC16       PC17      PC18
## 1  4.4461421 -1.2954902  0.027406739  1.9282573 -0.2319545 0.8106651
## 2 -0.8473155  1.8070304 -0.370526389 -0.1635908  0.8471344 1.0133916
## 3 -0.3753569  9.4051045 -7.440950334  1.6775289 -8.2673479 4.8958219
## 4  1.5257361 -0.7653945 -0.002122787 -0.1748738  0.2647101 1.1843010
## 5 -4.1995923 -2.6065327  2.037016918 -1.6942583  3.5965902 0.2723162
## 6  1.6897618 -5.0826689 -0.131055428  2.2452819  3.8628339 6.2408094
##           PC19        PC20 label
## 1  0.554055222 -0.51303866     7
## 2 -0.007060992 -0.85088997     1
## 3 -7.259290927 -1.46222194     6
## 4  0.156271897 -0.10987729     5
## 5 -0.600623181 -0.08510289     0
## 6 -3.940859742 -8.16357694     6

Construct Train and Test Data

train_index <- createDataPartition(pcs1$label,p=0.7,list=F)
train_df <- pcs1[train_index,]
dim(train_df)
## [1] 982  21
test_df <- pcs1[-train_index,]
dim(test_df)
## [1] 418  21
prop.table(table(pcs1$label))
## 
##          0          1          2          3          4          5 
## 0.10142857 0.11928571 0.09285714 0.10000000 0.09357143 0.09071429 
##          6          7          8          9 
## 0.10357143 0.09785714 0.09785714 0.10285714
prop.table(table(train_df$label))
## 
##          0          1          2          3          4          5 
## 0.10183299 0.11914460 0.09266802 0.09979633 0.09368635 0.09063136 
##          6          7          8          9 
## 0.10386965 0.09775967 0.09775967 0.10285132

Section 2: naive Bayes method

Naive Bayesian

Naive Bayesian using default hyper parameters The accuracicy is 0.7895, which is much higer than the No Information Rate(0.1196).

Start1 <- Sys.time()
model_nb1 <- train(label ~.,data=train_df,method='nb')
End1 <- Sys.time()-Start1
predict_nb1 <- predict(model_nb1, newdata = test_df, type = "raw")
confusionMatrix(predict_nb1,test_df$label)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2  3  4  5  6  7  8  9
##          0 37  0  0  0  1  0  1  1  0  1
##          1  0 46  0  0  1  0  0  0  2  1
##          2  0  1 30  1  0  0  2  1  5  2
##          3  0  0  2 30  1  4  1  1  3  1
##          4  0  0  1  1 22  2  0  2  0  5
##          5  0  0  2  3  3 29  0  1  2  0
##          6  3  0  2  0  0  1 39  1  0  0
##          7  1  0  1  0  0  0  0 25  0  5
##          8  0  3  1  6  0  2  0  2 28  0
##          9  1  0  0  1 11  0  0  7  1 28
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7512          
##                  95% CI : (0.7069, 0.7919)
##     No Information Rate : 0.1196          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7233          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.88095   0.9200  0.76923  0.71429  0.56410  0.76316
## Specificity           0.98936   0.9891  0.96834  0.96543  0.97098  0.97105
## Pos Pred Value        0.90244   0.9200  0.71429  0.69767  0.66667  0.72500
## Neg Pred Value        0.98674   0.9891  0.97606  0.96800  0.95584  0.97619
## Prevalence            0.10048   0.1196  0.09330  0.10048  0.09330  0.09091
## Detection Rate        0.08852   0.1100  0.07177  0.07177  0.05263  0.06938
## Detection Prevalence  0.09809   0.1196  0.10048  0.10287  0.07895  0.09569
## Balanced Accuracy     0.93516   0.9546  0.86878  0.83986  0.76754  0.86711
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity            0.9070  0.60976  0.68293  0.65116
## Specificity            0.9813  0.98143  0.96286  0.94400
## Pos Pred Value         0.8478  0.78125  0.66667  0.57143
## Neg Pred Value         0.9892  0.95855  0.96543  0.95935
## Prevalence             0.1029  0.09809  0.09809  0.10287
## Detection Rate         0.0933  0.05981  0.06699  0.06699
## Detection Prevalence   0.1100  0.07656  0.10048  0.11722
## Balanced Accuracy      0.9442  0.79559  0.82290  0.79758

Tune Model

After tuning, the accuracy is the same.

Start2 <- Sys.time()
model_nb2 <- train(label ~.,data=train_df, method = "nb",
                   trControl = trainControl(method = "cv", number = 3),
                   tuneGrid = expand.grid(fL = 1:3, usekernel = c(TRUE, FALSE), adjust = 1:3))
End2 <- Sys.time()-Start2
predict_nb2 <- predict(model_nb2, newdata = test_df, type = "raw")
confusionMatrix(predict_nb2,test_df$label)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2  3  4  5  6  7  8  9
##          0 37  0  0  0  1  0  1  1  0  1
##          1  0 46  0  0  1  0  0  0  2  1
##          2  0  1 30  1  0  0  2  1  5  2
##          3  0  0  2 30  1  4  1  1  3  1
##          4  0  0  1  1 22  2  0  2  0  5
##          5  0  0  2  3  3 29  0  1  2  0
##          6  3  0  2  0  0  1 39  1  0  0
##          7  1  0  1  0  0  0  0 25  0  5
##          8  0  3  1  6  0  2  0  2 28  0
##          9  1  0  0  1 11  0  0  7  1 28
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7512          
##                  95% CI : (0.7069, 0.7919)
##     No Information Rate : 0.1196          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7233          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.88095   0.9200  0.76923  0.71429  0.56410  0.76316
## Specificity           0.98936   0.9891  0.96834  0.96543  0.97098  0.97105
## Pos Pred Value        0.90244   0.9200  0.71429  0.69767  0.66667  0.72500
## Neg Pred Value        0.98674   0.9891  0.97606  0.96800  0.95584  0.97619
## Prevalence            0.10048   0.1196  0.09330  0.10048  0.09330  0.09091
## Detection Rate        0.08852   0.1100  0.07177  0.07177  0.05263  0.06938
## Detection Prevalence  0.09809   0.1196  0.10048  0.10287  0.07895  0.09569
## Balanced Accuracy     0.93516   0.9546  0.86878  0.83986  0.76754  0.86711
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity            0.9070  0.60976  0.68293  0.65116
## Specificity            0.9813  0.98143  0.96286  0.94400
## Pos Pred Value         0.8478  0.78125  0.66667  0.57143
## Neg Pred Value         0.9892  0.95855  0.96543  0.95935
## Prevalence             0.1029  0.09809  0.09809  0.10287
## Detection Rate         0.0933  0.05981  0.06699  0.06699
## Detection Prevalence   0.1100  0.07656  0.10048  0.11722
## Balanced Accuracy      0.9442  0.79559  0.82290  0.79758

Section 3: Random Forest method

Random Forest

The accuracy has been improved to 0.8589, which is higher than Naive Bayesian.

Start3 <- Sys.time()
model_rf <- train(label ~ ., data = train_df, method = "rf")
End3 <- Sys.time()-Start3
pred_rf <- predict(model_rf, newdata = test_df)
model_rf
## Random Forest 
## 
## 982 samples
##  20 predictor
##  10 classes: '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 982, 982, 982, 982, 982, 982, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8437271  0.8259598
##   11    0.8174232  0.7966961
##   20    0.7908915  0.7672097
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
confusionMatrix(pred_rf,test_df$label)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2  3  4  5  6  7  8  9
##          0 41  0  0  0  0  1  1  2  0  0
##          1  0 49  1  1  0  0  0  1  1  1
##          2  0  0 35  0  0  0  1  1  1  1
##          3  0  0  0 36  1  1  0  1  4  1
##          4  0  0  1  1 35  2  0  1  0  4
##          5  0  0  0  1  0 32  0  0  2  0
##          6  1  0  2  0  0  0 41  0  0  0
##          7  0  0  0  0  0  1  0 33  0  2
##          8  0  1  0  3  0  1  0  0 33  0
##          9  0  0  0  0  3  0  0  2  0 34
## 
## Overall Statistics
##                                         
##                Accuracy : 0.8828        
##                  95% CI : (0.848, 0.912)
##     No Information Rate : 0.1196        
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.8696        
##                                         
##  Mcnemar's Test P-Value : NA            
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.97619   0.9800  0.89744  0.85714  0.89744  0.84211
## Specificity           0.98936   0.9864  0.98945  0.97872  0.97625  0.99211
## Pos Pred Value        0.91111   0.9074  0.89744  0.81818  0.79545  0.91429
## Neg Pred Value        0.99732   0.9973  0.98945  0.98396  0.98930  0.98433
## Prevalence            0.10048   0.1196  0.09330  0.10048  0.09330  0.09091
## Detection Rate        0.09809   0.1172  0.08373  0.08612  0.08373  0.07656
## Detection Prevalence  0.10766   0.1292  0.09330  0.10526  0.10526  0.08373
## Balanced Accuracy     0.98278   0.9832  0.94344  0.91793  0.93684  0.91711
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           0.95349  0.80488  0.80488  0.79070
## Specificity           0.99200  0.99204  0.98674  0.98667
## Pos Pred Value        0.93182  0.91667  0.86842  0.87179
## Neg Pred Value        0.99465  0.97906  0.97895  0.97625
## Prevalence            0.10287  0.09809  0.09809  0.10287
## Detection Rate        0.09809  0.07895  0.07895  0.08134
## Detection Prevalence  0.10526  0.08612  0.09091  0.09330
## Balanced Accuracy     0.97274  0.89846  0.89581  0.88868

Section 4: Support Vector Machine (SVM)

Linear Kernel

Data PeProcessing: Standardize the values. Tune Models: boot

set.seed(1818)
Start4 <- Sys.time()
model_svm_linear <- train(label ~ ., data = train_df,
                          method = "svmLinear",
                          preProcess = c("center", "scale"),
                          trControl = trainControl(method = "boot", number = 25),
                          tuneGrid = expand.grid(C = seq(0, 1, 0.05)))
End4 <- Sys.time()-Start4
predict_svm_linear <- predict(model_svm_linear, newdata = test_df)
confusionMatrix(predict_svm_linear,test_df$label)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2  3  4  5  6  7  8  9
##          0 41  0  0  0  0  0  1  0  0  0
##          1  0 48  1  1  0  2  0  1  1  0
##          2  1  1 34  0  1  1  1  1  4  1
##          3  0  0  0 35  0  2  0  0  3  0
##          4  0  0  1  1 34  1  1  0  0  1
##          5  0  0  0  2  0 30  0  0  5  0
##          6  0  0  1  0  0  1 39  0  0  0
##          7  0  0  2  0  0  0  0 34  0  6
##          8  0  0  0  2  0  0  1  0 26  0
##          9  0  1  0  1  4  1  0  5  2 35
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8517          
##                  95% CI : (0.8139, 0.8844)
##     No Information Rate : 0.1196          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8351          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.97619   0.9600  0.87179  0.83333  0.87179  0.78947
## Specificity           0.99734   0.9837  0.97098  0.98670  0.98681  0.98158
## Pos Pred Value        0.97619   0.8889  0.75556  0.87500  0.87179  0.81081
## Neg Pred Value        0.99734   0.9945  0.98660  0.98148  0.98681  0.97900
## Prevalence            0.10048   0.1196  0.09330  0.10048  0.09330  0.09091
## Detection Rate        0.09809   0.1148  0.08134  0.08373  0.08134  0.07177
## Detection Prevalence  0.10048   0.1292  0.10766  0.09569  0.09330  0.08852
## Balanced Accuracy     0.98677   0.9718  0.92139  0.91002  0.92930  0.88553
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           0.90698  0.82927  0.63415  0.81395
## Specificity           0.99467  0.97878  0.99204  0.96267
## Pos Pred Value        0.95122  0.80952  0.89655  0.71429
## Neg Pred Value        0.98939  0.98138  0.96144  0.97832
## Prevalence            0.10287  0.09809  0.09809  0.10287
## Detection Rate        0.09330  0.08134  0.06220  0.08373
## Detection Prevalence  0.09809  0.10048  0.06938  0.11722
## Balanced Accuracy     0.95082  0.90402  0.81309  0.88831

Section 5: Algorithm performance comparison

Randomn Forest has the highest accuracy.

model_comparison <- resamples(list(NB=model_nb1,RF = model_rf,
                                   SVMLinear = model_svm_linear))
summary(model_comparison)
## 
## Call:
## summary.resamples(object = model_comparison)
## 
## Models: NB, RF, SVMLinear 
## Number of resamples: 25 
## 
## Accuracy 
##                Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## NB        0.6963788 0.7351351 0.7391304 0.7408546 0.7521127 0.7774481    0
## RF        0.8111111 0.8304598 0.8423913 0.8437271 0.8579235 0.8771429    0
## SVMLinear 0.8133705 0.8333333 0.8392371 0.8430103 0.8571429 0.8694444    0
## 
## Kappa 
##                Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## NB        0.6625067 0.7054332 0.7093255 0.7115437 0.7243667 0.7520357    0
## RF        0.7895660 0.8107911 0.8245444 0.8259598 0.8418639 0.8631495    0
## SVMLinear 0.7924300 0.8143252 0.8207711 0.8252274 0.8410204 0.8542673    0

SVM is the fastest algorithm.

Duration <- c(End2, End3, End4)
Algorithim <- c('NB','RF','SVM')
as.data.frame(Duration,Algorithim)
##          Duration
## NB  49.69468 secs
## RF  57.37902 secs
## SVM 35.05078 secs

Section 6:Kaggle test result

Following step is to use Randomn Forest model to predict the results. links to Kaggle:https://www.kaggle.com/wwwwwwc/kernel2b26715859?scriptVersionId=23243581

test <- read.csv('test.csv',header = T,sep=',')
set.seed(100)
test_sample_index <- sample(1:nrow(test),replace = T,1000)
predDf <- test[,zeronum-1]
pred_pcs <- as.data.frame(predict(pca, newdata = predDf))
pred_pcs <- pred_pcs[,1:20]
predict_rf <- predict(model_rf, newdata = pred_pcs)
Label <- as.factor(predict_rf)
ImageId <- 1:28000
predict_result <- as.data.frame(cbind(ImageId,Label),colnames=c('ImageId','Label'))
predict_result <- predict_result[order(predict_result$ImageId,decreasing = F),]
#write.csv(predict_result,'predict_result.csv')
head(predict_result)
##   ImageId Label
## 1       1     3
## 2       2     1
## 3       3    10
## 4       4     5
## 5       5     4
## 6       6     8