#Library

library(readr)
library(caret)
library(RColorBrewer)
library(doMC)
library(neuralnet)
library(FNN)
library(EBImage)
train<-read_csv("train.csv")
test<-read_csv("test.csv")

dim(train) ; dim(test)
## [1] 42000   785
## [1] 28000   784
head(train)
## # A tibble: 6 x 785
##   label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9
##   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1     1      0      0      0      0      0      0      0      0      0      0
## 2     0      0      0      0      0      0      0      0      0      0      0
## 3     1      0      0      0      0      0      0      0      0      0      0
## 4     4      0      0      0      0      0      0      0      0      0      0
## 5     0      0      0      0      0      0      0      0      0      0      0
## 6     0      0      0      0      0      0      0      0      0      0      0
## # ... with 774 more variables: pixel10 <dbl>, pixel11 <dbl>, pixel12 <dbl>,
## #   pixel13 <dbl>, pixel14 <dbl>, pixel15 <dbl>, pixel16 <dbl>, pixel17 <dbl>,
## #   pixel18 <dbl>, pixel19 <dbl>, pixel20 <dbl>, pixel21 <dbl>, pixel22 <dbl>,
## #   pixel23 <dbl>, pixel24 <dbl>, pixel25 <dbl>, pixel26 <dbl>, pixel27 <dbl>,
## #   pixel28 <dbl>, pixel29 <dbl>, pixel30 <dbl>, pixel31 <dbl>, pixel32 <dbl>,
## #   pixel33 <dbl>, pixel34 <dbl>, pixel35 <dbl>, pixel36 <dbl>, pixel37 <dbl>,
## #   pixel38 <dbl>, pixel39 <dbl>, pixel40 <dbl>, pixel41 <dbl>, pixel42 <dbl>,
## #   pixel43 <dbl>, pixel44 <dbl>, pixel45 <dbl>, pixel46 <dbl>, pixel47 <dbl>,
## #   pixel48 <dbl>, pixel49 <dbl>, pixel50 <dbl>, pixel51 <dbl>, pixel52 <dbl>,
## #   pixel53 <dbl>, pixel54 <dbl>, pixel55 <dbl>, pixel56 <dbl>, pixel57 <dbl>,
## #   pixel58 <dbl>, pixel59 <dbl>, pixel60 <dbl>, pixel61 <dbl>, pixel62 <dbl>,
## #   pixel63 <dbl>, pixel64 <dbl>, pixel65 <dbl>, pixel66 <dbl>, pixel67 <dbl>,
## #   pixel68 <dbl>, pixel69 <dbl>, pixel70 <dbl>, pixel71 <dbl>, pixel72 <dbl>,
## #   pixel73 <dbl>, pixel74 <dbl>, pixel75 <dbl>, pixel76 <dbl>, pixel77 <dbl>,
## #   pixel78 <dbl>, pixel79 <dbl>, pixel80 <dbl>, pixel81 <dbl>, pixel82 <dbl>,
## #   pixel83 <dbl>, pixel84 <dbl>, pixel85 <dbl>, pixel86 <dbl>, pixel87 <dbl>,
## #   pixel88 <dbl>, pixel89 <dbl>, pixel90 <dbl>, pixel91 <dbl>, pixel92 <dbl>,
## #   pixel93 <dbl>, pixel94 <dbl>, pixel95 <dbl>, pixel96 <dbl>, pixel97 <dbl>,
## #   pixel98 <dbl>, pixel99 <dbl>, pixel100 <dbl>, pixel101 <dbl>,
## #   pixel102 <dbl>, pixel103 <dbl>, pixel104 <dbl>, pixel105 <dbl>,
## #   pixel106 <dbl>, pixel107 <dbl>, pixel108 <dbl>, pixel109 <dbl>, ...
head(test)
## # A tibble: 6 x 784
##   pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 pixel10
##    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>
## 1      0      0      0      0      0      0      0      0      0      0       0
## 2      0      0      0      0      0      0      0      0      0      0       0
## 3      0      0      0      0      0      0      0      0      0      0       0
## 4      0      0      0      0      0      0      0      0      0      0       0
## 5      0      0      0      0      0      0      0      0      0      0       0
## 6      0      0      0      0      0      0      0      0      0      0       0
## # ... with 773 more variables: pixel11 <dbl>, pixel12 <dbl>, pixel13 <dbl>,
## #   pixel14 <dbl>, pixel15 <dbl>, pixel16 <dbl>, pixel17 <dbl>, pixel18 <dbl>,
## #   pixel19 <dbl>, pixel20 <dbl>, pixel21 <dbl>, pixel22 <dbl>, pixel23 <dbl>,
## #   pixel24 <dbl>, pixel25 <dbl>, pixel26 <dbl>, pixel27 <dbl>, pixel28 <dbl>,
## #   pixel29 <dbl>, pixel30 <dbl>, pixel31 <dbl>, pixel32 <dbl>, pixel33 <dbl>,
## #   pixel34 <dbl>, pixel35 <dbl>, pixel36 <dbl>, pixel37 <dbl>, pixel38 <dbl>,
## #   pixel39 <dbl>, pixel40 <dbl>, pixel41 <dbl>, pixel42 <dbl>, pixel43 <dbl>,
## #   pixel44 <dbl>, pixel45 <dbl>, pixel46 <dbl>, pixel47 <dbl>, pixel48 <dbl>,
## #   pixel49 <dbl>, pixel50 <dbl>, pixel51 <dbl>, pixel52 <dbl>, pixel53 <dbl>,
## #   pixel54 <dbl>, pixel55 <dbl>, pixel56 <dbl>, pixel57 <dbl>, pixel58 <dbl>,
## #   pixel59 <dbl>, pixel60 <dbl>, pixel61 <dbl>, pixel62 <dbl>, pixel63 <dbl>,
## #   pixel64 <dbl>, pixel65 <dbl>, pixel66 <dbl>, pixel67 <dbl>, pixel68 <dbl>,
## #   pixel69 <dbl>, pixel70 <dbl>, pixel71 <dbl>, pixel72 <dbl>, pixel73 <dbl>,
## #   pixel74 <dbl>, pixel75 <dbl>, pixel76 <dbl>, pixel77 <dbl>, pixel78 <dbl>,
## #   pixel79 <dbl>, pixel80 <dbl>, pixel81 <dbl>, pixel82 <dbl>, pixel83 <dbl>,
## #   pixel84 <dbl>, pixel85 <dbl>, pixel86 <dbl>, pixel87 <dbl>, pixel88 <dbl>,
## #   pixel89 <dbl>, pixel90 <dbl>, pixel91 <dbl>, pixel92 <dbl>, pixel93 <dbl>,
## #   pixel94 <dbl>, pixel95 <dbl>, pixel96 <dbl>, pixel97 <dbl>, pixel98 <dbl>,
## #   pixel99 <dbl>, pixel100 <dbl>, pixel101 <dbl>, pixel102 <dbl>,
## #   pixel103 <dbl>, pixel104 <dbl>, pixel105 <dbl>, pixel106 <dbl>,
## #   pixel107 <dbl>, pixel108 <dbl>, pixel109 <dbl>, pixel110 <dbl>, ...

#change column label to be factor type

train[,1] <- as.factor(train[,1]$label)

head(train[,1])
## # A tibble: 6 x 1
##   label
##   <fct>
## 1 1    
## 2 0    
## 3 1    
## 4 4    
## 5 0    
## 6 0

#the other column need to be numeric type

head(sapply(train[1,], class))
##     label    pixel0    pixel1    pixel2    pixel3    pixel4 
##  "factor" "numeric" "numeric" "numeric" "numeric" "numeric"

#backup the data

train_orig<-train
test_orig<-test

#prepare for training and test

nzv.data<-nearZeroVar(train, saveMetrics = TRUE)
drop.cols<-rownames(nzv.data)[nzv.data$nzv==TRUE]
train<-train[,!names(train) %in% drop.cols]
test<-test[,!names(test) %in% drop.cols]

#see the data

BNW<-c("white","black")
CUSTOM_BNW<-colorRampPalette(colors=BNW)
par(mfrow = c(4,3), pty="s", mar=c(1, 1, 1, 1), xaxt = "n", yaxt = "n")
images_digits_0_9<-array(dim=c(10,28*28))
for (digit in 0:9) {
  images_digits_0_9[digit + 1,] <- apply(train_orig[train_orig[,1]==digit, -1], 2, sum)
  images_digits_0_9[digit + 1,] <- images_digits_0_9[digit + 1,]/max(images_digits_0_9[digit + 1,]) * 255
  z<-array(images_digits_0_9[digit + 1,], dim = c(28, 28))
  z<-z[, 28:1]
  image(1:28, 1:28, z, main=digit, col=CUSTOM_BNW(256))
}

#To know the percentage of digit in the training set

CUSTOM_BNW_PLOT<-colorRampPalette(brewer.pal(10, "Set3"))
LabTable<-table(train_orig$label)
par(mfrow=c(1,1))
percentage<-round(LabTable/sum(LabTable)*100)
labels<-paste0(row.names(LabTable), " (", percentage, "%)")
pie(LabTable, labels = labels, col = CUSTOM_BNW_PLOT(10), main = "Percentage of Digits (Training Set)")

#Use 10% trainset and 10% validation set to make faster in process

set.seed(43210)

trainIndex <- createDataPartition(train$label, p=0.1, list = FALSE, times=1)
allindices <- c(1:42000)
training <- train[trainIndex,]
validating <- train[-trainIndex,]
valid0_index <- allindices[! allindices %in% trainIndex]

validIndex<-createDataPartition(validating$label, p=0.1, list = FALSE, times=1)
validating <- validating[validIndex,]
original_validIndex <- valid0_index[validIndex]

#Use SVM

registerDoMC(cores=3)
tc <- trainControl(method = "cv", number=4, verboseIter = F, allowParallel = T)
modSVMR1 <- train(label ~. , data=training, method="svmRadial", trControl=tc)
SVMRadial_predict1 <- as.numeric(predict(modSVMR1, newdata=validating))-1
confusionMatrix(factor(SVMRadial_predict1), validating$label)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1   2   3   4   5   6   7   8   9
##          0 361   0   3   1   1   1   1   2   0   2
##          1   0 419   3   0   1   1   0   3   7   1
##          2   2   0 345   8   0   1   2   5   6   1
##          3   1   2   2 351   0   8   1   1   7   6
##          4   2   1   4   0 341   3   3   7   1   9
##          5   0   0   1  19   1 314   3   2   5   4
##          6   4   0   2   5   6   7 362   0   4   0
##          7   0   0   8   3   0   1   0 364   0  11
##          8   2   0   8   4   1   3   1   2 330   1
##          9   0   0   0   1  16   3   0  10   6 342
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9329          
##                  95% CI : (0.9244, 0.9406)
##     No Information Rate : 0.1116          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9254          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.97043   0.9929  0.91755  0.89541  0.92916  0.91813
## Specificity           0.99678   0.9952  0.99266  0.99174  0.99122  0.98983
## Pos Pred Value        0.97043   0.9632  0.93243  0.92612  0.91914  0.89971
## Neg Pred Value        0.99678   0.9991  0.99092  0.98796  0.99238  0.99185
## Prevalence            0.09833   0.1116  0.09939  0.10362  0.09701  0.09040
## Detection Rate        0.09543   0.1108  0.09120  0.09278  0.09014  0.08300
## Detection Prevalence  0.09833   0.1150  0.09781  0.10019  0.09807  0.09225
## Balanced Accuracy     0.98360   0.9941  0.95511  0.94358  0.96019  0.95398
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           0.97051  0.91919  0.90164  0.90716
## Specificity           0.99179  0.99321  0.99356  0.98943
## Pos Pred Value        0.92821  0.94057  0.93750  0.90476
## Neg Pred Value        0.99676  0.99058  0.98951  0.98972
## Prevalence            0.09860  0.10468  0.09675  0.09966
## Detection Rate        0.09569  0.09622  0.08723  0.09040
## Detection Prevalence  0.10309  0.10230  0.09305  0.09992
## Balanced Accuracy     0.98115  0.95620  0.94760  0.94830