#Library
library(readr)
library(caret)
library(RColorBrewer)
library(doMC)
library(neuralnet)
library(FNN)
library(EBImage)
train<-read_csv("train.csv")
test<-read_csv("test.csv")
dim(train) ; dim(test)
## [1] 42000 785
## [1] 28000 784
head(train)
## # A tibble: 6 x 785
## label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0
## 3 1 0 0 0 0 0 0 0 0 0 0
## 4 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0
## # ... with 774 more variables: pixel10 <dbl>, pixel11 <dbl>, pixel12 <dbl>,
## # pixel13 <dbl>, pixel14 <dbl>, pixel15 <dbl>, pixel16 <dbl>, pixel17 <dbl>,
## # pixel18 <dbl>, pixel19 <dbl>, pixel20 <dbl>, pixel21 <dbl>, pixel22 <dbl>,
## # pixel23 <dbl>, pixel24 <dbl>, pixel25 <dbl>, pixel26 <dbl>, pixel27 <dbl>,
## # pixel28 <dbl>, pixel29 <dbl>, pixel30 <dbl>, pixel31 <dbl>, pixel32 <dbl>,
## # pixel33 <dbl>, pixel34 <dbl>, pixel35 <dbl>, pixel36 <dbl>, pixel37 <dbl>,
## # pixel38 <dbl>, pixel39 <dbl>, pixel40 <dbl>, pixel41 <dbl>, pixel42 <dbl>,
## # pixel43 <dbl>, pixel44 <dbl>, pixel45 <dbl>, pixel46 <dbl>, pixel47 <dbl>,
## # pixel48 <dbl>, pixel49 <dbl>, pixel50 <dbl>, pixel51 <dbl>, pixel52 <dbl>,
## # pixel53 <dbl>, pixel54 <dbl>, pixel55 <dbl>, pixel56 <dbl>, pixel57 <dbl>,
## # pixel58 <dbl>, pixel59 <dbl>, pixel60 <dbl>, pixel61 <dbl>, pixel62 <dbl>,
## # pixel63 <dbl>, pixel64 <dbl>, pixel65 <dbl>, pixel66 <dbl>, pixel67 <dbl>,
## # pixel68 <dbl>, pixel69 <dbl>, pixel70 <dbl>, pixel71 <dbl>, pixel72 <dbl>,
## # pixel73 <dbl>, pixel74 <dbl>, pixel75 <dbl>, pixel76 <dbl>, pixel77 <dbl>,
## # pixel78 <dbl>, pixel79 <dbl>, pixel80 <dbl>, pixel81 <dbl>, pixel82 <dbl>,
## # pixel83 <dbl>, pixel84 <dbl>, pixel85 <dbl>, pixel86 <dbl>, pixel87 <dbl>,
## # pixel88 <dbl>, pixel89 <dbl>, pixel90 <dbl>, pixel91 <dbl>, pixel92 <dbl>,
## # pixel93 <dbl>, pixel94 <dbl>, pixel95 <dbl>, pixel96 <dbl>, pixel97 <dbl>,
## # pixel98 <dbl>, pixel99 <dbl>, pixel100 <dbl>, pixel101 <dbl>,
## # pixel102 <dbl>, pixel103 <dbl>, pixel104 <dbl>, pixel105 <dbl>,
## # pixel106 <dbl>, pixel107 <dbl>, pixel108 <dbl>, pixel109 <dbl>, ...
head(test)
## # A tibble: 6 x 784
## pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 pixel10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0
## # ... with 773 more variables: pixel11 <dbl>, pixel12 <dbl>, pixel13 <dbl>,
## # pixel14 <dbl>, pixel15 <dbl>, pixel16 <dbl>, pixel17 <dbl>, pixel18 <dbl>,
## # pixel19 <dbl>, pixel20 <dbl>, pixel21 <dbl>, pixel22 <dbl>, pixel23 <dbl>,
## # pixel24 <dbl>, pixel25 <dbl>, pixel26 <dbl>, pixel27 <dbl>, pixel28 <dbl>,
## # pixel29 <dbl>, pixel30 <dbl>, pixel31 <dbl>, pixel32 <dbl>, pixel33 <dbl>,
## # pixel34 <dbl>, pixel35 <dbl>, pixel36 <dbl>, pixel37 <dbl>, pixel38 <dbl>,
## # pixel39 <dbl>, pixel40 <dbl>, pixel41 <dbl>, pixel42 <dbl>, pixel43 <dbl>,
## # pixel44 <dbl>, pixel45 <dbl>, pixel46 <dbl>, pixel47 <dbl>, pixel48 <dbl>,
## # pixel49 <dbl>, pixel50 <dbl>, pixel51 <dbl>, pixel52 <dbl>, pixel53 <dbl>,
## # pixel54 <dbl>, pixel55 <dbl>, pixel56 <dbl>, pixel57 <dbl>, pixel58 <dbl>,
## # pixel59 <dbl>, pixel60 <dbl>, pixel61 <dbl>, pixel62 <dbl>, pixel63 <dbl>,
## # pixel64 <dbl>, pixel65 <dbl>, pixel66 <dbl>, pixel67 <dbl>, pixel68 <dbl>,
## # pixel69 <dbl>, pixel70 <dbl>, pixel71 <dbl>, pixel72 <dbl>, pixel73 <dbl>,
## # pixel74 <dbl>, pixel75 <dbl>, pixel76 <dbl>, pixel77 <dbl>, pixel78 <dbl>,
## # pixel79 <dbl>, pixel80 <dbl>, pixel81 <dbl>, pixel82 <dbl>, pixel83 <dbl>,
## # pixel84 <dbl>, pixel85 <dbl>, pixel86 <dbl>, pixel87 <dbl>, pixel88 <dbl>,
## # pixel89 <dbl>, pixel90 <dbl>, pixel91 <dbl>, pixel92 <dbl>, pixel93 <dbl>,
## # pixel94 <dbl>, pixel95 <dbl>, pixel96 <dbl>, pixel97 <dbl>, pixel98 <dbl>,
## # pixel99 <dbl>, pixel100 <dbl>, pixel101 <dbl>, pixel102 <dbl>,
## # pixel103 <dbl>, pixel104 <dbl>, pixel105 <dbl>, pixel106 <dbl>,
## # pixel107 <dbl>, pixel108 <dbl>, pixel109 <dbl>, pixel110 <dbl>, ...
#change column label to be factor type
train[,1] <- as.factor(train[,1]$label)
head(train[,1])
## # A tibble: 6 x 1
## label
## <fct>
## 1 1
## 2 0
## 3 1
## 4 4
## 5 0
## 6 0
#the other column need to be numeric type
head(sapply(train[1,], class))
## label pixel0 pixel1 pixel2 pixel3 pixel4
## "factor" "numeric" "numeric" "numeric" "numeric" "numeric"
#backup the data
train_orig<-train
test_orig<-test
#prepare for training and test
nzv.data<-nearZeroVar(train, saveMetrics = TRUE)
drop.cols<-rownames(nzv.data)[nzv.data$nzv==TRUE]
train<-train[,!names(train) %in% drop.cols]
test<-test[,!names(test) %in% drop.cols]
#see the data
BNW<-c("white","black")
CUSTOM_BNW<-colorRampPalette(colors=BNW)
par(mfrow = c(4,3), pty="s", mar=c(1, 1, 1, 1), xaxt = "n", yaxt = "n")
images_digits_0_9<-array(dim=c(10,28*28))
for (digit in 0:9) {
images_digits_0_9[digit + 1,] <- apply(train_orig[train_orig[,1]==digit, -1], 2, sum)
images_digits_0_9[digit + 1,] <- images_digits_0_9[digit + 1,]/max(images_digits_0_9[digit + 1,]) * 255
z<-array(images_digits_0_9[digit + 1,], dim = c(28, 28))
z<-z[, 28:1]
image(1:28, 1:28, z, main=digit, col=CUSTOM_BNW(256))
}
#To know the percentage of digit in the training set
CUSTOM_BNW_PLOT<-colorRampPalette(brewer.pal(10, "Set3"))
LabTable<-table(train_orig$label)
par(mfrow=c(1,1))
percentage<-round(LabTable/sum(LabTable)*100)
labels<-paste0(row.names(LabTable), " (", percentage, "%)")
pie(LabTable, labels = labels, col = CUSTOM_BNW_PLOT(10), main = "Percentage of Digits (Training Set)")
#Use 10% trainset and 10% validation set to make faster in process
set.seed(43210)
trainIndex <- createDataPartition(train$label, p=0.1, list = FALSE, times=1)
allindices <- c(1:42000)
training <- train[trainIndex,]
validating <- train[-trainIndex,]
valid0_index <- allindices[! allindices %in% trainIndex]
validIndex<-createDataPartition(validating$label, p=0.1, list = FALSE, times=1)
validating <- validating[validIndex,]
original_validIndex <- valid0_index[validIndex]
#Use SVM
registerDoMC(cores=3)
tc <- trainControl(method = "cv", number=4, verboseIter = F, allowParallel = T)
modSVMR1 <- train(label ~. , data=training, method="svmRadial", trControl=tc)
SVMRadial_predict1 <- as.numeric(predict(modSVMR1, newdata=validating))-1
confusionMatrix(factor(SVMRadial_predict1), validating$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 361 0 3 1 1 1 1 2 0 2
## 1 0 419 3 0 1 1 0 3 7 1
## 2 2 0 345 8 0 1 2 5 6 1
## 3 1 2 2 351 0 8 1 1 7 6
## 4 2 1 4 0 341 3 3 7 1 9
## 5 0 0 1 19 1 314 3 2 5 4
## 6 4 0 2 5 6 7 362 0 4 0
## 7 0 0 8 3 0 1 0 364 0 11
## 8 2 0 8 4 1 3 1 2 330 1
## 9 0 0 0 1 16 3 0 10 6 342
##
## Overall Statistics
##
## Accuracy : 0.9329
## 95% CI : (0.9244, 0.9406)
## No Information Rate : 0.1116
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9254
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.97043 0.9929 0.91755 0.89541 0.92916 0.91813
## Specificity 0.99678 0.9952 0.99266 0.99174 0.99122 0.98983
## Pos Pred Value 0.97043 0.9632 0.93243 0.92612 0.91914 0.89971
## Neg Pred Value 0.99678 0.9991 0.99092 0.98796 0.99238 0.99185
## Prevalence 0.09833 0.1116 0.09939 0.10362 0.09701 0.09040
## Detection Rate 0.09543 0.1108 0.09120 0.09278 0.09014 0.08300
## Detection Prevalence 0.09833 0.1150 0.09781 0.10019 0.09807 0.09225
## Balanced Accuracy 0.98360 0.9941 0.95511 0.94358 0.96019 0.95398
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.97051 0.91919 0.90164 0.90716
## Specificity 0.99179 0.99321 0.99356 0.98943
## Pos Pred Value 0.92821 0.94057 0.93750 0.90476
## Neg Pred Value 0.99676 0.99058 0.98951 0.98972
## Prevalence 0.09860 0.10468 0.09675 0.09966
## Detection Rate 0.09569 0.09622 0.08723 0.09040
## Detection Prevalence 0.10309 0.10230 0.09305 0.09992
## Balanced Accuracy 0.98115 0.95620 0.94760 0.94830