train <- read_csv("~/Desktop/Digit Recognizer/train (1).csv")
test <- read_csv("~/Desktop/Digit Recognizer/test (1).csv")
m<-matrix(unlist(train[11,-1]), nrow=28, byrow = T)
image(m, col=grey.colors(255))

Visualization of the digits
digit<-function(x){
m<-matrix(unlist(x), nrow=28, byrow=T)
m<-t(apply(m, 2, rev))
image(m, col=grey.colors(255))
}
par(mfrow=c(3,4))
for(i in 1:12){
digit(train[i, -1])
}

par(mfrow=c(3,4))
for(i in 13:24){
digit(train[i, -1])
}

par(mfrow=c(3,4))
for(i in 25:36){
digit(train[i, -1])
}

par(mfrow=c(3,4))
for(i in 37:48){
digit(train[i, -1])
}

par(mfrow=c(3,4))
for(i in 49:60){
digit(train[i, -1])
}

Use principal components to speed up the process
PCTrain<-prcomp(train[, -1], center = TRUE, scale=FALSE)
# Calculate the variance explained by each principal component
VarianceExplained<-as.data.frame(PCTrain$sdev^2/sum(PCTrain$sdev^2))
VarianceExplained<-cbind(1:784, cumsum(VarianceExplained))
colnames(VarianceExplained)<-c("Number", "Variance")
VarianceExplained<-as.data.frame(VarianceExplained)
ggplot(VarianceExplained, aes(x=Number, y=Variance))+geom_point()

# 331 PC explain 99% of the data
#VarianceExplained[331,]
#PCs<-as.data.frame(cbind(label=as.factor(train$label),PCTrain$x[,1:331]))
#PCs$label<-as.factor(PCs$label)
PCs<-as.data.frame(cbind(label=as.factor(train$label),PCTrain$x))
PCs$label<-as.factor(PCs$label)
5-fold cross-validation
set.seed(1)
folds <- createFolds(y=factor(PCs$label), k = 5, list = FALSE)
PCs$fold <- folds
K-NN
valid.data <- subset(PCs, fold == 1)
train.data <- subset(PCs, fold != 1)
knn.predy<-knn(train = train.data[, 2:332], test = valid.data[1:8400, 2:332], cl = train.data$label, k=10)
knn.y<-valid.data$label[1:8400]
TenNN<-confusionMatrix(knn.predy, knn.y)
TenNN
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8 9 10
## 1 824 0 3 1 0 0 2 1 4 4
## 2 0 933 13 4 10 4 1 19 5 2
## 3 0 0 793 3 0 0 0 2 3 3
## 4 0 1 5 836 0 12 0 0 15 12
## 5 0 1 1 0 778 0 0 0 5 7
## 6 1 0 0 11 0 724 4 0 13 2
## 7 1 0 1 0 3 10 821 0 4 0
## 8 0 1 19 9 3 0 0 848 1 13
## 9 0 1 1 3 0 2 0 0 745 2
## 10 0 0 0 4 20 7 0 10 17 792
##
## Overall Statistics
##
## Accuracy : 0.9636
## 95% CI : (0.9593, 0.9675)
## No Information Rate : 0.1115
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9595
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.99758 0.9957 0.94856 0.95982 0.95577 0.95389
## Specificity 0.99802 0.9922 0.99855 0.99402 0.99815 0.99594
## Pos Pred Value 0.98212 0.9415 0.98632 0.94892 0.98232 0.95894
## Neg Pred Value 0.99974 0.9995 0.99434 0.99535 0.99527 0.99542
## Prevalence 0.09833 0.1115 0.09952 0.10369 0.09690 0.09036
## Detection Rate 0.09810 0.1111 0.09440 0.09952 0.09262 0.08619
## Detection Prevalence 0.09988 0.1180 0.09571 0.10488 0.09429 0.08988
## Balanced Accuracy 0.99780 0.9940 0.97356 0.97692 0.97696 0.97491
## Class: 7 Class: 8 Class: 9 Class: 10
## Sensitivity 0.99155 0.9636 0.91749 0.94624
## Specificity 0.99749 0.9939 0.99881 0.99233
## Pos Pred Value 0.97738 0.9485 0.98806 0.93176
## Neg Pred Value 0.99907 0.9957 0.99124 0.99404
## Prevalence 0.09857 0.1048 0.09667 0.09964
## Detection Rate 0.09774 0.1010 0.08869 0.09429
## Detection Prevalence 0.10000 0.1064 0.08976 0.10119
## Balanced Accuracy 0.99452 0.9788 0.95815 0.96928
SVMs
valid.data <- subset(PCs, fold == 1)
train.data <- subset(PCs, fold != 1)
svmfit<-svm(x = train.data[, 2:332], y = train.data$label, kernel="radial", cost=1538065)
svm.y<-valid.data$label[1:8400]
svm.predy<-predict(svmfit, valid.data[1:8400, 2:332])
rSVM<-confusionMatrix(svm.predy, svm.y)
rSVM
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8 9 10
## 1 811 0 2 2 1 0 2 1 3 4
## 2 0 918 1 1 4 3 0 5 3 3
## 3 3 3 803 13 9 4 3 15 5 4
## 4 0 5 5 824 1 16 1 3 5 15
## 5 3 1 5 0 780 1 2 5 3 15
## 6 1 4 0 9 0 720 4 1 5 5
## 7 4 1 5 1 4 6 815 0 4 0
## 8 0 0 7 6 1 0 0 841 0 16
## 9 4 5 8 8 3 5 1 1 780 5
## 10 0 0 0 7 11 4 0 8 4 770
##
## Overall Statistics
##
## Accuracy : 0.9598
## 95% CI : (0.9553, 0.9639)
## No Information Rate : 0.1115
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9553
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.98184 0.9797 0.96053 0.9460 0.95823 0.94862
## Specificity 0.99802 0.9973 0.99220 0.9932 0.99539 0.99620
## Pos Pred Value 0.98184 0.9787 0.93155 0.9417 0.95706 0.96128
## Neg Pred Value 0.99802 0.9975 0.99562 0.9938 0.99552 0.99490
## Prevalence 0.09833 0.1115 0.09952 0.1037 0.09690 0.09036
## Detection Rate 0.09655 0.1093 0.09560 0.0981 0.09286 0.08571
## Detection Prevalence 0.09833 0.1117 0.10262 0.1042 0.09702 0.08917
## Balanced Accuracy 0.98993 0.9885 0.97636 0.9696 0.97681 0.97241
## Class: 7 Class: 8 Class: 9 Class: 10
## Sensitivity 0.98430 0.9557 0.96059 0.91995
## Specificity 0.99670 0.9960 0.99473 0.99550
## Pos Pred Value 0.97024 0.9656 0.95122 0.95771
## Neg Pred Value 0.99828 0.9948 0.99578 0.99118
## Prevalence 0.09857 0.1048 0.09667 0.09964
## Detection Rate 0.09702 0.1001 0.09286 0.09167
## Detection Prevalence 0.10000 0.1037 0.09762 0.09571
## Balanced Accuracy 0.99050 0.9758 0.97766 0.95773
Quick Radial SVM Classification
#train <- read_csv("~/Desktop/Digit Recognizer/train (1).csv")
#test <- read_csv("~/Desktop/Digit Recognizer/test (1).csv")
Changing the test set into 331 PCs.
#PCTest<-prcomp(test, center = TRUE, scale=FALSE)
# Calculate the variance explained by each principal component
#VarianceExplained<-as.data.frame(PCTest$sdev^2/sum(PCTest$sdev^2))
#VarianceExplained<-cbind(1:784, cumsum(VarianceExplained))
#colnames(VarianceExplained)<-c("Number", "Variance")
#VarianceExplained<-as.data.frame(VarianceExplained)
#ggplot(VarianceExplained, aes(x=Number, y=Variance))+geom_point()
#VarianceExplained[331,]
#PCsTest<-as.data.frame(PCTest$x[,1:331])
#knn.predy<-knn(train = train[, -1], test = test, cl = train$label, k=10)
#svmfit<-svm(label ~ . ,data = train, kernel="radial", cost=1538065 , gamma = 0.02)
#svm.predy<-predict(svmfit, test)
#svmfit