train <- read_csv("~/Desktop/Digit Recognizer/train (1).csv")
test <- read_csv("~/Desktop/Digit Recognizer/test (1).csv")
m<-matrix(unlist(train[11,-1]), nrow=28, byrow = T)
image(m, col=grey.colors(255))

Visualization of the digits

digit<-function(x){
  m<-matrix(unlist(x), nrow=28, byrow=T)
  m<-t(apply(m, 2, rev))
  image(m, col=grey.colors(255))
}

par(mfrow=c(3,4))

for(i in 1:12){
  digit(train[i, -1])
}

par(mfrow=c(3,4))

for(i in 13:24){
  digit(train[i, -1])
}

par(mfrow=c(3,4))

for(i in 25:36){
  digit(train[i, -1])
}

par(mfrow=c(3,4))

for(i in 37:48){
  digit(train[i, -1])
}

par(mfrow=c(3,4))

for(i in 49:60){
  digit(train[i, -1])
}

Use principal components to speed up the process

PCTrain<-prcomp(train[, -1], center = TRUE, scale=FALSE)

# Calculate the variance explained by each principal component
VarianceExplained<-as.data.frame(PCTrain$sdev^2/sum(PCTrain$sdev^2))
VarianceExplained<-cbind(1:784, cumsum(VarianceExplained))
colnames(VarianceExplained)<-c("Number", "Variance")
VarianceExplained<-as.data.frame(VarianceExplained)

ggplot(VarianceExplained, aes(x=Number, y=Variance))+geom_point()

# 331 PC explain 99% of the data

#VarianceExplained[331,]

#PCs<-as.data.frame(cbind(label=as.factor(train$label),PCTrain$x[,1:331]))

#PCs$label<-as.factor(PCs$label)


PCs<-as.data.frame(cbind(label=as.factor(train$label),PCTrain$x))

PCs$label<-as.factor(PCs$label)

5-fold cross-validation

set.seed(1)
folds <- createFolds(y=factor(PCs$label), k = 5, list = FALSE)
PCs$fold <- folds

K-NN

    valid.data <- subset(PCs, fold == 1)
    train.data <- subset(PCs, fold != 1) 

    knn.predy<-knn(train = train.data[, 2:332], test = valid.data[1:8400, 2:332], cl = train.data$label, k=10)
    knn.y<-valid.data$label[1:8400]
    
    TenNN<-confusionMatrix(knn.predy, knn.y)
    
    TenNN
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   2   3   4   5   6   7   8   9  10
##         1  824   0   3   1   0   0   2   1   4   4
##         2    0 933  13   4  10   4   1  19   5   2
##         3    0   0 793   3   0   0   0   2   3   3
##         4    0   1   5 836   0  12   0   0  15  12
##         5    0   1   1   0 778   0   0   0   5   7
##         6    1   0   0  11   0 724   4   0  13   2
##         7    1   0   1   0   3  10 821   0   4   0
##         8    0   1  19   9   3   0   0 848   1  13
##         9    0   1   1   3   0   2   0   0 745   2
##         10   0   0   0   4  20   7   0  10  17 792
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9636          
##                  95% CI : (0.9593, 0.9675)
##     No Information Rate : 0.1115          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9595          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity           0.99758   0.9957  0.94856  0.95982  0.95577  0.95389
## Specificity           0.99802   0.9922  0.99855  0.99402  0.99815  0.99594
## Pos Pred Value        0.98212   0.9415  0.98632  0.94892  0.98232  0.95894
## Neg Pred Value        0.99974   0.9995  0.99434  0.99535  0.99527  0.99542
## Prevalence            0.09833   0.1115  0.09952  0.10369  0.09690  0.09036
## Detection Rate        0.09810   0.1111  0.09440  0.09952  0.09262  0.08619
## Detection Prevalence  0.09988   0.1180  0.09571  0.10488  0.09429  0.08988
## Balanced Accuracy     0.99780   0.9940  0.97356  0.97692  0.97696  0.97491
##                      Class: 7 Class: 8 Class: 9 Class: 10
## Sensitivity           0.99155   0.9636  0.91749   0.94624
## Specificity           0.99749   0.9939  0.99881   0.99233
## Pos Pred Value        0.97738   0.9485  0.98806   0.93176
## Neg Pred Value        0.99907   0.9957  0.99124   0.99404
## Prevalence            0.09857   0.1048  0.09667   0.09964
## Detection Rate        0.09774   0.1010  0.08869   0.09429
## Detection Prevalence  0.10000   0.1064  0.08976   0.10119
## Balanced Accuracy     0.99452   0.9788  0.95815   0.96928

SVMs

    valid.data <- subset(PCs, fold == 1)
    train.data <- subset(PCs, fold != 1) 
    
    svmfit<-svm(x = train.data[, 2:332], y = train.data$label, kernel="radial", cost=1538065)
    svm.y<-valid.data$label[1:8400]
    svm.predy<-predict(svmfit, valid.data[1:8400, 2:332])
    
    rSVM<-confusionMatrix(svm.predy, svm.y)
    rSVM
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   2   3   4   5   6   7   8   9  10
##         1  811   0   2   2   1   0   2   1   3   4
##         2    0 918   1   1   4   3   0   5   3   3
##         3    3   3 803  13   9   4   3  15   5   4
##         4    0   5   5 824   1  16   1   3   5  15
##         5    3   1   5   0 780   1   2   5   3  15
##         6    1   4   0   9   0 720   4   1   5   5
##         7    4   1   5   1   4   6 815   0   4   0
##         8    0   0   7   6   1   0   0 841   0  16
##         9    4   5   8   8   3   5   1   1 780   5
##         10   0   0   0   7  11   4   0   8   4 770
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9598          
##                  95% CI : (0.9553, 0.9639)
##     No Information Rate : 0.1115          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9553          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity           0.98184   0.9797  0.96053   0.9460  0.95823  0.94862
## Specificity           0.99802   0.9973  0.99220   0.9932  0.99539  0.99620
## Pos Pred Value        0.98184   0.9787  0.93155   0.9417  0.95706  0.96128
## Neg Pred Value        0.99802   0.9975  0.99562   0.9938  0.99552  0.99490
## Prevalence            0.09833   0.1115  0.09952   0.1037  0.09690  0.09036
## Detection Rate        0.09655   0.1093  0.09560   0.0981  0.09286  0.08571
## Detection Prevalence  0.09833   0.1117  0.10262   0.1042  0.09702  0.08917
## Balanced Accuracy     0.98993   0.9885  0.97636   0.9696  0.97681  0.97241
##                      Class: 7 Class: 8 Class: 9 Class: 10
## Sensitivity           0.98430   0.9557  0.96059   0.91995
## Specificity           0.99670   0.9960  0.99473   0.99550
## Pos Pred Value        0.97024   0.9656  0.95122   0.95771
## Neg Pred Value        0.99828   0.9948  0.99578   0.99118
## Prevalence            0.09857   0.1048  0.09667   0.09964
## Detection Rate        0.09702   0.1001  0.09286   0.09167
## Detection Prevalence  0.10000   0.1037  0.09762   0.09571
## Balanced Accuracy     0.99050   0.9758  0.97766   0.95773

Quick Radial SVM Classification

#train <- read_csv("~/Desktop/Digit Recognizer/train (1).csv")
#test <- read_csv("~/Desktop/Digit Recognizer/test (1).csv")

Changing the test set into 331 PCs.

#PCTest<-prcomp(test, center = TRUE, scale=FALSE)

# Calculate the variance explained by each principal component
#VarianceExplained<-as.data.frame(PCTest$sdev^2/sum(PCTest$sdev^2))
#VarianceExplained<-cbind(1:784, cumsum(VarianceExplained))
#colnames(VarianceExplained)<-c("Number", "Variance")
#VarianceExplained<-as.data.frame(VarianceExplained)

#ggplot(VarianceExplained, aes(x=Number, y=Variance))+geom_point()

#VarianceExplained[331,]

#PCsTest<-as.data.frame(PCTest$x[,1:331])
#knn.predy<-knn(train = train[, -1], test = test, cl = train$label, k=10)
#svmfit<-svm(label ~ . ,data = train, kernel="radial", cost=1538065 , gamma = 0.02)

#svm.predy<-predict(svmfit, test)

#svmfit