Data on handwritten zip-code digits from the United States Postal Serviceis divided into the data sets train (7,291 observations) and `test’ (2,007observations). Of the 257 columnsin both datasets, the rst contains the known data labels, which simplyrepresent the identity of each handwritten image [0,..,9]. The remaining 256 columns, for each single digit, represent the concatenation of a 16x16greyscale map matrix into a vector of length 256, using the scale [-1,1],corresponding to [black, white]. This matrix is obtained by subdividing thedigit entry box into a 16x16 grid and then scanning.

I aim to practice the Tuning Parameters with different Kernels of SVM utilizing Caret package. I also use MLP Neural Network with Keras to compare the accuracy with Parameterized SVM.

library(keras)
library(caret)
library(MLmetrics)
library(kernlab)

1. Import Data

train <- read.table("train.txt", header = F)
test <- read.table("test.txt", header = F)
dim(train)
## [1] 7291  257
library(doParallel)
cl <- makeCluster(2)
registerDoParallel(cl)

2. Model Building

preProcValues <- preProcess(train[,-1], method = "pca")
# transformed for Train
transformed_train <- predict(preProcValues, train[,-1])
transformed_train$response <- train[,1]
# convert Response to Catogorical / Rounded Numbers from [0,....9]
transformed_train$response <- as.factor(transformed_train$response)
# transformed for Test
transformed_test <- predict(preProcValues, test[,-1])
transformed_test$response <- test[,1]

2.1 “Polynomial” Kernel with Tuning Parameters

  • Tuning Parameters on C, Degree, scale=1.
tune.poly <- expand.grid(C= c(0.1,1,10,100), degree=c(1,2,3) ,scale= 1)
my_Control <- trainControl(method="cv",number=5)
set.seed(123)
sel.poly <- train(response ~ . , data= transformed_train,method="svmPoly",trControl=my_Control,tuneGrid=tune.poly)
caret_poly_predict <- predict(sel.poly, transformed_test[,-257])

Accuracy with Test Set.

mean(caret_poly_predict == transformed_test$response)
## [1] 0.9402093

Model Summary

sel.poly
## Support Vector Machines with Polynomial Kernel 
## 
## 7291 samples
##  107 predictor
##   10 classes: '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 5833, 5832, 5832, 5834, 5833 
## Resampling results across tuning parameters:
## 
##   C      degree  Accuracy   Kappa    
##     0.1  1       0.9502098  0.9441941
##     0.1  2       0.9691406  0.9654179
##     0.1  3       0.9255237  0.9162976
##     1.0  1       0.9443133  0.9375773
##     1.0  2       0.9688664  0.9651101
##     1.0  3       0.9255237  0.9162976
##    10.0  1       0.9433535  0.9365008
##    10.0  2       0.9688664  0.9651101
##    10.0  3       0.9255237  0.9162976
##   100.0  1       0.9433535  0.9365023
##   100.0  2       0.9688664  0.9651101
##   100.0  3       0.9255237  0.9162976
## 
## Tuning parameter 'scale' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were degree = 2, scale = 1 and C = 0.1.

2.2 “Radial” Kernel with Tuning Parameters

  • Tuning Parameters on C, Sigma.
tune.radial <- expand.grid(C= c(0.01, 0.1, 1, 5, 10), sigma = c(0.001, 0.01, 0.1, 1, 5)) 
my_Control <- trainControl(method="cv",number=5)
set.seed(123)
sel.radial <- train(response ~ . , data= transformed_train,method="svmRadial",trControl=my_Control,tuneGrid=tune.radial)
caret_radial_predict <- predict(sel.radial, transformed_test[,-257])

Accuracy with Test Set.

mean(caret_radial_predict == transformed_test$response)
## [1] 0.9297459

Model Summary after Tuning Parameters

sel.radial
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 7291 samples
##  107 predictor
##   10 classes: '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 5833, 5832, 5832, 5834, 5833 
## Resampling results across tuning parameters:
## 
##   C      sigma  Accuracy   Kappa       
##    0.01  0.001  0.1637635  0.0000000000
##    0.01  0.010  0.2988611  0.1650721756
##    0.01  0.100  0.2298704  0.0809388289
##    0.01  1.000  0.1637635  0.0000000000
##    0.01  5.000  0.1637635  0.0000000000
##    0.10  0.001  0.5654917  0.4957597232
##    0.10  0.010  0.8237535  0.8021227766
##    0.10  0.100  0.2833618  0.1461871873
##    0.10  1.000  0.1637635  0.0000000000
##    0.10  5.000  0.1637635  0.0000000000
##    1.00  0.001  0.9470577  0.9406707583
##    1.00  0.010  0.9429413  0.9360605311
##    1.00  0.100  0.3587982  0.2406672474
##    1.00  1.000  0.2116296  0.0586358464
##    1.00  5.000  0.1637635  0.0000000000
##    5.00  0.001  0.9602243  0.9554333240
##    5.00  0.010  0.9488399  0.9426749094
##    5.00  0.100  0.3779995  0.2646527724
##    5.00  1.000  0.2178004  0.0662018849
##    5.00  5.000  0.1640379  0.0003365733
##   10.00  0.001  0.9615955  0.9569673134
##   10.00  0.010  0.9489771  0.9428286455
##   10.00  0.100  0.3779995  0.2646527724
##   10.00  1.000  0.2178004  0.0662018849
##   10.00  5.000  0.1640379  0.0003365733
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.001 and C = 10.

2.3 Deep Learning MLP

train <- read.table("train.txt", header = F)
test <- read.table("test.txt", header = F)
x_train <- train[,2:257]
y_train <- train[,1]
x_test <- test[,2:257]
y_test <- test[,1]
y_train <- as.matrix(y_train)
y_test <- as.matrix(y_test)
y_train <- to_categorical(y_train, 10)
y_test <- to_categorical(y_test, 10)
x_train <- as.matrix(x_train)
y_train <- as.matrix(y_train)

Step1: Defining the Model

model <- keras_model_sequential() 
model %>% 
  layer_dense(units = 256, activation = 'relu', input_shape = c(256)) %>% 
  layer_dropout(rate = 0.4) %>% 
  layer_dense(units = 128, activation = 'relu') %>%
  layer_dropout(rate = 0.3) %>%
  layer_dense(units = 64, activation = 'relu') %>%
  layer_dropout(rate = 0.25) %>%
  layer_dense(units = 10, activation = 'softmax')

Step 2: Compile the Model

model %>% compile(
loss = 'categorical_crossentropy',
optimizer = 'adam',
metrics = c('accuracy')
)

Step 3: Fit to Training Dataset

history <- fit(
  object           = model, 
  x                = as.matrix(x_train), 
  y                = y_train,
  batch_size       = 50,
  epochs           = 35,
  validation_split = 0.3
)
print(history)
## Trained on 5,103 samples (batch_size=50, epochs=35)
## Final epoch (plot to see history):
##     loss: 0.02772
##      acc: 0.992
## val_loss: 0.1451
##  val_acc: 0.9698
yhat_keras_class_vec <- predict_classes(object = model, x = as.matrix(x_test)) %>%
    as.vector()

Accuracy of MLP

mean(yhat_keras_class_vec == test[,1])
## [1] 0.9392128

2.4 CNN (Convolutional neural network)

train <- read.table("train.txt", header = F)
test <- read.table("test.txt", header = F)
train<-data.matrix(train)
test<-data.matrix(test)
x_train <- train[,2:257]
y_train <- train[,1]
x_test <- test[,2:257]
y_test <- test[,1]
y_train <- to_categorical(y_train, 10)
y_test <- to_categorical(y_test, 10)
## convert to 4d array
x_train_cnn<-array_reshape(data.matrix(train[,2:257]),c(nrow(train),16,16,1))
#Data partition
y_train_cnn<-data.matrix(train[,1])
y_train_cnn<-to_categorical(y_train_cnn)
model <- keras_model_sequential() %>%
  layer_conv_2d(filters = 32, kernel_size = c(3, 3), activation = "relu",
                input_shape = c(16, 16, 1)) %>%
  layer_max_pooling_2d(pool_size = c(2, 2)) %>%
  layer_conv_2d(filters = 64, kernel_size = c(3, 3), activation = "relu") %>%
  layer_max_pooling_2d(pool_size = c(2, 2)) %>%
  layer_flatten() %>%
  layer_dropout(rate=0.5) %>%
  layer_dense(units = 64, activation = "relu") %>%
  layer_dense(units = 10, activation = "softmax")
model %>% compile(
loss = 'categorical_crossentropy',
optimizer = 'adam',
metrics = c('accuracy')
)
history<- model %>% fit(
    x_train_cnn,y_train_cnn,
    epochs=10,
    batch_size=32)
history
## Trained on 7,291 samples (batch_size=32, epochs=10)
## Final epoch (plot to see history):
## loss: 0.07905
##  acc: 0.9764
x_test_cnn <-array_reshape(data.matrix(test[,2:257]),c(nrow(test),16,16,1))
y_test_cnn<-data.matrix(train[,1])
yhat_keras_class_vec <- predict_classes(object = model, x = x_test_cnn) %>%
    as.vector()

Accuracy of CNN

mean(yhat_keras_class_vec == test[,1])
## [1] 0.9586447