Data Exploratory

str(letterdata)
## 'data.frame':    20000 obs. of  17 variables:
##  $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
##  $ xbox  : int  2 5 4 7 2 4 4 1 2 11 ...
##  $ ybox  : int  8 12 11 11 1 11 2 1 2 15 ...
##  $ width : int  3 3 6 6 3 5 5 3 4 13 ...
##  $ height: int  5 7 8 6 1 8 4 2 4 9 ...
##  $ onpix : int  1 2 6 3 1 3 4 1 2 7 ...
##  $ xbar  : int  8 10 10 5 8 8 8 8 10 13 ...
##  $ ybar  : int  13 5 6 9 6 8 7 2 6 2 ...
##  $ x2bar : int  0 5 2 4 6 6 6 2 2 6 ...
##  $ y2bar : int  6 4 6 6 6 9 6 2 6 2 ...
##  $ xybar : int  6 13 10 4 6 5 7 8 12 12 ...
##  $ x2ybar: int  10 3 3 4 5 6 6 2 4 1 ...
##  $ xy2bar: int  8 9 7 10 9 6 6 8 8 9 ...
##  $ xedge : int  0 2 3 6 1 0 2 1 1 8 ...
##  $ xedgey: int  8 8 7 10 7 8 8 6 6 1 ...
##  $ yedge : int  0 4 3 2 5 9 7 2 1 1 ...
##  $ yedgex: int  8 10 9 8 10 7 10 7 7 8 ...
glimpse(letterdata)
## Rows: 20,000
## Columns: 17
## $ letter <fct> T, I, D, N, G, S, B, A, J, M, X, O, G, M, R, F, O, C, T, J, J, …
## $ xbox   <int> 2, 5, 4, 7, 2, 4, 4, 1, 2, 11, 3, 6, 4, 6, 5, 6, 3, 7, 6, 2, 1,…
## $ ybox   <int> 8, 12, 11, 11, 1, 11, 2, 1, 2, 15, 9, 13, 9, 9, 9, 9, 4, 10, 11…
## $ width  <int> 3, 3, 6, 6, 3, 5, 5, 3, 4, 13, 5, 4, 6, 8, 5, 5, 4, 5, 6, 3, 2,…
## $ height <int> 5, 7, 8, 6, 1, 8, 4, 2, 4, 9, 7, 7, 7, 6, 7, 4, 3, 5, 8, 3, 2, …
## $ onpix  <int> 1, 2, 6, 3, 1, 3, 4, 1, 2, 7, 4, 4, 6, 9, 6, 3, 2, 2, 5, 1, 1, …
## $ xbar   <int> 8, 10, 10, 5, 8, 8, 8, 8, 10, 13, 8, 6, 7, 7, 6, 10, 8, 6, 6, 1…
## $ ybar   <int> 13, 5, 6, 9, 6, 8, 7, 2, 6, 2, 7, 7, 8, 8, 11, 6, 7, 8, 11, 6, …
## $ x2bar  <int> 0, 5, 2, 4, 6, 6, 6, 2, 2, 6, 3, 6, 6, 6, 7, 3, 7, 6, 5, 3, 2, …
## $ y2bar  <int> 6, 4, 6, 6, 6, 9, 6, 2, 6, 2, 8, 3, 2, 5, 3, 5, 5, 8, 6, 6, 5, …
## $ xybar  <int> 6, 13, 10, 4, 6, 5, 7, 8, 12, 12, 5, 10, 6, 7, 7, 10, 7, 11, 11…
## $ x2ybar <int> 10, 3, 3, 4, 5, 6, 6, 2, 4, 1, 6, 7, 5, 5, 3, 5, 6, 7, 9, 4, 5,…
## $ xy2bar <int> 8, 9, 7, 10, 9, 6, 6, 8, 8, 9, 8, 9, 11, 8, 9, 7, 8, 11, 4, 9, …
## $ xedge  <int> 0, 2, 3, 6, 1, 0, 2, 1, 1, 8, 2, 5, 4, 8, 2, 3, 2, 2, 3, 0, 0, …
## $ xedgey <int> 8, 8, 7, 10, 7, 8, 8, 6, 6, 1, 8, 9, 8, 9, 7, 9, 8, 8, 12, 7, 7…
## $ yedge  <int> 0, 4, 3, 2, 5, 9, 7, 2, 1, 1, 6, 5, 7, 8, 5, 6, 3, 5, 2, 1, 0, …
## $ yedgex <int> 8, 10, 9, 8, 10, 7, 10, 7, 7, 8, 7, 8, 8, 6, 11, 9, 8, 9, 4, 7,…
summary(letterdata)
##      letter           xbox             ybox            width       
##  U      :  813   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  D      :  805   1st Qu.: 3.000   1st Qu.: 5.000   1st Qu.: 4.000  
##  P      :  803   Median : 4.000   Median : 7.000   Median : 5.000  
##  T      :  796   Mean   : 4.024   Mean   : 7.035   Mean   : 5.122  
##  M      :  792   3rd Qu.: 5.000   3rd Qu.: 9.000   3rd Qu.: 6.000  
##  A      :  789   Max.   :15.000   Max.   :15.000   Max.   :15.000  
##  (Other):15202                                                     
##      height           onpix             xbar             ybar     
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.0  
##  1st Qu.: 4.000   1st Qu.: 2.000   1st Qu.: 6.000   1st Qu.: 6.0  
##  Median : 6.000   Median : 3.000   Median : 7.000   Median : 7.0  
##  Mean   : 5.372   Mean   : 3.506   Mean   : 6.898   Mean   : 7.5  
##  3rd Qu.: 7.000   3rd Qu.: 5.000   3rd Qu.: 8.000   3rd Qu.: 9.0  
##  Max.   :15.000   Max.   :15.000   Max.   :15.000   Max.   :15.0  
##                                                                   
##      x2bar            y2bar            xybar            x2ybar      
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 3.000   1st Qu.: 4.000   1st Qu.: 7.000   1st Qu.: 5.000  
##  Median : 4.000   Median : 5.000   Median : 8.000   Median : 6.000  
##  Mean   : 4.629   Mean   : 5.179   Mean   : 8.282   Mean   : 6.454  
##  3rd Qu.: 6.000   3rd Qu.: 7.000   3rd Qu.:10.000   3rd Qu.: 8.000  
##  Max.   :15.000   Max.   :15.000   Max.   :15.000   Max.   :15.000  
##                                                                     
##      xy2bar           xedge            xedgey           yedge       
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 7.000   1st Qu.: 1.000   1st Qu.: 8.000   1st Qu.: 2.000  
##  Median : 8.000   Median : 3.000   Median : 8.000   Median : 3.000  
##  Mean   : 7.929   Mean   : 3.046   Mean   : 8.339   Mean   : 3.692  
##  3rd Qu.: 9.000   3rd Qu.: 4.000   3rd Qu.: 9.000   3rd Qu.: 5.000  
##  Max.   :15.000   Max.   :15.000   Max.   :15.000   Max.   :15.000  
##                                                                     
##      yedgex      
##  Min.   : 0.000  
##  1st Qu.: 7.000  
##  Median : 8.000  
##  Mean   : 7.801  
##  3rd Qu.: 9.000  
##  Max.   :15.000  
## 
sum(is.na(letterdata))
## [1] 0
dim(letterdata)
## [1] 20000    17

Removing the dependent feature

letter1 <- letterdata %>%  select(-letter)

dim(letter1)
## [1] 20000    16
### Data Exploratory 
newdata <- melt(letter1)
## Using  as id variables

Plots for the variables

boxplot(data = newdata, value ~ variable)

pairs.panels(letter1[c("xbox", "ybox", "width", "height", "onpix", "xbar", "ybar", "x2bar","y2bar","xybar", "x2ybar", "xy2bar", "xedge",
                       "xedgey", "yedge", "yedgex")], pch = ",")

Splitting the data

letter_train <- letterdata[1:16000, ]
letter_test <- letterdata[16001:20000, ]

Training the data

letter_class <- ksvm(letter ~ ., data = letter_train, kernel ="vanilladot")
##  Setting default kernel parameters

Evaluating the model performance

letter_pred <- predict(letter_class, letter_test)

head(letter_pred)
## [1] U N V X N H
## Levels: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z

The diagonal values indicates the total number of records where th predicted letter matches the the true value.

table(letter_pred, letter_test$letter)
##            
## letter_pred   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O   P   Q
##           A 144   0   0   0   0   0   0   0   0   1   0   0   1   2   2   0   5
##           B   0 121   0   5   2   0   1   2   0   0   1   0   1   0   0   2   2
##           C   0   0 120   0   4   0  10   2   2   0   1   3   0   0   2   0   0
##           D   2   2   0 156   0   1   3  10   4   3   4   3   0   5   5   3   1
##           E   0   0   5   0 127   3   1   1   0   0   3   4   0   0   0   0   2
##           F   0   0   0   0   0 138   2   2   6   0   0   0   0   0   0  16   0
##           G   1   1   2   1   9   2 123   2   0   0   1   2   1   0   1   2   8
##           H   0   0   0   1   0   1   0 102   0   2   3   2   3   4  20   0   2
##           I   0   1   0   0   0   1   0   0 141   8   0   0   0   0   0   1   0
##           J   0   1   0   0   0   1   0   2   5 128   0   0   0   0   1   1   3
##           K   1   1   9   0   0   0   2   5   0   0 118   0   0   2   0   1   0
##           L   0   0   0   0   2   0   1   1   0   0   0 133   0   0   0   0   1
##           M   0   0   1   1   0   0   1   1   0   0   0   0 135   4   0   0   0
##           N   0   0   0   0   0   1   0   1   0   0   0   0   0 145   0   0   0
##           O   1   0   2   1   0   0   1   2   0   1   0   0   0   1  99   3   3
##           P   0   0   0   1   0   2   1   0   0   0   0   0   0   0   2 130   0
##           Q   0   0   0   0   0   0   8   2   0   0   0   3   0   0   3   1 124
##           R   0   7   0   0   1   0   3   8   0   0  13   0   0   1   1   1   0
##           S   1   1   0   0   1   0   3   0   1   1   0   1   0   0   0   0  14
##           T   0   0   0   0   3   2   0   0   0   0   1   0   0   0   0   0   0
##           U   1   0   3   1   0   0   0   2   0   0   0   0   0   0   1   0   0
##           V   0   0   0   0   0   1   3   4   0   0   0   0   1   2   1   0   3
##           W   0   0   0   0   0   0   1   0   0   0   0   0   2   0   0   0   0
##           X   0   1   0   0   2   0   0   1   3   0   1   6   0   0   1   0   0
##           Y   3   0   0   0   0   0   0   1   0   0   0   0   0   0   0   7   0
##           Z   2   0   0   0   1   0   0   0   3   4   0   0   0   0   0   0   0
##            
## letter_pred   R   S   T   U   V   W   X   Y   Z
##           A   0   1   1   1   0   1   0   0   1
##           B   3   5   0   0   2   0   1   0   0
##           C   0   0   0   0   0   0   0   0   0
##           D   4   0   0   0   0   0   3   3   1
##           E   0  10   0   0   0   0   2   0   3
##           F   0   3   0   0   1   0   1   2   0
##           G   2   4   3   0   0   0   1   0   0
##           H   3   0   3   0   2   0   0   1   0
##           I   0   3   0   0   0   0   5   1   1
##           J   0   2   0   0   0   0   1   0   6
##           K   7   0   1   3   0   0   5   0   0
##           L   0   5   0   0   0   0   0   0   1
##           M   0   0   0   3   0   8   0   0   0
##           N   3   0   0   1   0   2   0   0   0
##           O   0   0   0   3   0   0   0   0   0
##           P   0   0   0   0   0   0   0   1   0
##           Q   0   5   0   0   0   0   0   2   0
##           R 138   0   1   0   1   0   0   0   0
##           S   0 101   3   0   0   0   2   0  10
##           T   0   3 133   1   0   0   0   2   2
##           U   0   0   0 152   0   0   1   1   0
##           V   1   0   0   0 126   1   0   4   0
##           W   0   0   0   4   4 127   0   0   0
##           X   0   1   0   0   0   0 137   1   1
##           Y   0   0   3   0   0   0   0 127   0
##           Z   0  18   3   0   0   0   0   0 132
agreement <- letter_pred == letter_test$letter

table(agreement)
## agreement
## FALSE  TRUE 
##   643  3357

Percentage of accuracy

prop.table(table(agreement))
## agreement
##   FALSE    TRUE 
## 0.16075 0.83925

Improving the model performance by changing the SVM kernel function to Gaussian RBF kernel

set.seed(12345)

letter_class2 <- ksvm(letter ~ ., data = letter_train, kernel = "rbfdot")

### making the predictions

letter_pred2 <- predict(letter_class2, letter_test)

Comparing the accuracy

agreement2 <- letter_pred2== letter_test$letter

table(agreement2)
## agreement2
## FALSE  TRUE 
##   278  3722

Percentage of accuracy

prop.table(table(agreement2))
## agreement2
##  FALSE   TRUE 
## 0.0695 0.9305

Mapping accuracy against the SVM cost for the RBF kernel

cost_values <- c(1, seq(from = 5, to =50, by = 5))
accuracy_values <- sapply(cost_values, function(x){
  set.seed(12345)
  m <- ksvm(letter ~ ., data = letter_train, kernel = "rbfdot", C = x)
  pred <- predict(m, letter_test)
  agree <- ifelse(pred == letter_test$letter, 1, 0)
  accuracy <- sum(agree) / nrow(letter_test)
  return (accuracy)
  })

Visualisation shows that the model accuracy can be improved with the cost value between 40 and 50.

plot(cost_values, accuracy_values, types = "b", col=cost_values, pch = 19)