Support Vector Machine in R

Data Exploratory

str(letterdata)

## 'data.frame':    20000 obs. of  17 variables:
##  $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
##  $ xbox  : int  2 5 4 7 2 4 4 1 2 11 ...
##  $ ybox  : int  8 12 11 11 1 11 2 1 2 15 ...
##  $ width : int  3 3 6 6 3 5 5 3 4 13 ...
##  $ height: int  5 7 8 6 1 8 4 2 4 9 ...
##  $ onpix : int  1 2 6 3 1 3 4 1 2 7 ...
##  $ xbar  : int  8 10 10 5 8 8 8 8 10 13 ...
##  $ ybar  : int  13 5 6 9 6 8 7 2 6 2 ...
##  $ x2bar : int  0 5 2 4 6 6 6 2 2 6 ...
##  $ y2bar : int  6 4 6 6 6 9 6 2 6 2 ...
##  $ xybar : int  6 13 10 4 6 5 7 8 12 12 ...
##  $ x2ybar: int  10 3 3 4 5 6 6 2 4 1 ...
##  $ xy2bar: int  8 9 7 10 9 6 6 8 8 9 ...
##  $ xedge : int  0 2 3 6 1 0 2 1 1 8 ...
##  $ xedgey: int  8 8 7 10 7 8 8 6 6 1 ...
##  $ yedge : int  0 4 3 2 5 9 7 2 1 1 ...
##  $ yedgex: int  8 10 9 8 10 7 10 7 7 8 ...

glimpse(letterdata)

## Rows: 20,000
## Columns: 17
## $ letter <fct> T, I, D, N, G, S, B, A, J, M, X, O, G, M, R, F, O, C, T, J, J, …
## $ xbox   <int> 2, 5, 4, 7, 2, 4, 4, 1, 2, 11, 3, 6, 4, 6, 5, 6, 3, 7, 6, 2, 1,…
## $ ybox   <int> 8, 12, 11, 11, 1, 11, 2, 1, 2, 15, 9, 13, 9, 9, 9, 9, 4, 10, 11…
## $ width  <int> 3, 3, 6, 6, 3, 5, 5, 3, 4, 13, 5, 4, 6, 8, 5, 5, 4, 5, 6, 3, 2,…
## $ height <int> 5, 7, 8, 6, 1, 8, 4, 2, 4, 9, 7, 7, 7, 6, 7, 4, 3, 5, 8, 3, 2, …
## $ onpix  <int> 1, 2, 6, 3, 1, 3, 4, 1, 2, 7, 4, 4, 6, 9, 6, 3, 2, 2, 5, 1, 1, …
## $ xbar   <int> 8, 10, 10, 5, 8, 8, 8, 8, 10, 13, 8, 6, 7, 7, 6, 10, 8, 6, 6, 1…
## $ ybar   <int> 13, 5, 6, 9, 6, 8, 7, 2, 6, 2, 7, 7, 8, 8, 11, 6, 7, 8, 11, 6, …
## $ x2bar  <int> 0, 5, 2, 4, 6, 6, 6, 2, 2, 6, 3, 6, 6, 6, 7, 3, 7, 6, 5, 3, 2, …
## $ y2bar  <int> 6, 4, 6, 6, 6, 9, 6, 2, 6, 2, 8, 3, 2, 5, 3, 5, 5, 8, 6, 6, 5, …
## $ xybar  <int> 6, 13, 10, 4, 6, 5, 7, 8, 12, 12, 5, 10, 6, 7, 7, 10, 7, 11, 11…
## $ x2ybar <int> 10, 3, 3, 4, 5, 6, 6, 2, 4, 1, 6, 7, 5, 5, 3, 5, 6, 7, 9, 4, 5,…
## $ xy2bar <int> 8, 9, 7, 10, 9, 6, 6, 8, 8, 9, 8, 9, 11, 8, 9, 7, 8, 11, 4, 9, …
## $ xedge  <int> 0, 2, 3, 6, 1, 0, 2, 1, 1, 8, 2, 5, 4, 8, 2, 3, 2, 2, 3, 0, 0, …
## $ xedgey <int> 8, 8, 7, 10, 7, 8, 8, 6, 6, 1, 8, 9, 8, 9, 7, 9, 8, 8, 12, 7, 7…
## $ yedge  <int> 0, 4, 3, 2, 5, 9, 7, 2, 1, 1, 6, 5, 7, 8, 5, 6, 3, 5, 2, 1, 0, …
## $ yedgex <int> 8, 10, 9, 8, 10, 7, 10, 7, 7, 8, 7, 8, 8, 6, 11, 9, 8, 9, 4, 7,…

summary(letterdata)

##      letter           xbox             ybox            width       
##  U      :  813   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  D      :  805   1st Qu.: 3.000   1st Qu.: 5.000   1st Qu.: 4.000  
##  P      :  803   Median : 4.000   Median : 7.000   Median : 5.000  
##  T      :  796   Mean   : 4.024   Mean   : 7.035   Mean   : 5.122  
##  M      :  792   3rd Qu.: 5.000   3rd Qu.: 9.000   3rd Qu.: 6.000  
##  A      :  789   Max.   :15.000   Max.   :15.000   Max.   :15.000  
##  (Other):15202                                                     
##      height           onpix             xbar             ybar     
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.0  
##  1st Qu.: 4.000   1st Qu.: 2.000   1st Qu.: 6.000   1st Qu.: 6.0  
##  Median : 6.000   Median : 3.000   Median : 7.000   Median : 7.0  
##  Mean   : 5.372   Mean   : 3.506   Mean   : 6.898   Mean   : 7.5  
##  3rd Qu.: 7.000   3rd Qu.: 5.000   3rd Qu.: 8.000   3rd Qu.: 9.0  
##  Max.   :15.000   Max.   :15.000   Max.   :15.000   Max.   :15.0  
##                                                                   
##      x2bar            y2bar            xybar            x2ybar      
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 3.000   1st Qu.: 4.000   1st Qu.: 7.000   1st Qu.: 5.000  
##  Median : 4.000   Median : 5.000   Median : 8.000   Median : 6.000  
##  Mean   : 4.629   Mean   : 5.179   Mean   : 8.282   Mean   : 6.454  
##  3rd Qu.: 6.000   3rd Qu.: 7.000   3rd Qu.:10.000   3rd Qu.: 8.000  
##  Max.   :15.000   Max.   :15.000   Max.   :15.000   Max.   :15.000  
##                                                                     
##      xy2bar           xedge            xedgey           yedge       
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 7.000   1st Qu.: 1.000   1st Qu.: 8.000   1st Qu.: 2.000  
##  Median : 8.000   Median : 3.000   Median : 8.000   Median : 3.000  
##  Mean   : 7.929   Mean   : 3.046   Mean   : 8.339   Mean   : 3.692  
##  3rd Qu.: 9.000   3rd Qu.: 4.000   3rd Qu.: 9.000   3rd Qu.: 5.000  
##  Max.   :15.000   Max.   :15.000   Max.   :15.000   Max.   :15.000  
##                                                                     
##      yedgex      
##  Min.   : 0.000  
##  1st Qu.: 7.000  
##  Median : 8.000  
##  Mean   : 7.801  
##  3rd Qu.: 9.000  
##  Max.   :15.000  
##

sum(is.na(letterdata))

## [1] 0

dim(letterdata)

## [1] 20000    17

Removing the dependent feature

letter1 <- letterdata %>%  select(-letter)

dim(letter1)

## [1] 20000    16

### Data Exploratory 
newdata <- melt(letter1)

## Using  as id variables

Plots for the variables

boxplot(data = newdata, value ~ variable)

pairs.panels(letter1[c("xbox", "ybox", "width", "height", "onpix", "xbar", "ybar", "x2bar","y2bar","xybar", "x2ybar", "xy2bar", "xedge",
                       "xedgey", "yedge", "yedgex")], pch = ",")

Splitting the data

letter_train <- letterdata[1:16000, ]
letter_test <- letterdata[16001:20000, ]

Training the data

letter_class <- ksvm(letter ~ ., data = letter_train, kernel ="vanilladot")

##  Setting default kernel parameters

Evaluating the model performance

letter_pred <- predict(letter_class, letter_test)

head(letter_pred)

## [1] U N V X N H
## Levels: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z

The diagonal values indicates the total number of records where th predicted letter matches the the true value.

table(letter_pred, letter_test$letter)

##            
## letter_pred   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O   P   Q
##           A 144   0   0   0   0   0   0   0   0   1   0   0   1   2   2   0   5
##           B   0 121   0   5   2   0   1   2   0   0   1   0   1   0   0   2   2
##           C   0   0 120   0   4   0  10   2   2   0   1   3   0   0   2   0   0
##           D   2   2   0 156   0   1   3  10   4   3   4   3   0   5   5   3   1
##           E   0   0   5   0 127   3   1   1   0   0   3   4   0   0   0   0   2
##           F   0   0   0   0   0 138   2   2   6   0   0   0   0   0   0  16   0
##           G   1   1   2   1   9   2 123   2   0   0   1   2   1   0   1   2   8
##           H   0   0   0   1   0   1   0 102   0   2   3   2   3   4  20   0   2
##           I   0   1   0   0   0   1   0   0 141   8   0   0   0   0   0   1   0
##           J   0   1   0   0   0   1   0   2   5 128   0   0   0   0   1   1   3
##           K   1   1   9   0   0   0   2   5   0   0 118   0   0   2   0   1   0
##           L   0   0   0   0   2   0   1   1   0   0   0 133   0   0   0   0   1
##           M   0   0   1   1   0   0   1   1   0   0   0   0 135   4   0   0   0
##           N   0   0   0   0   0   1   0   1   0   0   0   0   0 145   0   0   0
##           O   1   0   2   1   0   0   1   2   0   1   0   0   0   1  99   3   3
##           P   0   0   0   1   0   2   1   0   0   0   0   0   0   0   2 130   0
##           Q   0   0   0   0   0   0   8   2   0   0   0   3   0   0   3   1 124
##           R   0   7   0   0   1   0   3   8   0   0  13   0   0   1   1   1   0
##           S   1   1   0   0   1   0   3   0   1   1   0   1   0   0   0   0  14
##           T   0   0   0   0   3   2   0   0   0   0   1   0   0   0   0   0   0
##           U   1   0   3   1   0   0   0   2   0   0   0   0   0   0   1   0   0
##           V   0   0   0   0   0   1   3   4   0   0   0   0   1   2   1   0   3
##           W   0   0   0   0   0   0   1   0   0   0   0   0   2   0   0   0   0
##           X   0   1   0   0   2   0   0   1   3   0   1   6   0   0   1   0   0
##           Y   3   0   0   0   0   0   0   1   0   0   0   0   0   0   0   7   0
##           Z   2   0   0   0   1   0   0   0   3   4   0   0   0   0   0   0   0
##            
## letter_pred   R   S   T   U   V   W   X   Y   Z
##           A   0   1   1   1   0   1   0   0   1
##           B   3   5   0   0   2   0   1   0   0
##           C   0   0   0   0   0   0   0   0   0
##           D   4   0   0   0   0   0   3   3   1
##           E   0  10   0   0   0   0   2   0   3
##           F   0   3   0   0   1   0   1   2   0
##           G   2   4   3   0   0   0   1   0   0
##           H   3   0   3   0   2   0   0   1   0
##           I   0   3   0   0   0   0   5   1   1
##           J   0   2   0   0   0   0   1   0   6
##           K   7   0   1   3   0   0   5   0   0
##           L   0   5   0   0   0   0   0   0   1
##           M   0   0   0   3   0   8   0   0   0
##           N   3   0   0   1   0   2   0   0   0
##           O   0   0   0   3   0   0   0   0   0
##           P   0   0   0   0   0   0   0   1   0
##           Q   0   5   0   0   0   0   0   2   0
##           R 138   0   1   0   1   0   0   0   0
##           S   0 101   3   0   0   0   2   0  10
##           T   0   3 133   1   0   0   0   2   2
##           U   0   0   0 152   0   0   1   1   0
##           V   1   0   0   0 126   1   0   4   0
##           W   0   0   0   4   4 127   0   0   0
##           X   0   1   0   0   0   0 137   1   1
##           Y   0   0   3   0   0   0   0 127   0
##           Z   0  18   3   0   0   0   0   0 132

agreement <- letter_pred == letter_test$letter

table(agreement)

## agreement
## FALSE  TRUE 
##   643  3357

Percentage of accuracy

prop.table(table(agreement))

## agreement
##   FALSE    TRUE 
## 0.16075 0.83925

Improving the model performance by changing the SVM kernel function to Gaussian RBF kernel

set.seed(12345)

letter_class2 <- ksvm(letter ~ ., data = letter_train, kernel = "rbfdot")

### making the predictions

letter_pred2 <- predict(letter_class2, letter_test)

Comparing the accuracy

agreement2 <- letter_pred2== letter_test$letter

table(agreement2)

## agreement2
## FALSE  TRUE 
##   278  3722

Percentage of accuracy

prop.table(table(agreement2))

## agreement2
##  FALSE   TRUE 
## 0.0695 0.9305

Mapping accuracy against the SVM cost for the RBF kernel

cost_values <- c(1, seq(from = 5, to =50, by = 5))
accuracy_values <- sapply(cost_values, function(x){
  set.seed(12345)
  m <- ksvm(letter ~ ., data = letter_train, kernel = "rbfdot", C = x)
  pred <- predict(m, letter_test)
  agree <- ifelse(pred == letter_test$letter, 1, 0)
  accuracy <- sum(agree) / nrow(letter_test)
  return (accuracy)
  })

Visualisation shows that the model accuracy can be improved with the cost value between 40 and 50.

plot(cost_values, accuracy_values, types = "b", col=cost_values, pch = 19)

Support Vector Machine in R

Spencer Madamedon

2025-01-28

Data Exploratory

Removing the dependent feature

Plots for the variables

Splitting the data

Training the data

Evaluating the model performance

The diagonal values indicates the total number of records where th predicted letter matches the the true value.

Percentage of accuracy

Improving the model performance by changing the SVM kernel function to Gaussian RBF kernel

Comparing the accuracy

Percentage of accuracy

Mapping accuracy against the SVM cost for the RBF kernel

Visualisation shows that the model accuracy can be improved with the cost value between 40 and 50.