##### Part 2: Support Vector Machines -------------------
## Example: Optical Character Recognition ----

## Step 2: Exploring and preparing the data ----
# read in data and examine structure
letters <- read.csv("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml11/letterdata.csv")
str(letters)
## 'data.frame':    20000 obs. of  17 variables:
##  $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
##  $ xbox  : int  2 5 4 7 2 4 4 1 2 11 ...
##  $ ybox  : int  8 12 11 11 1 11 2 1 2 15 ...
##  $ width : int  3 3 6 6 3 5 5 3 4 13 ...
##  $ height: int  5 7 8 6 1 8 4 2 4 9 ...
##  $ onpix : int  1 2 6 3 1 3 4 1 2 7 ...
##  $ xbar  : int  8 10 10 5 8 8 8 8 10 13 ...
##  $ ybar  : int  13 5 6 9 6 8 7 2 6 2 ...
##  $ x2bar : int  0 5 2 4 6 6 6 2 2 6 ...
##  $ y2bar : int  6 4 6 6 6 9 6 2 6 2 ...
##  $ xybar : int  6 13 10 4 6 5 7 8 12 12 ...
##  $ x2ybar: int  10 3 3 4 5 6 6 2 4 1 ...
##  $ xy2bar: int  8 9 7 10 9 6 6 8 8 9 ...
##  $ xedge : int  0 2 3 6 1 0 2 1 1 8 ...
##  $ xedgey: int  8 8 7 10 7 8 8 6 6 1 ...
##  $ yedge : int  0 4 3 2 5 9 7 2 1 1 ...
##  $ yedgex: int  8 10 9 8 10 7 10 7 7 8 ...
# divide into training and test data
letters_train <- letters[1:16000, ]
letters_test  <- letters[16001:20000, ]

## Step 3: Training a model on the data ----
# begin by training a simple linear SVM
library(kernlab)
letter_classifier <- ksvm(letter ~ ., data = letters_train,
                          kernel = "vanilladot")
##  Setting default kernel parameters
## Step 4: Evaluating model performance ----
# predictions on testing dataset

letter_predictions <- predict(letter_classifier, letters_test)

table<-table(letters_test$letter, letter_predictions); table
##    letter_predictions
##       A   B   C   D   E   F   G   H   I   J   K   L   M   N   O   P   Q
##   A 144   0   0   2   0   0   1   0   0   0   1   0   0   0   1   0   0
##   B   0 121   0   2   0   0   1   0   1   1   1   0   0   0   0   0   0
##   C   0   0 120   0   5   0   2   0   0   0   9   0   1   0   2   0   0
##   D   0   5   0 156   0   0   1   1   0   0   0   0   1   0   1   1   0
##   E   0   2   4   0 127   0   9   0   0   0   0   2   0   0   0   0   0
##   F   0   0   0   1   3 138   2   1   1   1   0   0   0   1   0   2   0
##   G   0   1  10   3   1   2 123   0   0   0   2   1   1   0   1   1   8
##   H   0   2   2  10   1   2   2 102   0   2   5   1   1   1   2   0   2
##   I   0   0   2   4   0   6   0   0 141   5   0   0   0   0   0   0   0
##   J   1   0   0   3   0   0   0   2   8 128   0   0   0   0   1   0   0
##   K   0   1   1   4   3   0   1   3   0   0 118   0   0   0   0   0   0
##   L   0   0   3   3   4   0   2   2   0   0   0 133   0   0   0   0   3
##   M   1   1   0   0   0   0   1   3   0   0   0   0 135   0   0   0   0
##   N   2   0   0   5   0   0   0   4   0   0   2   0   4 145   1   0   0
##   O   2   0   2   5   0   0   1  20   0   1   0   0   0   0  99   2   3
##   P   0   2   0   3   0  16   2   0   1   1   1   0   0   0   3 130   1
##   Q   5   2   0   1   2   0   8   2   0   3   0   1   0   0   3   0 124
##   R   0   3   0   4   0   0   2   3   0   0   7   0   0   3   0   0   0
##   S   1   5   0   0  10   3   4   0   3   2   0   5   0   0   0   0   5
##   T   1   0   0   0   0   0   3   3   0   0   1   0   0   0   0   0   0
##   U   1   0   0   0   0   0   0   0   0   0   3   0   3   1   3   0   0
##   V   0   2   0   0   0   1   0   2   0   0   0   0   0   0   0   0   0
##   W   1   0   0   0   0   0   0   0   0   0   0   0   8   2   0   0   0
##   X   0   1   0   3   2   1   1   0   5   1   5   0   0   0   0   0   0
##   Y   0   0   0   3   0   2   0   1   1   0   0   0   0   0   0   1   2
##   Z   1   0   0   1   3   0   0   0   1   6   0   1   0   0   0   0   0
##    letter_predictions
##       R   S   T   U   V   W   X   Y   Z
##   A   0   1   0   1   0   0   0   3   2
##   B   7   1   0   0   0   0   1   0   0
##   C   0   0   0   3   0   0   0   0   0
##   D   0   0   0   1   0   0   0   0   0
##   E   1   1   3   0   0   0   2   0   1
##   F   0   0   2   0   1   0   0   0   0
##   G   3   3   0   0   3   1   0   0   0
##   H   8   0   0   2   4   0   1   1   0
##   I   0   1   0   0   0   0   3   0   3
##   J   0   1   0   0   0   0   0   0   4
##   K  13   0   1   0   0   0   1   0   0
##   L   0   1   0   0   0   0   6   0   0
##   M   0   0   0   0   1   2   0   0   0
##   N   1   0   0   0   2   0   0   0   0
##   O   1   0   0   1   1   0   1   0   0
##   P   1   0   0   0   0   0   0   7   0
##   Q   0  14   0   0   3   0   0   0   0
##   R 138   0   0   0   1   0   0   0   0
##   S   0 101   3   0   0   0   1   0  18
##   T   1   3 133   0   0   0   0   3   3
##   U   0   0   1 152   0   4   0   0   0
##   V   1   0   0   0 126   4   0   0   0
##   W   0   0   0   0   1 127   0   0   0
##   X   0   2   0   1   0   0 137   0   0
##   Y   0   0   2   1   4   0   1 127   0
##   Z   0  10   2   0   0   0   1   0 132
# We're interested in the results by Letter:

i = (1:26)
j = cbind(i,i)
table[j]/rowSums(table)
##         A         B         C         D         E         F         G 
## 0.9230769 0.8897059 0.8450704 0.9341317 0.8355263 0.9019608 0.7500000 
##         H         I         J         K         L         M         N 
## 0.6754967 0.8545455 0.8648649 0.8082192 0.8471338 0.9375000 0.8734940 
##         O         P         Q         R         S         T         U 
## 0.7122302 0.7738095 0.7380952 0.8571429 0.6273292 0.8807947 0.9047619 
##         V         W         X         Y         Z 
## 0.9264706 0.9136691 0.8616352 0.8758621 0.8354430
# Notice how tricky the "H" is. The only letter below 70% accuracy

# construct a vector of TRUE/FALSE indicating correct/incorrect predictions
agreement <- letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE  TRUE 
##   643  3357
prop.table(table(agreement))
## agreement
##   FALSE    TRUE 
## 0.16075 0.83925
## Step 5: Improving model performance ----
set.seed(2996)  
letter_classifier_rbf <- ksvm(letter ~ ., data = letters_train, kernel = "rbfdot")

letter_predictions_rbf <- predict(letter_classifier_rbf, letters_test)

table2<-table(letters_test$letter, letter_predictions_rbf)

table2[j]/rowSums(table2)
##         A         B         C         D         E         F         G 
## 0.9679487 0.9411765 0.9295775 0.9640719 0.9013158 0.9673203 0.9390244 
##         H         I         J         K         L         M         N 
## 0.8278146 0.9151515 0.9189189 0.9041096 0.9044586 0.9583333 0.9036145 
##         O         P         Q         R         S         T         U 
## 0.9280576 0.8392857 0.9404762 0.9316770 0.9440994 0.9271523 0.9583333 
##         V         W         X         Y         Z 
## 0.9632353 0.9712230 0.9622642 0.9517241 0.9493671
agreement_rbf <- letter_predictions_rbf == letters_test$letter
table(agreement_rbf)
## agreement_rbf
## FALSE  TRUE 
##   277  3723
prop.table(table(agreement_rbf))
## agreement_rbf
##   FALSE    TRUE 
## 0.06925 0.93075
table[j]/rowSums(table(letters_test$letter, letter_predictions))
##         A         B         C         D         E         F         G 
## 0.9230769 0.8897059 0.8450704 0.9341317 0.8355263 0.9019608 0.7500000 
##         H         I         J         K         L         M         N 
## 0.6754967 0.8545455 0.8648649 0.8082192 0.8471338 0.9375000 0.8734940 
##         O         P         Q         R         S         T         U 
## 0.7122302 0.7738095 0.7380952 0.8571429 0.6273292 0.8807947 0.9047619 
##         V         W         X         Y         Z 
## 0.9264706 0.9136691 0.8616352 0.8758621 0.8354430
# Much Better! "H" still the "laggard" but now only one under .90.