##### Part 2: Support Vector Machines -------------------
## Example: Optical Character Recognition ----
## Step 2: Exploring and preparing the data ----
# read in data and examine structure
letters <- read.csv("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml11/letterdata.csv")
str(letters)
## 'data.frame': 20000 obs. of 17 variables:
## $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
## $ xbox : int 2 5 4 7 2 4 4 1 2 11 ...
## $ ybox : int 8 12 11 11 1 11 2 1 2 15 ...
## $ width : int 3 3 6 6 3 5 5 3 4 13 ...
## $ height: int 5 7 8 6 1 8 4 2 4 9 ...
## $ onpix : int 1 2 6 3 1 3 4 1 2 7 ...
## $ xbar : int 8 10 10 5 8 8 8 8 10 13 ...
## $ ybar : int 13 5 6 9 6 8 7 2 6 2 ...
## $ x2bar : int 0 5 2 4 6 6 6 2 2 6 ...
## $ y2bar : int 6 4 6 6 6 9 6 2 6 2 ...
## $ xybar : int 6 13 10 4 6 5 7 8 12 12 ...
## $ x2ybar: int 10 3 3 4 5 6 6 2 4 1 ...
## $ xy2bar: int 8 9 7 10 9 6 6 8 8 9 ...
## $ xedge : int 0 2 3 6 1 0 2 1 1 8 ...
## $ xedgey: int 8 8 7 10 7 8 8 6 6 1 ...
## $ yedge : int 0 4 3 2 5 9 7 2 1 1 ...
## $ yedgex: int 8 10 9 8 10 7 10 7 7 8 ...
# divide into training and test data
letters_train <- letters[1:16000, ]
letters_test <- letters[16001:20000, ]
## Step 3: Training a model on the data ----
# begin by training a simple linear SVM
library(kernlab)
letter_classifier <- ksvm(letter ~ ., data = letters_train,
kernel = "vanilladot")
## Setting default kernel parameters
## Step 4: Evaluating model performance ----
# predictions on testing dataset
letter_predictions <- predict(letter_classifier, letters_test)
table<-table(letters_test$letter, letter_predictions); table
## letter_predictions
## A B C D E F G H I J K L M N O P Q
## A 144 0 0 2 0 0 1 0 0 0 1 0 0 0 1 0 0
## B 0 121 0 2 0 0 1 0 1 1 1 0 0 0 0 0 0
## C 0 0 120 0 5 0 2 0 0 0 9 0 1 0 2 0 0
## D 0 5 0 156 0 0 1 1 0 0 0 0 1 0 1 1 0
## E 0 2 4 0 127 0 9 0 0 0 0 2 0 0 0 0 0
## F 0 0 0 1 3 138 2 1 1 1 0 0 0 1 0 2 0
## G 0 1 10 3 1 2 123 0 0 0 2 1 1 0 1 1 8
## H 0 2 2 10 1 2 2 102 0 2 5 1 1 1 2 0 2
## I 0 0 2 4 0 6 0 0 141 5 0 0 0 0 0 0 0
## J 1 0 0 3 0 0 0 2 8 128 0 0 0 0 1 0 0
## K 0 1 1 4 3 0 1 3 0 0 118 0 0 0 0 0 0
## L 0 0 3 3 4 0 2 2 0 0 0 133 0 0 0 0 3
## M 1 1 0 0 0 0 1 3 0 0 0 0 135 0 0 0 0
## N 2 0 0 5 0 0 0 4 0 0 2 0 4 145 1 0 0
## O 2 0 2 5 0 0 1 20 0 1 0 0 0 0 99 2 3
## P 0 2 0 3 0 16 2 0 1 1 1 0 0 0 3 130 1
## Q 5 2 0 1 2 0 8 2 0 3 0 1 0 0 3 0 124
## R 0 3 0 4 0 0 2 3 0 0 7 0 0 3 0 0 0
## S 1 5 0 0 10 3 4 0 3 2 0 5 0 0 0 0 5
## T 1 0 0 0 0 0 3 3 0 0 1 0 0 0 0 0 0
## U 1 0 0 0 0 0 0 0 0 0 3 0 3 1 3 0 0
## V 0 2 0 0 0 1 0 2 0 0 0 0 0 0 0 0 0
## W 1 0 0 0 0 0 0 0 0 0 0 0 8 2 0 0 0
## X 0 1 0 3 2 1 1 0 5 1 5 0 0 0 0 0 0
## Y 0 0 0 3 0 2 0 1 1 0 0 0 0 0 0 1 2
## Z 1 0 0 1 3 0 0 0 1 6 0 1 0 0 0 0 0
## letter_predictions
## R S T U V W X Y Z
## A 0 1 0 1 0 0 0 3 2
## B 7 1 0 0 0 0 1 0 0
## C 0 0 0 3 0 0 0 0 0
## D 0 0 0 1 0 0 0 0 0
## E 1 1 3 0 0 0 2 0 1
## F 0 0 2 0 1 0 0 0 0
## G 3 3 0 0 3 1 0 0 0
## H 8 0 0 2 4 0 1 1 0
## I 0 1 0 0 0 0 3 0 3
## J 0 1 0 0 0 0 0 0 4
## K 13 0 1 0 0 0 1 0 0
## L 0 1 0 0 0 0 6 0 0
## M 0 0 0 0 1 2 0 0 0
## N 1 0 0 0 2 0 0 0 0
## O 1 0 0 1 1 0 1 0 0
## P 1 0 0 0 0 0 0 7 0
## Q 0 14 0 0 3 0 0 0 0
## R 138 0 0 0 1 0 0 0 0
## S 0 101 3 0 0 0 1 0 18
## T 1 3 133 0 0 0 0 3 3
## U 0 0 1 152 0 4 0 0 0
## V 1 0 0 0 126 4 0 0 0
## W 0 0 0 0 1 127 0 0 0
## X 0 2 0 1 0 0 137 0 0
## Y 0 0 2 1 4 0 1 127 0
## Z 0 10 2 0 0 0 1 0 132
# We're interested in the results by Letter:
i = (1:26)
j = cbind(i,i)
table[j]/rowSums(table)
## A B C D E F G
## 0.9230769 0.8897059 0.8450704 0.9341317 0.8355263 0.9019608 0.7500000
## H I J K L M N
## 0.6754967 0.8545455 0.8648649 0.8082192 0.8471338 0.9375000 0.8734940
## O P Q R S T U
## 0.7122302 0.7738095 0.7380952 0.8571429 0.6273292 0.8807947 0.9047619
## V W X Y Z
## 0.9264706 0.9136691 0.8616352 0.8758621 0.8354430
# Notice how tricky the "H" is. The only letter below 70% accuracy
# construct a vector of TRUE/FALSE indicating correct/incorrect predictions
agreement <- letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE TRUE
## 643 3357
prop.table(table(agreement))
## agreement
## FALSE TRUE
## 0.16075 0.83925
## Step 5: Improving model performance ----
set.seed(2996)
letter_classifier_rbf <- ksvm(letter ~ ., data = letters_train, kernel = "rbfdot")
letter_predictions_rbf <- predict(letter_classifier_rbf, letters_test)
table2<-table(letters_test$letter, letter_predictions_rbf)
table2[j]/rowSums(table2)
## A B C D E F G
## 0.9679487 0.9411765 0.9295775 0.9640719 0.9013158 0.9673203 0.9390244
## H I J K L M N
## 0.8278146 0.9151515 0.9189189 0.9041096 0.9044586 0.9583333 0.9036145
## O P Q R S T U
## 0.9280576 0.8392857 0.9404762 0.9316770 0.9440994 0.9271523 0.9583333
## V W X Y Z
## 0.9632353 0.9712230 0.9622642 0.9517241 0.9493671
agreement_rbf <- letter_predictions_rbf == letters_test$letter
table(agreement_rbf)
## agreement_rbf
## FALSE TRUE
## 277 3723
prop.table(table(agreement_rbf))
## agreement_rbf
## FALSE TRUE
## 0.06925 0.93075
table[j]/rowSums(table(letters_test$letter, letter_predictions))
## A B C D E F G
## 0.9230769 0.8897059 0.8450704 0.9341317 0.8355263 0.9019608 0.7500000
## H I J K L M N
## 0.6754967 0.8545455 0.8648649 0.8082192 0.8471338 0.9375000 0.8734940
## O P Q R S T U
## 0.7122302 0.7738095 0.7380952 0.8571429 0.6273292 0.8807947 0.9047619
## V W X Y Z
## 0.9264706 0.9136691 0.8616352 0.8758621 0.8354430
# Much Better! "H" still the "laggard" but now only one under .90.