str(letterdata)
## 'data.frame': 20000 obs. of 17 variables:
## $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
## $ xbox : int 2 5 4 7 2 4 4 1 2 11 ...
## $ ybox : int 8 12 11 11 1 11 2 1 2 15 ...
## $ width : int 3 3 6 6 3 5 5 3 4 13 ...
## $ height: int 5 7 8 6 1 8 4 2 4 9 ...
## $ onpix : int 1 2 6 3 1 3 4 1 2 7 ...
## $ xbar : int 8 10 10 5 8 8 8 8 10 13 ...
## $ ybar : int 13 5 6 9 6 8 7 2 6 2 ...
## $ x2bar : int 0 5 2 4 6 6 6 2 2 6 ...
## $ y2bar : int 6 4 6 6 6 9 6 2 6 2 ...
## $ xybar : int 6 13 10 4 6 5 7 8 12 12 ...
## $ x2ybar: int 10 3 3 4 5 6 6 2 4 1 ...
## $ xy2bar: int 8 9 7 10 9 6 6 8 8 9 ...
## $ xedge : int 0 2 3 6 1 0 2 1 1 8 ...
## $ xedgey: int 8 8 7 10 7 8 8 6 6 1 ...
## $ yedge : int 0 4 3 2 5 9 7 2 1 1 ...
## $ yedgex: int 8 10 9 8 10 7 10 7 7 8 ...
glimpse(letterdata)
## Rows: 20,000
## Columns: 17
## $ letter <fct> T, I, D, N, G, S, B, A, J, M, X, O, G, M, R, F, O, C, T, J, J, …
## $ xbox <int> 2, 5, 4, 7, 2, 4, 4, 1, 2, 11, 3, 6, 4, 6, 5, 6, 3, 7, 6, 2, 1,…
## $ ybox <int> 8, 12, 11, 11, 1, 11, 2, 1, 2, 15, 9, 13, 9, 9, 9, 9, 4, 10, 11…
## $ width <int> 3, 3, 6, 6, 3, 5, 5, 3, 4, 13, 5, 4, 6, 8, 5, 5, 4, 5, 6, 3, 2,…
## $ height <int> 5, 7, 8, 6, 1, 8, 4, 2, 4, 9, 7, 7, 7, 6, 7, 4, 3, 5, 8, 3, 2, …
## $ onpix <int> 1, 2, 6, 3, 1, 3, 4, 1, 2, 7, 4, 4, 6, 9, 6, 3, 2, 2, 5, 1, 1, …
## $ xbar <int> 8, 10, 10, 5, 8, 8, 8, 8, 10, 13, 8, 6, 7, 7, 6, 10, 8, 6, 6, 1…
## $ ybar <int> 13, 5, 6, 9, 6, 8, 7, 2, 6, 2, 7, 7, 8, 8, 11, 6, 7, 8, 11, 6, …
## $ x2bar <int> 0, 5, 2, 4, 6, 6, 6, 2, 2, 6, 3, 6, 6, 6, 7, 3, 7, 6, 5, 3, 2, …
## $ y2bar <int> 6, 4, 6, 6, 6, 9, 6, 2, 6, 2, 8, 3, 2, 5, 3, 5, 5, 8, 6, 6, 5, …
## $ xybar <int> 6, 13, 10, 4, 6, 5, 7, 8, 12, 12, 5, 10, 6, 7, 7, 10, 7, 11, 11…
## $ x2ybar <int> 10, 3, 3, 4, 5, 6, 6, 2, 4, 1, 6, 7, 5, 5, 3, 5, 6, 7, 9, 4, 5,…
## $ xy2bar <int> 8, 9, 7, 10, 9, 6, 6, 8, 8, 9, 8, 9, 11, 8, 9, 7, 8, 11, 4, 9, …
## $ xedge <int> 0, 2, 3, 6, 1, 0, 2, 1, 1, 8, 2, 5, 4, 8, 2, 3, 2, 2, 3, 0, 0, …
## $ xedgey <int> 8, 8, 7, 10, 7, 8, 8, 6, 6, 1, 8, 9, 8, 9, 7, 9, 8, 8, 12, 7, 7…
## $ yedge <int> 0, 4, 3, 2, 5, 9, 7, 2, 1, 1, 6, 5, 7, 8, 5, 6, 3, 5, 2, 1, 0, …
## $ yedgex <int> 8, 10, 9, 8, 10, 7, 10, 7, 7, 8, 7, 8, 8, 6, 11, 9, 8, 9, 4, 7,…
summary(letterdata)
## letter xbox ybox width
## U : 813 Min. : 0.000 Min. : 0.000 Min. : 0.000
## D : 805 1st Qu.: 3.000 1st Qu.: 5.000 1st Qu.: 4.000
## P : 803 Median : 4.000 Median : 7.000 Median : 5.000
## T : 796 Mean : 4.024 Mean : 7.035 Mean : 5.122
## M : 792 3rd Qu.: 5.000 3rd Qu.: 9.000 3rd Qu.: 6.000
## A : 789 Max. :15.000 Max. :15.000 Max. :15.000
## (Other):15202
## height onpix xbar ybar
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.0
## 1st Qu.: 4.000 1st Qu.: 2.000 1st Qu.: 6.000 1st Qu.: 6.0
## Median : 6.000 Median : 3.000 Median : 7.000 Median : 7.0
## Mean : 5.372 Mean : 3.506 Mean : 6.898 Mean : 7.5
## 3rd Qu.: 7.000 3rd Qu.: 5.000 3rd Qu.: 8.000 3rd Qu.: 9.0
## Max. :15.000 Max. :15.000 Max. :15.000 Max. :15.0
##
## x2bar y2bar xybar x2ybar
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3.000 1st Qu.: 4.000 1st Qu.: 7.000 1st Qu.: 5.000
## Median : 4.000 Median : 5.000 Median : 8.000 Median : 6.000
## Mean : 4.629 Mean : 5.179 Mean : 8.282 Mean : 6.454
## 3rd Qu.: 6.000 3rd Qu.: 7.000 3rd Qu.:10.000 3rd Qu.: 8.000
## Max. :15.000 Max. :15.000 Max. :15.000 Max. :15.000
##
## xy2bar xedge xedgey yedge
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 7.000 1st Qu.: 1.000 1st Qu.: 8.000 1st Qu.: 2.000
## Median : 8.000 Median : 3.000 Median : 8.000 Median : 3.000
## Mean : 7.929 Mean : 3.046 Mean : 8.339 Mean : 3.692
## 3rd Qu.: 9.000 3rd Qu.: 4.000 3rd Qu.: 9.000 3rd Qu.: 5.000
## Max. :15.000 Max. :15.000 Max. :15.000 Max. :15.000
##
## yedgex
## Min. : 0.000
## 1st Qu.: 7.000
## Median : 8.000
## Mean : 7.801
## 3rd Qu.: 9.000
## Max. :15.000
##
sum(is.na(letterdata))
## [1] 0
dim(letterdata)
## [1] 20000 17
letter1 <- letterdata %>% select(-letter)
dim(letter1)
## [1] 20000 16
### Data Exploratory
newdata <- melt(letter1)
## Using as id variables
boxplot(data = newdata, value ~ variable)
pairs.panels(letter1[c("xbox", "ybox", "width", "height", "onpix", "xbar", "ybar", "x2bar","y2bar","xybar", "x2ybar", "xy2bar", "xedge",
"xedgey", "yedge", "yedgex")], pch = ",")
letter_train <- letterdata[1:16000, ]
letter_test <- letterdata[16001:20000, ]
letter_class <- ksvm(letter ~ ., data = letter_train, kernel ="vanilladot")
## Setting default kernel parameters
letter_pred <- predict(letter_class, letter_test)
head(letter_pred)
## [1] U N V X N H
## Levels: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
table(letter_pred, letter_test$letter)
##
## letter_pred A B C D E F G H I J K L M N O P Q
## A 144 0 0 0 0 0 0 0 0 1 0 0 1 2 2 0 5
## B 0 121 0 5 2 0 1 2 0 0 1 0 1 0 0 2 2
## C 0 0 120 0 4 0 10 2 2 0 1 3 0 0 2 0 0
## D 2 2 0 156 0 1 3 10 4 3 4 3 0 5 5 3 1
## E 0 0 5 0 127 3 1 1 0 0 3 4 0 0 0 0 2
## F 0 0 0 0 0 138 2 2 6 0 0 0 0 0 0 16 0
## G 1 1 2 1 9 2 123 2 0 0 1 2 1 0 1 2 8
## H 0 0 0 1 0 1 0 102 0 2 3 2 3 4 20 0 2
## I 0 1 0 0 0 1 0 0 141 8 0 0 0 0 0 1 0
## J 0 1 0 0 0 1 0 2 5 128 0 0 0 0 1 1 3
## K 1 1 9 0 0 0 2 5 0 0 118 0 0 2 0 1 0
## L 0 0 0 0 2 0 1 1 0 0 0 133 0 0 0 0 1
## M 0 0 1 1 0 0 1 1 0 0 0 0 135 4 0 0 0
## N 0 0 0 0 0 1 0 1 0 0 0 0 0 145 0 0 0
## O 1 0 2 1 0 0 1 2 0 1 0 0 0 1 99 3 3
## P 0 0 0 1 0 2 1 0 0 0 0 0 0 0 2 130 0
## Q 0 0 0 0 0 0 8 2 0 0 0 3 0 0 3 1 124
## R 0 7 0 0 1 0 3 8 0 0 13 0 0 1 1 1 0
## S 1 1 0 0 1 0 3 0 1 1 0 1 0 0 0 0 14
## T 0 0 0 0 3 2 0 0 0 0 1 0 0 0 0 0 0
## U 1 0 3 1 0 0 0 2 0 0 0 0 0 0 1 0 0
## V 0 0 0 0 0 1 3 4 0 0 0 0 1 2 1 0 3
## W 0 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0
## X 0 1 0 0 2 0 0 1 3 0 1 6 0 0 1 0 0
## Y 3 0 0 0 0 0 0 1 0 0 0 0 0 0 0 7 0
## Z 2 0 0 0 1 0 0 0 3 4 0 0 0 0 0 0 0
##
## letter_pred R S T U V W X Y Z
## A 0 1 1 1 0 1 0 0 1
## B 3 5 0 0 2 0 1 0 0
## C 0 0 0 0 0 0 0 0 0
## D 4 0 0 0 0 0 3 3 1
## E 0 10 0 0 0 0 2 0 3
## F 0 3 0 0 1 0 1 2 0
## G 2 4 3 0 0 0 1 0 0
## H 3 0 3 0 2 0 0 1 0
## I 0 3 0 0 0 0 5 1 1
## J 0 2 0 0 0 0 1 0 6
## K 7 0 1 3 0 0 5 0 0
## L 0 5 0 0 0 0 0 0 1
## M 0 0 0 3 0 8 0 0 0
## N 3 0 0 1 0 2 0 0 0
## O 0 0 0 3 0 0 0 0 0
## P 0 0 0 0 0 0 0 1 0
## Q 0 5 0 0 0 0 0 2 0
## R 138 0 1 0 1 0 0 0 0
## S 0 101 3 0 0 0 2 0 10
## T 0 3 133 1 0 0 0 2 2
## U 0 0 0 152 0 0 1 1 0
## V 1 0 0 0 126 1 0 4 0
## W 0 0 0 4 4 127 0 0 0
## X 0 1 0 0 0 0 137 1 1
## Y 0 0 3 0 0 0 0 127 0
## Z 0 18 3 0 0 0 0 0 132
agreement <- letter_pred == letter_test$letter
table(agreement)
## agreement
## FALSE TRUE
## 643 3357
prop.table(table(agreement))
## agreement
## FALSE TRUE
## 0.16075 0.83925
set.seed(12345)
letter_class2 <- ksvm(letter ~ ., data = letter_train, kernel = "rbfdot")
### making the predictions
letter_pred2 <- predict(letter_class2, letter_test)
agreement2 <- letter_pred2== letter_test$letter
table(agreement2)
## agreement2
## FALSE TRUE
## 278 3722
prop.table(table(agreement2))
## agreement2
## FALSE TRUE
## 0.0695 0.9305
cost_values <- c(1, seq(from = 5, to =50, by = 5))
accuracy_values <- sapply(cost_values, function(x){
set.seed(12345)
m <- ksvm(letter ~ ., data = letter_train, kernel = "rbfdot", C = x)
pred <- predict(m, letter_test)
agree <- ifelse(pred == letter_test$letter, 1, 0)
accuracy <- sum(agree) / nrow(letter_test)
return (accuracy)
})
plot(cost_values, accuracy_values, types = "b", col=cost_values, pch = 19)