set.seed(1)
x1 <- runif(500) - 0.5
x2 <- runif(500) - 0.5
y <- 1 * (x1^2 - x2^2 > 0)
plot(x1, x2, xlab = "x1", ylab = "x2", col = (4 - y), pch = (3 - y))
set.seed(1)
glm.fit <- glm(y ~ x1 + x2, family = "binomial")
summary(glm.fit)
##
## Call:
## glm(formula = y ~ x1 + x2, family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.179 -1.139 -1.112 1.206 1.257
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.087260 0.089579 -0.974 0.330
## x1 0.196199 0.316864 0.619 0.536
## x2 -0.002854 0.305712 -0.009 0.993
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 692.18 on 499 degrees of freedom
## Residual deviance: 691.79 on 497 degrees of freedom
## AIC: 697.79
##
## Number of Fisher Scoring iterations: 3
data <- data.frame(x1 = x1, x2 = x2, y = y)
glm.probs <- predict(glm.fit, data, type = "response")
glm.preds <- rep(0, 500)
glm.preds[glm.probs > 0.48] <- 1
plot(data[glm.preds == 1, ]$x1, data[glm.preds == 1, ]$x2, col = (4 - 1), pch = (3 - 1), xlab = "X1", ylab = "X2")
points(data[glm.preds == 0, ]$x1, data[glm.preds == 0, ]$x2, col = (4 - 0), pch = (3 - 0))
glm.fit1 <- glm(y ~ poly(x1, 2) + poly(x2, 2) + I(x1 * x2), family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm.fit1)
##
## Call:
## glm(formula = y ~ poly(x1, 2) + poly(x2, 2) + I(x1 * x2), family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.240e-04 -2.000e-08 -2.000e-08 2.000e-08 1.163e-03
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -102.2 4302.0 -0.024 0.981
## poly(x1, 2)1 2715.3 141109.5 0.019 0.985
## poly(x1, 2)2 27218.5 842987.2 0.032 0.974
## poly(x2, 2)1 -279.7 97160.4 -0.003 0.998
## poly(x2, 2)2 -28693.0 875451.3 -0.033 0.974
## I(x1 * x2) -206.4 41802.8 -0.005 0.996
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6.9218e+02 on 499 degrees of freedom
## Residual deviance: 3.5810e-06 on 494 degrees of freedom
## AIC: 12
##
## Number of Fisher Scoring iterations: 25
glm.probs1 <- predict(glm.fit1, data, type = "response")
glm.preds1 <- rep(0, 500)
glm.preds1[glm.probs1 > 0.48] <- 1
plot(data[glm.preds1 == 1, ]$x1, data[glm.preds1 == 1, ]$x2, col = (4 - 1), pch = (3 - 1), xlab = "X1", ylab = "X2")
points(data[glm.preds1 == 0, ]$x1, data[glm.preds1 == 0, ]$x2, col = (4 - 0), pch = (3 - 0))
library(e1071)
svm.fit <- svm(as.factor(y) ~ x1 + x2,data, kernel = "linear", cost = 0.01)
svm.preds <- predict(svm.fit, data)
plot(data[svm.preds == 0, ]$x1, data[svm.preds == 0, ]$x2, col = (4 - 0), pch = (3 - 0), xlab = "X1", ylab = "X2")
points(data[svm.preds == 1, ]$x1, data[svm.preds == 1, ]$x2, col = (4 - 1), pch = (3 - 1))
svm.fit1 <- svm(as.factor(y) ~ x1 + x2, data, kernel = "radial", gamma = 1)
svm.preds1 <- predict(svm.fit1, data)
plot(data[svm.preds1 == 0, ]$x1, data[svm.preds1 == 0, ]$x2, col = (4 - 0), pch = (3 - 0), xlab = "X1", ylab = "X2")
points(data[svm.preds1 == 1, ]$x1, data[svm.preds1 == 1, ]$x2, col = (4 - 1), pch = (3 - 1))
logistic regression w/ interaction terms and SVM non linear are good at identifying non linear decision boundaries. SVM with linear kernel and logistic without interactions are not effective in identifying non linear decision boundaries. we only tuned gamma for radial kernel so factors such as best interactions and tuning should be taken into consideration
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
library(e1071)
attach(Auto)
newvar <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto$mpglevel <- as.factor(newvar)
set.seed(1)
tuneout <- tune(svm, mpglevel ~ ., data = Auto, kernel = "linear", ranges = list(cost = c(0.01, 0.1, 1, 5, 10, 100)))
summary(tuneout)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.01025641
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-02 0.07653846 0.03617137
## 2 1e-01 0.04596154 0.03378238
## 3 1e+00 0.01025641 0.01792836
## 4 5e+00 0.02051282 0.02648194
## 5 1e+01 0.02051282 0.02648194
## 6 1e+02 0.03076923 0.03151981
cross-validation error is lowest when cost = 1.
set.seed(1)
tuneout1 <- tune(svm, mpglevel ~ ., data = Auto, kernel = "polynomial", ranges = list(cost = c(0.01, 0.1, 1, 5, 10, 100), degree = c(2, 3, 4)))
summary(tuneout1)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost degree
## 100 2
##
## - best performance: 0.3013462
##
## - Detailed performance results:
## cost degree error dispersion
## 1 1e-02 2 0.5511538 0.04366593
## 2 1e-01 2 0.5511538 0.04366593
## 3 1e+00 2 0.5511538 0.04366593
## 4 5e+00 2 0.5511538 0.04366593
## 5 1e+01 2 0.5130128 0.08963366
## 6 1e+02 2 0.3013462 0.09961961
## 7 1e-02 3 0.5511538 0.04366593
## 8 1e-01 3 0.5511538 0.04366593
## 9 1e+00 3 0.5511538 0.04366593
## 10 5e+00 3 0.5511538 0.04366593
## 11 1e+01 3 0.5511538 0.04366593
## 12 1e+02 3 0.3446154 0.09821588
## 13 1e-02 4 0.5511538 0.04366593
## 14 1e-01 4 0.5511538 0.04366593
## 15 1e+00 4 0.5511538 0.04366593
## 16 5e+00 4 0.5511538 0.04366593
## 17 1e+01 4 0.5511538 0.04366593
## 18 1e+02 4 0.5511538 0.04366593
lowest cv error obtained for degree = 2 and cost = 100
set.seed(1)
tuneout2 <- tune(svm, mpglevel ~ ., data = Auto, kernel = "radial", ranges = list(cost = c(0.01, 0.1, 1, 5, 10, 100), gamma = c(0.01, 0.1, 1, 5, 10, 100)))
summary(tuneout2)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 100 0.01
##
## - best performance: 0.01282051
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 1e-02 1e-02 0.55115385 0.04366593
## 2 1e-01 1e-02 0.08929487 0.04382379
## 3 1e+00 1e-02 0.07403846 0.03522110
## 4 5e+00 1e-02 0.04852564 0.03303346
## 5 1e+01 1e-02 0.02557692 0.02093679
## 6 1e+02 1e-02 0.01282051 0.01813094
## 7 1e-02 1e-01 0.21711538 0.09865227
## 8 1e-01 1e-01 0.07903846 0.03874545
## 9 1e+00 1e-01 0.05371795 0.03525162
## 10 5e+00 1e-01 0.02820513 0.03299190
## 11 1e+01 1e-01 0.03076923 0.03375798
## 12 1e+02 1e-01 0.03583333 0.02759051
## 13 1e-02 1e+00 0.55115385 0.04366593
## 14 1e-01 1e+00 0.55115385 0.04366593
## 15 1e+00 1e+00 0.06384615 0.04375618
## 16 5e+00 1e+00 0.05884615 0.04020934
## 17 1e+01 1e+00 0.05884615 0.04020934
## 18 1e+02 1e+00 0.05884615 0.04020934
## 19 1e-02 5e+00 0.55115385 0.04366593
## 20 1e-01 5e+00 0.55115385 0.04366593
## 21 1e+00 5e+00 0.49493590 0.04724924
## 22 5e+00 5e+00 0.48217949 0.05470903
## 23 1e+01 5e+00 0.48217949 0.05470903
## 24 1e+02 5e+00 0.48217949 0.05470903
## 25 1e-02 1e+01 0.55115385 0.04366593
## 26 1e-01 1e+01 0.55115385 0.04366593
## 27 1e+00 1e+01 0.51794872 0.05063697
## 28 5e+00 1e+01 0.51794872 0.04917316
## 29 1e+01 1e+01 0.51794872 0.04917316
## 30 1e+02 1e+01 0.51794872 0.04917316
## 31 1e-02 1e+02 0.55115385 0.04366593
## 32 1e-01 1e+02 0.55115385 0.04366593
## 33 1e+00 1e+02 0.55115385 0.04366593
## 34 5e+00 1e+02 0.55115385 0.04366593
## 35 1e+01 1e+02 0.55115385 0.04366593
## 36 1e+02 1e+02 0.55115385 0.04366593
lowest cv error obtained for gamma = 0.01 and cost = 100
svm.linear <- svm(mpglevel ~ ., data = Auto, kernel = "linear", cost = 1)
svm.poly <- svm(mpglevel ~ ., data = Auto, kernel = "polynomial", cost = 100, degree = 2)
svm.radial <- svm(mpglevel ~ ., data = Auto, kernel = "radial", cost = 100, gamma = 0.01)
plotpairs = function(fit) {
for (name in names(Auto)[!(names(Auto) %in% c("mpg", "mpglevel", "name"))]) {
plot(fit, Auto, as.formula(paste("mpg~", name, sep = "")))
}
}
plotpairs(svm.linear)
plotpairs(svm.poly)
plotpairs(svm.radial)
attach(OJ)
set.seed(1)
row.number <- sample(nrow(OJ), 800)
OJ_train <- OJ[row.number, ]
OJ_test <- OJ[-row.number, ]
set.seed(1)
ojsvmlinear = svm(Purchase ~ ., kernel = "linear", data = OJ_train, cost = 0.01)
summary(ojsvmlinear)
##
## Call:
## svm(formula = Purchase ~ ., data = OJ_train, kernel = "linear", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 435
##
## ( 219 216 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
SVM creates 435 support vectors. 219 belong to the CH level 216 belong to the MM level
ojsvm.train.pred <- predict(ojsvmlinear, OJ_train)
table(OJ_train$Purchase, ojsvm.train.pred)
## ojsvm.train.pred
## CH MM
## CH 420 65
## MM 75 240
(75+65)/(420+65+75+240)
## [1] 0.175
The training error rate is 17.5%
ojsvm.test.pred <- predict(ojsvmlinear, OJ_test)
table(OJ_test$Purchase, ojsvm.test.pred)
## ojsvm.test.pred
## CH MM
## CH 153 15
## MM 33 69
(33+15)/(153+15+33+69)
## [1] 0.1777778
The training error rate is 17.78%
set.seed(1)
ojtuneout <- tune(svm, Purchase ~ ., data = OJ_train, kernel = "linear", ranges = list(cost = 10^seq(-2, 1, by = 0.25)))
summary(ojtuneout)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 3.162278
##
## - best performance: 0.16875
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01000000 0.17625 0.02853482
## 2 0.01778279 0.17625 0.03143004
## 3 0.03162278 0.17125 0.02829041
## 4 0.05623413 0.17625 0.02853482
## 5 0.10000000 0.17250 0.03162278
## 6 0.17782794 0.17125 0.02829041
## 7 0.31622777 0.17125 0.02889757
## 8 0.56234133 0.17125 0.02703521
## 9 1.00000000 0.17500 0.02946278
## 10 1.77827941 0.17375 0.02729087
## 11 3.16227766 0.16875 0.03019037
## 12 5.62341325 0.17375 0.03304563
## 13 10.00000000 0.17375 0.03197764
optimal cost = 3.162278.
ojsvmlinear1 <- svm(Purchase ~ ., kernel = "linear", data = OJ_train, cost = ojtuneout$best.parameter$cost)
ojsvm.train.pred1 <- predict(ojsvmlinear1, OJ_train)
table(OJ_train$Purchase, ojsvm.train.pred1)
## ojsvm.train.pred1
## CH MM
## CH 423 62
## MM 70 245
(70+62)/(423+62+70+245)
## [1] 0.165
The train error rate decreased to 16.5%
ojsvm.test.pred1 <- predict(ojsvmlinear1, OJ_test)
table(OJ_test$Purchase, ojsvm.test.pred1)
## ojsvm.test.pred1
## CH MM
## CH 156 12
## MM 29 73
(29+12)/(156+12+29+73)
## [1] 0.1518519
The test error rate decreased to 15.19%
set.seed(1)
ojsvmradial <- svm(Purchase ~ ., kernel = "radial", data = OJ_train)
summary(ojsvmradial)
##
## Call:
## svm(formula = Purchase ~ ., data = OJ_train, kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 373
##
## ( 188 185 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
ojsvm.train.pred2 <- predict(ojsvmradial, OJ_train)
table(OJ_train$Purchase, ojsvm.train.pred2)
## ojsvm.train.pred2
## CH MM
## CH 441 44
## MM 77 238
(77+44)/(441+44+77+238)
## [1] 0.15125
ojsvm.test.pred2 <- predict(ojsvmradial, OJ_test)
table(OJ_test$Purchase, ojsvm.test.pred2)
## ojsvm.test.pred2
## CH MM
## CH 151 17
## MM 33 69
(33+17)/(151+17+33+69)
## [1] 0.1851852
The radial kernel with default gamma creates 373 support vectors 188 belong to level CH 185 belong to level MM
The classifier has a training error = 15.13%, test error = 18.52%. Train error lower than linear kernel Test error higher than linear kernel
set.seed(1)
ojtuneout1 <- tune(svm, Purchase ~ ., data = OJ_train, kernel = "radial", ranges = list(cost = 10^seq(-2,
1, by = 0.25)))
summary(ojtuneout1)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.5623413
##
## - best performance: 0.16875
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01000000 0.39375 0.04007372
## 2 0.01778279 0.39375 0.04007372
## 3 0.03162278 0.35750 0.05927806
## 4 0.05623413 0.19500 0.02443813
## 5 0.10000000 0.18625 0.02853482
## 6 0.17782794 0.18250 0.03291403
## 7 0.31622777 0.17875 0.03230175
## 8 0.56234133 0.16875 0.02651650
## 9 1.00000000 0.17125 0.02128673
## 10 1.77827941 0.17625 0.02079162
## 11 3.16227766 0.17750 0.02266912
## 12 5.62341325 0.18000 0.02220485
## 13 10.00000000 0.18625 0.02853482
ojsvmradial1 <- svm(Purchase ~ ., kernel = "radial", data = OJ_train, cost = ojtuneout1$best.parameter$cost)
summary(ojsvmradial1)
##
## Call:
## svm(formula = Purchase ~ ., data = OJ_train, kernel = "radial", cost = ojtuneout1$best.parameter$cost)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.5623413
##
## Number of Support Vectors: 397
##
## ( 200 197 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
ojsvm.train.pred3 <- predict(ojsvmradial1, OJ_train)
table(OJ_train$Purchase, ojsvm.train.pred3)
## ojsvm.train.pred3
## CH MM
## CH 437 48
## MM 71 244
(71+48)/(437+48+71+244)
## [1] 0.14875
The train error decreased to 14.88%
ojsvm.test.pred3 <- predict(ojsvmradial1, OJ_test)
table(OJ_test$Purchase, ojsvm.test.pred3)
## ojsvm.test.pred3
## CH MM
## CH 150 18
## MM 30 72
(30+18)/(150+18+30+72)
## [1] 0.1777778
The train error decreased to 17.78%.
set.seed(1)
ojsvm.poly <- svm(Purchase ~ ., kernel = "polynomial", data = OJ_train, degree = 2)
summary(ojsvm.poly)
##
## Call:
## svm(formula = Purchase ~ ., data = OJ_train, kernel = "polynomial",
## degree = 2)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 1
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 447
##
## ( 225 222 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
The polynomial kernel with default gamma creates 447 support vectors 225 belonging to level CH 222 belong to level MM
ojsvm.train.pred4 <- predict(ojsvm.poly, OJ_train)
table(OJ_train$Purchase, ojsvm.train.pred4)
## ojsvm.train.pred4
## CH MM
## CH 449 36
## MM 110 205
(110+36)/(449+36+110+205)
## [1] 0.1825
The train error is 18.25%
ojsvm.test.pred4 <- predict(ojsvm.poly, OJ_test)
table(OJ_test$Purchase, ojsvm.test.pred4)
## ojsvm.test.pred4
## CH MM
## CH 153 15
## MM 45 57
(45+15)/(153+15+45+57)
## [1] 0.2222222
The test error is 22.22%.
set.seed(1)
ojtune.out2 <- tune(svm, Purchase ~ ., data = OJ_train, kernel = "polynomial", degree = 2, ranges = list(cost = 10^seq(-2,
1, by = 0.25)))
summary(ojtune.out2)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 3.162278
##
## - best performance: 0.1775
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01000000 0.39125 0.04210189
## 2 0.01778279 0.37125 0.03537988
## 3 0.03162278 0.36500 0.03476109
## 4 0.05623413 0.33750 0.04714045
## 5 0.10000000 0.32125 0.05001736
## 6 0.17782794 0.24500 0.04758034
## 7 0.31622777 0.19875 0.03972562
## 8 0.56234133 0.20500 0.03961621
## 9 1.00000000 0.20250 0.04116363
## 10 1.77827941 0.18500 0.04199868
## 11 3.16227766 0.17750 0.03670453
## 12 5.62341325 0.18375 0.03064696
## 13 10.00000000 0.18125 0.02779513
ojsvm.poly1 <- svm(Purchase ~ ., kernel = "polynomial", degree = 2, data = OJ_train, cost = ojtune.out2$best.parameter$cost)
summary(ojsvm.poly1)
##
## Call:
## svm(formula = Purchase ~ ., data = OJ_train, kernel = "polynomial",
## degree = 2, cost = ojtune.out2$best.parameter$cost)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 3.162278
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 385
##
## ( 197 188 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
ojsvm.train.pred5 <- predict(ojsvm.poly1, OJ_train)
table(OJ_train$Purchase, ojsvm.train.pred5)
## ojsvm.train.pred5
## CH MM
## CH 451 34
## MM 90 225
(90+34)/(451+34+90+225)
## [1] 0.155
The train error rate decreased to 15.5%
ojsvm.test.pred5 <- predict(ojsvm.poly1, OJ_test)
table(OJ_test$Purchase, ojsvm.test.pred5)
## ojsvm.test.pred5
## CH MM
## CH 154 14
## MM 41 61
(41+14)/(154+14+41+61)
## [1] 0.2037037
The test error rate decreased to 20.37%
SVM model with the radial kernel and optimal cost seems to give the best results