(A)
set.seed(1)
x1 = runif(500) - 0.5
x2 = runif(500) - 0.5
y = 1*(x1^2 - x2^2 > 0)
(B)
plot(x1,x2, col = ifelse(y == 1,"green", "blue"))
(C)
glm.fit = glm(y ~ x1+x2, family = "binomial")
summary(glm.fit)
##
## Call:
## glm(formula = y ~ x1 + x2, family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.179 -1.139 -1.112 1.206 1.257
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.087260 0.089579 -0.974 0.330
## x1 0.196199 0.316864 0.619 0.536
## x2 -0.002854 0.305712 -0.009 0.993
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 692.18 on 499 degrees of freedom
## Residual deviance: 691.79 on 497 degrees of freedom
## AIC: 697.79
##
## Number of Fisher Scoring iterations: 3
(D) The decision boundary barely shows, on the left side.
df <- data.frame(x1 = x1, x2 = x2, y = as.factor(y))
glm.pred <- predict(glm.fit, df, type = "response")
glm.class = ifelse(glm.pred >= 0.47, 1, 0)
pos = df[glm.class == 1,]
neg = df[glm.class == 0,]
plot(pos$x1, pos$x2, col = "green", xlab = "x1", ylab = "x2")
points(neg$x1, neg$x2, col = "blue")
(E)
glm.fit1 <- glm(y ~ poly(x1, 2) + log(x2) + I(x1 * x2), family = 'binomial')
## Warning in log(x2): NaNs produced
summary(glm.fit1)
##
## Call:
## glm(formula = y ~ poly(x1, 2) + log(x2) + I(x1 * x2), family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.07660 -0.30318 -0.07824 0.26039 1.78835
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.0989 0.7937 -6.424 1.33e-10 ***
## poly(x1, 2)1 22.0302 15.1692 1.452 0.146
## poly(x1, 2)2 70.5508 9.7403 7.243 4.38e-13 ***
## log(x2) -3.1893 0.4915 -6.489 8.63e-11 ***
## I(x1 * x2) -7.6014 7.2496 -1.049 0.294
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 344.97 on 249 degrees of freedom
## Residual deviance: 123.09 on 245 degrees of freedom
## (250 observations deleted due to missingness)
## AIC: 133.09
##
## Number of Fisher Scoring iterations: 7
(F)
log.prob <- predict(glm.fit1, df, type = 'response')
## Warning in log(x2): NaNs produced
log.pred <- rep(0,500)
log.pred[log.prob>0.47] = 1
plot(df[log.pred==1,]$x1, df[log.pred==1,]$x2, col = 'blue', xlab = 'X1', ylab = 'X2')
points(df[log.pred==0,]$x1, df[log.pred==0,]$x2, col = 'green')
(G)
library(e1071)
svm.fit <- svm(as.factor(y)~ x1 + x2, df, kernel = 'linear', cost = 0.1)
svm.pred <- predict(svm.fit, df)
plot(df[svm.pred==0,]$x1, df[svm.pred==0,]$x2, col = "blue", xlab = "X1", ylab = "X2")
points(df[svm.pred==1,]$x1, df[svm.pred==1,]$x2, col = "green")
(H)
svm.fit2 <- svm(as.factor(y)~ x1 + x2, df, kernel = 'radial', gamma = 1)
svm.pred2 <- predict(svm.fit2, df)
plot(df[svm.pred2==0,]$x1, df[svm.pred2==0,]$x2, col = "red", xlab = "X1", ylab = "X2")
points(df[svm.pred2==1,]$x1, df[svm.pred2==1,]$x2, col = 'black')
(I) The non-linear decision boundary of SVM and log regression with non-linear terms included, performed well in predicting estimate of true class value of the original data. It shows that SVM is important in finding non-linear models.
library(ISLR)
library(e1071)
attach(Auto)
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
(A)
med.gas = median(Auto$mpg)
gas.var = ifelse(Auto$mpg > med.gas, 1, 0)
Auto$mpglevel = as.factor(gas.var)
(B) The cross validation error is minimized for cost at 1.
set.seed(1)
tune.out = tune(svm, mpglevel ~ ., data = Auto, kernel = "linear", ranges = list(cost = c(0.01, 0.1, 1, 5, 10, 100)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.01025641
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-02 0.07653846 0.03617137
## 2 1e-01 0.04596154 0.03378238
## 3 1e+00 0.01025641 0.01792836
## 4 5e+00 0.02051282 0.02648194
## 5 1e+01 0.02051282 0.02648194
## 6 1e+02 0.03076923 0.03151981
tune.out$best.parameters
## cost
## 3 1
(C) Looking at results of the SVM with radial basis kernel, cross-validation error is minimized for cost at 100 and gamma = 0.01, while SVM with polynimial basis kernel, cross-validation error is minimized for cost at 100 and degree = 2.
set.seed(1)
tuneout.rad <- tune(svm, mpglevel~., data = Auto, kernel = "radial", ranges = list(cost = c(0.01, 0.1, 1, 5, 10, 100), gamma = c(0.01, 0.1, 1, 5, 10, 100)))
summary(tuneout.rad)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 100 0.01
##
## - best performance: 0.01282051
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 1e-02 1e-02 0.55115385 0.04366593
## 2 1e-01 1e-02 0.08929487 0.04382379
## 3 1e+00 1e-02 0.07403846 0.03522110
## 4 5e+00 1e-02 0.04852564 0.03303346
## 5 1e+01 1e-02 0.02557692 0.02093679
## 6 1e+02 1e-02 0.01282051 0.01813094
## 7 1e-02 1e-01 0.21711538 0.09865227
## 8 1e-01 1e-01 0.07903846 0.03874545
## 9 1e+00 1e-01 0.05371795 0.03525162
## 10 5e+00 1e-01 0.02820513 0.03299190
## 11 1e+01 1e-01 0.03076923 0.03375798
## 12 1e+02 1e-01 0.03583333 0.02759051
## 13 1e-02 1e+00 0.55115385 0.04366593
## 14 1e-01 1e+00 0.55115385 0.04366593
## 15 1e+00 1e+00 0.06384615 0.04375618
## 16 5e+00 1e+00 0.05884615 0.04020934
## 17 1e+01 1e+00 0.05884615 0.04020934
## 18 1e+02 1e+00 0.05884615 0.04020934
## 19 1e-02 5e+00 0.55115385 0.04366593
## 20 1e-01 5e+00 0.55115385 0.04366593
## 21 1e+00 5e+00 0.49493590 0.04724924
## 22 5e+00 5e+00 0.48217949 0.05470903
## 23 1e+01 5e+00 0.48217949 0.05470903
## 24 1e+02 5e+00 0.48217949 0.05470903
## 25 1e-02 1e+01 0.55115385 0.04366593
## 26 1e-01 1e+01 0.55115385 0.04366593
## 27 1e+00 1e+01 0.51794872 0.05063697
## 28 5e+00 1e+01 0.51794872 0.04917316
## 29 1e+01 1e+01 0.51794872 0.04917316
## 30 1e+02 1e+01 0.51794872 0.04917316
## 31 1e-02 1e+02 0.55115385 0.04366593
## 32 1e-01 1e+02 0.55115385 0.04366593
## 33 1e+00 1e+02 0.55115385 0.04366593
## 34 5e+00 1e+02 0.55115385 0.04366593
## 35 1e+01 1e+02 0.55115385 0.04366593
## 36 1e+02 1e+02 0.55115385 0.04366593
tuneout.rad$best.parameters
## cost gamma
## 6 100 0.01
set.seed(1)
tuneout.poly <- tune(svm, mpglevel~., data = Auto, kernel = "polynomial", ranges = list(cost = c(0.01, 0.1, 1, 5, 10, 100), degree = c(2,3,4)))
summary(tuneout.poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost degree
## 100 2
##
## - best performance: 0.3013462
##
## - Detailed performance results:
## cost degree error dispersion
## 1 1e-02 2 0.5511538 0.04366593
## 2 1e-01 2 0.5511538 0.04366593
## 3 1e+00 2 0.5511538 0.04366593
## 4 5e+00 2 0.5511538 0.04366593
## 5 1e+01 2 0.5130128 0.08963366
## 6 1e+02 2 0.3013462 0.09961961
## 7 1e-02 3 0.5511538 0.04366593
## 8 1e-01 3 0.5511538 0.04366593
## 9 1e+00 3 0.5511538 0.04366593
## 10 5e+00 3 0.5511538 0.04366593
## 11 1e+01 3 0.5511538 0.04366593
## 12 1e+02 3 0.3446154 0.09821588
## 13 1e-02 4 0.5511538 0.04366593
## 14 1e-01 4 0.5511538 0.04366593
## 15 1e+00 4 0.5511538 0.04366593
## 16 5e+00 4 0.5511538 0.04366593
## 17 1e+01 4 0.5511538 0.04366593
## 18 1e+02 4 0.5511538 0.04366593
tuneout.poly$best.parameters
## cost degree
## 6 100 2
(D)
svm.linear <- svm(mpglevel~., data=Auto, kernel="linear", cost=1)
svm.rad <- svm(mpglevel~., data=Auto, kernel="radial", cost=10, gamma=0.01)
svm.poly <- svm(mpglevel~., data=Auto, kernel="polynomial", cost=1, degree=2)
plotpairs <- function(autofit) {
for (name in names(Auto)[!(names(Auto) %in% c("mpg", "mpglevel", "name"))]) {
plot(autofit, Auto, as.formula(paste("mpg~", name, sep = "")))}}
plotpairs(svm.linear)
plotpairs(svm.rad)
plotpairs(svm.poly)
## Exercise 8
attach(OJ)
str(OJ)
## 'data.frame': 1070 obs. of 18 variables:
## $ Purchase : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
## $ WeekofPurchase: num 237 239 245 227 228 230 232 234 235 238 ...
## $ StoreID : num 1 1 1 1 7 7 7 7 7 7 ...
## $ PriceCH : num 1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceMM : num 1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
## $ DiscCH : num 0 0 0.17 0 0 0 0 0 0 0 ...
## $ DiscMM : num 0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
## $ SpecialCH : num 0 0 0 0 0 0 1 1 0 0 ...
## $ SpecialMM : num 0 1 0 0 0 1 1 0 0 0 ...
## $ LoyalCH : num 0.5 0.6 0.68 0.4 0.957 ...
## $ SalePriceMM : num 1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
## $ SalePriceCH : num 1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceDiff : num 0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
## $ Store7 : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
## $ PctDiscMM : num 0 0.151 0 0 0 ...
## $ PctDiscCH : num 0 0 0.0914 0 0 ...
## $ ListPriceDiff : num 0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
## $ STORE : num 1 1 1 1 0 0 0 0 0 0 ...
set.seed(1)
index <- sample(nrow(OJ), 800)
oj.train <- OJ[index,]
oj.test <- OJ[-index,]
(B). The support vector classifier shows there are 435 support vectors out of 800 training points. Out of those 800, 219 belong to level CH and remaining 216 are for level MM.
library(e1071)
oj.svm = svm(Purchase ~ ., kernel = "linear", data = oj.train, cost = 0.01)
summary(oj.svm)
##
## Call:
## svm(formula = Purchase ~ ., data = oj.train, kernel = "linear", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 435
##
## ( 219 216 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
(C). The training error rate is 17.5% and test error rate is about 17.7%.
train.pred = predict(oj.svm, oj.train)
table(oj.train$Purchase, train.pred)
## train.pred
## CH MM
## CH 420 65
## MM 75 240
(75 + 65)/(420 + 65 + 75 + 240)
## [1] 0.175
test.pred = predict(oj.svm, oj.test)
table(oj.test$Purchase, test.pred)
## test.pred
## CH MM
## CH 153 15
## MM 33 69
(15 + 33)/(153 + 15 + 33 + 69)
## [1] 0.1777778
(D). Using the tune() function, optimal cost is 0.1.
set.seed(1)
oj.tune <- tune(svm, Purchase ~ ., data = oj.train, kernel = "linear", ranges = list(cost=c(0.001, 0.01, 0.1, 1, 5, 10)))
summary(oj.tune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.1
##
## - best performance: 0.1725
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.31250 0.04124790
## 2 1e-02 0.17625 0.02853482
## 3 1e-01 0.17250 0.03162278
## 4 1e+00 0.17500 0.02946278
## 5 5e+00 0.17250 0.03162278
## 6 1e+01 0.17375 0.03197764
oj.tune$best.parameters
## cost
## 3 0.1
(E). The training and test error rates reduced to 16.5% and 16.3% respectively, when using the new value for cost.
oj.svm1 <- svm(Purchase ~ ., kernel = "linear", data = oj.train, cost = oj.tune$best.parameters$cost)
train.pred1 <- predict(oj.svm1, oj.train)
table(oj.train$Purchase, train.pred1)
## train.pred1
## CH MM
## CH 422 63
## MM 69 246
(63 + 69)/(422 + 63 + 69 + 246)
## [1] 0.165
test.pred1 <- predict(oj.svm1, oj.test)
table(oj.test$Purchase, test.pred1)
## test.pred1
## CH MM
## CH 155 13
## MM 31 71
(13 + 31)/(155 + 13 + 31 + 71)
## [1] 0.162963
set.seed(1)
ojrad.svm <- svm(Purchase ~ ., data = oj.train, kernel = "radial")
summary(ojrad.svm )
##
## Call:
## svm(formula = Purchase ~ ., data = oj.train, kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 373
##
## ( 188 185 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train.pred2 <- predict(ojrad.svm, oj.train)
table(oj.train$Purchase, train.pred2)
## train.pred2
## CH MM
## CH 441 44
## MM 77 238
(44 + 77)/(441 + 44 + 77 + 238)
## [1] 0.15125
test.pred2 <- predict(ojrad.svm, oj.test)
table(oj.test$Purchase, test.pred2)
## test.pred2
## CH MM
## CH 151 17
## MM 33 69
(17 + 33)/(151 + 17 + 33 + 69)
## [1] 0.1851852
The support vector classifier shows there are 373 support vectors. Out of those, 188 belong to level CH and the remaining 185 are for level MM.The classifier has a training error rate of 15.1% and a test error rate of 18.5%.
set.seed(1)
ojtune.rad = tune(svm, Purchase ~ ., data = oj.train, kernel = "radial", ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 100)))
summary(ojtune.rad)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.17125
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.39375 0.04007372
## 2 1e-02 0.39375 0.04007372
## 3 1e-01 0.18625 0.02853482
## 4 1e+00 0.17125 0.02128673
## 5 5e+00 0.18000 0.02220485
## 6 1e+02 0.21750 0.04048319
ojtune.rad$best.parameters
## cost
## 4 1
ojrad.svm1 <- svm(Purchase ~ ., kernel = "radial", data = oj.train, cost = ojtune.rad$best.parameters$cost)
train.pred3 <- predict(ojrad.svm1, oj.train)
table(oj.train$Purchase, train.pred3)
## train.pred3
## CH MM
## CH 441 44
## MM 77 238
(44+77)/(441+44+77+238)
## [1] 0.15125
test.pred3 <- predict(ojrad.svm1, oj.test)
table(oj.test$Purchase, test.pred3)
## test.pred3
## CH MM
## CH 151 17
## MM 33 69
(17+33)/(151+17+33+69)
## [1] 0.1851852
After tuning, the training error rate and test error rate remained the same.
ojpoly.svm <- svm(Purchase ~ ., kernel = "poly", data = oj.train, degree=2)
summary(ojpoly.svm)
##
## Call:
## svm(formula = Purchase ~ ., data = oj.train, kernel = "poly", degree = 2)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 1
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 447
##
## ( 225 222 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train.pred4 <- predict(ojpoly.svm, oj.train)
table(oj.train$Purchase, train.pred4)
## train.pred4
## CH MM
## CH 449 36
## MM 110 205
(36+110)/(449+36+110+205)
## [1] 0.1825
test.pred4 <- predict(ojpoly.svm, oj.test)
table(oj.test$Purchase, test.pred4)
## test.pred4
## CH MM
## CH 153 15
## MM 45 57
(15+45)/(153+15+45+57)
## [1] 0.2222222
The polynomial basis kernel with degree 2 has 447 support vectors, of which, 225 are for level CH and remaining 222 belong to level MM. The classifier has a training error rate of 18.25% and a test error rate of 22.2%.
set.seed(1)
ojtune.poly <- tune(svm, Purchase ~ ., data = oj.train, kernel = "poly", degree = 2, ranges = list(cost=c(0.001, 0.01, 0.1, 1, 5, 10)))
summary(ojtune.poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 10
##
## - best performance: 0.18125
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.39375 0.04007372
## 2 1e-02 0.39125 0.04210189
## 3 1e-01 0.32125 0.05001736
## 4 1e+00 0.20250 0.04116363
## 5 5e+00 0.18250 0.03496029
## 6 1e+01 0.18125 0.02779513
ojtune.poly$best.parameters
## cost
## 6 10
ojpoly.svm1 = svm(Purchase~., data = oj.train, kernel = "poly", cost = 10, degree = 2)
summary(ojpoly.svm1 )
##
## Call:
## svm(formula = Purchase ~ ., data = oj.train, kernel = "poly", cost = 10,
## degree = 2)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 10
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 340
##
## ( 171 169 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train.pred5 <- predict(ojpoly.svm1, oj.train)
table(oj.train$Purchase, train.pred5)
## train.pred5
## CH MM
## CH 447 38
## MM 82 233
(38+82)/(447+38+82+233)
## [1] 0.15
test.pred5 <- predict(ojpoly.svm1, oj.test)
table(oj.test$Purchase, test.pred5)
## test.pred5
## CH MM
## CH 154 14
## MM 37 65
(14+37)/(154+14+37+65)
## [1] 0.1888889
(H). Overall, the tuned Poly SVM with cost at 10 seems to give the best results on this data.