Assignment 8

Q.5

(a) Generate a data set with n = 500 and p = 2

x1=runif (500) -0.5
x2=runif (500) -0.5
y=1*(x1^2-x2^2 > 0)

(b) Plot the observations, colored according to their class labels.

library(ggplot2)

ggplot() + geom_point(
  mapping = aes(x1[y == 0], x2[y == 0]),
  col = "blue",
  alpha = 0.5,
  size = 2
) + geom_point(
  mapping = aes(x1[y == 1], x2[y == 1]),
  col = "orange",
  alpha = 0.8,
  size = 2
) + labs(x = "x1", y = "x2")

(c) Fit a logistic regression model

df=as.data.frame(cbind(x1, x2, y))

glm.fit = glm(y ~ ., df, family = "binomial")
summary(glm.fit)

## 
## Call:
## glm(formula = y ~ ., family = "binomial", data = df)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.23819  -1.17504  -0.00174   1.16878   1.24401  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.0007834  0.0895614   0.009    0.993
## x1           0.2995040  0.3063108   0.978    0.328
## x2          -0.0351100  0.3040163  -0.115    0.908
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 693.15  on 499  degrees of freedom
## Residual deviance: 692.19  on 497  degrees of freedom
## AIC: 698.19
## 
## Number of Fisher Scoring iterations: 3

(d) Apply this model to the training data in order to obtain a predicted class label for each training observation. Plot the observations, colored according to the predicted class labels. The decision boundary should be linear.

glm.probs = predict(glm.fit, newdata = df, type = "response")
glm.preds = ifelse(glm.probs > 0.5, 1, 0)
pred1 = df[glm.preds == 1, ]
pred2 = df[glm.preds == 0, ]

ggplot() +
  geom_point(
    data = pred1,
    mapping = aes(x1, x2),
    col = "slateblue",
    alpha = 0.8,
    size = 2
  ) +
  geom_point(
    data = pred2,
    mapping = aes(x1, x2),
    col = "skyblue2",
    alpha = 0.8,
    size = 2
  )

(e) Now fit a logistic regression model to the data using non-linear functions of X1 and X2 as predictors (e.g. X2^1 , X1×X2, log(X2), and so forth).

glm.fit2 = glm(y ~ I(x1^2) + I(x2^2) + I(x1^3) + I(x2^3) + (x1 * x2), family = "binomial", data=df)
summary(glm.fit2)

## 
## Call:
## glm(formula = y ~ I(x1^2) + I(x2^2) + I(x1^3) + I(x2^3) + (x1 * 
##     x2), family = "binomial", data = df)
## 
## Deviance Residuals: 
##       Min         1Q     Median         3Q        Max  
## -0.001709   0.000000   0.000000   0.000000   0.008549  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)      3.485    594.422   0.006    0.995
## I(x1^2)      24078.366 554808.978   0.043    0.965
## I(x2^2)     -24657.267 565474.427  -0.044    0.965
## I(x1^3)      -2910.808 178065.069  -0.016    0.987
## I(x2^3)       2033.627 144241.483   0.014    0.989
## x1             175.599  21641.734   0.008    0.994
## x2             -72.037  14395.457  -0.005    0.996
## x1:x2          727.445  37194.532   0.020    0.984
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6.9315e+02  on 499  degrees of freedom
## Residual deviance: 8.0412e-05  on 492  degrees of freedom
## AIC: 16
## 
## Number of Fisher Scoring iterations: 25

(f) Apply this model to the training data in order to obtain a predicted class label for each training observation. Plot the observations, colored according to the predicted class labels. The decision boundary should be obviously non-linear. If it is not, then repeat (a)-(e) until you come up with an example in which the predicted class labels are obviously non-linear.

glm.probs2 = predict(glm.fit2, newdata = df, type = "response")
glm.preds2 = ifelse(glm.probs2 > 0.5, 1, 0)

pred3 = df[glm.preds2==1,]
pred4 = df[glm.preds2==0,]

ggplot()+
    geom_point(data=pred3, mapping = aes(x1, x2), col = 
                 "yellowgreen", alpha=0.8, size=2)+
    geom_point(data=pred4, mapping = aes(x1, x2), col = 
                 "turquoise4", alpha=0.5, size=2)

(g) Fit a support vector classifier to the data with X1 and X2 as predictors. Obtain a class prediction for each training observation. Plot the observations, colored according to the predicted class labels.

library(e1071)

svm.fit <- svm(as.factor(y)~ ., data=df, kernel ="linear", cost=0.1)

summary(svm.fit)

## 
## Call:
## svm(formula = as.factor(y) ~ ., data = df, kernel = "linear", cost = 0.1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.1 
## 
## Number of Support Vectors:  493
## 
##  ( 246 247 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

svm.probs = predict(svm.fit, df)

pred5 = df[svm.probs==1,]
pred6 = df[svm.probs==0,]

ggplot() +
  geom_point(
    data = pred5,
    mapping = aes(x1, x2),
    col = "skyblue" ,
    size = 2, alpha=0.8
  ) +
  geom_point(
    data = pred6,
    mapping = aes(x1, x2),
    col = "orange2",
    size = 2, alpha=0.6
  )

plot(svm.fit, df)

(h) Fit a SVM using a non-linear kernel to the data. Obtain a class prediction for each training observation. Plot the observations,colored according to the predicted class labels.

svm.fit2 <- svm(as.factor(y)~ x1 + x2, data=df, method = "radial", cost=0.1)
summary(svm.fit2)

## 
## Call:
## svm(formula = as.factor(y) ~ x1 + x2, data = df, method = "radial", 
##     cost = 0.1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  0.1 
## 
## Number of Support Vectors:  320
## 
##  ( 160 160 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

svm.probs2 = predict(svm.fit2, df)

pred7 = df[svm.probs2==1,]
pred8 = df[svm.probs2==0,]

ggplot() +
  geom_point(
    data = pred7,
    mapping = aes(x1, x2),
    col = "skyblue" ,
    size = 2, alpha=0.8
  ) +
  geom_point(
    data = pred8,
    mapping = aes(x1, x2),
    col = "orange2",
    size = 2, alpha=0.6
  )

plot(svm.fit2, df)

Q.7

(a) Create a binary variable that takes on a 1 for cars with gas mileage above the median, and a 0 for cars with gas mileage below the median.

library(ISLR)
data("Auto")

mpg01 = ifelse(Auto$mpg > median(Auto$mpg), 1,0)
Auto01 = data.frame(Auto[-c(1,9)], mpg01)

(b) Fit a support vector classifier to the data with various values of cost, in order to predict whether a car gets high or low gas mileage. Report the cross-validation errors associated with different values of this parameter. Comment on your results.

set.seed(1)
tune.lin=tune(svm,mpg01~.,data=Auto01,kernel ="linear",ranges=list(cost=c(0.001, 0.01, 0.1, 1,5,10,100)))
summary(tune.lin)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##  0.01
## 
## - best performance: 0.1053223 
## 
## - Detailed performance results:
##    cost     error dispersion
## 1 1e-03 0.1088870 0.02541796
## 2 1e-02 0.1053223 0.03162078
## 3 1e-01 0.1083165 0.03461157
## 4 1e+00 0.1100350 0.03552713
## 5 5e+00 0.1102002 0.03559065
## 6 1e+01 0.1101804 0.03557982
## 7 1e+02 0.1101708 0.03556061

Cost = 0.01 results in the lowest cross-validation error rate. #### (c) Now repeat (b), this time using SVMs with radial and polynomial basis kernels, with different values of gamma and degree and cost. Comment on your results.

tune.rad <- tune(svm, mpg01 ~ ., data = Auto01, kernel = "radial", ranges = list(cost = c(0.1, 1, 5, 10), gamma = c(0.01, 0.1, 1, 5, 10, 100)))
summary(tune.rad)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1     1
## 
## - best performance: 0.06031574 
## 
## - Detailed performance results:
##    cost gamma      error  dispersion
## 1   0.1 1e-02 0.09893598 0.036615111
## 2   1.0 1e-02 0.08754791 0.042547338
## 3   5.0 1e-02 0.08480617 0.045493029
## 4  10.0 1e-02 0.08642087 0.046948300
## 5   0.1 1e-01 0.07233360 0.037968503
## 6   1.0 1e-01 0.07477629 0.041412967
## 7   5.0 1e-01 0.07091706 0.037875175
## 8  10.0 1e-01 0.06693812 0.034121153
## 9   0.1 1e+00 0.08114160 0.026627777
## 10  1.0 1e+00 0.06031574 0.028798351
## 11  5.0 1e+00 0.06735518 0.027624894
## 12 10.0 1e+00 0.06986628 0.028134940
## 13  0.1 5e+00 0.31384670 0.049494935
## 14  1.0 5e+00 0.10575613 0.014517033
## 15  5.0 5e+00 0.11102705 0.021541907
## 16 10.0 5e+00 0.11186421 0.022222646
## 17  0.1 1e+01 0.42414373 0.055935817
## 18  1.0 1e+01 0.15301406 0.011420254
## 19  5.0 1e+01 0.15472405 0.015294250
## 20 10.0 1e+01 0.15472408 0.015294221
## 21  0.1 1e+02 0.49304390 0.052133504
## 22  1.0 1e+02 0.24947282 0.002756306
## 23  5.0 1e+02 0.24947267 0.002755672
## 24 10.0 1e+02 0.24947267 0.002755672

Cost = 1 results in the lowest cross-validation error rate.

tune.poly <- tune(svm, mpg01 ~ ., data = Auto01, kernel = "polynomial", ranges = list(cost = c(0.1, 1, 5, 10), degree = c(2, 3, 4)))
summary(tune.poly)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost degree
##    10      3
## 
## - best performance: 0.1113738 
## 
## - Detailed performance results:
##    cost degree     error dispersion
## 1   0.1      2 0.2269920 0.03627644
## 2   1.0      2 0.1651022 0.02910030
## 3   5.0      2 0.1565969 0.04467648
## 4  10.0      2 0.1592405 0.04433953
## 5   0.1      3 0.1376712 0.03070863
## 6   1.0      3 0.1228948 0.03188548
## 7   5.0      3 0.1130214 0.02872703
## 8  10.0      3 0.1113738 0.02617738
## 9   0.1      4 0.1943853 0.04498174
## 10  1.0      4 0.1546828 0.03542809
## 11  5.0      4 0.1626235 0.06784320
## 12 10.0      4 0.1665109 0.08650442

Cost = 10 and degree = 3 results in the lowest cross-validation error rate.

(d) Make some plots to back up your assertions in (b) and (c).Hint: In the lab, we used the plot() function for svm objects only in cases with p = 2. When p > 2, you can use the plot() function to create plots displaying pairs of variables at a time.

svm.lin = svm(as.factor(mpg01) ~ ., data = Auto01, kernel = "linear", cost = 0.01)
svm.poly = svm(as.factor(mpg01) ~ ., data = Auto01, kernel = "polynomial", cost = 10, 
    degree = 3)
svm.rad = svm(as.factor(mpg01) ~ ., data = Auto01, kernel = "radial", cost = 1, gamma = 1)

Auto02 = data.frame(Auto01, Auto[c(1)])

# for slicing
m.acc <- mean(acceleration)
m.wt <- mean(weight)
m.hp <- mean(horsepower)
m.dp <- mean(displacement)
m.mpg <- mean(Auto$mpg)

Plots

# linear
plot(
  svm.lin,
  Auto02,
  mpg ~ acceleration,
  main = "Linear: mpg~acceleration",
  slice = list(displacement = m.dp, horsepower = m.hp)
)

plot(
  svm.lin,
  Auto02,
  mpg ~ displacement,
  main = "Linear: mpg~displacement",
  slice = list(acceleration = m.acc, horsepower = m.hp)
)

plot(
  svm.lin,
  Auto02,
  mpg ~ horsepower,
  main = "Linear: mpg~horsepower",
  slice = list(acceleration = m.acc, displacement = m.dp)
)

plot(
  svm.lin,
  Auto02,
  mpg ~ weight,
  main = "Linear: mpg~weight",
  slice = list(acceleration = m.acc, displacement = m.dp)
)

plot(
  svm.lin,
  Auto02,
  mpg ~ cylinders,
  main = "Linear: mpg~cylinders",
  slice = list(displacement = m.dp, horsepower = m.hp)
)

plot(
  svm.lin,
  Auto02,
  mpg ~ origin,
  main = "Linear: mpg~origin",
  slice = list(displacement = m.dp, horsepower = m.hp)
)

plot(
  svm.lin,
  Auto02,
  mpg ~ year,
  main = "Linear: mpg~year",
  slice = list(displacement = m.dp, horsepower = m.hp)
)

# Radial plots
plot(
  svm.rad,
  Auto02,
  mpg ~ acceleration,
  main = "Radial: mpg~acceleration",
  slice = list(displacement = m.dp, horsepower = m.hp)
)

plot(
  svm.rad,
  Auto02,
  mpg ~ displacement,
  main = "Radial: mpg~displacement",
  slice = list(acceleration = m.acc, horsepower = m.hp)
)

plot(
  svm.rad,
  Auto02,
  mpg ~ horsepower,
  main = "Radial: mpg~horsepower",
  slice = list(acceleration = m.acc, displacement = m.dp)
)

plot(
  svm.rad,
  Auto02,
  mpg ~ weight,
  main = "Radial: mpg~weight",
  slice = list(acceleration = m.acc, displacement = m.dp)
)

plot(
  svm.rad,
  Auto02,
  mpg ~ cylinders,
  main = "Linear: mpg~cylinders",
  slice = list(displacement = m.dp, horsepower = m.hp)
)

plot(
  svm.rad,
  Auto02,
  mpg ~ origin,
  main = "Linear: mpg~origin",
  slice = list(displacement = m.dp, horsepower = m.hp)
)

plot(
  svm.rad,
  Auto02,
  mpg ~ year,
  main = "Linear: mpg~year",
  slice = list(displacement = m.dp, horsepower = m.hp)
)

# Polynomial
plot(
  svm.poly,
  Auto02,
  mpg ~ acceleration,
  main = "Polynomial: mpg~acceleration",
  slice = list(displacement = m.dp, horsepower = m.hp)
)

plot(
  svm.poly,
  Auto02,
  mpg ~ displacement,
  main = "Polynomial: mpg~displacement",
  slice = list(acceleration = m.acc, horsepower = m.hp)
)

plot(
  svm.poly,
  Auto02,
  mpg ~ horsepower,
  main = "Polynomial: mpg~horsepower",
  slice = list(acceleration = m.acc, displacement = m.dp)
)

plot(
  svm.poly,
  Auto02,
  mpg ~ weight,
  main = "Polynomial: mpg~weight",
  slice = list(acceleration = m.acc, displacement = m.dp)
)

plot(
  svm.poly,
  Auto02,
  mpg ~ cylinders,
  main = "Linear: mpg~cylinders",
  slice = list(displacement = m.dp, horsepower = m.hp)
)

plot(
  svm.poly,
  Auto02,
  mpg ~ origin,
  main = "Linear: mpg~origin",
  slice = list(displacement = m.dp, horsepower = m.hp)
)

plot(
  svm.poly,
  Auto02,
  mpg ~ year,
  main = "Linear: mpg~year",
  slice = list(displacement = m.dp, horsepower = m.hp)
)

Q.8

library(ISLR)
data("OJ")

oj = OJ

(a) Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations.

inTrain = sample(1070,800)
train = oj[inTrain,]
test = oj[-inTrain,]

(b) Fit a support vector classifier to the training data using cost=0.01, with Purchase as the response and the other variables as predictors. Use the summary() function to produce summary statistics, and describe the results obtained.

svm.lin2 = svm(Purchase ~ ., kernel = "linear", data = train, cost = 0.01)
summary(svm.lin2)

## 
## Call:
## svm(formula = Purchase ~ ., data = train, kernel = "linear", cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
## 
## Number of Support Vectors:  426
## 
##  ( 213 213 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

(c) What are the training and test error rates?

test.pred1=predict(svm.lin2, test)
train.pred1=predict(svm.lin2, train)

TrainErr = mean(train.pred1 != train$Purchase)
TrainErr

## [1] 0.16625

TestErr = mean(test.pred1 != test$Purchase)
TestErr

## [1] 0.1851852

(d) Use the tune() function to select an optimal cost. Consider values in the range 0.01 to 10.

set.seed(1)
tune.lin2 = tune(svm, Purchase~., data=train, kernel ="linear", ranges =list(cost=seq(0.01, 10, 0.333)))

op = tune.lin2$best.parameters$cost
op

## [1] 1.009

(e) Compute the training and test error rates using this new value for cost.

svm.lin4 = svm(Purchase ~ ., kernel = "linear", data = train, cost = op)

test.pred2=predict(svm.lin4, test)
train.pred2=predict(svm.lin4, train)

TrainErr = mean(train.pred2 != train$Purchase)
TrainErr

## [1] 0.16375

TestErr = mean(test.pred2 != test$Purchase)
TestErr

## [1] 0.1814815

(f) Repeat parts (b) through (e) using a support vector machine with a radial kernel. Use the default value for gamma.

svm.rad2 = svm(Purchase ~ ., kernel = "radial", data = train, cost = 0.01)

test.pred3=predict(svm.rad2, test)
train.pred3=predict(svm.rad2, train)

TestErr = mean(train.pred3 != train$Purchase)
TestErr

## [1] 0.38125

TrainErr = mean(test.pred3 != test$Purchase)
TrainErr

## [1] 0.4148148

Accuracy = 1 - TrainErr
Accuracy

## [1] 0.5851852

tune.lin3 = tune(svm, Purchase~., data=train, kernel ="linear", ranges =list(cost=seq(0.01, 10, 0.333)))

op = tune.lin3$best.parameters$cost

svm.lin5 = svm(Purchase ~ ., kernel = "radial", data = train, cost = op)

train.pred4=predict(svm.lin5, train)
test.pred4=predict(svm.lin5, test)

TestErr = mean(test.pred4 != test$Purchase)
TestErr

## [1] 0.2

TrainErr = mean(train.pred4 != train$Purchase)
TrainErr

## [1] 0.14625

(g) Repeat parts (b) through (e) using a support vector machine with a polynomial kernel. Set degree=2.

svm.poly2 = svm(Purchase ~ ., kernel = "polynomial", data = train, cost = 0.01, degree=2)

train.pred5=predict(svm.poly2, train)
test.pred5=predict(svm.poly2, test)

TestErr = mean(test.pred5 != test$Purchase)
TestErr

## [1] 0.4074074

TrainErr = mean(train.pred5 != train$Purchase)
TrainErr

## [1] 0.35625

tune.lin4 = tune(svm, Purchase~., data=train, kernel ="polynomial", ranges =list(cost=seq(0.01, 10, 0.333)), degree=2)

op = tune.lin4$best.parameters$cost

svm.lin6 = svm(Purchase ~ ., kernel = "polynomial", data = train, cost = op, degree=2)

train.pred6=predict(svm.lin6, train)
test.pred6=predict(svm.lin6, test)

TestErr = mean(train.pred6 != train$Purchase)
TestErr

## [1] 0.13625

TrainErr = mean(test.pred6 != test$Purchase)
TrainErr

## [1] 0.2296296

Accuracy = 1 - TrainErr
Accuracy

## [1] 0.7703704

(h) Overall, which approach seems to give the best results on this data?

# Radial Model
TestErr = mean(test.pred4 != test$Purchase)
TestErr

## [1] 0.2

TrainErr = mean(train.pred4 != train$Purchase)
TrainErr

## [1] 0.14625

The support vector model with radial kernel gives the best results because it has the lowest number of errors and a high accuracy.