R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Question 5

We have seen that we can fit an SVM with a non-linear kernel in order to perform classification using a non-linear decision boundary. We will now see that we can also obtain a non-linear decision boundary by performing logistic regression using non-linear transformations of the features.

library(ISLR)
library(tree)
library(rpart)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine()       masks randomForest::combine()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ purrr::lift()          masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
set.seed(1)
x1 = runif(500) - 0.5 
x2 = runif(500) - 0.5
y=1*(x1^2-x2^2>0)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

plot(x1[y==0],x2[y==0],col="red",xlab="X1",ylab="X2")
points(x1[y==1],x2[y==1],col="blue")

df=data.frame(x1 = x1, x2 = x2, y = as.factor(y))
glm_fit=glm(y~.,data=df, family='binomial')


glm_prob=predict(glm_fit,newdata=df,type='response')
glm_pred=ifelse(glm_prob>0.5,1,0)
ggplot(data = df, mapping = aes(x1, x2)) +
  geom_point(data = df, mapping = aes(colour = glm_pred))

glm_fit_2=glm(y~poly(x1,2)+poly(x2,2),data=df,family='binomial')
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm_prob_2=predict(glm_fit_2,newdata=df,type='response')
glm_pred_2=ifelse(glm_prob_2>0.5,1,0)
ggplot(data = df, mapping = aes(x1, x2)) +
  geom_point(data = df, mapping = aes(colour = glm_pred_2))

library(e1071)
svm_lin=svm(y~.,data=df,kernel='linear',cost=0.01)
plot(svm_lin,df)

svm_lin_2=svm(y~.,data=df,kernel='radial',gamma=1)
plot(svm_lin_2,data=df)

Quesion 7

attach(Auto)
## The following object is masked from package:lubridate:
## 
##     origin
## The following object is masked from package:ggplot2:
## 
##     mpg
mpg_med = median(Auto$mpg)
bin.var = ifelse(Auto$mpg > mpg_med, 1, 0)
Auto$mpglevel = as.factor(bin.var)

library(e1071)
set.seed(1)
tune_out = tune(svm, mpg~., data = Auto, kernel = "linear", ranges = list(cost = c(0.01, 
    0.1, 1, 5, 10, 100)))
summary(tune_out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.1
## 
## - best performance: 8.981009 
## 
## - Detailed performance results:
##    cost     error dispersion
## 1 1e-02 10.305990   5.295587
## 2 1e-01  8.981009   4.750742
## 3 1e+00  9.647184   4.313908
## 4 5e+00 10.149220   4.755080
## 5 1e+01 10.306219   4.953047
## 6 1e+02 10.684083   5.080506
set.seed(2)
tune_out = tune(svm, mpg ~ ., data = Auto, kernel = "polynomial", ranges = list(cost = c(0.1, 
    1, 5, 10), degree = c(2, 3, 4)))
summary(tune_out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost degree
##    10      2
## 
## - best performance: 50.95606 
## 
## - Detailed performance results:
##    cost degree    error dispersion
## 1   0.1      2 61.59446   13.60292
## 2   1.0      2 60.15304   13.79293
## 3   5.0      2 55.06386   15.19391
## 4  10.0      2 50.95606   15.72388
## 5   0.1      3 61.71831   13.56940
## 6   1.0      3 61.39833   13.54758
## 7   5.0      3 59.99304   13.43208
## 8  10.0      3 58.28857   13.27760
## 9   0.1      4 61.75343   13.57197
## 10  1.0      4 61.74822   13.57317
## 11  5.0      4 61.72510   13.57851
## 12 10.0      4 61.69626   13.58520
set.seed(33)
tune_out = tune(svm, mpg ~ ., data = Auto, kernel = "radial", ranges = list(cost = c(0.1, 
    1, 5, 10), degree = c(2, 3, 4)))
summary(tune_out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost degree
##    10      2
## 
## - best performance: 7.294024 
## 
## - Detailed performance results:
##    cost degree     error dispersion
## 1   0.1      2 22.211472   8.440215
## 2   1.0      2 10.118616   4.286046
## 3   5.0      2  7.649506   2.993534
## 4  10.0      2  7.294024   2.818545
## 5   0.1      3 22.211472   8.440215
## 6   1.0      3 10.118616   4.286046
## 7   5.0      3  7.649506   2.993534
## 8  10.0      3  7.294024   2.818545
## 9   0.1      4 22.211472   8.440215
## 10  1.0      4 10.118616   4.286046
## 11  5.0      4  7.649506   2.993534
## 12 10.0      4  7.294024   2.818545
svm_lin = svm(mpglevel ~ ., data = Auto, kernel = "linear", cost = 1)
svm_poly = svm(mpglevel ~ ., data = Auto, kernel = "polynomial", cost = 10, 
    degree = 2)
svm_radial = svm(mpglevel ~ ., data = Auto, kernel = "radial", cost = 10, gamma = 0.01)
plotpairs = function(fit) {
    for (name in names(Auto)[!(names(Auto) %in% c("mpg", "mpglevel", "name"))]) {
        plot(fit, Auto, as.formula(paste("mpg~", name, sep = "")))
    }
}
plotpairs(svm_lin)

plotpairs(svm_poly)

plotpairs(svm_radial)

Question 8

attach(OJ)
set.seed(1)
data_Train = sample(nrow(OJ), 800)
oj_train = OJ[data_Train,]
oj_test = OJ[-data_Train,]



svc=svm(Purchase~.,data=oj_train,kernel='linear',cost=0.01)
summary(svc)
## 
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "linear", cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
## 
## Number of Support Vectors:  435
## 
##  ( 219 216 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
pred_train = predict(svc, oj_train)
(t<-table(oj_train$Purchase, pred_train))
##     pred_train
##       CH  MM
##   CH 420  65
##   MM  75 240
pred_test = predict(svc, oj_test)
table(oj_test$Purchase, pred_test)
##     pred_test
##       CH  MM
##   CH 153  15
##   MM  33  69
set.seed(1)
tune_svc = tune(svm, Purchase ~ ., data = oj_train, kernel = "linear", ranges = list(cost = c(0.01,0.1,1,10)))
summary(tune_svc)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.1
## 
## - best performance: 0.1725 
## 
## - Detailed performance results:
##    cost   error dispersion
## 1  0.01 0.17625 0.02853482
## 2  0.10 0.17250 0.03162278
## 3  1.00 0.17500 0.02946278
## 4 10.00 0.17375 0.03197764
svm_lin_1 = svm(Purchase ~ ., kernel = "linear", data = oj_train, cost = tune_out$best.parameters$cost)
pred_train_1 = predict(svm_lin_1, oj_train)
table(oj_train$Purchase, pred_train_1)
##     pred_train_1
##       CH  MM
##   CH 423  62
##   MM  69 246
test_pred_1 = predict(svm_lin_1, oj_test)
(t<-table(oj_test$Purchase, test_pred_1))
##     test_pred_1
##       CH  MM
##   CH 156  12
##   MM  28  74
set.seed(1)
svm_rad_1 = svm(Purchase ~ ., data = oj_train, kernel = "radial")
summary(svm_rad_1)
## 
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  373
## 
##  ( 188 185 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
pred_train_2 = predict(svm_rad_1, oj_train)
table(oj_train$Purchase, pred_train_2)
##     pred_train_2
##       CH  MM
##   CH 441  44
##   MM  77 238
test_pred_2 = predict(svm_rad_1, oj_test)
table(oj_test$Purchase, test_pred_2)
##     test_pred_2
##       CH  MM
##   CH 151  17
##   MM  33  69
svm_rad_1 = svm(Purchase ~ ., data = oj_train, kernel = "radial", cost = tune_svc$best.parameters$cost)
pred_train = predict(svm_rad_1, oj_train)
table(oj_train$Purchase, pred_train_1)
##     pred_train_1
##       CH  MM
##   CH 423  62
##   MM  69 246
test_pred_3 = predict(svm_rad_1, oj_test)
(t<-table(oj_test$Purchase, test_pred_3))
##     test_pred_3
##       CH  MM
##   CH 150  18
##   MM  37  65
svm_pol_1 = svm(Purchase ~ ., kernel = "poly", data = oj_train, degree=2)
summary(svm_pol_1)
## 
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "poly", degree = 2)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  1 
##      degree:  2 
##      coef.0:  0 
## 
## Number of Support Vectors:  447
## 
##  ( 225 222 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
pred_train_2 = predict(svm_pol_1, oj_train)
(t<-table(oj_train$Purchase, pred_train_2))
##     pred_train_2
##       CH  MM
##   CH 449  36
##   MM 110 205
test_pred_3 = predict(svm_pol_1, oj_test)
(t<-table(oj_test$Purchase, test_pred_3))
##     test_pred_3
##       CH  MM
##   CH 153  15
##   MM  45  57
set.seed(1)
tune_svc = tune(svm, Purchase ~ ., data = oj_train, kernel = "poly", degree = 2, ranges = list(cost = 10^seq(-2, 1, by = 0.25)))
summary(tune_svc) 
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##      cost
##  3.162278
## 
## - best performance: 0.1775 
## 
## - Detailed performance results:
##           cost   error dispersion
## 1   0.01000000 0.39125 0.04210189
## 2   0.01778279 0.37125 0.03537988
## 3   0.03162278 0.36500 0.03476109
## 4   0.05623413 0.33750 0.04714045
## 5   0.10000000 0.32125 0.05001736
## 6   0.17782794 0.24500 0.04758034
## 7   0.31622777 0.19875 0.03972562
## 8   0.56234133 0.20500 0.03961621
## 9   1.00000000 0.20250 0.04116363
## 10  1.77827941 0.18500 0.04199868
## 11  3.16227766 0.17750 0.03670453
## 12  5.62341325 0.18375 0.03064696
## 13 10.00000000 0.18125 0.02779513