This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
We have seen that we can fit an SVM with a non-linear kernel in order to perform classification using a non-linear decision boundary. We will now see that we can also obtain a non-linear decision boundary by performing logistic regression using non-linear transformations of the features.
library(ISLR)
library(tree)
library(rpart)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks randomForest::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
set.seed(1)
x1 = runif(500) - 0.5
x2 = runif(500) - 0.5
y=1*(x1^2-x2^2>0)
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
plot(x1[y==0],x2[y==0],col="red",xlab="X1",ylab="X2")
points(x1[y==1],x2[y==1],col="blue")
df=data.frame(x1 = x1, x2 = x2, y = as.factor(y))
glm_fit=glm(y~.,data=df, family='binomial')
glm_prob=predict(glm_fit,newdata=df,type='response')
glm_pred=ifelse(glm_prob>0.5,1,0)
ggplot(data = df, mapping = aes(x1, x2)) +
geom_point(data = df, mapping = aes(colour = glm_pred))
glm_fit_2=glm(y~poly(x1,2)+poly(x2,2),data=df,family='binomial')
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm_prob_2=predict(glm_fit_2,newdata=df,type='response')
glm_pred_2=ifelse(glm_prob_2>0.5,1,0)
ggplot(data = df, mapping = aes(x1, x2)) +
geom_point(data = df, mapping = aes(colour = glm_pred_2))
library(e1071)
svm_lin=svm(y~.,data=df,kernel='linear',cost=0.01)
plot(svm_lin,df)
svm_lin_2=svm(y~.,data=df,kernel='radial',gamma=1)
plot(svm_lin_2,data=df)
attach(Auto)
## The following object is masked from package:lubridate:
##
## origin
## The following object is masked from package:ggplot2:
##
## mpg
mpg_med = median(Auto$mpg)
bin.var = ifelse(Auto$mpg > mpg_med, 1, 0)
Auto$mpglevel = as.factor(bin.var)
library(e1071)
set.seed(1)
tune_out = tune(svm, mpg~., data = Auto, kernel = "linear", ranges = list(cost = c(0.01,
0.1, 1, 5, 10, 100)))
summary(tune_out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.1
##
## - best performance: 8.981009
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-02 10.305990 5.295587
## 2 1e-01 8.981009 4.750742
## 3 1e+00 9.647184 4.313908
## 4 5e+00 10.149220 4.755080
## 5 1e+01 10.306219 4.953047
## 6 1e+02 10.684083 5.080506
set.seed(2)
tune_out = tune(svm, mpg ~ ., data = Auto, kernel = "polynomial", ranges = list(cost = c(0.1,
1, 5, 10), degree = c(2, 3, 4)))
summary(tune_out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost degree
## 10 2
##
## - best performance: 50.95606
##
## - Detailed performance results:
## cost degree error dispersion
## 1 0.1 2 61.59446 13.60292
## 2 1.0 2 60.15304 13.79293
## 3 5.0 2 55.06386 15.19391
## 4 10.0 2 50.95606 15.72388
## 5 0.1 3 61.71831 13.56940
## 6 1.0 3 61.39833 13.54758
## 7 5.0 3 59.99304 13.43208
## 8 10.0 3 58.28857 13.27760
## 9 0.1 4 61.75343 13.57197
## 10 1.0 4 61.74822 13.57317
## 11 5.0 4 61.72510 13.57851
## 12 10.0 4 61.69626 13.58520
set.seed(33)
tune_out = tune(svm, mpg ~ ., data = Auto, kernel = "radial", ranges = list(cost = c(0.1,
1, 5, 10), degree = c(2, 3, 4)))
summary(tune_out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost degree
## 10 2
##
## - best performance: 7.294024
##
## - Detailed performance results:
## cost degree error dispersion
## 1 0.1 2 22.211472 8.440215
## 2 1.0 2 10.118616 4.286046
## 3 5.0 2 7.649506 2.993534
## 4 10.0 2 7.294024 2.818545
## 5 0.1 3 22.211472 8.440215
## 6 1.0 3 10.118616 4.286046
## 7 5.0 3 7.649506 2.993534
## 8 10.0 3 7.294024 2.818545
## 9 0.1 4 22.211472 8.440215
## 10 1.0 4 10.118616 4.286046
## 11 5.0 4 7.649506 2.993534
## 12 10.0 4 7.294024 2.818545
svm_lin = svm(mpglevel ~ ., data = Auto, kernel = "linear", cost = 1)
svm_poly = svm(mpglevel ~ ., data = Auto, kernel = "polynomial", cost = 10,
degree = 2)
svm_radial = svm(mpglevel ~ ., data = Auto, kernel = "radial", cost = 10, gamma = 0.01)
plotpairs = function(fit) {
for (name in names(Auto)[!(names(Auto) %in% c("mpg", "mpglevel", "name"))]) {
plot(fit, Auto, as.formula(paste("mpg~", name, sep = "")))
}
}
plotpairs(svm_lin)
plotpairs(svm_poly)
plotpairs(svm_radial)
attach(OJ)
set.seed(1)
data_Train = sample(nrow(OJ), 800)
oj_train = OJ[data_Train,]
oj_test = OJ[-data_Train,]
svc=svm(Purchase~.,data=oj_train,kernel='linear',cost=0.01)
summary(svc)
##
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "linear", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 435
##
## ( 219 216 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
pred_train = predict(svc, oj_train)
(t<-table(oj_train$Purchase, pred_train))
## pred_train
## CH MM
## CH 420 65
## MM 75 240
pred_test = predict(svc, oj_test)
table(oj_test$Purchase, pred_test)
## pred_test
## CH MM
## CH 153 15
## MM 33 69
set.seed(1)
tune_svc = tune(svm, Purchase ~ ., data = oj_train, kernel = "linear", ranges = list(cost = c(0.01,0.1,1,10)))
summary(tune_svc)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.1
##
## - best performance: 0.1725
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.17625 0.02853482
## 2 0.10 0.17250 0.03162278
## 3 1.00 0.17500 0.02946278
## 4 10.00 0.17375 0.03197764
svm_lin_1 = svm(Purchase ~ ., kernel = "linear", data = oj_train, cost = tune_out$best.parameters$cost)
pred_train_1 = predict(svm_lin_1, oj_train)
table(oj_train$Purchase, pred_train_1)
## pred_train_1
## CH MM
## CH 423 62
## MM 69 246
test_pred_1 = predict(svm_lin_1, oj_test)
(t<-table(oj_test$Purchase, test_pred_1))
## test_pred_1
## CH MM
## CH 156 12
## MM 28 74
set.seed(1)
svm_rad_1 = svm(Purchase ~ ., data = oj_train, kernel = "radial")
summary(svm_rad_1)
##
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 373
##
## ( 188 185 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
pred_train_2 = predict(svm_rad_1, oj_train)
table(oj_train$Purchase, pred_train_2)
## pred_train_2
## CH MM
## CH 441 44
## MM 77 238
test_pred_2 = predict(svm_rad_1, oj_test)
table(oj_test$Purchase, test_pred_2)
## test_pred_2
## CH MM
## CH 151 17
## MM 33 69
svm_rad_1 = svm(Purchase ~ ., data = oj_train, kernel = "radial", cost = tune_svc$best.parameters$cost)
pred_train = predict(svm_rad_1, oj_train)
table(oj_train$Purchase, pred_train_1)
## pred_train_1
## CH MM
## CH 423 62
## MM 69 246
test_pred_3 = predict(svm_rad_1, oj_test)
(t<-table(oj_test$Purchase, test_pred_3))
## test_pred_3
## CH MM
## CH 150 18
## MM 37 65
svm_pol_1 = svm(Purchase ~ ., kernel = "poly", data = oj_train, degree=2)
summary(svm_pol_1)
##
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "poly", degree = 2)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 1
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 447
##
## ( 225 222 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
pred_train_2 = predict(svm_pol_1, oj_train)
(t<-table(oj_train$Purchase, pred_train_2))
## pred_train_2
## CH MM
## CH 449 36
## MM 110 205
test_pred_3 = predict(svm_pol_1, oj_test)
(t<-table(oj_test$Purchase, test_pred_3))
## test_pred_3
## CH MM
## CH 153 15
## MM 45 57
set.seed(1)
tune_svc = tune(svm, Purchase ~ ., data = oj_train, kernel = "poly", degree = 2, ranges = list(cost = 10^seq(-2, 1, by = 0.25)))
summary(tune_svc)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 3.162278
##
## - best performance: 0.1775
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01000000 0.39125 0.04210189
## 2 0.01778279 0.37125 0.03537988
## 3 0.03162278 0.36500 0.03476109
## 4 0.05623413 0.33750 0.04714045
## 5 0.10000000 0.32125 0.05001736
## 6 0.17782794 0.24500 0.04758034
## 7 0.31622777 0.19875 0.03972562
## 8 0.56234133 0.20500 0.03961621
## 9 1.00000000 0.20250 0.04116363
## 10 1.77827941 0.18500 0.04199868
## 11 3.16227766 0.17750 0.03670453
## 12 5.62341325 0.18375 0.03064696
## 13 10.00000000 0.18125 0.02779513