##5. We have seen that we can fit an SVM with a non-linear kernel in order to perform classification using a non-linear decision boundary. We will now see that we can also obtain a non-linear decision boundary by performing logistic regression using non-linear transformations of the features.
x1=runif (500) -0.5
x2=runif (500) -0.5
y=1*(x1^2-x2^2 > 0)
plot(x1, x2, col=(4-y))
model.log <- glm(y~ x1 + x2, family = "binomial")
summary(model.log)
##
## Call:
## glm(formula = y ~ x1 + x2, family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4377 -1.1516 -0.9283 1.1725 1.4544
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.02589 0.09032 -0.287 0.7744
## x1 0.70982 0.32517 2.183 0.0290 *
## x2 0.69319 0.32154 2.156 0.0311 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 693.08 on 499 degrees of freedom
## Residual deviance: 684.35 on 497 degrees of freedom
## AIC: 690.35
##
## Number of Fisher Scoring iterations: 4
data5 <- data.frame(x1,x2, y=as.factor(y))
probs <- predict(model.log, data5, type = 'response')
preds <- ifelse(probs >= 0.5, 1,0)
plot(x1, x2, col=(4-preds))
x3 <- as.numeric(I(x1^2))
x4 <- as.numeric(I(x2^2))
x5 <- as.numeric(I(x1 * x2))
#x6 <- as.numeric(I(log(x2)))
data5e <-data.frame(x1,x2,x3,x4,x5, y)
model.log2 <- glm(y~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1 * x2), family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#model.log2 <- glm(y~., data = data5e, family = "binomial")
y_class <- predict(model.log2, data5e, type = 'response')
preds2 <- ifelse(y_class >= 0.5, 1,0)
plot(x1, x2, col=(4-preds2))
grid()
library(e1071)
model.svm <- svm(y~., data=data5, kernel = "linear", cost = 10, scale = FALSE)
plot(model.svm,data5)
library(e1071)
model.svmr <- svm(y~., data=data5, kernel = "radial", gamma = 1, cost = .01)
plot(model.svmr,data5)
##7. In this problem, you will use support vector approaches in order to predict whether a given car gets high or low gas mileage based on the Auto data set.
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.1.2
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'purrr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Auto2 = Auto
med_mpg = median(Auto2$mpg)
Auto2 = mutate(Auto2, mpg01 = ifelse(mpg > med_mpg, 1,0))
Auto2$mpg01 = as.factor(Auto2$mpg01)
Auto2$origin = as.factor(Auto2$origin)
Auto2 = subset(Auto2, select = -c(mpg,name))
str(Auto2)
## 'data.frame': 392 obs. of 8 variables:
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ mpg01 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#table(Auto2$mpg01)
library(caret)
## Warning: package 'caret' was built under R version 4.1.2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Set up Repeated k-fold Cross Validation
train_control1 <- trainControl(method="repeatedcv", number=10, repeats=3) ##(taking too long with repeats = 3)
Creating SVM model with c held constant at a value of 1: Accuracy = 0.913977
svm7 <- train(mpg01 ~., data = Auto2, method = "svmLinear", trControl = train_control1, preProcess = c ("center","scale"))
svm7
## Support Vector Machines with Linear Kernel
##
## 392 samples
## 7 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered (8), scaled (8)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 353, 352, 354, 354, 353, 353, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9113417 0.8226492
##
## Tuning parameter 'C' was held constant at a value of 1
Results saved for svm7 C held constant svmLinear
res1 <- as_tibble(svm7$results[which.min(svm7$results[,2]),])
res1
## # A tibble: 1 x 5
## C Accuracy Kappa AccuracySD KappaSD
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.911 0.823 0.0456 0.0912
Creating SVM model with tuneGrid for “C”: Accuracy = 0.9200675, final value of C = 1.895263 SVM7a is slightly more accurate than SVM7 when using cross-validation for grid values of C
svm7a <- train(mpg01 ~., data = Auto2, method = "svmLinear", trControl = train_control1, preProcess = c ("center","scale"), tuneGrid = expand.grid(C = seq(0.01,2,length = 20)))
svm7a
## Support Vector Machines with Linear Kernel
##
## 392 samples
## 7 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered (8), scaled (8)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 352, 354, 353, 353, 352, 352, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.0100000 0.9096727 0.8192702
## 0.1147368 0.9054611 0.8108582
## 0.2194737 0.9037506 0.8074549
## 0.3242105 0.9038147 0.8075664
## 0.4289474 0.9021491 0.8042376
## 0.5336842 0.9055038 0.8109369
## 0.6384211 0.9037731 0.8074967
## 0.7431579 0.9063158 0.8125934
## 0.8478947 0.9089249 0.8177970
## 0.9526316 0.9123673 0.8246989
## 1.0573684 0.9131781 0.8263150
## 1.1621053 0.9106129 0.8211991
## 1.2668421 0.9132006 0.8263745
## 1.3715789 0.9132006 0.8263745
## 1.4763158 0.9123673 0.8247079
## 1.5810526 0.9149100 0.8297687
## 1.6857895 0.9148662 0.8296809
## 1.7905263 0.9157434 0.8314353
## 1.8952632 0.9157434 0.8314353
## 2.0000000 0.9157434 0.8314353
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 1.790526.
#svm7a$bestTune
res2 <- as_tibble(svm7a$results[which.min(svm7a$results[,2]),])
res2
## # A tibble: 1 x 5
## C Accuracy Kappa AccuracySD KappaSD
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.429 0.902 0.804 0.0362 0.0723
Radial: I used Caret (based on class R Lab to identify the optimal gamma and C for the radial SVM model) Optimal Gamma: 0.2228925 C: 8
svm7r <- train(mpg01 ~., data = Auto2, method = "svmRadial", trControl = train_control1, preProcess = c ("center","scale"), tuneLength = 10)
svm7r$bestTune
## sigma C
## 7 0.1593645 16
res3 <- as_tibble(svm7r$results[which.min(svm7r$results[,2]),])
res3
## # A tibble: 1 x 6
## sigma C Accuracy Kappa AccuracySD KappaSD
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.159 0.25 0.908 0.816 0.0584 0.116
Polynomial
svm7p <- train(mpg01 ~., data = Auto2, method = "svmPoly", trControl = train_control1, preProcess = c ("center","scale"), tuneLength = 4)
svm7p$bestTune
## degree scale C
## 44 3 0.1 2
res4 <- as_tibble(svm7p$results[which.min(svm7p$results[,2]),])
res4
## # A tibble: 1 x 7
## degree scale C Accuracy Kappa AccuracySD KappaSD
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.001 0.25 0.702 0.411 0.124 0.240
Accuracy results per svm model Comments: Poly took way too long to run with tunelength of 10, so I changed it to 4. Accuracy is fairly close, SVM linear however has the highest accuracy.
df<-tibble(Model=c('SVM Linear','SVM Linear w/ choice of cost','SVM Radial','SVM Poly'),Accuracy=c(svm7$results[2][[1]],res2$Accuracy,res3$Accuracy,res4$Accuracy))
df #%>% arrange(Accuracy)
## # A tibble: 4 x 2
## Model Accuracy
## <chr> <dbl>
## 1 SVM Linear 0.911
## 2 SVM Linear w/ choice of cost 0.902
## 3 SVM Radial 0.908
## 4 SVM Poly 0.702
svm linear with optimal C chosen via Caret
plot(svm7a)
svm radial with optimal parameters chosen via Caret
plot(svm7r)
svm poly with optimal parameters chosen via Caret
plot(svm7p)
library(ISLR)
str(OJ)
## 'data.frame': 1070 obs. of 18 variables:
## $ Purchase : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
## $ WeekofPurchase: num 237 239 245 227 228 230 232 234 235 238 ...
## $ StoreID : num 1 1 1 1 7 7 7 7 7 7 ...
## $ PriceCH : num 1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceMM : num 1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
## $ DiscCH : num 0 0 0.17 0 0 0 0 0 0 0 ...
## $ DiscMM : num 0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
## $ SpecialCH : num 0 0 0 0 0 0 1 1 0 0 ...
## $ SpecialMM : num 0 1 0 0 0 1 1 0 0 0 ...
## $ LoyalCH : num 0.5 0.6 0.68 0.4 0.957 ...
## $ SalePriceMM : num 1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
## $ SalePriceCH : num 1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceDiff : num 0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
## $ Store7 : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
## $ PctDiscMM : num 0 0.151 0 0 0 ...
## $ PctDiscCH : num 0 0 0.0914 0 0 ...
## $ ListPriceDiff : num 0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
## $ STORE : num 1 1 1 1 0 0 0 0 0 0 ...
Training and test set
set.seed(22)
train.oj=sample(1:nrow(OJ), 800)
OJ.train = OJ[train.oj,]
OJ.test = OJ[-train.oj,]
svm8 = svm(Purchase~., data = OJ.train, kernel = 'linear', cost = 0.01, scale = FALSE)
summary(svm8)
##
## Call:
## svm(formula = Purchase ~ ., data = OJ.train, kernel = "linear", cost = 0.01,
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 621
##
## ( 310 311 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
svm8trainerror <- mean(predict(svm8, OJ.train) != OJ.train$Purchase)
svm8testerror <- mean(predict(svm8, OJ.test) != OJ.test$Purchase)
error<-tibble(Model=c('Train Error','Test Error'),Error=c(svm8trainerror,svm8testerror))
error
## # A tibble: 2 x 2
## Model Error
## <chr> <dbl>
## 1 Train Error 0.236
## 2 Test Error 0.219
set.seed(22)
tune.out=tune(svm,Purchase~., data = OJ.train, kernel = 'linear',ranges=list(cost=c(0.001, 0.01, 0.1, 1,10)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.1
##
## - best performance: 0.18375
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.34500 0.05342440
## 2 1e-02 0.18625 0.03304563
## 3 1e-01 0.18375 0.03866254
## 4 1e+00 0.18375 0.03775377
## 5 1e+01 0.18750 0.03952847
best model for svm with range of C
svm.best <- tune.out$best.model
summary(svm.best)
##
## Call:
## best.tune(method = svm, train.x = Purchase ~ ., data = OJ.train,
## ranges = list(cost = c(0.001, 0.01, 0.1, 1, 10)), kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.1
##
## Number of Support Vectors: 365
##
## ( 182 183 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
svm8atrainerror <- mean(predict(svm.best, OJ.train) != OJ.train$Purchase)
svm8atesterror <- mean(predict(svm.best, OJ.test) != OJ.test$Purchase)
error1<-tibble(Model=c('Train Error SVM Best','Test Error SVM Best'),Error=c(svm8atrainerror,svm8atesterror))
error1
## # A tibble: 2 x 2
## Model Error
## <chr> <dbl>
## 1 Train Error SVM Best 0.176
## 2 Test Error SVM Best 0.141
svm8r=svm(Purchase~., data=OJ.train, kernel ="radial", gamma=1, cost=1)
summary(svm8r)
##
## Call:
## svm(formula = Purchase ~ ., data = OJ.train, kernel = "radial", gamma = 1,
## cost = 1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 501
##
## ( 230 271 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
svm8rtrainerror <- mean(predict(svm8r, OJ.train) != OJ.train$Purchase)
svm8rtesterror <- mean(predict(svm8r, OJ.test) != OJ.test$Purchase)
error.r<-tibble(Model=c('Train Error radial','Test Error radial'),Error=c(svm8rtrainerror,svm8rtesterror))
error.r
## # A tibble: 2 x 2
## Model Error
## <chr> <dbl>
## 1 Train Error radial 0.119
## 2 Test Error radial 0.196
using caret for the optimal radial model
svm8ro <- train(Purchase ~., data = OJ.train, method = "svmRadial", trControl = train_control1, preProcess = c ("center","scale"), tuneLength = 10)
svm8ro$bestTune
## sigma C
## 2 0.05532364 0.5
svm8rotrainerror <- mean(predict(svm8ro, OJ.train) != OJ.train$Purchase)
svm8rotesterror <- mean(predict(svm8ro, OJ.test) != OJ.test$Purchase)
error.ro<-tibble(Model=c('Train Error radial','Test Error radial'),Error=c(svm8rotrainerror,svm8rotesterror))
error.ro
## # A tibble: 2 x 2
## Model Error
## <chr> <dbl>
## 1 Train Error radial 0.17
## 2 Test Error radial 0.159
svm8p=svm(Purchase~., data=OJ.train, kernel ="polynomial", degree=2, cost=1)
summary(svm8r)
##
## Call:
## svm(formula = Purchase ~ ., data = OJ.train, kernel = "radial", gamma = 1,
## cost = 1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 501
##
## ( 230 271 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
svm8ptrainerror <- mean(predict(svm8p, OJ.train) != OJ.train$Purchase)
svm8ptesterror <- mean(predict(svm8p, OJ.test) != OJ.test$Purchase)
error.p<-tibble(Model=c('Train Error poly','Test Error poly'),Error=c(svm8ptrainerror,svm8ptesterror))
error.p
## # A tibble: 2 x 2
## Model Error
## <chr> <dbl>
## 1 Train Error poly 0.194
## 2 Test Error poly 0.167
svm8po <- train(Purchase ~., data = OJ.train, method = "svmPoly", trControl = train_control1, preProcess = c ("center","scale"), tuneLength = 4)
svm8potrainerror <- mean(predict(svm8po, OJ.train) != OJ.train$Purchase)
svm8potesterror <- mean(predict(svm8po, OJ.test) != OJ.test$Purchase)
error.po<-tibble(Model=c('Train Error polyo','Test Error polyo'),Error=c(svm8potrainerror,svm8potesterror))
error.po
## # A tibble: 2 x 2
## Model Error
## <chr> <dbl>
## 1 Train Error polyo 0.169
## 2 Test Error polyo 0.144
Poly seems to be the best approach, though the training error for radial (standard) had the best error results but not on the test dataset. Overall, as the data becomes more non-linear, the non-linear kernals performed better than the linear kernal. Tuning the parameters for radial and poly produced the best performance, in conclusion the non-linear kernal models with optimal parameters performed the best