library(glmnet)
library(e1071)
library(ISLR)
library(caret)
x1=runif(500)-0.5
x2=runif(500)-0.5
y=1*(x1^2 - x2^2 > 0)
# set the seed
set.seed(1)
# Generate a dataset
x1 <- runif(500) - 0.5
x2 <- runif(500) - 0.5
y <- 1 * (x1^2 - x2^2 > 0)
# Plot the observations
plot(x1,x2,col=ifelse(y,'red','blue'),xlab='X1',ylab='X2')
# Create a dataframe
dat = data.frame(x1, x2, y = as.factor(y))
# Fit a logistic regression with x1 and x2 as predictors
glm.fit = glm(y~., data = dat, family = "binomial")
summary(glm.fit)
##
## Call:
## glm(formula = y ~ ., family = "binomial", data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.179 -1.139 -1.112 1.206 1.257
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.087260 0.089579 -0.974 0.330
## x1 0.196199 0.316864 0.619 0.536
## x2 -0.002854 0.305712 -0.009 0.993
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 692.18 on 499 degrees of freedom
## Residual deviance: 691.79 on 497 degrees of freedom
## AIC: 697.79
##
## Number of Fisher Scoring iterations: 3
# Use the predict function and get the log odds value
glm.preds = predict(glm.fit, newdata = dat, type ="response")
# Plot the observations; if predictions is 1, then blue, else red
plot(x1,x2,col=ifelse(glm.preds>=0.5,'blue','red'),xlab='X1',ylab='X2')
From the above plot, it is clear that the decision boundary is linear.
glm.fit2 = glm(y~I(x1*x2) + poly(x2,2) + poly(x1,2), data=dat, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Did not try log as NaNs are occuring.
# Use the predict function and get the log odds value
glm.preds2 = predict(glm.fit2, newdata = dat, type ="response")
# Plot the observations; if predictions is 1, then blue, else red
plot(x1,x2,col=ifelse(glm.preds2>=0.5,'blue','red'),xlab='X1',ylab='X2')
This is very similar to the original true decision boundary.
svm.fit=svm(y~.,data=dat,kernal='linear',cost=0.01)
svm.preds=predict(svm.fit,newdata=dat,type='response')
plot(x1,x2,col=ifelse(svm.preds!=0,'blue','red'),xlab='X1',ylab='X2')
This support vector classifier (even with low cost) classifies all points to a single class.
svm.fit2=svm(y~.,data=dat,kernel='radial',gamma=1)
svm.preds2=predict(svm.fit2,newdata=dat,type='response')
plot(x1,x2,col=ifelse(svm.preds2!=0,'blue','red'),xlab='X1',ylab='X2')
table(y,as.integer(glm.preds2>=.5))
##
## y 0 1
## 0 261 0
## 1 0 239
(261+239)/500
## [1] 1
table(y,as.integer(glm.preds2!=0))
##
## y 1
## 0 261
## 1 239
table(y,svm.preds2)
## svm.preds2
## y 0 1
## 0 258 3
## 1 11 228
(258+228)/500
## [1] 0.972
We may conclude that SVM with non-linear kernel and logistic regression with non linear functions of X1 and X2 are better and powerful. SVM with linear kernel and logistic regression without any interaction terms are not good.
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
Auto$mpglevel = as.factor(ifelse(Auto$mpg>median(Auto$mpg),1,0))
str(Auto)
## 'data.frame': 392 obs. of 10 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## $ mpglevel : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
set.seed(2)
costlist <- list(cost=c(0.001, 0.01, 0.1, 1,5,10,100))
svm.tune=tune(svm,mpglevel~.,data=Auto,ranges=costlist,kernel='linear')
summary(svm.tune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.01012821
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.09679487 0.04882280
## 2 1e-02 0.07641026 0.04620210
## 3 1e-01 0.04076923 0.04206765
## 4 1e+00 0.01012821 0.01780775
## 5 5e+00 0.02032051 0.02336409
## 6 1e+01 0.02288462 0.02807216
## 7 1e+02 0.03314103 0.02424635
svm.tune$best.parameters
## cost
## 4 1
For linear kernel, The C Value of 1 is selected by cross-validation. At C = 1, the error rate is 0.01012821.
gammalist <- c(0.01, 0.1, 1, 5, 10, 100)
degreelist <- c(2,3,4,5)
svm.tunep=tune(svm,mpglevel~.,data=Auto,kernal='polynomial',ranges=costlist,gamma=gammalist,degree=degreelist)
summary(svm.tunep)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 100
##
## - best performance: 0.01525641
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.54083333 0.03127850
## 2 1e-02 0.54083333 0.03127850
## 3 1e-01 0.08903846 0.03954371
## 4 1e+00 0.07128205 0.03497074
## 5 5e+00 0.04570513 0.04554847
## 6 1e+01 0.02544872 0.01178918
## 7 1e+02 0.01525641 0.02138845
svm.tunep$best.parameters
## cost
## 7 100
For a Polynomial kernel, the lowest cross-validation error is obtained at cost of 100.
svm.tuner=tune(svm,mpglevel~.,data=Auto,ranges=costlist,kernel='radial',gamma=gammalist)
summary(svm.tuner)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 100
##
## - best performance: 0.01525641
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.56121795 0.03931537
## 2 1e-02 0.56121795 0.03931537
## 3 1e-01 0.08929487 0.04053081
## 4 1e+00 0.07397436 0.03703948
## 5 5e+00 0.04839744 0.03496088
## 6 1e+01 0.02288462 0.02505026
## 7 1e+02 0.01525641 0.02138845
svm.tuner$best.parameters
## cost
## 7 100
For a Radial kernel, the lowest cross-validation error is obtained at cost of 100.
# Define a function to put different plots
plotpairs = function(Auto.svmfit) {
for (name in names(Auto)[!(names(Auto) %in% c("mpg", "mpglevel", "name"))]) {
plot(Auto.svmfit, Auto, as.formula(paste("mpg~", name, sep = "")))
}
}
# Run different svm fits
svm.linear = svm(mpglevel~., data=Auto, kernal="linear", cost=1)
svm.radial = svm(mpglevel~., data=Auto, kernal="radial", cost=100, gamma=0.01)
svm.polynomial = svm(mpglevel~., data=Auto, kernal="polynomial", cost=100, degree=2)
# Plots for linear SVM
plotpairs(svm.linear)
# Plots for Radia SVM with gamma = 0.01
plotpairs(svm.radial)
# Plots for polynomial SVM with degree=2
plotpairs(svm.polynomial)
set.seed(3)
oj.intrain <- createDataPartition(OJ$Purchase, p = 0.746, list = FALSE)
oj.train <- OJ[oj.intrain,]
oj.test <- OJ[-oj.intrain,]
dim(oj.train)
## [1] 800 18
oj.svm <- svm(Purchase~., data = oj.train, kernel = "linear", cost = 0.01)
summary(oj.svm)
##
## Call:
## svm(formula = Purchase ~ ., data = oj.train, kernel = "linear", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 444
##
## ( 223 221 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
The Support Vector Classifier with Cost = 0.01 results in 444 Support Vectors with 223 of the Support Vectors being classified as CH and 221 of the Support Vectors being classified as MM.
# Get predictions for training dataset
oj.train.preds <- predict(oj.svm, oj.train)
table(oj.train$Purchase, oj.train.preds)
## oj.train.preds
## CH MM
## CH 434 54
## MM 78 234
# Calculate training error rate
mse_lsvm_reg_tr=(54+78)/800
mse_lsvm_reg_tr
## [1] 0.165
Lets calculate the test error rate.
# Get predictions for test dataset
oj.test.preds <- predict(oj.svm, oj.test)
table(oj.test$Purchase, oj.test.preds)
## oj.test.preds
## CH MM
## CH 145 20
## MM 27 78
# calculate test error rate
mse_lsvm_reg_T=(20+27)/270
mse_lsvm_reg_T
## [1] 0.1740741
The training error rate is 0.165 and the test error rate is .1740741.
set.seed(4)
oj.svm.tune = tune(svm, Purchase ~., data = oj.train, kernel = "linear", ranges = list(cost=c(0.001, 0.01, 0.1, 1, 5, 10)))
summary(oj.svm.tune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.16
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.32875 0.09948653
## 2 1e-02 0.17125 0.02949223
## 3 1e-01 0.16250 0.03061862
## 4 1e+00 0.16000 0.03525699
## 5 5e+00 0.16250 0.03632416
## 6 1e+01 0.16375 0.03653860
oj.svm.tune$best.parameters
## cost
## 4 1
The optimal Cost value is 1 with an error of 0.16000.
# Run the svm fit with cost of 1 based on tuning
oj.svm.optimal = svm(Purchase ~., kernel = "linear", data = oj.train, cost = oj.svm.tune$best.parameters$cost)
# Predictions with training dataset
oj.svm.optimal.preds = predict(oj.svm.optimal, oj.train)
# Print confusion matrix for training predictions
table(oj.train$Purchase, oj.svm.optimal.preds)
## oj.svm.optimal.preds
## CH MM
## CH 431 57
## MM 68 244
#Calculate the training error rate
mse_lsvm_best_tr=(57+68)/800
mse_lsvm_best_tr
## [1] 0.15625
# Predictions with test dataset
oj.svm.optimal.preds2 = predict(oj.svm.optimal, oj.test)
# Print confusion matrix for test predictions
table(oj.test$Purchase, oj.svm.optimal.preds2)
## oj.svm.optimal.preds2
## CH MM
## CH 144 21
## MM 26 79
mse_lsvm_best_T=(21+26)/270
mse_lsvm_best_T
## [1] 0.1740741
The training and test error rates with new tuned SVM with cost of 1 are 0.15625 and 0.1740741 respectively.
# set the kernel to radial and run the train with cost of 0.01 as per 8b.
oj.svm.rad = svm(Purchase~., data = oj.train, kernel = "radial", cost = 0.01)
summary(oj.svm.rad)
##
## Call:
## svm(formula = Purchase ~ ., data = oj.train, kernel = "radial", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.01
##
## Number of Support Vectors: 628
##
## ( 316 312 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
# training predictions for radial svm
oj.rad.train.pred = predict(oj.svm.rad, oj.train)
# Print confusion matrix for training predictions for radial svm
table(oj.train$Purchase, oj.rad.train.pred)
## oj.rad.train.pred
## CH MM
## CH 488 0
## MM 312 0
# calculate training error rate
mse_Rsvm_reg_tr=(0+312)/800
mse_Rsvm_reg_tr
## [1] 0.39
# TEST dataset predictions for radial svm
oj.rad.test.pred = predict(oj.svm.rad, oj.test)
# Print confusion matrix for test predictions for radial svm
table(oj.test$Purchase, oj.rad.test.pred)
## oj.rad.test.pred
## CH MM
## CH 165 0
## MM 105 0
mse_Rsvm_reg_T=(0+105)/270
mse_Rsvm_reg_T
## [1] 0.3888889
The training and test error rates for RADIAL SVM with cost = 0.01 are 0.39 and 0.3888889 respectively.
# Tune the RADIAL SVM
oj.rad.tune = tune(svm, Purchase~., data = oj.train, kernal = "radial", ranges = list(cost=c(0.001, 0.01, 0.1, 1, 5, 10)))
summary(oj.rad.tune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.1
##
## - best performance: 0.1775
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.39000 0.03425801
## 2 1e-02 0.39000 0.03425801
## 3 1e-01 0.17750 0.02486072
## 4 1e+00 0.17750 0.02687419
## 5 5e+00 0.19125 0.02703521
## 6 1e+01 0.18875 0.02853482
oj.rad.tune$best.parameters
## cost
## 3 0.1
After tuning the RADIAL SVM, best cost is 0.1.
Lets run the radial SVM fit with this best cost.
# Running the SVM RADIAL with best cost of 0.1
oj.svm.rad2 = svm(Purchase~., data = oj.train, kernel = "radial", cost = oj.rad.tune$best.parameters$cost)
summary(oj.svm.rad2)
##
## Call:
## svm(formula = Purchase ~ ., data = oj.train, kernel = "radial", cost = oj.rad.tune$best.parameters$cost)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.1
##
## Number of Support Vectors: 552
##
## ( 277 275 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
The Radial SVM with Cost = .1 results in 552 Support Vectors being used with 277 of the Support Vectors being classified as CH and 275 being classified as MM.
Lets get the training and test error rates.
# Radial SVM with best cost of 0.1 - Training confusion matrix
oj.train.rad.pred2 = predict(oj.svm.rad2, oj.train)
table(oj.train$Purchase, oj.train.rad.pred2)
## oj.train.rad.pred2
## CH MM
## CH 441 47
## MM 87 225
# Radial SVM with best cost of 0.1 - Training error rate
mse_Rsvm_best_tr=(47+87)/800
mse_Rsvm_best_tr
## [1] 0.1675
# Radial SVM with best cost of 0.1 - TEST confusion matrix
oj.test.rad.pred2 = predict(oj.svm.rad2, oj.test)
table(oj.test$Purchase, oj.test.rad.pred2)
## oj.test.rad.pred2
## CH MM
## CH 147 18
## MM 30 75
# Radial SVM with best cost of 0.1 - TEST error rate
mse_Rsvm_best_T=(18+30)/270
mse_Rsvm_best_T
## [1] 0.1777778
Radial SVM with best cost of 0.1 - Training error rate is 0.1675 and TEST error rate is 0.1777778.
# Run the SVM with POLYNOMIAL kernel and degree=2 and cost = 0.01
oj.svm.poly = svm(Purchase~., data = oj.train, kernel = "poly", cost = 0.01, degree = 2)
summary(oj.svm.poly)
##
## Call:
## svm(formula = Purchase ~ ., data = oj.train, kernel = "poly", cost = 0.01,
## degree = 2)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 0.01
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 630
##
## ( 318 312 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
The Polynomial SVM with Cost = 0.01 and Degree = 2 results in 630 Support Vectors with 318 classified as CH and 312 as MM. Lets find the training and test error rates.
# Polynomial with cost=0.01 and degree=2 - Training confusion matrix
oj.poly.train.pred = predict(oj.svm.poly, oj.train)
table(oj.train$Purchase, oj.poly.train.pred)
## oj.poly.train.pred
## CH MM
## CH 485 3
## MM 292 20
# Polynomial with cost=0.01 and degree=2 - Training error rate
mse_Psvm_reg_tr=(3+292)/800
mse_Psvm_reg_tr
## [1] 0.36875
# Polynomial with cost=0.01 and degree=2 - TEST confusion matrix
oj.poly.test.pred = predict(oj.svm.poly, oj.test)
table(oj.test$Purchase, oj.poly.test.pred)
## oj.poly.test.pred
## CH MM
## CH 161 4
## MM 96 9
# Polynomial with cost=0.01 and degree=2 - Test error rate
mse_Psvm_reg_T=(4+96)/270
mse_Psvm_reg_T
## [1] 0.3703704
Polynomial SVM with cost of 0.01 and degree of 2 - Training error rate is 0.36875 and TEST error rate is 0.3703704 respectively.
Lets find the best cost.
oj.poly.tune = tune(svm, Purchase~., data = oj.train, kernel = "polynomial", ranges = list(cost=c(0.001, 0.01, 0.1, 1, 5, 10)), degree = 2)
summary(oj.poly.tune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 10
##
## - best performance: 0.18625
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.39000 0.04281744
## 2 1e-02 0.37875 0.05466120
## 3 1e-01 0.33000 0.07051399
## 4 1e+00 0.20625 0.04611655
## 5 5e+00 0.19375 0.04379958
## 6 1e+01 0.18625 0.04387878
oj.poly.tune$best.parameters
## cost
## 6 10
After tuning the Polynomial SVM, the best Cost value = 10 where the error = 0.18625.
Lets find the training and test error rates for this best cost value of 10 and degree=2.
oj.svm.poly2 = svm(Purchase~., data = oj.train, kernel = "polynomial", cost = oj.poly.tune$best.parameters$cost, degree = 2)
summary(oj.svm.poly2)
##
## Call:
## svm(formula = Purchase ~ ., data = oj.train, kernel = "polynomial",
## cost = oj.poly.tune$best.parameters$cost, degree = 2)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 10
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 352
##
## ( 181 171 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
The Polynomial SVM at degree=2 and with Cost = 10 results in 352 Support Vectors being used with 181 of the Support Vectors being classified as CH and 171 being classified as MM.
Lets find the training and test error rates.
# Polynomial with cost=10 and degree=2 - Training confusion matrix
oj.train.poly.pred2 = predict(oj.svm.poly2, oj.train)
table(oj.train$Purchase, oj.train.poly.pred2)
## oj.train.poly.pred2
## CH MM
## CH 445 43
## MM 76 236
# Polynomial with cost=10 and degree=2 - Training error rate
mse_Psvm_best_tr=(43+76)/800
mse_Psvm_best_tr
## [1] 0.14875
# Polynomial with cost=10 and degree=2 - Test confusion matrix
oj.test.poly.pred2 = predict(oj.svm.poly2, oj.test)
table(oj.test$Purchase, oj.test.poly.pred2)
## oj.test.poly.pred2
## CH MM
## CH 149 16
## MM 27 78
# Polynomial with cost=10 and degree=2 - Test error rate
mse_Psvm_best_T=(16+27)/270
mse_Psvm_best_T
## [1] 0.1592593
Polynomial SVM with cost of 10 and degree of 2 - Training error rate is 0.14875 and TEST error rate is 0.1592593
Lets create a matrix of all the error rates achieved.
# Get all the error rates into a matrix and print it
MSE_all <- matrix(c(mse_lsvm_reg_tr,mse_lsvm_reg_T,mse_lsvm_best_tr,mse_lsvm_best_T,mse_Rsvm_reg_tr,mse_Rsvm_reg_T,mse_Rsvm_best_tr,mse_Rsvm_best_T,mse_Psvm_reg_tr,mse_Psvm_reg_T,mse_Psvm_best_tr,mse_Psvm_best_T))
dimnames(MSE_all) = list(c("mse_lsvm_reg_tr","mse_lsvm_reg_T","mse_lsvm_best_tr","mse_lsvm_best_T","mse_Rsvm_reg_tr","mse_Rsvm_reg_T","mse_Rsvm_best_tr","mse_Rsvm_best_T","mse_Psvm_reg_tr","mse_Psvm_reg_T","mse_Psvm_best_tr","mse_Psvm_best_T"))
MSE_all
## [,1]
## mse_lsvm_reg_tr 0.1650000
## mse_lsvm_reg_T 0.1740741
## mse_lsvm_best_tr 0.1562500
## mse_lsvm_best_T 0.1740741
## mse_Rsvm_reg_tr 0.3900000
## mse_Rsvm_reg_T 0.3888889
## mse_Rsvm_best_tr 0.1675000
## mse_Rsvm_best_T 0.1777778
## mse_Psvm_reg_tr 0.3687500
## mse_Psvm_reg_T 0.3703704
## mse_Psvm_best_tr 0.1487500
## mse_Psvm_best_T 0.1592593
Based on th TEST error rates printed above, it seems that Polynomial SVM with degree = 2 and best cost = 10 is the best for this data.