Homework 1: SVM

R Markdown

Loading libraries and loading in the juice.csv file into

if(!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, e1071, caret)

library("tidyverse")
library("e1071")
library("caret")

j <- read_csv("juice.csv")

## Parsed with column specification:
## cols(
##   Purchase = col_character(),
##   WeekofPurchase = col_double(),
##   StoreID = col_double(),
##   PriceCH = col_double(),
##   PriceMM = col_double(),
##   DiscCH = col_double(),
##   DiscMM = col_double(),
##   SpecialCH = col_double(),
##   SpecialMM = col_double(),
##   LoyalCH = col_double(),
##   SalePriceMM = col_double(),
##   SalePriceCH = col_double(),
##   PriceDiff = col_double(),
##   Store7 = col_character(),
##   PctDiscMM = col_double(),
##   PctDiscCH = col_double(),
##   ListPriceDiff = col_double(),
##   STORE = col_double()
## )

head(j, 2)

Create a training set containing a random sample of 80% of the observations in the “juice.csv” data set using createDataPartition(). Create a test data set containing the remaining observations.

#Data Manipulation
j <- j[, !names(j) %in% c("PriceCH", "PriceMM", "Store7", "ListPriceDiff", "STORE")]
j$Purchase <- as.factor(j$Purchase)
set.seed(123)
trainindex <- createDataPartition(j$Purchase, p=0.8, list= FALSE)
training <- j[trainindex, ]
testing <- j[-trainindex, ]

Fit a SVM model to the training data using cost=0.01, with Purchase as the response and the other variables as predictors. Use the summary() function to produce summary statistics, and describe the results obtained.

set.seed(123)
svm_cost <- svm(Purchase~., data=training, kernel = "linear", cost = 0.01)
summary(svm_cost)

## 
## Call:
## svm(formula = Purchase ~ ., data = training, kernel = "linear", 
##     cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
## 
## Number of Support Vectors:  446
## 
##  ( 222 224 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

The SVM is fit by linear kernal and of type C-Classification. It also shows the cost = 0.01 and 446 total number of support vectors. 222 of CH class and 224 of MM class.

What are the training and test error rates?

#Train Error
pred_train <- predict(svm_cost, training)
#In class method
conf.matrix <- table(Predicted = pred_train, Actual = training$Purchase)
conf.matrix

##          Actual
## Predicted  CH  MM
##        CH 429  83
##        MM  59 229

(83+59)/800

## [1] 0.1775

#Simpler method
train.error <- mean(pred_train != training$Purchase)
train.error

## [1] 0.1775

#Test Error
pred_test <- predict(svm_cost, testing)
#In class method
conf.matrix <- table(Predicted = pred_test, Actual = testing$Purchase)
conf.matrix

##          Actual
## Predicted  CH  MM
##        CH 106  16
##        MM  16  62

(16+16)/200

## [1] 0.16

#Simpler method
test.error <- mean(pred_test != testing$Purchase)
test.error

## [1] 0.16

Training error is 17.775% and Testing error is 16%

Use the tune() function to select an optimal cost. Consider values in the range 0.01 to 10.

set.seed(123)
tunesvm <- tune(svm, Purchase~., data = training, kernal = "linear", ranges = list(cost = c(seq(0.01,0.1,by = 0.01), seq(0.1,1,by = 0.1), seq(1,10,by = 1))))
dynamic.cost <- tunesvm$best.model$cost
summary(tunesvm)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.7
## 
## - best performance: 0.1775 
## 
## - Detailed performance results:
##     cost   error dispersion
## 1   0.01 0.39000 0.04031129
## 2   0.02 0.39000 0.04031129
## 3   0.03 0.27625 0.03143004
## 4   0.04 0.22250 0.02554952
## 5   0.05 0.19250 0.03689324
## 6   0.06 0.18500 0.03670453
## 7   0.07 0.18750 0.03004626
## 8   0.08 0.19125 0.02829041
## 9   0.09 0.19000 0.02687419
## 10  0.10 0.19125 0.02829041
## 11  0.10 0.19125 0.02829041
## 12  0.20 0.19000 0.03525699
## 13  0.30 0.18000 0.03641962
## 14  0.40 0.17875 0.03488573
## 15  0.50 0.18125 0.03547789
## 16  0.60 0.17750 0.03622844
## 17  0.70 0.17750 0.03763863
## 18  0.80 0.17875 0.03729108
## 19  0.90 0.17750 0.03525699
## 20  1.00 0.17875 0.03230175
## 21  1.00 0.17875 0.03230175
## 22  2.00 0.18375 0.03120831
## 23  3.00 0.18375 0.02766993
## 24  4.00 0.18625 0.02461509
## 25  5.00 0.18875 0.02598744
## 26  6.00 0.19000 0.02486072
## 27  7.00 0.19000 0.02266912
## 28  8.00 0.19000 0.02486072
## 29  9.00 0.19125 0.02360703
## 30 10.00 0.19125 0.02433134

#Function to pull out the best model
data <- tunesvm$performances
data[data$error == min(data$error),]

Compute and report the training and test error rates using this new value for cost.

set.seed(123)
svm_cost <- svm(Purchase~., data=training,kernel = "linear", cost = dynamic.cost)
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
linear <- data.frame("Kernal" = "Linear", "Cost" = dynamic.cost,
                     "Training Error" = train.error, 
                     "Testing Error" = test.error)
linear

Repeat parts (2.) through (5.) using a support vector machine with a radial kernel. Use the default value for gamma.

##STEP 2 (MODEL FITTING)
set.seed(123)
svm_cost <- svm(Purchase~., data=training, kernel = "radial", cost = 0.01)
summary(svm_cost)

## 
## Call:
## svm(formula = Purchase ~ ., data = training, kernel = "radial", 
##     cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  0.01 
## 
## Number of Support Vectors:  627
## 
##  ( 312 315 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

The SVM is fit by Radial kernal and of type C-Classification. It also has cost = 0.01 and 627 total number of support vectors. 312 of CH class and 315 of MM class,

##STEP 3 (Training and Testing Error)
#train error
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
train.error

## [1] 0.39

#39% is training error

#test error
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
test.error

## [1] 0.39

39% is the testing error

##STEP 4 (TUNE FUNCTION)
set.seed(123)
tunesvm <- tune(svm, Purchase~., data = training, kernal = "radial",
                ranges = list(cost = c(seq(0.01,0.1,by = 0.01),
                                       seq(0.1,1,by = 0.1),
                                       seq(1,10,by = 1))))
dynamic.cost <- tunesvm$best.model$cost
summary(tunesvm)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.7
## 
## - best performance: 0.1775 
## 
## - Detailed performance results:
##     cost   error dispersion
## 1   0.01 0.39000 0.04031129
## 2   0.02 0.39000 0.04031129
## 3   0.03 0.27625 0.03143004
## 4   0.04 0.22250 0.02554952
## 5   0.05 0.19250 0.03689324
## 6   0.06 0.18500 0.03670453
## 7   0.07 0.18750 0.03004626
## 8   0.08 0.19125 0.02829041
## 9   0.09 0.19000 0.02687419
## 10  0.10 0.19125 0.02829041
## 11  0.10 0.19125 0.02829041
## 12  0.20 0.19000 0.03525699
## 13  0.30 0.18000 0.03641962
## 14  0.40 0.17875 0.03488573
## 15  0.50 0.18125 0.03547789
## 16  0.60 0.17750 0.03622844
## 17  0.70 0.17750 0.03763863
## 18  0.80 0.17875 0.03729108
## 19  0.90 0.17750 0.03525699
## 20  1.00 0.17875 0.03230175
## 21  1.00 0.17875 0.03230175
## 22  2.00 0.18375 0.03120831
## 23  3.00 0.18375 0.02766993
## 24  4.00 0.18625 0.02461509
## 25  5.00 0.18875 0.02598744
## 26  6.00 0.19000 0.02486072
## 27  7.00 0.19000 0.02266912
## 28  8.00 0.19000 0.02486072
## 29  9.00 0.19125 0.02360703
## 30 10.00 0.19125 0.02433134

#Function to pull out the best model
data <- tunesvm$performances
data[data$error == min(data$error),]

##STEP 5 (TRAIN &TEST ERRORS FOR COST = dynamic cost)
svm_cost <- svm(Purchase~., data=training, kernel = "radial", cost = dynamic.cost)
#train error
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
#test error
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
radial <- data.frame("Kernal" = "Radial", "Cost" = dynamic.cost,
                     "Training Error" =  train.error, 
                     "Testing Error" = test.error)
radial

Repeat parts (2.) through (5.) using a support vector machine with a polynomial kernel. Set degree=2.

##STEP 2 (MODEL FITTING)
set.seed(123)
svm_cost <- svm(Purchase~., data=training, kernel = "polynomial", degree = 2, cost = 0.01)
summary(svm_cost)

## 
## Call:
## svm(formula = Purchase ~ ., data = training, kernel = "polynomial", 
##     degree = 2, cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  0.01 
##      degree:  2 
##      coef.0:  0 
## 
## Number of Support Vectors:  628
## 
##  ( 312 316 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

The SVM is fit by Polynomial kernal and of type C-Classification. It also shows the cost = 0.01 and 628 total number of support vectors. 312 of CH class and 316 of MM class.

##STEP 3 (Training and Testing Error)
#train error
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
train.error

## [1] 0.36

36% is training error

#test error
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
test.error

## [1] 0.385

38.5% is the testing error

##STEP 4 (TUNE FUNCTION)
set.seed(123)
tunesvm <- tune(svm, Purchase~., data = training, kernal = "polynomial", degree = 2, ranges = list(cost = c(seq(0.01,0.1, by = 0.01), seq(0.1,1,by = 0.1), seq(1,10,by = 1))))
dynamic.cost <- tunesvm$best.model$cost
summary(tunesvm)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.7
## 
## - best performance: 0.1775 
## 
## - Detailed performance results:
##     cost   error dispersion
## 1   0.01 0.39000 0.04031129
## 2   0.02 0.39000 0.04031129
## 3   0.03 0.27625 0.03143004
## 4   0.04 0.22250 0.02554952
## 5   0.05 0.19250 0.03689324
## 6   0.06 0.18500 0.03670453
## 7   0.07 0.18750 0.03004626
## 8   0.08 0.19125 0.02829041
## 9   0.09 0.19000 0.02687419
## 10  0.10 0.19125 0.02829041
## 11  0.10 0.19125 0.02829041
## 12  0.20 0.19000 0.03525699
## 13  0.30 0.18000 0.03641962
## 14  0.40 0.17875 0.03488573
## 15  0.50 0.18125 0.03547789
## 16  0.60 0.17750 0.03622844
## 17  0.70 0.17750 0.03763863
## 18  0.80 0.17875 0.03729108
## 19  0.90 0.17750 0.03525699
## 20  1.00 0.17875 0.03230175
## 21  1.00 0.17875 0.03230175
## 22  2.00 0.18375 0.03120831
## 23  3.00 0.18375 0.02766993
## 24  4.00 0.18625 0.02461509
## 25  5.00 0.18875 0.02598744
## 26  6.00 0.19000 0.02486072
## 27  7.00 0.19000 0.02266912
## 28  8.00 0.19000 0.02486072
## 29  9.00 0.19125 0.02360703
## 30 10.00 0.19125 0.02433134

#Function to pull out the best model
data <- tunesvm$performances
data[data$error == min(data$error),]

##STEP 5 (TRAIN &TEST ERRORS FOR COST = dynamic cost)
svm_cost <- svm(Purchase~., data=training, kernel = "polynomial", degree = 2, cost = dynamic.cost)
#train error
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
#test error
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
polynomial <- data.frame("Kernal" = "Polynomial", "Cost" = dynamic.cost,
                         "Training Error" =  train.error, 
                         "Testing Error" = test.error)
polynomial

Overall, which approach seems to give the best results on this data?

final <- rbind(linear, radial, polynomial)
final

Radial performs slightly better than linear and seems to be the most efficient giving 15.75% training error and 15.5% testing error.

Homework 1: SVM

Sushanth Chintalapati

9/19/2019

R Markdown