R Markdown

Loading libraries and loading in the juice.csv file into

if(!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, e1071, caret)
library("tidyverse")
library("e1071")
library("caret")
j <- read_csv("juice.csv")
## Parsed with column specification:
## cols(
##   Purchase = col_character(),
##   WeekofPurchase = col_double(),
##   StoreID = col_double(),
##   PriceCH = col_double(),
##   PriceMM = col_double(),
##   DiscCH = col_double(),
##   DiscMM = col_double(),
##   SpecialCH = col_double(),
##   SpecialMM = col_double(),
##   LoyalCH = col_double(),
##   SalePriceMM = col_double(),
##   SalePriceCH = col_double(),
##   PriceDiff = col_double(),
##   Store7 = col_character(),
##   PctDiscMM = col_double(),
##   PctDiscCH = col_double(),
##   ListPriceDiff = col_double(),
##   STORE = col_double()
## )
  1. Create a training set containing a random sample of 80% of the observations in the “juice.csv” data set using createDataPartition(). Create a test data set containing the remaining observations.
j$Store7 <- ifelse(j$Store7 == "No", 0, 1)
j$Purchase <- as.factor(j$Purchase)
set.seed(123)
trainindex <- createDataPartition(j$Purchase, p=0.8, list= FALSE)
training <- j[trainindex, ]
testing <- j[-trainindex, ]
  1. Fit a SVM model to the training data using cost=0.01, with Purchase as the response and the other variables as predictors. Use the summary() function to produce summary statistics, and describe the results obtained.
svm_cost <- svm(Purchase~., data=training, kernel = "linear", cost = 0.01)
summary(svm_cost)
## 
## Call:
## svm(formula = Purchase ~ ., data = training, kernel = "linear", 
##     cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
## 
## Number of Support Vectors:  443
## 
##  ( 221 222 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

The SVM is fit by linear kernal and of type C-Classification. It also shows the cost = 0.01 and 443 total number of support vectors. 221 of CH class and 222 of MM class.

  1. What are the training and test error rates?
#Train Error
pred_train <- predict(svm_cost, training)
#In class method
conf.matrix <- table(Predicted = pred_train, Actual = training$Purchase)
conf.matrix
##          Actual
## Predicted  CH  MM
##        CH 429  84
##        MM  59 228
(84+59)/800
## [1] 0.17875
#Simpler method
train.error <- mean(pred_train != training$Purchase)
train.error
## [1] 0.17875
#Test Error
pred_test <- predict(svm_cost, testing)
#In class method
conf.matrix <- table(Predicted = pred_test, Actual = testing$Purchase)
conf.matrix
##          Actual
## Predicted  CH  MM
##        CH 106  17
##        MM  16  61
(17+16)/200
## [1] 0.165
#Simpler method
test.error <- mean(pred_test != testing$Purchase)
test.error
## [1] 0.165

Training error is 17.875% and Testing error is 16.5%

  1. Use the tune() function to select an optimal cost. Consider values in the range 0.01 to 10.
tunesvm <- tune(svm, Purchase~., data = training, kernal = "linear", ranges = list(cost = c(seq(0.01,0.1,by = 0.01), seq(0.1,1,by = 0.1), seq(1,10,by = 1))))
dynamic.cost <- tunesvm$best.model$cost
summary(tunesvm)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     3
## 
## - best performance: 0.18 
## 
## - Detailed performance results:
##     cost   error dispersion
## 1   0.01 0.39000 0.04556741
## 2   0.02 0.39000 0.04556741
## 3   0.03 0.38625 0.04505013
## 4   0.04 0.27375 0.04059026
## 5   0.05 0.22500 0.04487637
## 6   0.06 0.20500 0.04684490
## 7   0.07 0.20375 0.04450733
## 8   0.08 0.20000 0.03996526
## 9   0.09 0.19750 0.03944053
## 10  0.10 0.19500 0.04090979
## 11  0.10 0.19500 0.04090979
## 12  0.20 0.19250 0.03917553
## 13  0.30 0.18375 0.04210189
## 14  0.40 0.18250 0.04533824
## 15  0.50 0.18500 0.04116363
## 16  0.60 0.18625 0.04016027
## 17  0.70 0.18375 0.03955042
## 18  0.80 0.18750 0.04249183
## 19  0.90 0.18875 0.04348132
## 20  1.00 0.19000 0.03944053
## 21  1.00 0.19000 0.03944053
## 22  2.00 0.18125 0.04093101
## 23  3.00 0.18000 0.05210833
## 24  4.00 0.18375 0.04825065
## 25  5.00 0.18875 0.05015601
## 26  6.00 0.19000 0.04958158
## 27  7.00 0.19000 0.05027701
## 28  8.00 0.19125 0.05172376
## 29  9.00 0.18750 0.05270463
## 30 10.00 0.19000 0.05489890
#Function to pull out the best model
data <- tunesvm$performances
data[data$error == min(data$error),]
  1. Compute and report the training and test error rates using this new value for cost.
svm_cost <- svm(Purchase~., data=training,kernel = "linear", cost = dynamic.cost)
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
linear <- data.frame("Kernal" = "Linear", "Cost" = dynamic.cost,
                     "Training Error" = train.error, 
                     "Testing Error" = test.error)
linear
  1. Repeat parts (2.) through (5.) using a support vector machine with a radial kernel. Use the default value for gamma.
##STEP 2 (MODEL FITTING)
svm_cost <- svm(Purchase~., data=training, kernel = "radial", cost = 0.01)
summary(svm_cost)
## 
## Call:
## svm(formula = Purchase ~ ., data = training, kernel = "radial", 
##     cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  0.01 
## 
## Number of Support Vectors:  630
## 
##  ( 312 318 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

The SVM is fit by Radial kernal and of type C-Classification. It also has cost = 0.01 and 630 total number of support vectors. 312 of CH class and 318 of MM class,

##STEP 3 (Training and Testing Error)
#train error
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
train.error 
## [1] 0.39

#39% is training error

#test error
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
test.error
## [1] 0.39

39% is the testing error

##STEP 4 (TUNE FUNCTION)
tunesvm <- tune(svm, Purchase~., data = training, kernal = "radial",
                ranges = list(cost = c(seq(0.01,0.1,by = 0.01),
                                       seq(0.1,1,by = 0.1),
                                       seq(1,10,by = 1))))
dynamic.cost <- tunesvm$best.model$cost
summary(tunesvm)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     2
## 
## - best performance: 0.18375 
## 
## - Detailed performance results:
##     cost   error dispersion
## 1   0.01 0.39000 0.04241004
## 2   0.02 0.39000 0.04241004
## 3   0.03 0.38375 0.04126894
## 4   0.04 0.25875 0.04860913
## 5   0.05 0.22125 0.03586723
## 6   0.06 0.20375 0.03998698
## 7   0.07 0.20125 0.04185375
## 8   0.08 0.20125 0.04466309
## 9   0.09 0.20250 0.04479893
## 10  0.10 0.20125 0.04185375
## 11  0.10 0.20125 0.04185375
## 12  0.20 0.19000 0.04594683
## 13  0.30 0.18750 0.05496211
## 14  0.40 0.19000 0.04958158
## 15  0.50 0.19250 0.05596378
## 16  0.60 0.18500 0.05296750
## 17  0.70 0.18375 0.05337563
## 18  0.80 0.18625 0.05318012
## 19  0.90 0.18875 0.05415064
## 20  1.00 0.18750 0.05621141
## 21  1.00 0.18750 0.05621141
## 22  2.00 0.18375 0.05104804
## 23  3.00 0.18500 0.04851976
## 24  4.00 0.18500 0.04556741
## 25  5.00 0.18875 0.04581439
## 26  6.00 0.18750 0.04750731
## 27  7.00 0.19000 0.04632314
## 28  8.00 0.18750 0.04714045
## 29  9.00 0.19000 0.04706674
## 30 10.00 0.19125 0.04450733
#Function to pull out the best model
data <- tunesvm$performances
data[data$error == min(data$error),] 
##STEP 5 (TRAIN &TEST ERRORS FOR COST = dynamic cost)
svm_cost <- svm(Purchase~., data=training, kernel = "radial", cost = dynamic.cost)
#train error
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
#test error
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
radial <- data.frame("Kernal" = "Radial", "Cost" = dynamic.cost,
                     "Training Error" =  train.error, 
                     "Testing Error" = test.error)
radial
  1. Repeat parts (2.) through (5.) using a support vector machine with a polynomial kernel. Set degree=2.
##STEP 2 (MODEL FITTING)
svm_cost <- svm(Purchase~., data=training, kernel = "polynomial", degree = 2, cost = 0.01)
summary(svm_cost)
## 
## Call:
## svm(formula = Purchase ~ ., data = training, kernel = "polynomial", 
##     degree = 2, cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  0.01 
##      degree:  2 
##      coef.0:  0 
## 
## Number of Support Vectors:  628
## 
##  ( 312 316 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

The SVM is fit by Polynomial kernal and of type C-Classification. It also shows the cost = 0.01 and 628 total number of support vectors. 312 of CH class and 316 of MM class.

##STEP 3 (Training and Testing Error)
#train error
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
train.error
## [1] 0.365

36.5% is training error

#test error
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
test.error
## [1] 0.39

39% is the testing error

##STEP 4 (TUNE FUNCTION)
tunesvm <- tune(svm, Purchase~., data = training, kernal = "polynomial", degree = 2, ranges = list(cost = c(seq(0.01,0.1, by = 0.01), seq(0.1,1,by = 0.1), seq(1,10,by = 1))))
dynamic.cost <- tunesvm$best.model$cost
summary(tunesvm)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     2
## 
## - best performance: 0.18125 
## 
## - Detailed performance results:
##     cost   error dispersion
## 1   0.01 0.39000 0.04362084
## 2   0.02 0.39000 0.04362084
## 3   0.03 0.38500 0.04958158
## 4   0.04 0.26750 0.04937104
## 5   0.05 0.21875 0.05245699
## 6   0.06 0.20625 0.02716334
## 7   0.07 0.19375 0.03830162
## 8   0.08 0.19000 0.03670453
## 9   0.09 0.19000 0.03622844
## 10  0.10 0.19000 0.03622844
## 11  0.10 0.19000 0.03622844
## 12  0.20 0.18875 0.04185375
## 13  0.30 0.18625 0.04466309
## 14  0.40 0.18375 0.04860913
## 15  0.50 0.18750 0.04526159
## 16  0.60 0.18875 0.04348132
## 17  0.70 0.19000 0.04281744
## 18  0.80 0.18750 0.04526159
## 19  0.90 0.18375 0.04604120
## 20  1.00 0.18375 0.04450733
## 21  1.00 0.18375 0.04450733
## 22  2.00 0.18125 0.04535738
## 23  3.00 0.18875 0.04875178
## 24  4.00 0.19000 0.04706674
## 25  5.00 0.19000 0.04816061
## 26  6.00 0.19125 0.04896498
## 27  7.00 0.19125 0.04896498
## 28  8.00 0.19500 0.04866267
## 29  9.00 0.19250 0.04794383
## 30 10.00 0.19375 0.04759858
#Function to pull out the best model
data <- tunesvm$performances
data[data$error == min(data$error),]
##STEP 5 (TRAIN &TEST ERRORS FOR COST = dynamic cost)
svm_cost <- svm(Purchase~., data=training, kernel = "polynomial", degree = 2, cost = dynamic.cost)
#train error
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
#test error
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
polynomial <- data.frame("Kernal" = "Polynomial", "Cost" = dynamic.cost,
                         "Training Error" =  train.error, 
                         "Testing Error" = test.error)
polynomial
  1. Overall, which approach seems to give the best results on this data?
final <- rbind(linear, radial, polynomial)
final

Radial gives the best training error of 14.75% and linear gives the best testing error of 15%.