Loading libraries and loading in the juice.csv file into
if(!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, e1071, caret)
library("tidyverse")
library("e1071")
library("caret")
j <- read_csv("juice.csv")
## Parsed with column specification:
## cols(
## Purchase = col_character(),
## WeekofPurchase = col_double(),
## StoreID = col_double(),
## PriceCH = col_double(),
## PriceMM = col_double(),
## DiscCH = col_double(),
## DiscMM = col_double(),
## SpecialCH = col_double(),
## SpecialMM = col_double(),
## LoyalCH = col_double(),
## SalePriceMM = col_double(),
## SalePriceCH = col_double(),
## PriceDiff = col_double(),
## Store7 = col_character(),
## PctDiscMM = col_double(),
## PctDiscCH = col_double(),
## ListPriceDiff = col_double(),
## STORE = col_double()
## )
j$Store7 <- ifelse(j$Store7 == "No", 0, 1)
j$Purchase <- as.factor(j$Purchase)
set.seed(123)
trainindex <- createDataPartition(j$Purchase, p=0.8, list= FALSE)
training <- j[trainindex, ]
testing <- j[-trainindex, ]
svm_cost <- svm(Purchase~., data=training, kernel = "linear", cost = 0.01)
summary(svm_cost)
##
## Call:
## svm(formula = Purchase ~ ., data = training, kernel = "linear",
## cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 443
##
## ( 221 222 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
The SVM is fit by linear kernal and of type C-Classification. It also shows the cost = 0.01 and 443 total number of support vectors. 221 of CH class and 222 of MM class.
#Train Error
pred_train <- predict(svm_cost, training)
#In class method
conf.matrix <- table(Predicted = pred_train, Actual = training$Purchase)
conf.matrix
## Actual
## Predicted CH MM
## CH 429 84
## MM 59 228
(84+59)/800
## [1] 0.17875
#Simpler method
train.error <- mean(pred_train != training$Purchase)
train.error
## [1] 0.17875
#Test Error
pred_test <- predict(svm_cost, testing)
#In class method
conf.matrix <- table(Predicted = pred_test, Actual = testing$Purchase)
conf.matrix
## Actual
## Predicted CH MM
## CH 106 17
## MM 16 61
(17+16)/200
## [1] 0.165
#Simpler method
test.error <- mean(pred_test != testing$Purchase)
test.error
## [1] 0.165
Training error is 17.875% and Testing error is 16.5%
tunesvm <- tune(svm, Purchase~., data = training, kernal = "linear", ranges = list(cost = c(seq(0.01,0.1,by = 0.01), seq(0.1,1,by = 0.1), seq(1,10,by = 1))))
dynamic.cost <- tunesvm$best.model$cost
summary(tunesvm)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 3
##
## - best performance: 0.18
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.39000 0.04556741
## 2 0.02 0.39000 0.04556741
## 3 0.03 0.38625 0.04505013
## 4 0.04 0.27375 0.04059026
## 5 0.05 0.22500 0.04487637
## 6 0.06 0.20500 0.04684490
## 7 0.07 0.20375 0.04450733
## 8 0.08 0.20000 0.03996526
## 9 0.09 0.19750 0.03944053
## 10 0.10 0.19500 0.04090979
## 11 0.10 0.19500 0.04090979
## 12 0.20 0.19250 0.03917553
## 13 0.30 0.18375 0.04210189
## 14 0.40 0.18250 0.04533824
## 15 0.50 0.18500 0.04116363
## 16 0.60 0.18625 0.04016027
## 17 0.70 0.18375 0.03955042
## 18 0.80 0.18750 0.04249183
## 19 0.90 0.18875 0.04348132
## 20 1.00 0.19000 0.03944053
## 21 1.00 0.19000 0.03944053
## 22 2.00 0.18125 0.04093101
## 23 3.00 0.18000 0.05210833
## 24 4.00 0.18375 0.04825065
## 25 5.00 0.18875 0.05015601
## 26 6.00 0.19000 0.04958158
## 27 7.00 0.19000 0.05027701
## 28 8.00 0.19125 0.05172376
## 29 9.00 0.18750 0.05270463
## 30 10.00 0.19000 0.05489890
#Function to pull out the best model
data <- tunesvm$performances
data[data$error == min(data$error),]
svm_cost <- svm(Purchase~., data=training,kernel = "linear", cost = dynamic.cost)
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
linear <- data.frame("Kernal" = "Linear", "Cost" = dynamic.cost,
"Training Error" = train.error,
"Testing Error" = test.error)
linear
##STEP 2 (MODEL FITTING)
svm_cost <- svm(Purchase~., data=training, kernel = "radial", cost = 0.01)
summary(svm_cost)
##
## Call:
## svm(formula = Purchase ~ ., data = training, kernel = "radial",
## cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.01
##
## Number of Support Vectors: 630
##
## ( 312 318 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
The SVM is fit by Radial kernal and of type C-Classification. It also has cost = 0.01 and 630 total number of support vectors. 312 of CH class and 318 of MM class,
##STEP 3 (Training and Testing Error)
#train error
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
train.error
## [1] 0.39
#39% is training error
#test error
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
test.error
## [1] 0.39
39% is the testing error
##STEP 4 (TUNE FUNCTION)
tunesvm <- tune(svm, Purchase~., data = training, kernal = "radial",
ranges = list(cost = c(seq(0.01,0.1,by = 0.01),
seq(0.1,1,by = 0.1),
seq(1,10,by = 1))))
dynamic.cost <- tunesvm$best.model$cost
summary(tunesvm)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 2
##
## - best performance: 0.18375
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.39000 0.04241004
## 2 0.02 0.39000 0.04241004
## 3 0.03 0.38375 0.04126894
## 4 0.04 0.25875 0.04860913
## 5 0.05 0.22125 0.03586723
## 6 0.06 0.20375 0.03998698
## 7 0.07 0.20125 0.04185375
## 8 0.08 0.20125 0.04466309
## 9 0.09 0.20250 0.04479893
## 10 0.10 0.20125 0.04185375
## 11 0.10 0.20125 0.04185375
## 12 0.20 0.19000 0.04594683
## 13 0.30 0.18750 0.05496211
## 14 0.40 0.19000 0.04958158
## 15 0.50 0.19250 0.05596378
## 16 0.60 0.18500 0.05296750
## 17 0.70 0.18375 0.05337563
## 18 0.80 0.18625 0.05318012
## 19 0.90 0.18875 0.05415064
## 20 1.00 0.18750 0.05621141
## 21 1.00 0.18750 0.05621141
## 22 2.00 0.18375 0.05104804
## 23 3.00 0.18500 0.04851976
## 24 4.00 0.18500 0.04556741
## 25 5.00 0.18875 0.04581439
## 26 6.00 0.18750 0.04750731
## 27 7.00 0.19000 0.04632314
## 28 8.00 0.18750 0.04714045
## 29 9.00 0.19000 0.04706674
## 30 10.00 0.19125 0.04450733
#Function to pull out the best model
data <- tunesvm$performances
data[data$error == min(data$error),]
##STEP 5 (TRAIN &TEST ERRORS FOR COST = dynamic cost)
svm_cost <- svm(Purchase~., data=training, kernel = "radial", cost = dynamic.cost)
#train error
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
#test error
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
radial <- data.frame("Kernal" = "Radial", "Cost" = dynamic.cost,
"Training Error" = train.error,
"Testing Error" = test.error)
radial
##STEP 2 (MODEL FITTING)
svm_cost <- svm(Purchase~., data=training, kernel = "polynomial", degree = 2, cost = 0.01)
summary(svm_cost)
##
## Call:
## svm(formula = Purchase ~ ., data = training, kernel = "polynomial",
## degree = 2, cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 0.01
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 628
##
## ( 312 316 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
The SVM is fit by Polynomial kernal and of type C-Classification. It also shows the cost = 0.01 and 628 total number of support vectors. 312 of CH class and 316 of MM class.
##STEP 3 (Training and Testing Error)
#train error
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
train.error
## [1] 0.365
36.5% is training error
#test error
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
test.error
## [1] 0.39
39% is the testing error
##STEP 4 (TUNE FUNCTION)
tunesvm <- tune(svm, Purchase~., data = training, kernal = "polynomial", degree = 2, ranges = list(cost = c(seq(0.01,0.1, by = 0.01), seq(0.1,1,by = 0.1), seq(1,10,by = 1))))
dynamic.cost <- tunesvm$best.model$cost
summary(tunesvm)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 2
##
## - best performance: 0.18125
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.39000 0.04362084
## 2 0.02 0.39000 0.04362084
## 3 0.03 0.38500 0.04958158
## 4 0.04 0.26750 0.04937104
## 5 0.05 0.21875 0.05245699
## 6 0.06 0.20625 0.02716334
## 7 0.07 0.19375 0.03830162
## 8 0.08 0.19000 0.03670453
## 9 0.09 0.19000 0.03622844
## 10 0.10 0.19000 0.03622844
## 11 0.10 0.19000 0.03622844
## 12 0.20 0.18875 0.04185375
## 13 0.30 0.18625 0.04466309
## 14 0.40 0.18375 0.04860913
## 15 0.50 0.18750 0.04526159
## 16 0.60 0.18875 0.04348132
## 17 0.70 0.19000 0.04281744
## 18 0.80 0.18750 0.04526159
## 19 0.90 0.18375 0.04604120
## 20 1.00 0.18375 0.04450733
## 21 1.00 0.18375 0.04450733
## 22 2.00 0.18125 0.04535738
## 23 3.00 0.18875 0.04875178
## 24 4.00 0.19000 0.04706674
## 25 5.00 0.19000 0.04816061
## 26 6.00 0.19125 0.04896498
## 27 7.00 0.19125 0.04896498
## 28 8.00 0.19500 0.04866267
## 29 9.00 0.19250 0.04794383
## 30 10.00 0.19375 0.04759858
#Function to pull out the best model
data <- tunesvm$performances
data[data$error == min(data$error),]
##STEP 5 (TRAIN &TEST ERRORS FOR COST = dynamic cost)
svm_cost <- svm(Purchase~., data=training, kernel = "polynomial", degree = 2, cost = dynamic.cost)
#train error
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
#test error
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
polynomial <- data.frame("Kernal" = "Polynomial", "Cost" = dynamic.cost,
"Training Error" = train.error,
"Testing Error" = test.error)
polynomial
final <- rbind(linear, radial, polynomial)
final
Radial gives the best training error of 14.75% and linear gives the best testing error of 15%.