Loading libraries and loading in the juice.csv file into
if(!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, e1071, caret)
library("tidyverse")
library("e1071")
library("caret")
j <- read_csv("juice.csv")
## Parsed with column specification:
## cols(
## Purchase = col_character(),
## WeekofPurchase = col_double(),
## StoreID = col_double(),
## PriceCH = col_double(),
## PriceMM = col_double(),
## DiscCH = col_double(),
## DiscMM = col_double(),
## SpecialCH = col_double(),
## SpecialMM = col_double(),
## LoyalCH = col_double(),
## SalePriceMM = col_double(),
## SalePriceCH = col_double(),
## PriceDiff = col_double(),
## Store7 = col_character(),
## PctDiscMM = col_double(),
## PctDiscCH = col_double(),
## ListPriceDiff = col_double(),
## STORE = col_double()
## )
head(j, 2)
#Data Manipulation
j <- j[, !names(j) %in% c("PriceCH", "PriceMM", "Store7", "ListPriceDiff", "STORE")]
j$Purchase <- as.factor(j$Purchase)
set.seed(123)
trainindex <- createDataPartition(j$Purchase, p=0.8, list= FALSE)
training <- j[trainindex, ]
testing <- j[-trainindex, ]
set.seed(123)
svm_cost <- svm(Purchase~., data=training, kernel = "linear", cost = 0.01)
summary(svm_cost)
##
## Call:
## svm(formula = Purchase ~ ., data = training, kernel = "linear",
## cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 446
##
## ( 222 224 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
The SVM is fit by linear kernal and of type C-Classification. It also shows the cost = 0.01 and 446 total number of support vectors. 222 of CH class and 224 of MM class.
#Train Error
pred_train <- predict(svm_cost, training)
#In class method
conf.matrix <- table(Predicted = pred_train, Actual = training$Purchase)
conf.matrix
## Actual
## Predicted CH MM
## CH 429 83
## MM 59 229
(83+59)/800
## [1] 0.1775
#Simpler method
train.error <- mean(pred_train != training$Purchase)
train.error
## [1] 0.1775
#Test Error
pred_test <- predict(svm_cost, testing)
#In class method
conf.matrix <- table(Predicted = pred_test, Actual = testing$Purchase)
conf.matrix
## Actual
## Predicted CH MM
## CH 106 16
## MM 16 62
(16+16)/200
## [1] 0.16
#Simpler method
test.error <- mean(pred_test != testing$Purchase)
test.error
## [1] 0.16
Training error is 17.775% and Testing error is 16%
set.seed(123)
tunesvm <- tune(svm, Purchase~., data = training, kernal = "linear", ranges = list(cost = c(seq(0.01,0.1,by = 0.01), seq(0.1,1,by = 0.1), seq(1,10,by = 1))))
dynamic.cost <- tunesvm$best.model$cost
summary(tunesvm)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.7
##
## - best performance: 0.1775
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.39000 0.04031129
## 2 0.02 0.39000 0.04031129
## 3 0.03 0.27625 0.03143004
## 4 0.04 0.22250 0.02554952
## 5 0.05 0.19250 0.03689324
## 6 0.06 0.18500 0.03670453
## 7 0.07 0.18750 0.03004626
## 8 0.08 0.19125 0.02829041
## 9 0.09 0.19000 0.02687419
## 10 0.10 0.19125 0.02829041
## 11 0.10 0.19125 0.02829041
## 12 0.20 0.19000 0.03525699
## 13 0.30 0.18000 0.03641962
## 14 0.40 0.17875 0.03488573
## 15 0.50 0.18125 0.03547789
## 16 0.60 0.17750 0.03622844
## 17 0.70 0.17750 0.03763863
## 18 0.80 0.17875 0.03729108
## 19 0.90 0.17750 0.03525699
## 20 1.00 0.17875 0.03230175
## 21 1.00 0.17875 0.03230175
## 22 2.00 0.18375 0.03120831
## 23 3.00 0.18375 0.02766993
## 24 4.00 0.18625 0.02461509
## 25 5.00 0.18875 0.02598744
## 26 6.00 0.19000 0.02486072
## 27 7.00 0.19000 0.02266912
## 28 8.00 0.19000 0.02486072
## 29 9.00 0.19125 0.02360703
## 30 10.00 0.19125 0.02433134
#Function to pull out the best model
data <- tunesvm$performances
data[data$error == min(data$error),]
set.seed(123)
svm_cost <- svm(Purchase~., data=training,kernel = "linear", cost = dynamic.cost)
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
linear <- data.frame("Kernal" = "Linear", "Cost" = dynamic.cost,
"Training Error" = train.error,
"Testing Error" = test.error)
linear
##STEP 2 (MODEL FITTING)
set.seed(123)
svm_cost <- svm(Purchase~., data=training, kernel = "radial", cost = 0.01)
summary(svm_cost)
##
## Call:
## svm(formula = Purchase ~ ., data = training, kernel = "radial",
## cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.01
##
## Number of Support Vectors: 627
##
## ( 312 315 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
The SVM is fit by Radial kernal and of type C-Classification. It also has cost = 0.01 and 627 total number of support vectors. 312 of CH class and 315 of MM class,
##STEP 3 (Training and Testing Error)
#train error
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
train.error
## [1] 0.39
#39% is training error
#test error
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
test.error
## [1] 0.39
39% is the testing error
##STEP 4 (TUNE FUNCTION)
set.seed(123)
tunesvm <- tune(svm, Purchase~., data = training, kernal = "radial",
ranges = list(cost = c(seq(0.01,0.1,by = 0.01),
seq(0.1,1,by = 0.1),
seq(1,10,by = 1))))
dynamic.cost <- tunesvm$best.model$cost
summary(tunesvm)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.7
##
## - best performance: 0.1775
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.39000 0.04031129
## 2 0.02 0.39000 0.04031129
## 3 0.03 0.27625 0.03143004
## 4 0.04 0.22250 0.02554952
## 5 0.05 0.19250 0.03689324
## 6 0.06 0.18500 0.03670453
## 7 0.07 0.18750 0.03004626
## 8 0.08 0.19125 0.02829041
## 9 0.09 0.19000 0.02687419
## 10 0.10 0.19125 0.02829041
## 11 0.10 0.19125 0.02829041
## 12 0.20 0.19000 0.03525699
## 13 0.30 0.18000 0.03641962
## 14 0.40 0.17875 0.03488573
## 15 0.50 0.18125 0.03547789
## 16 0.60 0.17750 0.03622844
## 17 0.70 0.17750 0.03763863
## 18 0.80 0.17875 0.03729108
## 19 0.90 0.17750 0.03525699
## 20 1.00 0.17875 0.03230175
## 21 1.00 0.17875 0.03230175
## 22 2.00 0.18375 0.03120831
## 23 3.00 0.18375 0.02766993
## 24 4.00 0.18625 0.02461509
## 25 5.00 0.18875 0.02598744
## 26 6.00 0.19000 0.02486072
## 27 7.00 0.19000 0.02266912
## 28 8.00 0.19000 0.02486072
## 29 9.00 0.19125 0.02360703
## 30 10.00 0.19125 0.02433134
#Function to pull out the best model
data <- tunesvm$performances
data[data$error == min(data$error),]
##STEP 5 (TRAIN &TEST ERRORS FOR COST = dynamic cost)
svm_cost <- svm(Purchase~., data=training, kernel = "radial", cost = dynamic.cost)
#train error
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
#test error
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
radial <- data.frame("Kernal" = "Radial", "Cost" = dynamic.cost,
"Training Error" = train.error,
"Testing Error" = test.error)
radial
##STEP 2 (MODEL FITTING)
set.seed(123)
svm_cost <- svm(Purchase~., data=training, kernel = "polynomial", degree = 2, cost = 0.01)
summary(svm_cost)
##
## Call:
## svm(formula = Purchase ~ ., data = training, kernel = "polynomial",
## degree = 2, cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 0.01
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 628
##
## ( 312 316 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
The SVM is fit by Polynomial kernal and of type C-Classification. It also shows the cost = 0.01 and 628 total number of support vectors. 312 of CH class and 316 of MM class.
##STEP 3 (Training and Testing Error)
#train error
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
train.error
## [1] 0.36
36% is training error
#test error
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
test.error
## [1] 0.385
38.5% is the testing error
##STEP 4 (TUNE FUNCTION)
set.seed(123)
tunesvm <- tune(svm, Purchase~., data = training, kernal = "polynomial", degree = 2, ranges = list(cost = c(seq(0.01,0.1, by = 0.01), seq(0.1,1,by = 0.1), seq(1,10,by = 1))))
dynamic.cost <- tunesvm$best.model$cost
summary(tunesvm)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.7
##
## - best performance: 0.1775
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.39000 0.04031129
## 2 0.02 0.39000 0.04031129
## 3 0.03 0.27625 0.03143004
## 4 0.04 0.22250 0.02554952
## 5 0.05 0.19250 0.03689324
## 6 0.06 0.18500 0.03670453
## 7 0.07 0.18750 0.03004626
## 8 0.08 0.19125 0.02829041
## 9 0.09 0.19000 0.02687419
## 10 0.10 0.19125 0.02829041
## 11 0.10 0.19125 0.02829041
## 12 0.20 0.19000 0.03525699
## 13 0.30 0.18000 0.03641962
## 14 0.40 0.17875 0.03488573
## 15 0.50 0.18125 0.03547789
## 16 0.60 0.17750 0.03622844
## 17 0.70 0.17750 0.03763863
## 18 0.80 0.17875 0.03729108
## 19 0.90 0.17750 0.03525699
## 20 1.00 0.17875 0.03230175
## 21 1.00 0.17875 0.03230175
## 22 2.00 0.18375 0.03120831
## 23 3.00 0.18375 0.02766993
## 24 4.00 0.18625 0.02461509
## 25 5.00 0.18875 0.02598744
## 26 6.00 0.19000 0.02486072
## 27 7.00 0.19000 0.02266912
## 28 8.00 0.19000 0.02486072
## 29 9.00 0.19125 0.02360703
## 30 10.00 0.19125 0.02433134
#Function to pull out the best model
data <- tunesvm$performances
data[data$error == min(data$error),]
##STEP 5 (TRAIN &TEST ERRORS FOR COST = dynamic cost)
svm_cost <- svm(Purchase~., data=training, kernel = "polynomial", degree = 2, cost = dynamic.cost)
#train error
pred_train <- predict(svm_cost, training)
train.error <- mean(pred_train != training$Purchase)
#test error
pred_test <- predict(svm_cost, testing)
test.error <- mean(pred_test != testing$Purchase)
polynomial <- data.frame("Kernal" = "Polynomial", "Cost" = dynamic.cost,
"Training Error" = train.error,
"Testing Error" = test.error)
polynomial
final <- rbind(linear, radial, polynomial)
final
Radial performs slightly better than linear and seems to be the most efficient giving 15.75% training error and 15.5% testing error.