library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.5
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(bestglm)
## Warning: package 'bestglm' was built under R version 4.0.5
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.0.5
library(dplyr)
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.0.5
## Loaded ROSE 0.0-4
library(e1071)
## Warning: package 'e1071' was built under R version 4.0.5
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(mclust)
## Warning: package 'mclust' was built under R version 4.0.5
## Package 'mclust' version 5.4.7
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
## The following object is masked from 'package:purrr':
##
## map
library(readxl)
library(e1071)
library(rminer)
## Warning: package 'rminer' was built under R version 4.0.5
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.5
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:e1071':
##
## impute
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(corrplot)
## corrplot 0.90 loaded
df_bb_train <- read_excel("BBBC-Train.xlsx")
df_bb_test <- read_excel("BBBC-Test.xlsx")
df_bb_train <- df_bb_train[,-1]
df_bb_test <- df_bb_test[,-1]
df_bb_testY <- df_bb_test[,1]
df_bb_test <- df_bb_test[,-1] #remove choice
dim(df_bb_train)
## [1] 1600 11
dim(df_bb_test)
## [1] 2300 10
anyNA(df_bb_train)
## [1] FALSE
df_bb_train$Choice = as.factor(df_bb_train$Choice)
df_bb_train$Gender = as.factor(df_bb_train$Gender)
# df_bb_test$Choice = as.factor(df_bb_test$Choice)
df_bb_test$Gender = as.factor(df_bb_test$Gender)
df_bb_testY$Choice = as.factor(df_bb_testY$Choice)
table(df_bb_testY$Choice)
##
## 0 1
## 2096 204
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.0.5
dummy_data <- fastDummies::dummy_cols(df_bb_train[,c(2)],remove_first_dummy = TRUE)
names(dummy_data)
## [1] "Gender" "Gender_1"
clPairs(df_bb_train[,3:11], cl = df_bb_train$Choice)
clp <- clPairs(df_bb_train[,3:11], cl = df_bb_train$Choice, lower.panel = NULL)
clPairsLegend(0.1, 0.6, class = clp$class,
col = clp$col, pch = clp$pch,
title = "Bookbinders data")
head(df_bb_train)
## # A tibble: 6 x 11
## Choice Gender Amount_purchased Frequency Last_purchase First_purchase P_Child
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 113 8 1 8 0
## 2 1 1 418 6 11 66 0
## 3 1 1 336 18 6 32 2
## 4 1 1 180 16 5 42 2
## 5 1 0 320 2 3 18 0
## 6 1 1 268 4 1 4 0
## # ... with 4 more variables: P_Youth <dbl>, P_Cook <dbl>, P_DIY <dbl>,
## # P_Art <dbl>
The variables in the data frame are:
Choice - Did the customer buy the book or not? Gender - 1 - Male 0- Female ? Amount_purchased - Total amount spent at BBB Club. Frequency - Total frequency of purchases at BBB Club. Last_purchase - Months since last purchase. First_purchase - Months since first purchase. P_child - Number of children’s books purchased. P_youth - Number of youth books purchased. P_cook - Number of cooking books purchased. P_diy - Number of do-it-yourself books purchased. P_art - Number of art books purchased.
dim(df_bb_train)
## [1] 1600 11
names(df_bb_train)
## [1] "Choice" "Gender" "Amount_purchased" "Frequency"
## [5] "Last_purchase" "First_purchase" "P_Child" "P_Youth"
## [9] "P_Cook" "P_DIY" "P_Art"
str(df_bb_train)
## tibble [1,600 x 11] (S3: tbl_df/tbl/data.frame)
## $ Choice : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Gender : Factor w/ 2 levels "0","1": 2 2 2 2 1 2 2 1 2 2 ...
## $ Amount_purchased: num [1:1600] 113 418 336 180 320 268 198 280 393 138 ...
## $ Frequency : num [1:1600] 8 6 18 16 2 4 2 6 12 10 ...
## $ Last_purchase : num [1:1600] 1 11 6 5 3 1 12 2 11 7 ...
## $ First_purchase : num [1:1600] 8 66 32 42 18 4 62 12 50 38 ...
## $ P_Child : num [1:1600] 0 0 2 2 0 0 2 0 3 2 ...
## $ P_Youth : num [1:1600] 1 2 0 0 0 0 3 2 0 3 ...
## $ P_Cook : num [1:1600] 0 3 1 0 0 0 2 0 3 0 ...
## $ P_DIY : num [1:1600] 0 2 1 1 1 0 1 0 0 0 ...
## $ P_Art : num [1:1600] 0 3 2 1 2 0 2 0 2 1 ...
summary(df_bb_train)
## Choice Gender Amount_purchased Frequency Last_purchase
## 0:1200 0: 546 Min. : 15.0 Min. : 2.00 Min. : 1.000
## 1: 400 1:1054 1st Qu.:126.8 1st Qu.: 6.00 1st Qu.: 1.000
## Median :203.0 Median :12.00 Median : 2.000
## Mean :200.9 Mean :12.31 Mean : 3.199
## 3rd Qu.:273.0 3rd Qu.:16.00 3rd Qu.: 4.000
## Max. :474.0 Max. :36.00 Max. :12.000
## First_purchase P_Child P_Youth P_Cook
## Min. : 2.00 Min. :0.0000 Min. :0.0000 Min. :0.00
## 1st Qu.:12.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00
## Median :18.00 Median :0.0000 Median :0.0000 Median :0.00
## Mean :22.58 Mean :0.7394 Mean :0.3375 Mean :0.76
## 3rd Qu.:30.00 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.00
## Max. :96.00 Max. :8.0000 Max. :4.0000 Max. :6.00
## P_DIY P_Art
## Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :0.000
## Mean :0.3912 Mean :0.425
## 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :4.0000 Max. :5.000
hist(df_bb_train$Amount_purchased, freq = FALSE)
lines(density(df_bb_train$Amount_purchased), col = 4)
boxplot(df_bb_train$Amount_purchased)
points(mean(df_bb_train$Amount_purchased, na.rm=TRUE), col="red")
table(df_bb_train$P_Art,df_bb_train$Choice)
##
## 0 1
## 0 924 176
## 1 232 134
## 2 38 59
## 3 4 25
## 4 2 5
## 5 0 1
table(df_bb_train$Last_purchase)
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 586 499 72 67 62 60 43 50 42 42 43 34
table(df_bb_train$First_purchase)
##
## 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40
## 55 58 66 99 113 126 115 143 100 70 71 54 39 45 57 47 34 33 40 27
## 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74 76 78 80
## 16 25 19 14 20 10 13 7 10 11 11 3 17 8 5 4 2 1 4 1
## 82 84 86 96
## 3 2 1 1
hist(df_bb_train$Last_purchase, breaks = seq(0.5, 12.5, by = 1), freq = TRUE)
axis(1, at = seq(1, 12, by = 1))
table(df_bb_train$Gender)
##
## 0 1
## 546 1054
tab <- table(df_bb_train$Gender)
prop.table(tab)*100
##
## 0 1
## 34.125 65.875
barplot(tab,names.arg=c("Female","Male"), col="#69b3a2",xlab="Gender", ylab="Frequency",
font.lab=1,
col.lab="orange",
cex.lab= 1 )
cor_5 <- rcorr(as.matrix(df_bb_train[,-1]))
M <- cor_5$r
p_mat <- cor_5$P
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(M, method = "color", col = col(200),
type = "upper", order = "hclust",
addCoef.col = "black", # Add coefficient of correlation
tl.col = "darkblue", tl.srt = 45, #Text label color and rotation
# Combine with significance level
p.mat = p_mat, sig.level = 0.01,
# hide correlation coefficient on the principal diagonal
diag = FALSE
)
plot( Amount_purchased ~ Last_purchase, data = df_bb_train,xlab="Last Purchased",ylab="Amount Purchased")
plot( Amount_purchased ~ Gender, data = df_bb_train,xlab="Gender",ylab="Amount Purchased")
plot( Choice ~ Gender, data = df_bb_train,xlab="Choice",ylab="Gender")
mosaicplot(tab)
tab <- table(df_bb_train$Gender, df_bb_train$Choice)
prop.table(tab, 1)*100
##
## 0 1
## 0 66.48352 33.51648
## 1 79.41176 20.58824
#Scaling the numeric covariates befor we start the modeling
df_bb_train_scaled <- scale(df_bb_train[,3:11], scale = TRUE, center = TRUE)
df_bb_train_scaled <- cbind(df_bb_train_scaled,Gender = df_bb_train$Gender,Choice = df_bb_train$Choice)
# Model Selection using AIC and BIC
#----------------------------------------
#Full Model
glm_full_bw <- glm(Choice ~ ., data = df_bb_train, family = "binomial")
#Null Model
glm_null_bw <- glm(Choice ~ 1, data = df_bb_train, family = "binomial")
# stepwise selection with AIC & BIC
#**************************************
#**Null to Full step wise selection****
#**************************************
bw_AIC<-step(glm_null_bw, scope = list(upper=glm_full_bw),
direction="both",test="Chisq", trace = F)
# stepwise selection with BIC
bw_BIC<-step(glm_null_bw, scope = list(upper=glm_full_bw),
direction="both",test="Chisq", trace = F, k=log(nrow(df_bb_train)))
summary(bw_AIC)
##
## Call:
## glm(formula = Choice ~ P_Art + Frequency + Gender + P_Cook +
## P_DIY + Amount_purchased + P_Child + Last_purchase + P_Youth,
## family = "binomial", data = df_bb_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.44132 -0.66647 -0.43745 -0.01855 2.72460
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.2833949 0.2062721 -1.374 0.1695
## P_Art 0.6643371 0.1255243 5.292 1.21e-07 ***
## Frequency -0.0903261 0.0106304 -8.497 < 2e-16 ***
## Gender1 -0.8660575 0.1373268 -6.307 2.85e-10 ***
## P_Cook -0.9330131 0.1190073 -7.840 4.51e-15 ***
## P_DIY -0.9101106 0.1433591 -6.348 2.17e-10 ***
## Amount_purchased 0.0018357 0.0007908 2.321 0.0203 *
## P_Child -0.8181807 0.1163377 -7.033 2.02e-12 ***
## Last_purchase 0.5536689 0.0784519 7.057 1.70e-12 ***
## P_Youth -0.6424923 0.1432548 -4.485 7.29e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1799.5 on 1599 degrees of freedom
## Residual deviance: 1393.5 on 1590 degrees of freedom
## AIC: 1413.5
##
## Number of Fisher Scoring iterations: 5
summary(bw_BIC)
##
## Call:
## glm(formula = Choice ~ P_Art + Frequency + Gender + P_Cook +
## P_DIY, family = "binomial", data = df_bb_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.37774 -0.69368 -0.47955 -0.03998 2.74167
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.02017 0.15704 0.128 0.89781
## P_Art 1.25836 0.09628 13.070 < 2e-16 ***
## Frequency -0.08719 0.01037 -8.403 < 2e-16 ***
## Gender1 -0.79063 0.13339 -5.927 3.09e-09 ***
## P_Cook -0.28822 0.07011 -4.111 3.94e-05 ***
## P_DIY -0.28378 0.10537 -2.693 0.00708 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1799.5 on 1599 degrees of freedom
## Residual deviance: 1459.9 on 1594 degrees of freedom
## AIC: 1471.9
##
## Number of Fisher Scoring iterations: 5
#Defining the training controls for multiple models
fitControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats=3
)
#Training the logistic regression model
model_lr<-train(Choice ~ .,method='glm',data = df_bb_train[], trControl=fitControl )
summary(model_lr)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.38586 -0.66728 -0.43696 -0.02242 2.72238
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3515281 0.2143839 -1.640 0.1011
## Gender1 -0.8632319 0.1374499 -6.280 3.38e-10 ***
## Amount_purchased 0.0018641 0.0007918 2.354 0.0186 *
## Frequency -0.0755142 0.0165937 -4.551 5.35e-06 ***
## Last_purchase 0.6117713 0.0938127 6.521 6.97e-11 ***
## First_purchase -0.0147792 0.0128027 -1.154 0.2483
## P_Child -0.8112489 0.1167067 -6.951 3.62e-12 ***
## P_Youth -0.6370422 0.1433778 -4.443 8.87e-06 ***
## P_Cook -0.9230066 0.1194814 -7.725 1.12e-14 ***
## P_DIY -0.9058697 0.1437025 -6.304 2.90e-10 ***
## P_Art 0.6861124 0.1270176 5.402 6.60e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1799.5 on 1599 degrees of freedom
## Residual deviance: 1392.2 on 1589 degrees of freedom
## AIC: 1414.2
##
## Number of Fisher Scoring iterations: 5
#Predicting using logistic regression model
pred_lr<-predict(object = model_lr,df_bb_train)
#
# #Checking the accuracy
confusionMatrix(df_bb_train$Choice,pred_lr)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1120 80
## 1 240 160
##
## Accuracy : 0.8
## 95% CI : (0.7795, 0.8193)
## No Information Rate : 0.85
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3846
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8235
## Specificity : 0.6667
## Pos Pred Value : 0.9333
## Neg Pred Value : 0.4000
## Prevalence : 0.8500
## Detection Rate : 0.7000
## Detection Prevalence : 0.7500
## Balanced Accuracy : 0.7451
##
## 'Positive' Class : 0
##
varImp(model_lr) %>% ggplot()+geom_vline(xintercept=55 )
index <- caret::createDataPartition(df_bb_train$Choice, p=0.8, list=FALSE)
X_train <- df_bb_train[index,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
X_val <- df_bb_train[-index,]
X_train_tmp <- X_train[,1:2]
X_train_scaled <- scale(X_train[,3:11], scale = TRUE, center = TRUE)
X_train_scaled <- cbind(X_train_tmp,X_train_scaled)
#Training the logistic regression model
model_lr_scaled<-train(Choice ~ .,method='glm',data = X_train_scaled, trControl=fitControl )
summary(model_lr_scaled)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.39540 -0.65485 -0.42257 -0.02631 2.72639
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.82398 0.11965 -6.887 5.70e-12 ***
## Gender1 -0.94576 0.15607 -6.060 1.36e-09 ***
## Amount_purchased 0.21835 0.08543 2.556 0.010593 *
## Frequency -0.55649 0.14906 -3.733 0.000189 ***
## Last_purchase 1.95980 0.32652 6.002 1.95e-09 ***
## First_purchase -0.31326 0.24239 -1.292 0.196213
## P_Child -0.95572 0.14175 -6.742 1.56e-11 ***
## P_Youth -0.36535 0.10032 -3.642 0.000271 ***
## P_Cook -0.94119 0.14221 -6.618 3.63e-11 ***
## P_DIY -0.62619 0.10991 -5.698 1.22e-08 ***
## P_Art 0.52693 0.10833 4.864 1.15e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1439.6 on 1279 degrees of freedom
## Residual deviance: 1092.7 on 1269 degrees of freedom
## AIC: 1114.7
##
## Number of Fisher Scoring iterations: 5
#Predicting using logistic regression model
pred_lr_scaled<-predict(object = model_lr_scaled,X_val)
#
# #Checking the accuracy
confusionMatrix(X_val$Choice,pred_lr_scaled)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 16 224
## 1 3 77
##
## Accuracy : 0.2906
## 95% CI : (0.2415, 0.3437)
## No Information Rate : 0.9406
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0152
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.84211
## Specificity : 0.25581
## Pos Pred Value : 0.06667
## Neg Pred Value : 0.96250
## Prevalence : 0.05937
## Detection Rate : 0.05000
## Detection Prevalence : 0.75000
## Balanced Accuracy : 0.54896
##
## 'Positive' Class : 0
##
varImp(model_lr_scaled)
## glm variable importance
##
## Overall
## P_Child 100.00
## P_Cook 97.73
## Gender1 87.48
## Last_purchase 86.41
## P_DIY 80.83
## P_Art 65.53
## Frequency 44.79
## P_Youth 43.11
## Amount_purchased 23.18
## First_purchase 0.00
pred_lr_test_s<-predict(object = model_lr_scaled,df_bb_test)
#
# #Checking the accuracy
confusionMatrix(pred_lr_test_s,df_bb_testY$Choice)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 197 7
## 1 1899 197
##
## Accuracy : 0.1713
## 95% CI : (0.1561, 0.1873)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0115
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.09399
## Specificity : 0.96569
## Pos Pred Value : 0.96569
## Neg Pred Value : 0.09399
## Prevalence : 0.91130
## Detection Rate : 0.08565
## Detection Prevalence : 0.08870
## Balanced Accuracy : 0.52984
##
## 'Positive' Class : 0
##
#Training the logistic regression model
model_lr_train<-train(Choice ~ .,method='glm',data = X_train[], trControl=fitControl )
summary(model_lr_train)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.39540 -0.65485 -0.42257 -0.02631 2.72639
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4543298 0.2464849 -1.843 0.065295 .
## Gender1 -0.9457572 0.1560697 -6.060 1.36e-09 ***
## Amount_purchased 0.0022921 0.0008968 2.556 0.010593 *
## Frequency -0.0701082 0.0187791 -3.733 0.000189 ***
## Last_purchase 0.6551532 0.1091559 6.002 1.95e-09 ***
## First_purchase -0.0191319 0.0148032 -1.292 0.196213
## P_Child -0.9107911 0.1350843 -6.742 1.56e-11 ***
## P_Youth -0.5747093 0.1578094 -3.642 0.000271 ***
## P_Cook -0.9214163 0.1392192 -6.618 3.63e-11 ***
## P_DIY -0.9235212 0.1620917 -5.698 1.22e-08 ***
## P_Art 0.6996152 0.1438373 4.864 1.15e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1439.6 on 1279 degrees of freedom
## Residual deviance: 1092.7 on 1269 degrees of freedom
## AIC: 1114.7
##
## Number of Fisher Scoring iterations: 5
#Predicting using logistic regression model
pred_lr_val<-predict(object = model_lr_train,X_val)
#
# #Checking the accuracy
confusionMatrix(X_val$Choice,pred_lr_val)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 218 22
## 1 50 30
##
## Accuracy : 0.775
## 95% CI : (0.7252, 0.8196)
## No Information Rate : 0.8375
## P-Value [Acc > NIR] : 0.998568
##
## Kappa : 0.3208
##
## Mcnemar's Test P-Value : 0.001463
##
## Sensitivity : 0.8134
## Specificity : 0.5769
## Pos Pred Value : 0.9083
## Neg Pred Value : 0.3750
## Prevalence : 0.8375
## Detection Rate : 0.6813
## Detection Prevalence : 0.7500
## Balanced Accuracy : 0.6952
##
## 'Positive' Class : 0
##
varImp(model_lr_train)
## glm variable importance
##
## Overall
## P_Child 100.00
## P_Cook 97.73
## Gender1 87.48
## Last_purchase 86.41
## P_DIY 80.83
## P_Art 65.53
## Frequency 44.79
## P_Youth 43.11
## Amount_purchased 23.18
## First_purchase 0.00
# P_Cook+P_Child+Last_purchase+P_DIY+Gender+P_Art+Frequency
#Training the logistic regression model
model_lr_train<-train(Choice ~ P_Cook+P_Child+Last_purchase+P_DIY+Gender+P_Art+Frequency,
method='glm',data = X_train[], trControl=fitControl )
summary(model_lr_train)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.36861 -0.65670 -0.44225 -0.01068 2.80924
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.03777 0.18361 -0.206 0.837
## P_Cook -0.69320 0.12091 -5.733 9.86e-09 ***
## P_Child -0.69589 0.11885 -5.855 4.77e-09 ***
## Last_purchase 0.39240 0.06879 5.705 1.17e-08 ***
## P_DIY -0.72775 0.15005 -4.850 1.23e-06 ***
## Gender1 -0.92497 0.15418 -5.999 1.98e-09 ***
## P_Art 0.88655 0.13029 6.804 1.02e-11 ***
## Frequency -0.08761 0.01158 -7.564 3.91e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1439.6 on 1279 degrees of freedom
## Residual deviance: 1114.9 on 1272 degrees of freedom
## AIC: 1130.9
##
## Number of Fisher Scoring iterations: 5
#Predicting using logistic regression model
pred_lr_val<-predict(object = model_lr_train,X_val)
#
# #Checking the accuracy
confusionMatrix(X_val$Choice,pred_lr_val)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 218 22
## 1 53 27
##
## Accuracy : 0.7656
## 95% CI : (0.7153, 0.811)
## No Information Rate : 0.8469
## P-Value [Acc > NIR] : 0.999949
##
## Kappa : 0.2823
##
## Mcnemar's Test P-Value : 0.000532
##
## Sensitivity : 0.8044
## Specificity : 0.5510
## Pos Pred Value : 0.9083
## Neg Pred Value : 0.3375
## Prevalence : 0.8469
## Detection Rate : 0.6813
## Detection Prevalence : 0.7500
## Balanced Accuracy : 0.6777
##
## 'Positive' Class : 0
##
varImp(model_lr_train)
## glm variable importance
##
## Overall
## Frequency 100.00
## P_Art 72.00
## Gender1 42.35
## P_Child 37.03
## P_Cook 32.54
## Last_purchase 31.49
## P_DIY 0.00
pred_lr_test<-predict(object = model_lr_train,df_bb_test)
#
# #Checking the accuracy
confusionMatrix(pred_lr_test,df_bb_testY$Choice)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1979 130
## 1 117 74
##
## Accuracy : 0.8926
## 95% CI : (0.8792, 0.905)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.9991
##
## Kappa : 0.316
##
## Mcnemar's Test P-Value : 0.4451
##
## Sensitivity : 0.9442
## Specificity : 0.3627
## Pos Pred Value : 0.9384
## Neg Pred Value : 0.3874
## Prevalence : 0.9113
## Detection Rate : 0.8604
## Detection Prevalence : 0.9170
## Balanced Accuracy : 0.6535
##
## 'Positive' Class : 0
##
# P_Cook+P_Child+Last_purchase+P_DIY+Gender+P_Art+Frequency
#Training the logistic regression model
model_lr_train<-train(Choice ~ P_Cook+P_Art+Frequency,
method='glm',data = X_train[], trControl=fitControl )
summary(model_lr_train)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.48499 -0.70234 -0.51679 -0.06692 2.81261
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.57580 0.14757 -3.902 9.55e-05 ***
## P_Cook -0.26379 0.07593 -3.474 0.000513 ***
## P_Art 1.18765 0.10213 11.629 < 2e-16 ***
## Frequency -0.08563 0.01133 -7.559 4.07e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1439.6 on 1279 degrees of freedom
## Residual deviance: 1192.6 on 1276 degrees of freedom
## AIC: 1200.6
##
## Number of Fisher Scoring iterations: 5
#Predicting using logistic regression model
pred_lr_val<-predict(object = model_lr_train,X_val)
#
# #Checking the accuracy
confusionMatrix(X_val$Choice,pred_lr_val)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 228 12
## 1 58 22
##
## Accuracy : 0.7812
## 95% CI : (0.7319, 0.8253)
## No Information Rate : 0.8938
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2784
##
## Mcnemar's Test P-Value : 7.51e-08
##
## Sensitivity : 0.7972
## Specificity : 0.6471
## Pos Pred Value : 0.9500
## Neg Pred Value : 0.2750
## Prevalence : 0.8938
## Detection Rate : 0.7125
## Detection Prevalence : 0.7500
## Balanced Accuracy : 0.7221
##
## 'Positive' Class : 0
##
varImp(model_lr_train)
## glm variable importance
##
## Overall
## P_Art 100.00
## Frequency 50.09
## P_Cook 0.00
pred_lr_test<-predict(object = model_lr_train,df_bb_test)
#
# #Checking the accuracy
confusionMatrix(pred_lr_test,df_bb_testY$Choice)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1985 139
## 1 111 65
##
## Accuracy : 0.8913
## 95% CI : (0.8779, 0.9037)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.99954
##
## Kappa : 0.2832
##
## Mcnemar's Test P-Value : 0.08771
##
## Sensitivity : 0.9470
## Specificity : 0.3186
## Pos Pred Value : 0.9346
## Neg Pred Value : 0.3693
## Prevalence : 0.9113
## Detection Rate : 0.8630
## Detection Prevalence : 0.9235
## Balanced Accuracy : 0.6328
##
## 'Positive' Class : 0
##
model_svm<-svm(Choice ~ .,data = X_train)
summary(model_svm)
##
## Call:
## svm(formula = Choice ~ ., data = X_train)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 646
##
## ( 290 356 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
val_pred_svm<-predict(model_svm,X_val)
confusionMatrix(X_val$Choice,val_pred_svm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 230 10
## 1 62 18
##
## Accuracy : 0.775
## 95% CI : (0.7252, 0.8196)
## No Information Rate : 0.9125
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.234
##
## Mcnemar's Test P-Value : 1.851e-09
##
## Sensitivity : 0.7877
## Specificity : 0.6429
## Pos Pred Value : 0.9583
## Neg Pred Value : 0.2250
## Prevalence : 0.9125
## Detection Rate : 0.7188
## Detection Prevalence : 0.7500
## Balanced Accuracy : 0.7153
##
## 'Positive' Class : 0
##
test_pred_svm<-predict(model_svm,df_bb_test)
confusionMatrix(test_pred_svm,df_bb_testY$Choice)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2019 154
## 1 77 50
##
## Accuracy : 0.8996
## 95% CI : (0.8866, 0.9116)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.9766
##
## Kappa : 0.2511
##
## Mcnemar's Test P-Value : 5.72e-07
##
## Sensitivity : 0.9633
## Specificity : 0.2451
## Pos Pred Value : 0.9291
## Neg Pred Value : 0.3937
## Prevalence : 0.9113
## Detection Rate : 0.8778
## Detection Prevalence : 0.9448
## Balanced Accuracy : 0.6042
##
## 'Positive' Class : 0
##
svm_polyCl = svm(formula = Choice ~ .,
data = X_train,
type = 'C-classification',
kernel = 'polynomial')
summary(svm_polyCl)
##
## Call:
## svm(formula = Choice ~ ., data = X_train, type = "C-classification",
## kernel = "polynomial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 1
## degree: 3
## coef.0: 0
##
## Number of Support Vectors: 595
##
## ( 282 313 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
val_pred_svmR<-predict(svm_polyCl,X_val)
confusionMatrix(X_val$Choice,val_pred_svmR)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 232 8
## 1 67 13
##
## Accuracy : 0.7656
## 95% CI : (0.7153, 0.811)
## No Information Rate : 0.9344
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1713
##
## Mcnemar's Test P-Value : 2.124e-11
##
## Sensitivity : 0.7759
## Specificity : 0.6190
## Pos Pred Value : 0.9667
## Neg Pred Value : 0.1625
## Prevalence : 0.9344
## Detection Rate : 0.7250
## Detection Prevalence : 0.7500
## Balanced Accuracy : 0.6975
##
## 'Positive' Class : 0
##
test_pred_svmR<-predict(svm_polyCl,df_bb_test)
confusionMatrix(test_pred_svmR,df_bb_testY$Choice)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2034 171
## 1 62 33
##
## Accuracy : 0.8987
## 95% CI : (0.8856, 0.9107)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.9834
##
## Kappa : 0.1742
##
## Mcnemar's Test P-Value : 1.491e-12
##
## Sensitivity : 0.9704
## Specificity : 0.1618
## Pos Pred Value : 0.9224
## Neg Pred Value : 0.3474
## Prevalence : 0.9113
## Detection Rate : 0.8843
## Detection Prevalence : 0.9587
## Balanced Accuracy : 0.5661
##
## 'Positive' Class : 0
##
# Fit the model
train_control <- trainControl(method="repeatedcv", number=2, repeats=1)
svm_cv <- train(Choice ~., data = X_train, method = "svmPoly", trControl = train_control, preProcess = c("center","scale"), tuneLength = 4)
# Print the best tuning parameter sigma and C that maximizes model accuracy
svm_cv$bestTune
## degree scale C
## 13 1 1 0.25
svm_cv$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 0.25
##
## Polynomial kernel function.
## Hyperparameters : degree = 1 scale = 1 offset = 1
##
## Number of Support Vectors : 583
##
## Objective Function Value : -143.5434
## Training error : 0.192187
pred_svm_poly<-predict(svm_cv,df_bb_test)
confusionMatrix(pred_svm_poly,df_bb_testY$Choice)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2010 146
## 1 86 58
##
## Accuracy : 0.8991
## 95% CI : (0.8861, 0.9111)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.9802316
##
## Kappa : 0.2805
##
## Mcnemar's Test P-Value : 0.0001073
##
## Sensitivity : 0.9590
## Specificity : 0.2843
## Pos Pred Value : 0.9323
## Neg Pred Value : 0.4028
## Prevalence : 0.9113
## Detection Rate : 0.8739
## Detection Prevalence : 0.9374
## Balanced Accuracy : 0.6216
##
## 'Positive' Class : 0
##
bb_train <- read_excel("BBBC-Train.xlsx")
bb_train <- bb_train[,-1]
X_train_reg <- bb_train[index,]
X_val_reg <- bb_train[-index,]
#Training the logistic regression model
model_lreg_train<-train(Choice ~ .,method='lm',data = X_train_reg[], trControl=fitControl )
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
summary(model_lreg_train)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9736 -0.2440 -0.1101 0.1557 1.0600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3542684 0.0347210 10.203 < 2e-16 ***
## Gender -0.1396575 0.0223084 -6.260 5.25e-10 ***
## Amount_purchased 0.0003144 0.0001227 2.563 0.010494 *
## Frequency -0.0087599 0.0024504 -3.575 0.000364 ***
## Last_purchase 0.1005807 0.0154818 6.497 1.18e-10 ***
## First_purchase -0.0022340 0.0020597 -1.085 0.278293
## P_Child -0.1374368 0.0183788 -7.478 1.40e-13 ***
## P_Youth -0.0879054 0.0221839 -3.963 7.83e-05 ***
## P_Cook -0.1374552 0.0189914 -7.238 7.88e-13 ***
## P_DIY -0.1358874 0.0218755 -6.212 7.09e-10 ***
## P_Art 0.1187456 0.0214811 5.528 3.93e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3753 on 1269 degrees of freedom
## Multiple R-squared: 0.2552, Adjusted R-squared: 0.2493
## F-statistic: 43.48 on 10 and 1269 DF, p-value: < 2.2e-16
#Predicting using logistic regression model
pred_lreg_val<-predict(object = model_lreg_train,X_val_reg)
pred_lreg_df <- data.frame(pred_lreg_val)
pred_lreg_df <- mutate(pred_lreg_df, ID = row_number())
ggplot(pred_lreg_df)+geom_point(aes(ID,pred_lreg_val))+geom_hline(yintercept=0, linetype="dashed",
color = "red", size=2)+geom_hline(yintercept=1, linetype="dashed",
color = "red", size=2)+labs(x = "Validation Set Observations",y="Regression Predicted Values")
bb_test_reg <- read_excel("BBBC-Test.xlsx")
bb_test_reg <- bb_test_reg[,-1]
#Predicting using logistic regression model
pred_lreg_test<-predict(object = model_lreg_train,bb_test_reg)
pred_lreg_df_t <- data.frame(pred_lreg_test)
pred_lreg_df_t <- mutate(pred_lreg_df_t, ID = row_number())
ggplot(pred_lreg_df_t)+geom_point(aes(ID,pred_lreg_test))+geom_hline(yintercept=0, linetype="dashed",
color = "red", size=2)+geom_hline(yintercept=1, linetype="dashed",
color = "red", size=2)+labs(x = "Test Set Observations",y="Regression Predicted Values")
# Accuacy classfication
linear_preds_test <- ifelse(pred_lreg_test >= 0.5, 1, 0)
confusionMatrix(as.factor(linear_preds_test),df_bb_testY$Choice)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2000 141
## 1 96 63
##
## Accuracy : 0.897
## 95% CI : (0.8838, 0.9091)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.992022
##
## Kappa : 0.2921
##
## Mcnemar's Test P-Value : 0.004262
##
## Sensitivity : 0.9542
## Specificity : 0.3088
## Pos Pred Value : 0.9341
## Neg Pred Value : 0.3962
## Prevalence : 0.9113
## Detection Rate : 0.8696
## Detection Prevalence : 0.9309
## Balanced Accuracy : 0.6315
##
## 'Positive' Class : 0
##