library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(leaps) 
## Warning: package 'leaps' was built under R version 4.0.5
library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
library(bestglm)
## Warning: package 'bestglm' was built under R version 4.0.5
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.0.5
library(dplyr)
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.0.5
## Loaded ROSE 0.0-4
library(e1071)
## Warning: package 'e1071' was built under R version 4.0.5
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(mclust)
## Warning: package 'mclust' was built under R version 4.0.5
## Package 'mclust' version 5.4.7
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## The following object is masked from 'package:purrr':
## 
##     map
library(readxl)
library(e1071)
library(rminer)
## Warning: package 'rminer' was built under R version 4.0.5
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.5
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:e1071':
## 
##     impute
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(corrplot)
## corrplot 0.90 loaded
df_bb_train <- read_excel("BBBC-Train.xlsx")
df_bb_test <- read_excel("BBBC-Test.xlsx")

df_bb_train <- df_bb_train[,-1]
df_bb_test <- df_bb_test[,-1]
df_bb_testY <- df_bb_test[,1]
df_bb_test <- df_bb_test[,-1] #remove choice
dim(df_bb_train)
## [1] 1600   11
dim(df_bb_test)
## [1] 2300   10
anyNA(df_bb_train)
## [1] FALSE
df_bb_train$Choice = as.factor(df_bb_train$Choice)
df_bb_train$Gender = as.factor(df_bb_train$Gender)
# df_bb_test$Choice = as.factor(df_bb_test$Choice)
df_bb_test$Gender = as.factor(df_bb_test$Gender)
df_bb_testY$Choice = as.factor(df_bb_testY$Choice)
table(df_bb_testY$Choice)
## 
##    0    1 
## 2096  204
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.0.5
dummy_data <- fastDummies::dummy_cols(df_bb_train[,c(2)],remove_first_dummy = TRUE)
names(dummy_data)
## [1] "Gender"   "Gender_1"
clPairs(df_bb_train[,3:11], cl = df_bb_train$Choice)

clp <- clPairs(df_bb_train[,3:11], cl = df_bb_train$Choice, lower.panel = NULL)

clPairsLegend(0.1, 0.6, class = clp$class, 
              col = clp$col, pch = clp$pch, 
              title = "Bookbinders data")

head(df_bb_train)
## # A tibble: 6 x 11
##   Choice Gender Amount_purchased Frequency Last_purchase First_purchase P_Child
##   <fct>  <fct>             <dbl>     <dbl>         <dbl>          <dbl>   <dbl>
## 1 1      1                   113         8             1              8       0
## 2 1      1                   418         6            11             66       0
## 3 1      1                   336        18             6             32       2
## 4 1      1                   180        16             5             42       2
## 5 1      0                   320         2             3             18       0
## 6 1      1                   268         4             1              4       0
## # ... with 4 more variables: P_Youth <dbl>, P_Cook <dbl>, P_DIY <dbl>,
## #   P_Art <dbl>

The variables in the data frame are:

Choice - Did the customer buy the book or not? Gender - 1 - Male 0- Female ? Amount_purchased - Total amount spent at BBB Club. Frequency - Total frequency of purchases at BBB Club. Last_purchase - Months since last purchase. First_purchase - Months since first purchase. P_child - Number of children’s books purchased. P_youth - Number of youth books purchased. P_cook - Number of cooking books purchased. P_diy - Number of do-it-yourself books purchased. P_art - Number of art books purchased.

dim(df_bb_train)
## [1] 1600   11
names(df_bb_train)
##  [1] "Choice"           "Gender"           "Amount_purchased" "Frequency"       
##  [5] "Last_purchase"    "First_purchase"   "P_Child"          "P_Youth"         
##  [9] "P_Cook"           "P_DIY"            "P_Art"
str(df_bb_train)
## tibble [1,600 x 11] (S3: tbl_df/tbl/data.frame)
##  $ Choice          : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Gender          : Factor w/ 2 levels "0","1": 2 2 2 2 1 2 2 1 2 2 ...
##  $ Amount_purchased: num [1:1600] 113 418 336 180 320 268 198 280 393 138 ...
##  $ Frequency       : num [1:1600] 8 6 18 16 2 4 2 6 12 10 ...
##  $ Last_purchase   : num [1:1600] 1 11 6 5 3 1 12 2 11 7 ...
##  $ First_purchase  : num [1:1600] 8 66 32 42 18 4 62 12 50 38 ...
##  $ P_Child         : num [1:1600] 0 0 2 2 0 0 2 0 3 2 ...
##  $ P_Youth         : num [1:1600] 1 2 0 0 0 0 3 2 0 3 ...
##  $ P_Cook          : num [1:1600] 0 3 1 0 0 0 2 0 3 0 ...
##  $ P_DIY           : num [1:1600] 0 2 1 1 1 0 1 0 0 0 ...
##  $ P_Art           : num [1:1600] 0 3 2 1 2 0 2 0 2 1 ...
summary(df_bb_train)
##  Choice   Gender   Amount_purchased   Frequency     Last_purchase   
##  0:1200   0: 546   Min.   : 15.0    Min.   : 2.00   Min.   : 1.000  
##  1: 400   1:1054   1st Qu.:126.8    1st Qu.: 6.00   1st Qu.: 1.000  
##                    Median :203.0    Median :12.00   Median : 2.000  
##                    Mean   :200.9    Mean   :12.31   Mean   : 3.199  
##                    3rd Qu.:273.0    3rd Qu.:16.00   3rd Qu.: 4.000  
##                    Max.   :474.0    Max.   :36.00   Max.   :12.000  
##  First_purchase     P_Child          P_Youth           P_Cook    
##  Min.   : 2.00   Min.   :0.0000   Min.   :0.0000   Min.   :0.00  
##  1st Qu.:12.00   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00  
##  Median :18.00   Median :0.0000   Median :0.0000   Median :0.00  
##  Mean   :22.58   Mean   :0.7394   Mean   :0.3375   Mean   :0.76  
##  3rd Qu.:30.00   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.00  
##  Max.   :96.00   Max.   :8.0000   Max.   :4.0000   Max.   :6.00  
##      P_DIY            P_Art      
##  Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :0.000  
##  Mean   :0.3912   Mean   :0.425  
##  3rd Qu.:1.0000   3rd Qu.:1.000  
##  Max.   :4.0000   Max.   :5.000
hist(df_bb_train$Amount_purchased, freq = FALSE)
lines(density(df_bb_train$Amount_purchased), col = 4)

boxplot(df_bb_train$Amount_purchased)
points(mean(df_bb_train$Amount_purchased, na.rm=TRUE), col="red")

table(df_bb_train$P_Art,df_bb_train$Choice)
##    
##       0   1
##   0 924 176
##   1 232 134
##   2  38  59
##   3   4  25
##   4   2   5
##   5   0   1
table(df_bb_train$Last_purchase)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12 
## 586 499  72  67  62  60  43  50  42  42  43  34
table(df_bb_train$First_purchase)
## 
##   2   4   6   8  10  12  14  16  18  20  22  24  26  28  30  32  34  36  38  40 
##  55  58  66  99 113 126 115 143 100  70  71  54  39  45  57  47  34  33  40  27 
##  42  44  46  48  50  52  54  56  58  60  62  64  66  68  70  72  74  76  78  80 
##  16  25  19  14  20  10  13   7  10  11  11   3  17   8   5   4   2   1   4   1 
##  82  84  86  96 
##   3   2   1   1
hist(df_bb_train$Last_purchase, breaks = seq(0.5, 12.5, by = 1), freq = TRUE)
axis(1, at = seq(1, 12, by = 1))

table(df_bb_train$Gender)
## 
##    0    1 
##  546 1054
tab <- table(df_bb_train$Gender)
prop.table(tab)*100
## 
##      0      1 
## 34.125 65.875
barplot(tab,names.arg=c("Female","Male"), col="#69b3a2",xlab="Gender", ylab="Frequency",
        font.lab=1, 
        col.lab="orange", 
        cex.lab= 1  )

cor_5 <- rcorr(as.matrix(df_bb_train[,-1]))
M <- cor_5$r
p_mat <- cor_5$P
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(M, method = "color", col = col(200),  
         type = "upper", order = "hclust", 
         addCoef.col = "black", # Add coefficient of correlation
         tl.col = "darkblue", tl.srt = 45, #Text label color and rotation
         # Combine with significance level
         p.mat = p_mat, sig.level = 0.01,  
         # hide correlation coefficient on the principal diagonal
         diag = FALSE 
         )

plot( Amount_purchased ~ Last_purchase, data = df_bb_train,xlab="Last Purchased",ylab="Amount    Purchased")

plot( Amount_purchased ~ Gender, data = df_bb_train,xlab="Gender",ylab="Amount Purchased")

plot( Choice ~ Gender, data = df_bb_train,xlab="Choice",ylab="Gender")

mosaicplot(tab)

tab <- table(df_bb_train$Gender, df_bb_train$Choice)
prop.table(tab, 1)*100
##    
##            0        1
##   0 66.48352 33.51648
##   1 79.41176 20.58824

#Scaling the numeric covariates befor we start the modeling

df_bb_train_scaled <- scale(df_bb_train[,3:11], scale = TRUE, center = TRUE)

df_bb_train_scaled <- cbind(df_bb_train_scaled,Gender = df_bb_train$Gender,Choice = df_bb_train$Choice)
# Model Selection using AIC and BIC
#----------------------------------------

#Full Model
glm_full_bw <- glm(Choice ~ ., data = df_bb_train, family = "binomial")

#Null Model
glm_null_bw <- glm(Choice ~ 1, data = df_bb_train, family = "binomial")

# stepwise selection with AIC & BIC
#**************************************
#**Null to Full step wise selection****
#**************************************
bw_AIC<-step(glm_null_bw, scope = list(upper=glm_full_bw),
                  direction="both",test="Chisq", trace = F) 
 
# stepwise selection with BIC
bw_BIC<-step(glm_null_bw, scope = list(upper=glm_full_bw),
                  direction="both",test="Chisq", trace = F, k=log(nrow(df_bb_train)))

summary(bw_AIC)
## 
## Call:
## glm(formula = Choice ~ P_Art + Frequency + Gender + P_Cook + 
##     P_DIY + Amount_purchased + P_Child + Last_purchase + P_Youth, 
##     family = "binomial", data = df_bb_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.44132  -0.66647  -0.43745  -0.01855   2.72460  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.2833949  0.2062721  -1.374   0.1695    
## P_Art             0.6643371  0.1255243   5.292 1.21e-07 ***
## Frequency        -0.0903261  0.0106304  -8.497  < 2e-16 ***
## Gender1          -0.8660575  0.1373268  -6.307 2.85e-10 ***
## P_Cook           -0.9330131  0.1190073  -7.840 4.51e-15 ***
## P_DIY            -0.9101106  0.1433591  -6.348 2.17e-10 ***
## Amount_purchased  0.0018357  0.0007908   2.321   0.0203 *  
## P_Child          -0.8181807  0.1163377  -7.033 2.02e-12 ***
## Last_purchase     0.5536689  0.0784519   7.057 1.70e-12 ***
## P_Youth          -0.6424923  0.1432548  -4.485 7.29e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1393.5  on 1590  degrees of freedom
## AIC: 1413.5
## 
## Number of Fisher Scoring iterations: 5
summary(bw_BIC)
## 
## Call:
## glm(formula = Choice ~ P_Art + Frequency + Gender + P_Cook + 
##     P_DIY, family = "binomial", data = df_bb_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.37774  -0.69368  -0.47955  -0.03998   2.74167  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.02017    0.15704   0.128  0.89781    
## P_Art        1.25836    0.09628  13.070  < 2e-16 ***
## Frequency   -0.08719    0.01037  -8.403  < 2e-16 ***
## Gender1     -0.79063    0.13339  -5.927 3.09e-09 ***
## P_Cook      -0.28822    0.07011  -4.111 3.94e-05 ***
## P_DIY       -0.28378    0.10537  -2.693  0.00708 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1459.9  on 1594  degrees of freedom
## AIC: 1471.9
## 
## Number of Fisher Scoring iterations: 5
#Defining the training controls for multiple models
fitControl <- trainControl(
                            method = "repeatedcv",
                            number = 10,
                            repeats=3
                          )
#Training the logistic regression model
model_lr<-train(Choice ~ .,method='glm',data = df_bb_train[], trControl=fitControl )

summary(model_lr)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.38586  -0.66728  -0.43696  -0.02242   2.72238  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.3515281  0.2143839  -1.640   0.1011    
## Gender1          -0.8632319  0.1374499  -6.280 3.38e-10 ***
## Amount_purchased  0.0018641  0.0007918   2.354   0.0186 *  
## Frequency        -0.0755142  0.0165937  -4.551 5.35e-06 ***
## Last_purchase     0.6117713  0.0938127   6.521 6.97e-11 ***
## First_purchase   -0.0147792  0.0128027  -1.154   0.2483    
## P_Child          -0.8112489  0.1167067  -6.951 3.62e-12 ***
## P_Youth          -0.6370422  0.1433778  -4.443 8.87e-06 ***
## P_Cook           -0.9230066  0.1194814  -7.725 1.12e-14 ***
## P_DIY            -0.9058697  0.1437025  -6.304 2.90e-10 ***
## P_Art             0.6861124  0.1270176   5.402 6.60e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1392.2  on 1589  degrees of freedom
## AIC: 1414.2
## 
## Number of Fisher Scoring iterations: 5
#Predicting using logistic regression model
pred_lr<-predict(object = model_lr,df_bb_train)
# 
# #Checking the accuracy 
confusionMatrix(df_bb_train$Choice,pred_lr)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1120   80
##          1  240  160
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.7795, 0.8193)
##     No Information Rate : 0.85            
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3846          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8235          
##             Specificity : 0.6667          
##          Pos Pred Value : 0.9333          
##          Neg Pred Value : 0.4000          
##              Prevalence : 0.8500          
##          Detection Rate : 0.7000          
##    Detection Prevalence : 0.7500          
##       Balanced Accuracy : 0.7451          
##                                           
##        'Positive' Class : 0               
## 
varImp(model_lr) %>% ggplot()+geom_vline(xintercept=55 )

index <- caret::createDataPartition(df_bb_train$Choice, p=0.8, list=FALSE)
X_train <- df_bb_train[index,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
X_val <- df_bb_train[-index,]

X_train_tmp <- X_train[,1:2]
X_train_scaled <- scale(X_train[,3:11], scale = TRUE, center = TRUE)
X_train_scaled <- cbind(X_train_tmp,X_train_scaled)
#Training the logistic regression model
model_lr_scaled<-train(Choice ~ .,method='glm',data = X_train_scaled, trControl=fitControl )

summary(model_lr_scaled)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.39540  -0.65485  -0.42257  -0.02631   2.72639  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.82398    0.11965  -6.887 5.70e-12 ***
## Gender1          -0.94576    0.15607  -6.060 1.36e-09 ***
## Amount_purchased  0.21835    0.08543   2.556 0.010593 *  
## Frequency        -0.55649    0.14906  -3.733 0.000189 ***
## Last_purchase     1.95980    0.32652   6.002 1.95e-09 ***
## First_purchase   -0.31326    0.24239  -1.292 0.196213    
## P_Child          -0.95572    0.14175  -6.742 1.56e-11 ***
## P_Youth          -0.36535    0.10032  -3.642 0.000271 ***
## P_Cook           -0.94119    0.14221  -6.618 3.63e-11 ***
## P_DIY            -0.62619    0.10991  -5.698 1.22e-08 ***
## P_Art             0.52693    0.10833   4.864 1.15e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1439.6  on 1279  degrees of freedom
## Residual deviance: 1092.7  on 1269  degrees of freedom
## AIC: 1114.7
## 
## Number of Fisher Scoring iterations: 5
#Predicting using logistic regression model
pred_lr_scaled<-predict(object = model_lr_scaled,X_val)
# 
# #Checking the accuracy 
confusionMatrix(X_val$Choice,pred_lr_scaled)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  16 224
##          1   3  77
##                                           
##                Accuracy : 0.2906          
##                  95% CI : (0.2415, 0.3437)
##     No Information Rate : 0.9406          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0152          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.84211         
##             Specificity : 0.25581         
##          Pos Pred Value : 0.06667         
##          Neg Pred Value : 0.96250         
##              Prevalence : 0.05937         
##          Detection Rate : 0.05000         
##    Detection Prevalence : 0.75000         
##       Balanced Accuracy : 0.54896         
##                                           
##        'Positive' Class : 0               
## 
varImp(model_lr_scaled)
## glm variable importance
## 
##                  Overall
## P_Child           100.00
## P_Cook             97.73
## Gender1            87.48
## Last_purchase      86.41
## P_DIY              80.83
## P_Art              65.53
## Frequency          44.79
## P_Youth            43.11
## Amount_purchased   23.18
## First_purchase      0.00
pred_lr_test_s<-predict(object = model_lr_scaled,df_bb_test)
# 
# #Checking the accuracy 
confusionMatrix(pred_lr_test_s,df_bb_testY$Choice)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  197    7
##          1 1899  197
##                                           
##                Accuracy : 0.1713          
##                  95% CI : (0.1561, 0.1873)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0115          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.09399         
##             Specificity : 0.96569         
##          Pos Pred Value : 0.96569         
##          Neg Pred Value : 0.09399         
##              Prevalence : 0.91130         
##          Detection Rate : 0.08565         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.52984         
##                                           
##        'Positive' Class : 0               
## 
#Training the logistic regression model
model_lr_train<-train(Choice ~ .,method='glm',data = X_train[], trControl=fitControl )

summary(model_lr_train)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.39540  -0.65485  -0.42257  -0.02631   2.72639  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.4543298  0.2464849  -1.843 0.065295 .  
## Gender1          -0.9457572  0.1560697  -6.060 1.36e-09 ***
## Amount_purchased  0.0022921  0.0008968   2.556 0.010593 *  
## Frequency        -0.0701082  0.0187791  -3.733 0.000189 ***
## Last_purchase     0.6551532  0.1091559   6.002 1.95e-09 ***
## First_purchase   -0.0191319  0.0148032  -1.292 0.196213    
## P_Child          -0.9107911  0.1350843  -6.742 1.56e-11 ***
## P_Youth          -0.5747093  0.1578094  -3.642 0.000271 ***
## P_Cook           -0.9214163  0.1392192  -6.618 3.63e-11 ***
## P_DIY            -0.9235212  0.1620917  -5.698 1.22e-08 ***
## P_Art             0.6996152  0.1438373   4.864 1.15e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1439.6  on 1279  degrees of freedom
## Residual deviance: 1092.7  on 1269  degrees of freedom
## AIC: 1114.7
## 
## Number of Fisher Scoring iterations: 5
#Predicting using logistic regression model
pred_lr_val<-predict(object = model_lr_train,X_val)
# 
# #Checking the accuracy 
confusionMatrix(X_val$Choice,pred_lr_val)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 218  22
##          1  50  30
##                                           
##                Accuracy : 0.775           
##                  95% CI : (0.7252, 0.8196)
##     No Information Rate : 0.8375          
##     P-Value [Acc > NIR] : 0.998568        
##                                           
##                   Kappa : 0.3208          
##                                           
##  Mcnemar's Test P-Value : 0.001463        
##                                           
##             Sensitivity : 0.8134          
##             Specificity : 0.5769          
##          Pos Pred Value : 0.9083          
##          Neg Pred Value : 0.3750          
##              Prevalence : 0.8375          
##          Detection Rate : 0.6813          
##    Detection Prevalence : 0.7500          
##       Balanced Accuracy : 0.6952          
##                                           
##        'Positive' Class : 0               
## 
varImp(model_lr_train)
## glm variable importance
## 
##                  Overall
## P_Child           100.00
## P_Cook             97.73
## Gender1            87.48
## Last_purchase      86.41
## P_DIY              80.83
## P_Art              65.53
## Frequency          44.79
## P_Youth            43.11
## Amount_purchased   23.18
## First_purchase      0.00
# P_Cook+P_Child+Last_purchase+P_DIY+Gender+P_Art+Frequency

#Training the logistic regression model
model_lr_train<-train(Choice ~ P_Cook+P_Child+Last_purchase+P_DIY+Gender+P_Art+Frequency,
                      method='glm',data = X_train[], trControl=fitControl )

summary(model_lr_train)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.36861  -0.65670  -0.44225  -0.01068   2.80924  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -0.03777    0.18361  -0.206    0.837    
## P_Cook        -0.69320    0.12091  -5.733 9.86e-09 ***
## P_Child       -0.69589    0.11885  -5.855 4.77e-09 ***
## Last_purchase  0.39240    0.06879   5.705 1.17e-08 ***
## P_DIY         -0.72775    0.15005  -4.850 1.23e-06 ***
## Gender1       -0.92497    0.15418  -5.999 1.98e-09 ***
## P_Art          0.88655    0.13029   6.804 1.02e-11 ***
## Frequency     -0.08761    0.01158  -7.564 3.91e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1439.6  on 1279  degrees of freedom
## Residual deviance: 1114.9  on 1272  degrees of freedom
## AIC: 1130.9
## 
## Number of Fisher Scoring iterations: 5
#Predicting using logistic regression model
pred_lr_val<-predict(object = model_lr_train,X_val)
# 
# #Checking the accuracy 
confusionMatrix(X_val$Choice,pred_lr_val)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 218  22
##          1  53  27
##                                          
##                Accuracy : 0.7656         
##                  95% CI : (0.7153, 0.811)
##     No Information Rate : 0.8469         
##     P-Value [Acc > NIR] : 0.999949       
##                                          
##                   Kappa : 0.2823         
##                                          
##  Mcnemar's Test P-Value : 0.000532       
##                                          
##             Sensitivity : 0.8044         
##             Specificity : 0.5510         
##          Pos Pred Value : 0.9083         
##          Neg Pred Value : 0.3375         
##              Prevalence : 0.8469         
##          Detection Rate : 0.6813         
##    Detection Prevalence : 0.7500         
##       Balanced Accuracy : 0.6777         
##                                          
##        'Positive' Class : 0              
## 
varImp(model_lr_train)
## glm variable importance
## 
##               Overall
## Frequency      100.00
## P_Art           72.00
## Gender1         42.35
## P_Child         37.03
## P_Cook          32.54
## Last_purchase   31.49
## P_DIY            0.00
pred_lr_test<-predict(object = model_lr_train,df_bb_test)
# 
# #Checking the accuracy 
confusionMatrix(pred_lr_test,df_bb_testY$Choice)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1979  130
##          1  117   74
##                                          
##                Accuracy : 0.8926         
##                  95% CI : (0.8792, 0.905)
##     No Information Rate : 0.9113         
##     P-Value [Acc > NIR] : 0.9991         
##                                          
##                   Kappa : 0.316          
##                                          
##  Mcnemar's Test P-Value : 0.4451         
##                                          
##             Sensitivity : 0.9442         
##             Specificity : 0.3627         
##          Pos Pred Value : 0.9384         
##          Neg Pred Value : 0.3874         
##              Prevalence : 0.9113         
##          Detection Rate : 0.8604         
##    Detection Prevalence : 0.9170         
##       Balanced Accuracy : 0.6535         
##                                          
##        'Positive' Class : 0              
## 
# P_Cook+P_Child+Last_purchase+P_DIY+Gender+P_Art+Frequency

#Training the logistic regression model
model_lr_train<-train(Choice ~ P_Cook+P_Art+Frequency,
                      method='glm',data = X_train[], trControl=fitControl )

summary(model_lr_train)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.48499  -0.70234  -0.51679  -0.06692   2.81261  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.57580    0.14757  -3.902 9.55e-05 ***
## P_Cook      -0.26379    0.07593  -3.474 0.000513 ***
## P_Art        1.18765    0.10213  11.629  < 2e-16 ***
## Frequency   -0.08563    0.01133  -7.559 4.07e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1439.6  on 1279  degrees of freedom
## Residual deviance: 1192.6  on 1276  degrees of freedom
## AIC: 1200.6
## 
## Number of Fisher Scoring iterations: 5
#Predicting using logistic regression model
pred_lr_val<-predict(object = model_lr_train,X_val)
# 
# #Checking the accuracy 
confusionMatrix(X_val$Choice,pred_lr_val)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 228  12
##          1  58  22
##                                           
##                Accuracy : 0.7812          
##                  95% CI : (0.7319, 0.8253)
##     No Information Rate : 0.8938          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2784          
##                                           
##  Mcnemar's Test P-Value : 7.51e-08        
##                                           
##             Sensitivity : 0.7972          
##             Specificity : 0.6471          
##          Pos Pred Value : 0.9500          
##          Neg Pred Value : 0.2750          
##              Prevalence : 0.8938          
##          Detection Rate : 0.7125          
##    Detection Prevalence : 0.7500          
##       Balanced Accuracy : 0.7221          
##                                           
##        'Positive' Class : 0               
## 
varImp(model_lr_train)
## glm variable importance
## 
##           Overall
## P_Art      100.00
## Frequency   50.09
## P_Cook       0.00
pred_lr_test<-predict(object = model_lr_train,df_bb_test)
# 
# #Checking the accuracy 
confusionMatrix(pred_lr_test,df_bb_testY$Choice)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1985  139
##          1  111   65
##                                           
##                Accuracy : 0.8913          
##                  95% CI : (0.8779, 0.9037)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.99954         
##                                           
##                   Kappa : 0.2832          
##                                           
##  Mcnemar's Test P-Value : 0.08771         
##                                           
##             Sensitivity : 0.9470          
##             Specificity : 0.3186          
##          Pos Pred Value : 0.9346          
##          Neg Pred Value : 0.3693          
##              Prevalence : 0.9113          
##          Detection Rate : 0.8630          
##    Detection Prevalence : 0.9235          
##       Balanced Accuracy : 0.6328          
##                                           
##        'Positive' Class : 0               
## 
model_svm<-svm(Choice ~ .,data = X_train)
summary(model_svm)
## 
## Call:
## svm(formula = Choice ~ ., data = X_train)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  646
## 
##  ( 290 356 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
val_pred_svm<-predict(model_svm,X_val)
confusionMatrix(X_val$Choice,val_pred_svm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 230  10
##          1  62  18
##                                           
##                Accuracy : 0.775           
##                  95% CI : (0.7252, 0.8196)
##     No Information Rate : 0.9125          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.234           
##                                           
##  Mcnemar's Test P-Value : 1.851e-09       
##                                           
##             Sensitivity : 0.7877          
##             Specificity : 0.6429          
##          Pos Pred Value : 0.9583          
##          Neg Pred Value : 0.2250          
##              Prevalence : 0.9125          
##          Detection Rate : 0.7188          
##    Detection Prevalence : 0.7500          
##       Balanced Accuracy : 0.7153          
##                                           
##        'Positive' Class : 0               
## 
test_pred_svm<-predict(model_svm,df_bb_test)
confusionMatrix(test_pred_svm,df_bb_testY$Choice)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2019  154
##          1   77   50
##                                           
##                Accuracy : 0.8996          
##                  95% CI : (0.8866, 0.9116)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.9766          
##                                           
##                   Kappa : 0.2511          
##                                           
##  Mcnemar's Test P-Value : 5.72e-07        
##                                           
##             Sensitivity : 0.9633          
##             Specificity : 0.2451          
##          Pos Pred Value : 0.9291          
##          Neg Pred Value : 0.3937          
##              Prevalence : 0.9113          
##          Detection Rate : 0.8778          
##    Detection Prevalence : 0.9448          
##       Balanced Accuracy : 0.6042          
##                                           
##        'Positive' Class : 0               
## 
svm_polyCl = svm(formula = Choice ~ .,
                   data = X_train,
                   type = 'C-classification',
                   kernel = 'polynomial')

summary(svm_polyCl)
## 
## Call:
## svm(formula = Choice ~ ., data = X_train, type = "C-classification", 
##     kernel = "polynomial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  1 
##      degree:  3 
##      coef.0:  0 
## 
## Number of Support Vectors:  595
## 
##  ( 282 313 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
val_pred_svmR<-predict(svm_polyCl,X_val)
confusionMatrix(X_val$Choice,val_pred_svmR)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 232   8
##          1  67  13
##                                          
##                Accuracy : 0.7656         
##                  95% CI : (0.7153, 0.811)
##     No Information Rate : 0.9344         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.1713         
##                                          
##  Mcnemar's Test P-Value : 2.124e-11      
##                                          
##             Sensitivity : 0.7759         
##             Specificity : 0.6190         
##          Pos Pred Value : 0.9667         
##          Neg Pred Value : 0.1625         
##              Prevalence : 0.9344         
##          Detection Rate : 0.7250         
##    Detection Prevalence : 0.7500         
##       Balanced Accuracy : 0.6975         
##                                          
##        'Positive' Class : 0              
## 
test_pred_svmR<-predict(svm_polyCl,df_bb_test)
confusionMatrix(test_pred_svmR,df_bb_testY$Choice)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2034  171
##          1   62   33
##                                           
##                Accuracy : 0.8987          
##                  95% CI : (0.8856, 0.9107)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.9834          
##                                           
##                   Kappa : 0.1742          
##                                           
##  Mcnemar's Test P-Value : 1.491e-12       
##                                           
##             Sensitivity : 0.9704          
##             Specificity : 0.1618          
##          Pos Pred Value : 0.9224          
##          Neg Pred Value : 0.3474          
##              Prevalence : 0.9113          
##          Detection Rate : 0.8843          
##    Detection Prevalence : 0.9587          
##       Balanced Accuracy : 0.5661          
##                                           
##        'Positive' Class : 0               
## 
# Fit the model 
train_control <- trainControl(method="repeatedcv", number=2, repeats=1)
svm_cv <- train(Choice ~., data = X_train, method = "svmPoly", trControl = train_control, preProcess = c("center","scale"), tuneLength = 4)
# Print the best tuning parameter sigma and C that maximizes model accuracy
svm_cv$bestTune
##    degree scale    C
## 13      1     1 0.25
svm_cv$finalModel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 0.25 
## 
## Polynomial kernel function. 
##  Hyperparameters : degree =  1  scale =  1  offset =  1 
## 
## Number of Support Vectors : 583 
## 
## Objective Function Value : -143.5434 
## Training error : 0.192187
pred_svm_poly<-predict(svm_cv,df_bb_test)
confusionMatrix(pred_svm_poly,df_bb_testY$Choice)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2010  146
##          1   86   58
##                                           
##                Accuracy : 0.8991          
##                  95% CI : (0.8861, 0.9111)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.9802316       
##                                           
##                   Kappa : 0.2805          
##                                           
##  Mcnemar's Test P-Value : 0.0001073       
##                                           
##             Sensitivity : 0.9590          
##             Specificity : 0.2843          
##          Pos Pred Value : 0.9323          
##          Neg Pred Value : 0.4028          
##              Prevalence : 0.9113          
##          Detection Rate : 0.8739          
##    Detection Prevalence : 0.9374          
##       Balanced Accuracy : 0.6216          
##                                           
##        'Positive' Class : 0               
## 
bb_train <- read_excel("BBBC-Train.xlsx")
bb_train <- bb_train[,-1]

X_train_reg <- bb_train[index,]

X_val_reg <- bb_train[-index,]


#Training the logistic regression model
model_lreg_train<-train(Choice ~ .,method='lm',data = X_train_reg[], trControl=fitControl )
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
summary(model_lreg_train)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9736 -0.2440 -0.1101  0.1557  1.0600 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.3542684  0.0347210  10.203  < 2e-16 ***
## Gender           -0.1396575  0.0223084  -6.260 5.25e-10 ***
## Amount_purchased  0.0003144  0.0001227   2.563 0.010494 *  
## Frequency        -0.0087599  0.0024504  -3.575 0.000364 ***
## Last_purchase     0.1005807  0.0154818   6.497 1.18e-10 ***
## First_purchase   -0.0022340  0.0020597  -1.085 0.278293    
## P_Child          -0.1374368  0.0183788  -7.478 1.40e-13 ***
## P_Youth          -0.0879054  0.0221839  -3.963 7.83e-05 ***
## P_Cook           -0.1374552  0.0189914  -7.238 7.88e-13 ***
## P_DIY            -0.1358874  0.0218755  -6.212 7.09e-10 ***
## P_Art             0.1187456  0.0214811   5.528 3.93e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3753 on 1269 degrees of freedom
## Multiple R-squared:  0.2552, Adjusted R-squared:  0.2493 
## F-statistic: 43.48 on 10 and 1269 DF,  p-value: < 2.2e-16
#Predicting using logistic regression model
pred_lreg_val<-predict(object = model_lreg_train,X_val_reg)

pred_lreg_df <- data.frame(pred_lreg_val)
pred_lreg_df <- mutate(pred_lreg_df, ID = row_number())

ggplot(pred_lreg_df)+geom_point(aes(ID,pred_lreg_val))+geom_hline(yintercept=0, linetype="dashed", 
                color = "red", size=2)+geom_hline(yintercept=1, linetype="dashed", 
                color = "red", size=2)+labs(x = "Validation Set Observations",y="Regression Predicted Values")

bb_test_reg <- read_excel("BBBC-Test.xlsx")
bb_test_reg <- bb_test_reg[,-1]

#Predicting using logistic regression model
pred_lreg_test<-predict(object = model_lreg_train,bb_test_reg)

pred_lreg_df_t <- data.frame(pred_lreg_test)
pred_lreg_df_t <- mutate(pred_lreg_df_t, ID = row_number())

ggplot(pred_lreg_df_t)+geom_point(aes(ID,pred_lreg_test))+geom_hline(yintercept=0, linetype="dashed", 
                color = "red", size=2)+geom_hline(yintercept=1, linetype="dashed", 
                color = "red", size=2)+labs(x = "Test Set Observations",y="Regression Predicted Values")

# Accuacy classfication
linear_preds_test <- ifelse(pred_lreg_test >= 0.5, 1, 0)

confusionMatrix(as.factor(linear_preds_test),df_bb_testY$Choice)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2000  141
##          1   96   63
##                                           
##                Accuracy : 0.897           
##                  95% CI : (0.8838, 0.9091)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.992022        
##                                           
##                   Kappa : 0.2921          
##                                           
##  Mcnemar's Test P-Value : 0.004262        
##                                           
##             Sensitivity : 0.9542          
##             Specificity : 0.3088          
##          Pos Pred Value : 0.9341          
##          Neg Pred Value : 0.3962          
##              Prevalence : 0.9113          
##          Detection Rate : 0.8696          
##    Detection Prevalence : 0.9309          
##       Balanced Accuracy : 0.6315          
##                                           
##        'Positive' Class : 0               
##