Load Libraries

Read train and test dataset

bbbc_train = read_excel("BBBC-Train.xlsx")[2:12]
bbbc_test = read_excel("BBBC-Test.xlsx")[2:12]

Check structure of train dataset

str(bbbc_train)
## tibble [1,600 × 11] (S3: tbl_df/tbl/data.frame)
##  $ Choice          : num [1:1600] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Gender          : num [1:1600] 1 1 1 1 0 1 1 0 1 1 ...
##  $ Amount_purchased: num [1:1600] 113 418 336 180 320 268 198 280 393 138 ...
##  $ Frequency       : num [1:1600] 8 6 18 16 2 4 2 6 12 10 ...
##  $ Last_purchase   : num [1:1600] 1 11 6 5 3 1 12 2 11 7 ...
##  $ First_purchase  : num [1:1600] 8 66 32 42 18 4 62 12 50 38 ...
##  $ P_Child         : num [1:1600] 0 0 2 2 0 0 2 0 3 2 ...
##  $ P_Youth         : num [1:1600] 1 2 0 0 0 0 3 2 0 3 ...
##  $ P_Cook          : num [1:1600] 0 3 1 0 0 0 2 0 3 0 ...
##  $ P_DIY           : num [1:1600] 0 2 1 1 1 0 1 0 0 0 ...
##  $ P_Art           : num [1:1600] 0 3 2 1 2 0 2 0 2 1 ...

Check structure of test dataset

str(bbbc_test)
## tibble [2,300 × 11] (S3: tbl_df/tbl/data.frame)
##  $ Choice          : num [1:2300] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Gender          : num [1:2300] 0 1 1 0 1 0 1 1 1 1 ...
##  $ Amount_purchased: num [1:2300] 287 215 261 24 120 66 42 233 66 199 ...
##  $ Frequency       : num [1:2300] 12 4 2 4 8 2 12 8 12 22 ...
##  $ Last_purchase   : num [1:2300] 4 1 1 1 1 4 1 2 1 1 ...
##  $ First_purchase  : num [1:2300] 24 4 2 4 8 16 12 12 12 22 ...
##  $ P_Child         : num [1:2300] 0 0 0 1 0 0 0 0 0 0 ...
##  $ P_Youth         : num [1:2300] 3 0 0 0 0 0 0 0 0 0 ...
##  $ P_Cook          : num [1:2300] 0 0 0 0 0 1 1 0 0 0 ...
##  $ P_DIY           : num [1:2300] 0 0 0 0 0 1 0 0 0 0 ...
##  $ P_Art           : num [1:2300] 1 1 1 0 1 1 0 0 0 1 ...

Change Gender variable to factor

bbbc_train$Gender = as.factor(bbbc_train$Gender)
bbbc_test$Gender = as.factor(bbbc_test$Gender)

Check/remove missing na values from dataset

bbbc_train = na.omit(bbbc_train)
bbbc_test = na.omit(bbbc_test)

Choice as factor variable

levels(as.factor(bbbc_train$Choice))
## [1] "0" "1"

Gender as factor variable

levels(as.factor(bbbc_train$Gender))
## [1] "0" "1"

Amount purchased as factor variable

levels(as.factor(bbbc_train$Amount_purchased))
##   [1] "15"  "16"  "17"  "18"  "21"  "22"  "23"  "24"  "25"  "26"  "27"  "28" 
##  [13] "29"  "30"  "31"  "32"  "33"  "34"  "35"  "36"  "37"  "38"  "39"  "40" 
##  [25] "41"  "42"  "43"  "44"  "45"  "46"  "47"  "48"  "49"  "51"  "52"  "53" 
##  [37] "54"  "55"  "56"  "57"  "59"  "60"  "61"  "62"  "63"  "64"  "65"  "66" 
##  [49] "67"  "68"  "69"  "70"  "71"  "72"  "73"  "74"  "75"  "76"  "77"  "78" 
##  [61] "79"  "80"  "81"  "82"  "83"  "84"  "85"  "86"  "87"  "88"  "89"  "90" 
##  [73] "91"  "92"  "93"  "94"  "95"  "96"  "97"  "98"  "99"  "100" "101" "102"
##  [85] "103" "104" "105" "106" "107" "108" "110" "111" "112" "113" "114" "115"
##  [97] "116" "117" "118" "119" "120" "121" "122" "123" "124" "125" "126" "127"
## [109] "128" "129" "130" "131" "132" "133" "134" "135" "136" "137" "138" "139"
## [121] "140" "141" "142" "143" "144" "145" "146" "147" "148" "149" "150" "151"
## [133] "152" "153" "154" "155" "156" "157" "158" "159" "160" "161" "162" "163"
## [145] "164" "165" "166" "167" "168" "169" "170" "171" "172" "173" "174" "175"
## [157] "176" "177" "178" "179" "180" "181" "182" "183" "184" "185" "186" "187"
## [169] "188" "189" "190" "191" "192" "193" "194" "195" "196" "197" "198" "199"
## [181] "200" "201" "202" "203" "204" "205" "206" "207" "208" "209" "210" "211"
## [193] "212" "213" "214" "215" "216" "217" "218" "219" "220" "221" "222" "223"
## [205] "224" "225" "226" "227" "228" "229" "230" "231" "232" "233" "234" "235"
## [217] "236" "237" "238" "239" "240" "241" "242" "243" "244" "245" "246" "247"
## [229] "248" "249" "250" "251" "252" "253" "254" "255" "256" "257" "258" "259"
## [241] "260" "261" "262" "263" "264" "265" "266" "267" "268" "269" "270" "271"
## [253] "272" "273" "274" "275" "276" "277" "278" "279" "280" "281" "282" "283"
## [265] "284" "285" "286" "287" "288" "289" "290" "291" "292" "293" "294" "295"
## [277] "296" "297" "298" "299" "300" "301" "302" "303" "304" "305" "306" "307"
## [289] "308" "309" "310" "311" "312" "313" "314" "315" "316" "317" "318" "319"
## [301] "320" "321" "322" "323" "324" "325" "326" "327" "328" "329" "330" "331"
## [313] "332" "333" "334" "335" "336" "338" "339" "340" "341" "342" "343" "344"
## [325] "345" "346" "347" "349" "350" "351" "352" "353" "354" "355" "356" "357"
## [337] "360" "361" "364" "366" "367" "369" "370" "371" "372" "373" "374" "375"
## [349] "376" "377" "382" "383" "384" "385" "386" "388" "389" "390" "391" "393"
## [361] "394" "396" "397" "399" "400" "401" "403" "404" "405" "406" "407" "408"
## [373] "410" "413" "414" "418" "425" "428" "432" "434" "437" "444" "447" "451"
## [385] "469" "473" "474"
levels(as.factor(bbbc_train$Frequency))
##  [1] "2"  "4"  "6"  "8"  "10" "12" "14" "16" "18" "20" "22" "24" "26" "28" "30"
## [16] "32" "34" "36"
levels(as.factor(bbbc_train$Last_purchase))
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12"
levels(as.factor(bbbc_train$First_purchase))
##  [1] "2"  "4"  "6"  "8"  "10" "12" "14" "16" "18" "20" "22" "24" "26" "28" "30"
## [16] "32" "34" "36" "38" "40" "42" "44" "46" "48" "50" "52" "54" "56" "58" "60"
## [31] "62" "64" "66" "68" "70" "72" "74" "76" "78" "80" "82" "84" "86" "96"
levels(as.factor(bbbc_train$P_Child))
## [1] "0" "1" "2" "3" "4" "5" "6" "8"
levels(as.factor(bbbc_train$P_Youth))
## [1] "0" "1" "2" "3" "4"
levels(as.factor(bbbc_train$P_Cook))
## [1] "0" "1" "2" "3" "4" "5" "6"
levels(as.factor(bbbc_train$P_DIY))
## [1] "0" "1" "2" "3" "4"
levels(as.factor(bbbc_train$P_Art))
## [1] "0" "1" "2" "3" "4" "5"
levels(as.factor(bbbc_test$Choice))
## [1] "0" "1"
levels(as.factor(bbbc_test$Gender))
## [1] "0" "1"
levels(as.factor(bbbc_test$Amount_purchased))
##   [1] "15"  "16"  "17"  "18"  "19"  "20"  "21"  "22"  "23"  "24"  "25"  "26" 
##  [13] "29"  "30"  "31"  "32"  "33"  "34"  "35"  "36"  "37"  "38"  "39"  "40" 
##  [25] "41"  "42"  "43"  "44"  "45"  "46"  "47"  "48"  "49"  "50"  "51"  "52" 
##  [37] "53"  "54"  "55"  "56"  "57"  "58"  "59"  "60"  "61"  "62"  "63"  "64" 
##  [49] "65"  "66"  "67"  "68"  "69"  "70"  "71"  "72"  "73"  "74"  "75"  "76" 
##  [61] "77"  "78"  "79"  "80"  "81"  "82"  "83"  "84"  "85"  "86"  "87"  "88" 
##  [73] "89"  "90"  "91"  "92"  "93"  "94"  "95"  "96"  "97"  "98"  "99"  "100"
##  [85] "101" "102" "103" "104" "105" "106" "107" "108" "109" "110" "111" "112"
##  [97] "113" "114" "115" "116" "117" "118" "119" "120" "121" "122" "123" "124"
## [109] "125" "126" "127" "128" "129" "130" "131" "132" "133" "134" "135" "136"
## [121] "137" "138" "139" "140" "141" "142" "143" "144" "145" "146" "147" "148"
## [133] "149" "150" "151" "152" "153" "154" "155" "156" "157" "158" "159" "160"
## [145] "161" "162" "163" "164" "165" "166" "167" "168" "169" "170" "171" "172"
## [157] "173" "174" "175" "176" "177" "178" "179" "180" "181" "182" "183" "184"
## [169] "185" "186" "187" "188" "189" "190" "191" "192" "193" "194" "195" "196"
## [181] "197" "198" "199" "200" "201" "202" "203" "204" "205" "206" "207" "208"
## [193] "209" "210" "211" "212" "213" "214" "215" "216" "217" "218" "219" "220"
## [205] "221" "222" "223" "224" "225" "226" "227" "228" "229" "230" "231" "232"
## [217] "233" "234" "235" "236" "237" "238" "239" "240" "241" "242" "243" "244"
## [229] "245" "246" "247" "248" "249" "250" "251" "252" "253" "254" "255" "256"
## [241] "257" "258" "259" "260" "261" "262" "263" "264" "265" "266" "267" "268"
## [253] "269" "270" "271" "272" "273" "274" "275" "276" "277" "278" "279" "280"
## [265] "281" "282" "283" "284" "285" "286" "287" "288" "289" "290" "291" "292"
## [277] "293" "294" "295" "296" "297" "298" "299" "300" "301" "302" "303" "304"
## [289] "305" "306" "307" "308" "309" "310" "311" "312" "313" "314" "315" "316"
## [301] "317" "318" "319" "320" "321" "322" "323" "324" "325" "326" "327" "328"
## [313] "329" "330" "331" "332" "333" "334" "335" "336" "337" "338" "339" "342"
## [325] "343" "344" "345" "346" "347" "348" "350" "351" "352" "353" "355" "356"
## [337] "357" "359" "360" "363" "364" "365" "366" "368" "369" "370" "371" "372"
## [349] "374" "377" "379" "382" "383" "384" "385" "386" "387" "388" "391" "392"
## [361] "393" "396" "397" "398" "399" "400" "401" "402" "404" "405" "406" "407"
## [373] "409" "410" "411" "414" "419" "420" "421" "423" "427" "428" "429" "431"
## [385] "433" "434" "435" "439" "442" "443" "444" "447" "450" "458" "461"
levels(as.factor(bbbc_test$Frequency))
##  [1] "2"  "4"  "6"  "8"  "10" "12" "14" "16" "18" "20" "22" "24" "26" "28" "30"
## [16] "32" "34" "36"
levels(as.factor(bbbc_test$Last_purchase))
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12"
levels(as.factor(bbbc_test$First_purchase))
##  [1] "2"  "4"  "6"  "8"  "10" "12" "14" "16" "18" "20" "22" "24" "26" "28" "30"
## [16] "32" "34" "36" "38" "40" "42" "44" "46" "48" "50" "52" "54" "56" "58" "60"
## [31] "62" "64" "66" "68" "70" "72" "74" "76" "78" "80" "84" "88" "96"
levels(as.factor(bbbc_test$P_Child))
## [1] "0" "1" "2" "3" "4" "5" "6" "7"
levels(as.factor(bbbc_test$P_Youth))
## [1] "0" "1" "2" "3" "4" "5"
levels(as.factor(bbbc_test$P_Cook))
## [1] "0" "1" "2" "3" "4" "5" "6"
levels(as.factor(bbbc_test$P_DIY))
## [1] "0" "1" "2" "3" "4"
levels(as.factor(bbbc_test$P_Art))
## [1] "0" "1" "2" "3" "4"

Apply Linear model with all predictors

bbbc_lm = lm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_train)
summary(bbbc_lm)
## 
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + 
##     P_DIY + P_Art, data = bbbc_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9603 -0.2462 -0.1161  0.1622  1.0588 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.3642284  0.0307411  11.848  < 2e-16 ***
## Gender1          -0.1309205  0.0200303  -6.536 8.48e-11 ***
## Amount_purchased  0.0002736  0.0001110   2.464   0.0138 *  
## Frequency        -0.0090868  0.0021791  -4.170 3.21e-05 ***
## Last_purchase     0.0970286  0.0135589   7.156 1.26e-12 ***
## First_purchase   -0.0020024  0.0018160  -1.103   0.2704    
## P_Child          -0.1262584  0.0164011  -7.698 2.41e-14 ***
## P_Youth          -0.0963563  0.0201097  -4.792 1.81e-06 ***
## P_Cook           -0.1414907  0.0166064  -8.520  < 2e-16 ***
## P_DIY            -0.1352313  0.0197873  -6.834 1.17e-11 ***
## P_Art             0.1178494  0.0194427   6.061 1.68e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3788 on 1589 degrees of freedom
## Multiple R-squared:  0.2401, Adjusted R-squared:  0.2353 
## F-statistic:  50.2 on 10 and 1589 DF,  p-value: < 2.2e-16

Apply linear regressions with predictors interaction effect

bbbc_lm1 = lm(Choice ~ (.)^2, data = bbbc_train)
summary(bbbc_lm1)
## 
## Call:
## lm(formula = Choice ~ (.)^2, data = bbbc_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.03612 -0.22165 -0.10283  0.06123  0.99352 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      3.765e-01  6.995e-02   5.382 8.51e-08 ***
## Gender1                         -1.777e-01  5.843e-02  -3.041  0.00240 ** 
## Amount_purchased                 1.989e-04  3.084e-04   0.645  0.51902    
## Frequency                       -7.691e-03  8.245e-03  -0.933  0.35102    
## Last_purchase                    1.533e-01  5.225e-02   2.933  0.00340 ** 
## First_purchase                  -1.035e-02  7.439e-03  -1.392  0.16420    
## P_Child                         -1.630e-01  5.581e-02  -2.920  0.00356 ** 
## P_Youth                         -8.511e-02  6.878e-02  -1.237  0.21610    
## P_Cook                          -1.305e-01  5.775e-02  -2.260  0.02398 *  
## P_DIY                           -1.416e-01  7.104e-02  -1.994  0.04635 *  
## P_Art                            1.835e-01  6.735e-02   2.725  0.00651 ** 
## Gender1:Amount_purchased         2.387e-04  2.396e-04   0.996  0.31922    
## Gender1:Frequency               -3.741e-03  4.702e-03  -0.796  0.42632    
## Gender1:Last_purchase           -6.512e-02  2.931e-02  -2.222  0.02645 *  
## Gender1:First_purchase           7.184e-03  3.981e-03   1.804  0.07135 .  
## Gender1:P_Child                  4.698e-02  3.502e-02   1.341  0.17996    
## Gender1:P_Youth                  4.745e-02  4.279e-02   1.109  0.26766    
## Gender1:P_Cook                   4.097e-02  3.516e-02   1.165  0.24408    
## Gender1:P_DIY                   -2.289e-02  4.233e-02  -0.541  0.58867    
## Gender1:P_Art                    3.922e-02  4.205e-02   0.933  0.35116    
## Amount_purchased:Frequency      -3.406e-06  2.713e-05  -0.126  0.90014    
## Amount_purchased:Last_purchase  -1.530e-05  1.555e-04  -0.098  0.92165    
## Amount_purchased:First_purchase -2.818e-06  2.277e-05  -0.124  0.90155    
## Amount_purchased:P_Child         1.670e-04  1.926e-04   0.867  0.38598    
## Amount_purchased:P_Youth        -4.600e-04  2.276e-04  -2.021  0.04343 *  
## Amount_purchased:P_Cook         -6.230e-05  1.958e-04  -0.318  0.75040    
## Amount_purchased:P_DIY           1.370e-04  2.336e-04   0.587  0.55747    
## Amount_purchased:P_Art           2.441e-04  2.276e-04   1.072  0.28379    
## Frequency:Last_purchase         -1.008e-03  1.802e-03  -0.559  0.57604    
## Frequency:First_purchase         2.641e-04  1.081e-04   2.444  0.01465 *  
## Frequency:P_Child               -2.299e-04  2.894e-03  -0.079  0.93668    
## Frequency:P_Youth               -1.632e-04  3.336e-03  -0.049  0.96100    
## Frequency:P_Cook                -1.263e-04  2.813e-03  -0.045  0.96420    
## Frequency:P_DIY                  2.298e-03  3.546e-03   0.648  0.51706    
## Frequency:P_Art                 -9.074e-03  3.219e-03  -2.819  0.00488 ** 
## Last_purchase:First_purchase     1.856e-04  9.478e-04   0.196  0.84480    
## Last_purchase:P_Child           -3.015e-03  7.146e-03  -0.422  0.67315    
## Last_purchase:P_Youth            1.825e-02  1.325e-02   1.377  0.16877    
## Last_purchase:P_Cook            -7.985e-04  9.259e-03  -0.086  0.93129    
## Last_purchase:P_DIY             -1.875e-02  1.355e-02  -1.383  0.16682    
## Last_purchase:P_Art             -4.667e-03  1.319e-02  -0.354  0.72351    
## First_purchase:P_Child          -1.048e-03  1.701e-03  -0.616  0.53779    
## First_purchase:P_Youth           9.484e-04  2.197e-03   0.432  0.66600    
## First_purchase:P_Cook           -2.934e-04  1.707e-03  -0.172  0.86355    
## First_purchase:P_DIY             1.131e-03  2.229e-03   0.507  0.61191    
## First_purchase:P_Art             9.424e-04  2.047e-03   0.460  0.64525    
## P_Child:P_Youth                 -1.070e-02  2.005e-02  -0.534  0.59364    
## P_Child:P_Cook                   2.717e-02  1.337e-02   2.032  0.04236 *  
## P_Child:P_DIY                    2.246e-02  2.116e-02   1.061  0.28877    
## P_Child:P_Art                   -1.623e-02  1.933e-02  -0.840  0.40114    
## P_Youth:P_Cook                  -3.018e-02  2.109e-02  -1.431  0.15259    
## P_Youth:P_DIY                   -2.040e-02  2.994e-02  -0.681  0.49567    
## P_Youth:P_Art                   -1.887e-02  2.628e-02  -0.718  0.47276    
## P_Cook:P_DIY                     3.197e-03  2.370e-02   0.135  0.89272    
## P_Cook:P_Art                    -2.763e-02  1.973e-02  -1.400  0.16165    
## P_DIY:P_Art                      2.514e-02  2.742e-02   0.917  0.35935    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3759 on 1544 degrees of freedom
## Multiple R-squared:  0.2727, Adjusted R-squared:  0.2468 
## F-statistic: 10.53 on 55 and 1544 DF,  p-value: < 2.2e-16

Apply Linear regression with predictors interaction effect and backward selection

bbbc_lm2 = ols_step_backward_p(bbbc_lm1, prem = 0.05, details = F)
bbbc_lm2$model
## 
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")), 
##     data = l)
## 
## Coefficients:
##              (Intercept)                   Gender1          Amount_purchased  
##                0.3652837                -0.1300602                 0.0004283  
##                Frequency             Last_purchase                   P_Child  
##               -0.0159536                 0.0757656                -0.1051257  
##                  P_Youth                    P_Cook                     P_DIY  
##               -0.0439214                -0.1468073                -0.1319547  
##                    P_Art  Amount_purchased:P_Youth  Frequency:First_purchase  
##                0.2569808                -0.0004654                 0.0002160  
##          Frequency:P_Art     Last_purchase:P_Youth    P_Child:First_purchase  
##               -0.0088380                 0.0095153                -0.0012821  
##           P_Child:P_Cook              P_Cook:P_Art  
##                0.0240791                -0.0294565

Linear regression with selected variables from previous backward selection step

bbbc_lm2 = lm(data = bbbc_train, Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Amount_purchased:P_Youth + Frequency:First_purchase + Frequency:P_Art + Last_purchase:P_Youth + P_Child:First_purchase + P_Child:P_Cook + P_Cook:P_Art)
summary(bbbc_lm2)
## 
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art + 
##     Amount_purchased:P_Youth + Frequency:First_purchase + Frequency:P_Art + 
##     Last_purchase:P_Youth + P_Child:First_purchase + P_Child:P_Cook + 
##     P_Cook:P_Art, data = bbbc_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.97590 -0.22903 -0.10373  0.07595  1.00459 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.653e-01  4.103e-02   8.903  < 2e-16 ***
## Gender1                  -1.301e-01  1.983e-02  -6.558 7.35e-11 ***
## Amount_purchased          4.283e-04  1.259e-04   3.402 0.000687 ***
## Frequency                -1.595e-02  3.045e-03  -5.240 1.82e-07 ***
## Last_purchase             7.577e-02  1.218e-02   6.219 6.37e-10 ***
## P_Child                  -1.051e-01  2.364e-02  -4.447 9.31e-06 ***
## P_Youth                  -4.392e-02  4.269e-02  -1.029 0.303753    
## P_Cook                   -1.468e-01  2.011e-02  -7.302 4.47e-13 ***
## P_DIY                    -1.320e-01  1.973e-02  -6.689 3.11e-11 ***
## P_Art                     2.570e-01  3.250e-02   7.906 4.93e-15 ***
## Amount_purchased:P_Youth -4.654e-04  1.697e-04  -2.743 0.006165 ** 
## Frequency:First_purchase  2.160e-04  7.231e-05   2.987 0.002857 ** 
## Frequency:P_Art          -8.838e-03  1.936e-03  -4.564 5.40e-06 ***
## Last_purchase:P_Youth     9.515e-03  4.796e-03   1.984 0.047437 *  
## P_Child:First_purchase   -1.282e-03  5.160e-04  -2.485 0.013060 *  
## P_Child:P_Cook            2.408e-02  8.469e-03   2.843 0.004524 ** 
## P_Cook:P_Art             -2.946e-02  1.123e-02  -2.622 0.008821 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3746 on 1583 degrees of freedom
## Multiple R-squared:  0.2597, Adjusted R-squared:  0.2523 
## F-statistic: 34.71 on 16 and 1583 DF,  p-value: < 2.2e-16

Cooks distance diagnostic plot

cook_bbbc_lm2 = cooks.distance(bbbc_lm2)
plot(cook_bbbc_lm2, col = "red", pch = 20, cex = 1)
abline(h = 4/1600, lty = 2, col = "black")

large_cook_distance = which(cooks.distance(bbbc_lm2)>4/1600)

bbbc_lm2_trained = bbbc_train[-large_cook_distance,]

bbbc_lm2_1 = lm(data = bbbc_lm2_trained, Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Amount_purchased:P_Youth + Frequency:First_purchase + Frequency:P_Art + Last_purchase:P_Youth + P_Child:First_purchase + P_Child:P_Cook + P_Cook:P_Art)
summary(bbbc_lm2_1)
## 
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art + 
##     Amount_purchased:P_Youth + Frequency:First_purchase + Frequency:P_Art + 
##     Last_purchase:P_Youth + P_Child:First_purchase + P_Child:P_Cook + 
##     P_Cook:P_Art, data = bbbc_lm2_trained)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.75671 -0.21055 -0.08308  0.07178  0.99302 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.937e-01  3.935e-02  10.005  < 2e-16 ***
## Gender1                  -1.179e-01  1.906e-02  -6.182 8.12e-10 ***
## Amount_purchased          3.403e-04  1.205e-04   2.823 0.004816 ** 
## Frequency                -1.886e-02  2.951e-03  -6.390 2.20e-10 ***
## Last_purchase             7.137e-02  1.295e-02   5.512 4.18e-08 ***
## P_Child                  -1.026e-01  2.370e-02  -4.328 1.60e-05 ***
## P_Youth                  -1.044e-01  4.230e-02  -2.469 0.013654 *  
## P_Cook                   -1.467e-01  1.988e-02  -7.381 2.59e-13 ***
## P_DIY                    -1.292e-01  2.017e-02  -6.407 1.98e-10 ***
## P_Art                     2.995e-01  3.202e-02   9.352  < 2e-16 ***
## Amount_purchased:P_Youth -4.138e-04  1.758e-04  -2.354 0.018690 *  
## Frequency:First_purchase  2.759e-04  7.112e-05   3.880 0.000109 ***
## Frequency:P_Art          -1.081e-02  1.920e-03  -5.631 2.14e-08 ***
## Last_purchase:P_Youth     1.227e-02  4.977e-03   2.465 0.013798 *  
## P_Child:First_purchase   -1.318e-03  5.734e-04  -2.299 0.021629 *  
## P_Child:P_Cook            1.889e-02  9.627e-03   1.962 0.049906 *  
## P_Cook:P_Art             -3.943e-02  1.323e-02  -2.979 0.002934 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3506 on 1508 degrees of freedom
## Multiple R-squared:  0.3055, Adjusted R-squared:  0.2981 
## F-statistic: 41.46 on 16 and 1508 DF,  p-value: < 2.2e-16

Multicollinearity check VIF values

vif(bbbc_lm2_1)
##                   Gender         Amount_purchased                Frequency 
##                 1.007185                 1.571742                 6.550741 
##            Last_purchase                  P_Child                  P_Youth 
##                16.041588                 6.999756                 7.730242 
##                   P_Cook                    P_DIY                    P_Art 
##                 4.675060                 2.113232                 6.263660 
## Amount_purchased:P_Youth Frequency:First_purchase          Frequency:P_Art 
##                10.366335                 9.185984                 4.366306 
##    Last_purchase:P_Youth   P_Child:First_purchase           P_Child:P_Cook 
##                 6.608138                 9.556593                 4.866074 
##             P_Cook:P_Art 
##                 2.813394

Removing variable Last_purchase, Amount_purchased:P_Youth, Frequency:First_purchase, P_Child:First_purchase, with high VIF value correlation

bbbc_lm2_2 = lm(data = bbbc_lm2_trained, Choice ~ Gender + Amount_purchased + Frequency + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Frequency:P_Art + Last_purchase:P_Youth + P_Child:P_Cook + P_Cook:P_Art)
summary(bbbc_lm2_2)
## 
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     P_Child + P_Youth + P_Cook + P_DIY + P_Art + Frequency:P_Art + 
##     Last_purchase:P_Youth + P_Child:P_Cook + P_Cook:P_Art, data = bbbc_lm2_trained)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.82113 -0.20684 -0.10215  0.07888  0.94534 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            0.3573304  0.0320051  11.165  < 2e-16 ***
## Gender1               -0.1176548  0.0194322  -6.055 1.77e-09 ***
## Amount_purchased       0.0003135  0.0001071   2.928 0.003466 ** 
## Frequency             -0.0090326  0.0013688  -6.599 5.71e-11 ***
## P_Child               -0.0507128  0.0138707  -3.656 0.000265 ***
## P_Youth               -0.0977169  0.0311716  -3.135 0.001753 ** 
## P_Cook                -0.0554737  0.0153856  -3.606 0.000322 ***
## P_DIY                 -0.0422865  0.0156734  -2.698 0.007054 ** 
## P_Art                  0.3691289  0.0276425  13.354  < 2e-16 ***
## Frequency:P_Art       -0.0089962  0.0018532  -4.854 1.33e-06 ***
## P_Youth:Last_purchase  0.0089195  0.0043355   2.057 0.039829 *  
## P_Child:P_Cook         0.0130521  0.0083536   1.562 0.118391    
## P_Cook:P_Art          -0.0362256  0.0134283  -2.698 0.007059 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3576 on 1512 degrees of freedom
## Multiple R-squared:  0.2758, Adjusted R-squared:   0.27 
## F-statistic: 47.98 on 12 and 1512 DF,  p-value: < 2.2e-16

Multicollinearity check VIF values

vif(bbbc_lm2_2)
##                Gender      Amount_purchased             Frequency 
##              1.006296              1.192679              1.354652 
##               P_Child               P_Youth                P_Cook 
##              2.304990              4.036195              2.693090 
##                 P_DIY                 P_Art       Frequency:P_Art 
##              1.227421              4.487716              3.912225 
## P_Youth:Last_purchase        P_Child:P_Cook          P_Cook:P_Art 
##              4.822456              3.522952              2.785372
bbbc_lm2_3 = ols_step_backward_p(bbbc_lm2_2, prem = 0.05, details = F)
bbbc_lm2_3$model
## 
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")), 
##     data = l)
## 
## Coefficients:
##           (Intercept)                Gender1       Amount_purchased  
##             0.3451476             -0.1159939              0.0003101  
##             Frequency                P_Child                P_Youth  
##            -0.0089816             -0.0359693             -0.0962058  
##                P_Cook                  P_DIY                  P_Art  
##            -0.0418821             -0.0401135              0.3697613  
##       Frequency:P_Art  P_Youth:Last_purchase           P_Cook:P_Art  
##            -0.0090406              0.0087674             -0.0361205
bbbc_lm2_3 = lm(data = bbbc_lm2_trained,Choice ~ Gender + Amount_purchased + Frequency + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Frequency:P_Art + Last_purchase:P_Youth + P_Cook:P_Art)
summary(bbbc_lm2_3)
## 
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     P_Child + P_Youth + P_Cook + P_DIY + P_Art + Frequency:P_Art + 
##     Last_purchase:P_Youth + P_Cook:P_Art, data = bbbc_lm2_trained)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.80923 -0.20561 -0.10483  0.08567  0.93807 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            0.3451476  0.0310555  11.114  < 2e-16 ***
## Gender1               -0.1159939  0.0194123  -5.975 2.86e-09 ***
## Amount_purchased       0.0003101  0.0001071   2.896 0.003838 ** 
## Frequency             -0.0089816  0.0013690  -6.561 7.34e-11 ***
## P_Child               -0.0359693  0.0101712  -3.536 0.000418 ***
## P_Youth               -0.0962058  0.0311714  -3.086 0.002063 ** 
## P_Cook                -0.0418821  0.0126964  -3.299 0.000994 ***
## P_DIY                 -0.0401135  0.0156191  -2.568 0.010317 *  
## P_Art                  0.3697613  0.0276527  13.372  < 2e-16 ***
## Frequency:P_Art       -0.0090406  0.0018538  -4.877 1.19e-06 ***
## P_Youth:Last_purchase  0.0087674  0.0043365   2.022 0.043375 *  
## P_Cook:P_Art          -0.0361205  0.0134345  -2.689 0.007253 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3578 on 1513 degrees of freedom
## Multiple R-squared:  0.2746, Adjusted R-squared:  0.2693 
## F-statistic: 52.07 on 11 and 1513 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(bbbc_lm2_3, which=c(1:4)) 

## Apply Linear regression with all predictors and backward selection

bbbc_lm3 = ols_step_backward_p(bbbc_lm, prem = 0.05, details = F)
bbbc_lm3$model
## 
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")), 
##     data = l)
## 
## Coefficients:
##      (Intercept)           Gender1  Amount_purchased         Frequency  
##        0.3727367        -0.1316464         0.0002742        -0.0110830  
##    Last_purchase           P_Child           P_Youth            P_Cook  
##        0.0894288        -0.1275991        -0.0973642        -0.1433497  
##            P_DIY             P_Art  
##       -0.1365578         0.1150034

Linear regression with selected variables from previous backward selection step

bbbc_lm3 = lm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_train)
summary(bbbc_lm3)
## 
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, 
##     data = bbbc_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9802 -0.2452 -0.1157  0.1655  1.0595 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.3727367  0.0297590  12.525  < 2e-16 ***
## Gender1          -0.1316464  0.0200208  -6.575 6.56e-11 ***
## Amount_purchased  0.0002742  0.0001110   2.470   0.0136 *  
## Frequency        -0.0110830  0.0012128  -9.138  < 2e-16 ***
## Last_purchase     0.0894288  0.0116772   7.658 3.25e-14 ***
## P_Child          -0.1275991  0.0163571  -7.801 1.11e-14 ***
## P_Youth          -0.0973642  0.0200903  -4.846 1.38e-06 ***
## P_Cook           -0.1433497  0.0165218  -8.676  < 2e-16 ***
## P_DIY            -0.1365578  0.0197520  -6.914 6.82e-12 ***
## P_Art             0.1150034  0.0192719   5.967 2.97e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3788 on 1590 degrees of freedom
## Multiple R-squared:  0.2395, Adjusted R-squared:  0.2352 
## F-statistic: 55.63 on 9 and 1590 DF,  p-value: < 2.2e-16

Cooks distance diagnostic plot

cook_bbbc_lm3 = cooks.distance(bbbc_lm3)
plot(cook_bbbc_lm3, col = "red", pch = 20, cex = 1)
abline(h = 4/1600, lty = 2, col = "black")

large_cook_distance = which(cooks.distance(bbbc_lm3)>4/1600)

bbbc_lm3_trained = bbbc_train[-large_cook_distance,]

bbbc_lm3_1 = lm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_lm3_trained)
summary(bbbc_lm3_1)
## 
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, 
##     data = bbbc_lm3_trained)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.72682 -0.22418 -0.09923  0.12558  0.98094 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.3589669  0.0283022  12.683  < 2e-16 ***
## Gender1          -0.1133411  0.0191199  -5.928 3.79e-09 ***
## Amount_purchased  0.0002985  0.0001063   2.808  0.00505 ** 
## Frequency        -0.0120967  0.0011619 -10.411  < 2e-16 ***
## Last_purchase     0.0837024  0.0122794   6.816 1.34e-11 ***
## P_Child          -0.1273699  0.0168130  -7.576 6.18e-14 ***
## P_Youth          -0.1241418  0.0204237  -6.078 1.53e-09 ***
## P_Cook           -0.1455776  0.0167062  -8.714  < 2e-16 ***
## P_DIY            -0.1276728  0.0198407  -6.435 1.65e-10 ***
## P_Art             0.1490765  0.0196550   7.585 5.78e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3523 on 1517 degrees of freedom
## Multiple R-squared:  0.2935, Adjusted R-squared:  0.2893 
## F-statistic: 70.03 on 9 and 1517 DF,  p-value: < 2.2e-16

Multicollinearity check VIF values

vif(bbbc_lm3_1)
##           Gender Amount_purchased        Frequency    Last_purchase 
##         1.003393         1.226169         1.009234        15.034206 
##          P_Child          P_Youth           P_Cook            P_DIY 
##         3.617257         1.820319         3.384715         2.089291 
##            P_Art 
##         2.401423

Removing variable Last_purchase with high VIF value correlation

bbbc_lm3_2 = lm(Choice ~ Gender + Amount_purchased + Frequency + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_lm3_trained)
summary(bbbc_lm3_2)
## 
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_lm3_trained)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7698 -0.2261 -0.1036  0.1216  0.9464 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.3584078  0.0287228  12.478  < 2e-16 ***
## Gender1          -0.1108832  0.0194007  -5.715 1.31e-08 ***
## Amount_purchased  0.0003941  0.0001069   3.685 0.000236 ***
## Frequency        -0.0121444  0.0011792 -10.299  < 2e-16 ***
## P_Child          -0.0336805  0.0098271  -3.427 0.000626 ***
## P_Youth          -0.0360643  0.0160518  -2.247 0.024800 *  
## P_Cook           -0.0542681  0.0101315  -5.356 9.80e-08 ***
## P_DIY            -0.0379397  0.0150652  -2.518 0.011892 *  
## P_Art             0.2472133  0.0135798  18.205  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3575 on 1518 degrees of freedom
## Multiple R-squared:  0.2719, Adjusted R-squared:  0.268 
## F-statistic: 70.85 on 8 and 1518 DF,  p-value: < 2.2e-16
vif(bbbc_lm3_2)
##           Gender Amount_purchased        Frequency          P_Child 
##         1.003036         1.204798         1.009197         1.199848 
##          P_Youth           P_Cook            P_DIY            P_Art 
##         1.091722         1.208629         1.169544         1.112987
par(mfrow=c(2,2))
plot(bbbc_lm3_2, which=c(1:4)) 

#### Choice as factor variable

bbbc_train$Choice = as.factor(bbbc_train$Choice)
bbbc_test$Choice = as.factor(bbbc_test$Choice)

Full Logistic Regression Model

bbbc_log = glm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_train, family = binomial)
summary(bbbc_log)
## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + 
##     P_DIY + P_Art, family = binomial, data = bbbc_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.38586  -0.66728  -0.43696  -0.02242   2.72238  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.3515281  0.2143839  -1.640   0.1011    
## Gender1          -0.8632319  0.1374499  -6.280 3.38e-10 ***
## Amount_purchased  0.0018641  0.0007918   2.354   0.0186 *  
## Frequency        -0.0755142  0.0165937  -4.551 5.35e-06 ***
## Last_purchase     0.6117713  0.0938127   6.521 6.97e-11 ***
## First_purchase   -0.0147792  0.0128027  -1.154   0.2483    
## P_Child          -0.8112489  0.1167067  -6.951 3.62e-12 ***
## P_Youth          -0.6370422  0.1433778  -4.443 8.87e-06 ***
## P_Cook           -0.9230066  0.1194814  -7.725 1.12e-14 ***
## P_DIY            -0.9058697  0.1437025  -6.304 2.90e-10 ***
## P_Art             0.6861124  0.1270176   5.402 6.60e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1392.2  on 1589  degrees of freedom
## AIC: 1414.2
## 
## Number of Fisher Scoring iterations: 5

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1968  128
##          1  125   79
##                                           
##                Accuracy : 0.89            
##                  95% CI : (0.8765, 0.9025)
##     No Information Rate : 0.91            
##     P-Value [Acc > NIR] : 0.9995          
##                                           
##                   Kappa : 0.324           
##                                           
##  Mcnemar's Test P-Value : 0.8999          
##                                           
##             Sensitivity : 0.38164         
##             Specificity : 0.94028         
##          Pos Pred Value : 0.38725         
##          Neg Pred Value : 0.93893         
##              Prevalence : 0.09000         
##          Detection Rate : 0.03435         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.66096         
##                                           
##        'Positive' Class : 1               
## 

Full Logistic Regression with interaction terms

bbbc_log2 = glm(Choice ~ (.)^2, data = bbbc_train, family = binomial)
summary(bbbc_log2)
## 
## Call:
## glm(formula = Choice ~ (.)^2, family = binomial, data = bbbc_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.66574  -0.65647  -0.42036  -0.00011   2.62619  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                     -4.924e-01  4.846e-01  -1.016   0.3096  
## Gender1                         -9.323e-01  4.281e-01  -2.178   0.0294 *
## Amount_purchased                 2.517e-04  2.169e-03   0.116   0.9076  
## Frequency                       -3.397e-02  6.206e-02  -0.547   0.5841  
## Last_purchase                    6.055e-01  3.723e-01   1.627   0.1038  
## First_purchase                  -5.060e-02  5.556e-02  -0.911   0.3625  
## P_Child                         -7.134e-01  4.117e-01  -1.733   0.0832 .
## P_Youth                         -1.269e-02  5.098e-01  -0.025   0.9801  
## P_Cook                          -2.425e-01  4.232e-01  -0.573   0.5665  
## P_DIY                           -5.820e-01  5.164e-01  -1.127   0.2597  
## P_Art                            8.205e-01  4.654e-01   1.763   0.0779 .
## Gender1:Amount_purchased         2.726e-03  1.709e-03   1.596   0.1106  
## Gender1:Frequency               -5.859e-02  3.833e-02  -1.528   0.1264  
## Gender1:Last_purchase           -2.992e-01  2.165e-01  -1.382   0.1669  
## Gender1:First_purchase           4.681e-02  3.148e-02   1.487   0.1370  
## Gender1:P_Child                  2.939e-02  2.576e-01   0.114   0.9092  
## Gender1:P_Youth                  6.852e-02  3.332e-01   0.206   0.8371  
## Gender1:P_Cook                   2.024e-02  2.654e-01   0.076   0.9392  
## Gender1:P_DIY                   -5.411e-01  3.241e-01  -1.670   0.0950 .
## Gender1:P_Art                    3.834e-01  2.830e-01   1.355   0.1755  
## Amount_purchased:Frequency       3.692e-05  2.183e-04   0.169   0.8657  
## Amount_purchased:Last_purchase   1.662e-04  1.162e-03   0.143   0.8862  
## Amount_purchased:First_purchase -3.205e-05  1.743e-04  -0.184   0.8541  
## Amount_purchased:P_Child         1.779e-03  1.448e-03   1.228   0.2193  
## Amount_purchased:P_Youth        -4.112e-03  1.791e-03  -2.296   0.0217 *
## Amount_purchased:P_Cook         -1.025e-03  1.520e-03  -0.674   0.5001  
## Amount_purchased:P_DIY           1.438e-03  1.843e-03   0.780   0.4353  
## Amount_purchased:P_Art           4.243e-04  1.593e-03   0.266   0.7899  
## Frequency:Last_purchase          1.202e-02  1.514e-02   0.794   0.4272  
## Frequency:First_purchase         7.832e-04  9.614e-04   0.815   0.4153  
## Frequency:P_Child               -2.940e-02  2.484e-02  -1.183   0.2366  
## Frequency:P_Youth               -2.877e-02  3.080e-02  -0.934   0.3502  
## Frequency:P_Cook                -3.762e-02  2.471e-02  -1.523   0.1279  
## Frequency:P_DIY                 -6.784e-03  3.043e-02  -0.223   0.8236  
## Frequency:P_Art                 -3.816e-02  2.808e-02  -1.359   0.1742  
## Last_purchase:First_purchase     1.447e-03  7.004e-03   0.207   0.8363  
## Last_purchase:P_Child           -2.727e-02  6.489e-02  -0.420   0.6742  
## Last_purchase:P_Youth            1.975e-01  1.167e-01   1.693   0.0905 .
## Last_purchase:P_Cook            -1.965e-02  7.094e-02  -0.277   0.7817  
## Last_purchase:P_DIY             -1.001e-01  1.043e-01  -0.959   0.3374  
## Last_purchase:P_Art              3.443e-02  1.365e-01   0.252   0.8009  
## First_purchase:P_Child          -8.368e-03  1.401e-02  -0.597   0.5503  
## First_purchase:P_Youth           3.070e-03  1.909e-02   0.161   0.8722  
## First_purchase:P_Cook            3.805e-03  1.363e-02   0.279   0.7801  
## First_purchase:P_DIY             4.674e-03  1.819e-02   0.257   0.7973  
## First_purchase:P_Art            -2.875e-03  1.800e-02  -0.160   0.8731  
## P_Child:P_Youth                 -1.283e-01  1.824e-01  -0.703   0.4820  
## P_Child:P_Cook                   2.110e-01  1.200e-01   1.757   0.0788 .
## P_Child:P_DIY                    3.803e-02  1.746e-01   0.218   0.8276  
## P_Child:P_Art                   -7.824e-02  1.874e-01  -0.418   0.6762  
## P_Youth:P_Cook                  -3.904e-01  1.808e-01  -2.159   0.0309 *
## P_Youth:P_DIY                   -2.996e-01  2.589e-01  -1.157   0.2472  
## P_Youth:P_Art                    3.997e-02  2.307e-01   0.173   0.8625  
## P_Cook:P_DIY                    -8.013e-02  1.985e-01  -0.404   0.6864  
## P_Cook:P_Art                    -1.709e-01  1.856e-01  -0.921   0.3571  
## P_DIY:P_Art                      2.937e-01  2.351e-01   1.250   0.2114  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1340.0  on 1544  degrees of freedom
## AIC: 1452
## 
## Number of Fisher Scoring iterations: 6

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log2, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1975  121
##          1  124   80
##                                           
##                Accuracy : 0.8935          
##                  95% CI : (0.8801, 0.9058)
##     No Information Rate : 0.9126          
##     P-Value [Acc > NIR] : 0.9993          
##                                           
##                   Kappa : 0.3367          
##                                           
##  Mcnemar's Test P-Value : 0.8983          
##                                           
##             Sensitivity : 0.39801         
##             Specificity : 0.94092         
##          Pos Pred Value : 0.39216         
##          Neg Pred Value : 0.94227         
##              Prevalence : 0.08739         
##          Detection Rate : 0.03478         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.66947         
##                                           
##        'Positive' Class : 1               
## 

Full Logistic regression with interaction terms backward selection

bbbc_log3 = step(bbbc_log2, direction = "backward", trace = FALSE)
summary(bbbc_log3)
## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + 
##     P_DIY + P_Art + Gender:Amount_purchased + Gender:P_DIY + 
##     Amount_purchased:P_Child + Amount_purchased:P_Youth + Last_purchase:P_Youth + 
##     Last_purchase:P_DIY + First_purchase:P_Child + P_Child:P_Cook + 
##     P_Youth:P_Cook + P_Cook:P_Art + P_DIY:P_Art, family = binomial, 
##     data = bbbc_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.08079  -0.65926  -0.42548   0.01452   2.70453  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -0.1777724  0.3168490  -0.561 0.574754    
## Gender1                  -1.2238857  0.3354875  -3.648 0.000264 ***
## Amount_purchased          0.0005102  0.0013783   0.370 0.711236    
## Frequency                -0.0955830  0.0186950  -5.113 3.17e-07 ***
## Last_purchase             0.5393426  0.1044378   5.164 2.41e-07 ***
## First_purchase            0.0112353  0.0164963   0.681 0.495821    
## P_Child                  -1.0831641  0.2356436  -4.597 4.29e-06 ***
## P_Youth                  -0.3039130  0.3238519  -0.938 0.348022    
## P_Cook                   -0.8759147  0.1640133  -5.341 9.27e-08 ***
## P_DIY                    -0.4908395  0.2726998  -1.800 0.071872 .  
## P_Art                     0.7101141  0.1653700   4.294 1.75e-05 ***
## Gender1:Amount_purchased  0.0025071  0.0015265   1.642 0.100504    
## Gender1:P_DIY            -0.5392425  0.2355548  -2.289 0.022065 *  
## Amount_purchased:P_Child  0.0019907  0.0008728   2.281 0.022559 *  
## Amount_purchased:P_Youth -0.0037796  0.0013114  -2.882 0.003951 ** 
## Last_purchase:P_Youth     0.1459361  0.0490033   2.978 0.002901 ** 
## Last_purchase:P_DIY      -0.0772748  0.0393397  -1.964 0.049496 *  
## First_purchase:P_Child   -0.0134187  0.0048060  -2.792 0.005238 ** 
## P_Child:P_Cook            0.1745490  0.0689410   2.532 0.011346 *  
## P_Youth:P_Cook           -0.2992836  0.1421260  -2.106 0.035225 *  
## P_Cook:P_Art             -0.1614371  0.0903445  -1.787 0.073953 .  
## P_DIY:P_Art               0.3362011  0.1614869   2.082 0.037351 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1358.2  on 1578  degrees of freedom
## AIC: 1402.2
## 
## Number of Fisher Scoring iterations: 5

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log3, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1973  123
##          1  120   84
##                                           
##                Accuracy : 0.8943          
##                  95% CI : (0.8811, 0.9066)
##     No Information Rate : 0.91            
##     P-Value [Acc > NIR] : 0.9954          
##                                           
##                   Kappa : 0.3508          
##                                           
##  Mcnemar's Test P-Value : 0.8979          
##                                           
##             Sensitivity : 0.40580         
##             Specificity : 0.94267         
##          Pos Pred Value : 0.41176         
##          Neg Pred Value : 0.94132         
##              Prevalence : 0.09000         
##          Detection Rate : 0.03652         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.67423         
##                                           
##        'Positive' Class : 1               
## 

Cooks distance diagnostic plot

cook_distance_log3 = cooks.distance(bbbc_log3)
plot(cook_distance_log3, col = "red", pch = 20, cex = 1)
abline(h = 4/1600, lty = 2, col = "black")

#### Remove outliers

large_cook_distance = which(cooks.distance(bbbc_log3)>(4/1600))

bbbc_log3_trained = bbbc_train[-large_cook_distance,]

bbbc_log3_2 = glm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Gender:Amount_purchased + Gender:P_DIY + Amount_purchased:P_Child + Amount_purchased:P_Youth + Last_purchase:P_Youth + Last_purchase:P_DIY + First_purchase:P_Child + P_Child:P_Cook + P_Youth:P_Cook + P_Cook:P_Art + P_DIY:P_Art, data = bbbc_log3_trained, family = binomial)
summary(bbbc_log3_2)
## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + 
##     P_DIY + P_Art + Gender:Amount_purchased + Gender:P_DIY + 
##     Amount_purchased:P_Child + Amount_purchased:P_Youth + Last_purchase:P_Youth + 
##     Last_purchase:P_DIY + First_purchase:P_Child + P_Child:P_Cook + 
##     P_Youth:P_Cook + P_Cook:P_Art + P_DIY:P_Art, family = binomial, 
##     data = bbbc_log3_trained)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.08528  -0.55742  -0.28775  -0.02826   2.74734  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               0.0045600  0.3819120   0.012 0.990474    
## Gender1                  -1.2870270  0.4036421  -3.189 0.001430 ** 
## Amount_purchased          0.0009134  0.0016268   0.561 0.574471    
## Frequency                -0.1372397  0.0272378  -5.039 4.69e-07 ***
## Last_purchase             0.7056159  0.1516651   4.652 3.28e-06 ***
## First_purchase            0.0090064  0.0241811   0.372 0.709553    
## P_Child                  -1.3932946  0.3178214  -4.384 1.17e-05 ***
## P_Youth                  -0.8798479  0.4633697  -1.899 0.057590 .  
## P_Cook                   -1.1053726  0.1975767  -5.595 2.21e-08 ***
## P_DIY                    -0.6485602  0.3615495  -1.794 0.072839 .  
## P_Art                     0.7290163  0.1954992   3.729 0.000192 ***
## Gender1:Amount_purchased  0.0030971  0.0018839   1.644 0.100182    
## Gender1:P_DIY            -1.2880630  0.3720225  -3.462 0.000536 ***
## Amount_purchased:P_Child  0.0031359  0.0013007   2.411 0.015913 *  
## Amount_purchased:P_Youth -0.0063309  0.0020373  -3.107 0.001887 ** 
## Last_purchase:P_Youth     0.4049317  0.0967919   4.184 2.87e-05 ***
## Last_purchase:P_DIY      -0.1518262  0.0756022  -2.008 0.044619 *  
## First_purchase:P_Child   -0.0260979  0.0077131  -3.384 0.000715 ***
## P_Child:P_Cook            0.2090575  0.1327920   1.574 0.115413    
## P_Youth:P_Cook           -1.1989748  0.3181952  -3.768 0.000165 ***
## P_Cook:P_Art              0.0207649  0.1465806   0.142 0.887347    
## P_DIY:P_Art               0.8832159  0.3110327   2.840 0.004517 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1584.7  on 1502  degrees of freedom
## Residual deviance: 1014.2  on 1481  degrees of freedom
## AIC: 1058.2
## 
## Number of Fisher Scoring iterations: 7

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log3_2, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1946  150
##          1  120   84
##                                           
##                Accuracy : 0.8826          
##                  95% CI : (0.8687, 0.8955)
##     No Information Rate : 0.8983          
##     P-Value [Acc > NIR] : 0.99328         
##                                           
##                   Kappa : 0.319           
##                                           
##  Mcnemar's Test P-Value : 0.07758         
##                                           
##             Sensitivity : 0.35897         
##             Specificity : 0.94192         
##          Pos Pred Value : 0.41176         
##          Neg Pred Value : 0.92844         
##              Prevalence : 0.10174         
##          Detection Rate : 0.03652         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.65045         
##                                           
##        'Positive' Class : 1               
## 

Multicollinearity check VIF

vif(bbbc_log3_2)
##                   Gender         Amount_purchased                Frequency 
##                 6.327947                 3.231413                 3.959253 
##            Last_purchase           First_purchase                  P_Child 
##                18.097883                14.781570                 9.903587 
##                  P_Youth                   P_Cook                    P_DIY 
##                 7.031414                 3.818008                 4.813335 
##                    P_Art  Gender:Amount_purchased             Gender:P_DIY 
##                 2.749984                 8.253189                 2.942640 
## Amount_purchased:P_Child Amount_purchased:P_Youth    Last_purchase:P_Youth 
##                13.202736                 8.600721                12.389289 
##      Last_purchase:P_DIY   First_purchase:P_Child           P_Child:P_Cook 
##                10.272898                 9.451090                 4.470351 
##           P_Youth:P_Cook             P_Cook:P_Art              P_DIY:P_Art 
##                 5.266006                 4.369223                 6.828573

Removing high collerated VIF value variables and interaction terms

bbbc_log3_3 = glm(Choice ~ Gender + Amount_purchased + Frequency + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Gender:P_DIY + P_Child:P_Cook + P_Youth:P_Cook + P_Cook:P_Art + P_DIY:P_Art, data = bbbc_log3_trained, family = binomial)
summary(bbbc_log3_3)
## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     P_Child + P_Youth + P_Cook + P_DIY + P_Art + Gender:P_DIY + 
##     P_Child:P_Cook + P_Youth:P_Cook + P_Cook:P_Art + P_DIY:P_Art, 
##     family = binomial, data = bbbc_log3_trained)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.12475  -0.59325  -0.35155  -0.08197   2.61190  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.1466637  0.2597245  -0.565 0.572285    
## Gender1          -0.7279068  0.1719913  -4.232 2.31e-05 ***
## Amount_purchased  0.0033546  0.0009092   3.690 0.000225 ***
## Frequency        -0.1335029  0.0136510  -9.780  < 2e-16 ***
## P_Child          -0.3777533  0.1237906  -3.052 0.002277 ** 
## P_Youth          -0.0992598  0.1901371  -0.522 0.601640    
## P_Cook           -0.4114009  0.1542256  -2.668 0.007641 ** 
## P_DIY            -0.5006975  0.2532521  -1.977 0.048034 *  
## P_Art             1.5406510  0.1541179   9.997  < 2e-16 ***
## Gender1:P_DIY    -0.7268812  0.3192180  -2.277 0.022782 *  
## P_Child:P_Cook    0.0939201  0.0924061   1.016 0.309447    
## P_Youth:P_Cook   -0.2982718  0.1659704  -1.797 0.072314 .  
## P_Cook:P_Art     -0.0919085  0.1247962  -0.736 0.461445    
## P_DIY:P_Art       0.5070186  0.2025299   2.503 0.012300 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1584.7  on 1502  degrees of freedom
## Residual deviance: 1106.2  on 1489  degrees of freedom
## AIC: 1134.2
## 
## Number of Fisher Scoring iterations: 6

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log3_3, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1976  120
##          1  130   74
##                                           
##                Accuracy : 0.8913          
##                  95% CI : (0.8779, 0.9037)
##     No Information Rate : 0.9157          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.3124          
##                                           
##  Mcnemar's Test P-Value : 0.5692          
##                                           
##             Sensitivity : 0.38144         
##             Specificity : 0.93827         
##          Pos Pred Value : 0.36275         
##          Neg Pred Value : 0.94275         
##              Prevalence : 0.08435         
##          Detection Rate : 0.03217         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.65986         
##                                           
##        'Positive' Class : 1               
## 

Check VIF multicollinearity

vif(bbbc_log3_3)
##           Gender Amount_purchased        Frequency          P_Child 
##         1.244340         1.134827         1.052314         1.982088 
##          P_Youth           P_Cook            P_DIY            P_Art 
##         1.721617         2.835163         3.023992         1.965903 
##     Gender:P_DIY   P_Child:P_Cook   P_Youth:P_Cook     P_Cook:P_Art 
##         3.002732         3.658630         2.453211         4.223305 
##      P_DIY:P_Art 
##         3.828999
bbbc_log3_4 = step(bbbc_log3_3, direction = "backward", trace = FALSE)
summary(bbbc_log3_4)
## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     P_Child + P_Youth + P_Cook + P_DIY + P_Art + Gender:P_DIY + 
##     P_Youth:P_Cook + P_DIY:P_Art, family = binomial, data = bbbc_log3_trained)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.09834  -0.59177  -0.35310  -0.07416   2.60709  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.1748168  0.2481528  -0.704 0.481139    
## Gender1          -0.7243421  0.1717539  -4.217 2.47e-05 ***
## Amount_purchased  0.0033395  0.0009082   3.677 0.000236 ***
## Frequency        -0.1328249  0.0135764  -9.784  < 2e-16 ***
## P_Child          -0.3105357  0.0966700  -3.212 0.001317 ** 
## P_Youth          -0.0940805  0.1890695  -0.498 0.618768    
## P_Cook           -0.3929480  0.1164072  -3.376 0.000736 ***
## P_DIY            -0.4764514  0.2524590  -1.887 0.059128 .  
## P_Art             1.5095302  0.1366903  11.043  < 2e-16 ***
## Gender1:P_DIY    -0.7203885  0.3188350  -2.259 0.023856 *  
## P_Youth:P_Cook   -0.3141036  0.1653946  -1.899 0.057549 .  
## P_DIY:P_Art       0.4675144  0.1975315   2.367 0.017943 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1584.7  on 1502  degrees of freedom
## Residual deviance: 1107.4  on 1491  degrees of freedom
## AIC: 1131.4
## 
## Number of Fisher Scoring iterations: 6

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log3_4, newdata = bbbc_test, type = "response")
book_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .18178,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1976  120
##          1  130   74
##                                           
##                Accuracy : 0.8913          
##                  95% CI : (0.8779, 0.9037)
##     No Information Rate : 0.9157          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.3124          
##                                           
##  Mcnemar's Test P-Value : 0.5692          
##                                           
##             Sensitivity : 0.38144         
##             Specificity : 0.93827         
##          Pos Pred Value : 0.36275         
##          Neg Pred Value : 0.94275         
##              Prevalence : 0.08435         
##          Detection Rate : 0.03217         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.65986         
##                                           
##        'Positive' Class : 1               
## 
par(mfrow=c(2,2))
plot(bbbc_log3_4, which=c(1:4)) 

Optimal cutoff point on Full logistic regression model with interaction

effect terms using backward selection method

pred = prediction(predict(bbbc_log3_4, newdata = bbbc_test, type = "response"), bbbc_test$Choice)

auc = round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates = performance(pred, "fpr", "fnr")
accuracy = performance(pred, "acc", "err")

perf = performance(pred, "tpr", "fpr")
plot(perf, colorize = T, main = "ROC curve")
text(.5,.5, paste("AUC", auc))

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

Full Logistic regression model

using backward selection

bbbc_log4 = step(bbbc_log, direction = "backward", trace = FALSE)
summary(bbbc_log4)
## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, 
##     family = binomial, data = bbbc_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.44132  -0.66647  -0.43745  -0.01855   2.72460  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.2833949  0.2062721  -1.374   0.1695    
## Gender1          -0.8660575  0.1373268  -6.307 2.85e-10 ***
## Amount_purchased  0.0018357  0.0007908   2.321   0.0203 *  
## Frequency        -0.0903261  0.0106304  -8.497  < 2e-16 ***
## Last_purchase     0.5536689  0.0784519   7.057 1.70e-12 ***
## P_Child          -0.8181807  0.1163377  -7.033 2.02e-12 ***
## P_Youth          -0.6424923  0.1432548  -4.485 7.29e-06 ***
## P_Cook           -0.9330131  0.1190073  -7.840 4.51e-15 ***
## P_DIY            -0.9101106  0.1433591  -6.348 2.17e-10 ***
## P_Art             0.6643371  0.1255243   5.292 1.21e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1393.5  on 1590  degrees of freedom
## AIC: 1413.5
## 
## Number of Fisher Scoring iterations: 5

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log4, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1973  123
##          1  127   77
##                                           
##                Accuracy : 0.8913          
##                  95% CI : (0.8779, 0.9037)
##     No Information Rate : 0.913           
##     P-Value [Acc > NIR] : 0.9999          
##                                           
##                   Kappa : 0.3216          
##                                           
##  Mcnemar's Test P-Value : 0.8495          
##                                           
##             Sensitivity : 0.38500         
##             Specificity : 0.93952         
##          Pos Pred Value : 0.37745         
##          Neg Pred Value : 0.94132         
##              Prevalence : 0.08696         
##          Detection Rate : 0.03348         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.66226         
##                                           
##        'Positive' Class : 1               
## 

Cooks distance diagnostic plot

cook_distance_log4 = cooks.distance(bbbc_log4)
plot(cook_distance_log4, col = "red", pch = 20, cex = 1)
abline(h = 4/1600, lty = 2, col = "black")

#### Remove outliers

large_cook_distance = which(cooks.distance(bbbc_log4)>(4/1600))

bbbc_log4_trained = bbbc_train[-large_cook_distance,]

bbbc_log4_2 = glm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_log4_trained, family = binomial) 
summary(bbbc_log4_2)
## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, 
##     family = binomial, data = bbbc_log4_trained)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.12566  -0.52618  -0.29449  -0.07674   2.78007  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.0359902  0.2490070  -0.145  0.88508    
## Gender1          -0.9746586  0.1653465  -5.895 3.75e-09 ***
## Amount_purchased  0.0025809  0.0009611   2.685  0.00725 ** 
## Frequency        -0.1531425  0.0151177 -10.130  < 2e-16 ***
## Last_purchase     0.8096840  0.1094971   7.395 1.42e-13 ***
## P_Child          -1.2429659  0.1634518  -7.604 2.86e-14 ***
## P_Youth          -1.2818878  0.2081274  -6.159 7.31e-10 ***
## P_Cook           -1.3858412  0.1629903  -8.503  < 2e-16 ***
## P_DIY            -1.4611262  0.1967092  -7.428 1.10e-13 ***
## P_Art             0.9484052  0.1614054   5.876 4.21e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1558.29  on 1495  degrees of freedom
## Residual deviance:  999.36  on 1486  degrees of freedom
## AIC: 1019.4
## 
## Number of Fisher Scoring iterations: 6

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log4_2, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1946  150
##          1  119   85
##                                           
##                Accuracy : 0.883           
##                  95% CI : (0.8692, 0.8959)
##     No Information Rate : 0.8978          
##     P-Value [Acc > NIR] : 0.99024         
##                                           
##                   Kappa : 0.323           
##                                           
##  Mcnemar's Test P-Value : 0.06738         
##                                           
##             Sensitivity : 0.36170         
##             Specificity : 0.94237         
##          Pos Pred Value : 0.41667         
##          Neg Pred Value : 0.92844         
##              Prevalence : 0.10217         
##          Detection Rate : 0.03696         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.65204         
##                                           
##        'Positive' Class : 1               
## 

Multicollinearity check VIF

vif(bbbc_log4_2)
##           Gender Amount_purchased        Frequency    Last_purchase 
##         1.034201         1.191262         1.078824        12.954224 
##          P_Child          P_Youth           P_Cook            P_DIY 
##         3.147356         1.838353         3.218469         2.048614 
##            P_Art 
##         1.984071

Removing variables with high VIF

bbbc_log4_3 = glm(Choice ~ Gender + Amount_purchased + Frequency + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_log4_trained, family = binomial) 
summary(bbbc_log4_3)
## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     P_Child + P_Youth + P_Cook + P_DIY + P_Art, family = binomial, 
##     data = bbbc_log4_trained)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.03973  -0.55738  -0.32735  -0.09369   2.56494  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.0934399  0.2419181  -0.386 0.699315    
## Gender1          -0.9195610  0.1606789  -5.723 1.05e-08 ***
## Amount_purchased  0.0034735  0.0009316   3.729 0.000192 ***
## Frequency        -0.1442607  0.0143420 -10.059  < 2e-16 ***
## P_Child          -0.3050191  0.0955213  -3.193 0.001407 ** 
## P_Youth          -0.3524054  0.1523640  -2.313 0.020727 *  
## P_Cook           -0.4693417  0.0979090  -4.794 1.64e-06 ***
## P_DIY            -0.5691434  0.1467527  -3.878 0.000105 ***
## P_Art             1.7802192  0.1261186  14.115  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1558.3  on 1495  degrees of freedom
## Residual deviance: 1057.9  on 1487  degrees of freedom
## AIC: 1075.9
## 
## Number of Fisher Scoring iterations: 6

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log4_3, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .16438,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1473  623
##          1   61  143
##                                           
##                Accuracy : 0.7026          
##                  95% CI : (0.6835, 0.7212)
##     No Information Rate : 0.667           
##     P-Value [Acc > NIR] : 0.0001376       
##                                           
##                   Kappa : 0.18            
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.18668         
##             Specificity : 0.96023         
##          Pos Pred Value : 0.70098         
##          Neg Pred Value : 0.70277         
##              Prevalence : 0.33304         
##          Detection Rate : 0.06217         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.57346         
##                                           
##        'Positive' Class : 1               
## 
OR= exp(bbbc_log4_3$coefficients)
round(OR, 3)
##      (Intercept)          Gender1 Amount_purchased        Frequency 
##            0.911            0.399            1.003            0.866 
##          P_Child          P_Youth           P_Cook            P_DIY 
##            0.737            0.703            0.625            0.566 
##            P_Art 
##            5.931
vif(bbbc_log4_3)
##           Gender Amount_purchased        Frequency          P_Child 
##         1.032861         1.169365         1.058566         1.169014 
##          P_Youth           P_Cook            P_DIY            P_Art 
##         1.087578         1.200231         1.192834         1.292923
par(mfrow=c(2,2))
plot(bbbc_log4_3, which=c(1:4)) 

probabilities <- predict(bbbc_log4_3, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)

# Select only numeric variables
mydata = dplyr::select_if(bbbc_log4_trained, is.numeric) 
predictors <- colnames(mydata)
# Binding the logit and tidying the data for plot
mydata = mutate(mydata, logit = log(probabilities/(1-probabilities))) 
mydata = gather(mydata, key = "predictors", value = "predictor.value", -logit)

#making the plot
ggplot(mydata, aes(logit, predictor.value))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") + 
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_y")
## `geom_smooth()` using formula 'y ~ x'

Optimal cutoff point on Full logistic regression model

without interaction effect terms using backward selection method

pred = prediction(predict(bbbc_log4_3, newdata = bbbc_test, type = "response"), bbbc_test$Choice)

auc = round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates = performance(pred, "fpr", "fnr")
accuracy = performance(pred, "acc", "err")

perf = performance(pred, "tpr", "fpr")
plot(perf, colorize = T, main = "ROC curve")
text(.5,.5, paste("AUC", auc))

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

## Linear SVM

set.seed(1)
form1 = Choice ~ .
tuned_svm = tune.svm(form1, data = bbbc_train,kernel = "linear", gamma = seq(.01, .1, by = .01), cost = seq(.1, 1, by = .1))
tuned_svm$best.parameters
##    gamma cost
## 61  0.01  0.7
bbbc_svm = svm(Choice ~., data = bbbc_train, kernel = "linear", gamma = tuned_svm$best.parameters$gamma, cost = tuned_svm$best.parameters$cost)
summary(bbbc_svm)
## 
## Call:
## svm(formula = Choice ~ ., data = bbbc_train, kernel = "linear", gamma = tuned_svm$best.parameters$gamma, 
##     cost = tuned_svm$best.parameters$cost)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.7 
## 
## Number of Support Vectors:  739
## 
##  ( 367 372 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

Confusion Matrix

bbbc_svm_predict1 = predict(bbbc_svm, bbbc_test, type = "response")
caret::confusionMatrix(bbbc_test$Choice, as.factor(bbbc_svm_predict1), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2011   85
##          1  147   57
##                                           
##                Accuracy : 0.8991          
##                  95% CI : (0.8861, 0.9111)
##     No Information Rate : 0.9383          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2768          
##                                           
##  Mcnemar's Test P-Value : 6.206e-05       
##                                           
##             Sensitivity : 0.40141         
##             Specificity : 0.93188         
##          Pos Pred Value : 0.27941         
##          Neg Pred Value : 0.95945         
##              Prevalence : 0.06174         
##          Detection Rate : 0.02478         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.66664         
##                                           
##        'Positive' Class : 1               
## 

Polynomial SVM

set.seed(1)
tuned_svm2 = tune.svm(form1, data = bbbc_train,kernel = "polynomial", gamma = seq(.01, .1, by = .01), cost = seq(.1, 1, by = .1))
tuned_svm2$best.parameters
##    gamma cost
## 90   0.1  0.9
bbbc_svm2 = svm(formula = form1, data = bbbc_train,kernel = "polynomial", gamma = tuned_svm2$best.parameters$gamma, cost = tuned_svm2$best.parameters$cost)
summary(bbbc_svm2)
## 
## Call:
## svm(formula = form1, data = bbbc_train, kernel = "polynomial", gamma = tuned_svm2$best.parameters$gamma, 
##     cost = tuned_svm2$best.parameters$cost)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  0.9 
##      degree:  3 
##      coef.0:  0 
## 
## Number of Support Vectors:  749
## 
##  ( 356 393 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

Confusion matrix

bbbc_svm_predict2 = predict(bbbc_svm2, bbbc_test, type = "response")
caret::confusionMatrix(bbbc_test$Choice, as.factor(bbbc_svm_predict2), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2050   46
##          1  168   36
##                                           
##                Accuracy : 0.907           
##                  95% CI : (0.8943, 0.9185)
##     No Information Rate : 0.9643          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2117          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.43902         
##             Specificity : 0.92426         
##          Pos Pred Value : 0.17647         
##          Neg Pred Value : 0.97805         
##              Prevalence : 0.03565         
##          Detection Rate : 0.01565         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.68164         
##                                           
##        'Positive' Class : 1               
## 

Radial SVM

set.seed(1)
tuned_svm3 = tune.svm(form1, data = bbbc_train,kernel = "radial", gamma = seq(.01, .1, by = .01), cost = seq(.1, 1, by = .1))
tuned_svm3$best.parameters
##    gamma cost
## 75  0.05  0.8
bbbc_svm3 = svm(formula = form1, data = bbbc_train,kernel = "radial", gamma = tuned_svm3$best.parameters$gamma, cost = tuned_svm3$best.parameters$cost)
summary(bbbc_svm3)
## 
## Call:
## svm(formula = form1, data = bbbc_train, kernel = "radial", gamma = tuned_svm3$best.parameters$gamma, 
##     cost = tuned_svm3$best.parameters$cost)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  0.8 
## 
## Number of Support Vectors:  782
## 
##  ( 371 411 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

Confusion Matrix

bbbc_svm_predict3 = predict(bbbc_svm3, bbbc_test, type = "response")
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_svm_predict3), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2054   42
##          1  166   38
##                                          
##                Accuracy : 0.9096         
##                  95% CI : (0.8971, 0.921)
##     No Information Rate : 0.9652         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.2291         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.47500        
##             Specificity : 0.92523        
##          Pos Pred Value : 0.18627        
##          Neg Pred Value : 0.97996        
##              Prevalence : 0.03478        
##          Detection Rate : 0.01652        
##    Detection Prevalence : 0.08870        
##       Balanced Accuracy : 0.70011        
##                                          
##        'Positive' Class : 1              
## 

Sigmoid SVM

set.seed(1)
tuned_svm4 = tune.svm(form1, data = bbbc_train,kernel = "sigmoid", gamma = seq(.01, .1, by = .01), cost = seq(.1, 1, by = .1))
tuned_svm4$best.parameters
##    gamma cost
## 53  0.03  0.6
bbbc_svm4 = svm(formula = form1, data = bbbc_train,kernel = "sigmoid", gamma = tuned_svm4$best.parameters$gamma, cost = tuned_svm4$best.parameters$cost)
summary(bbbc_svm4)
## 
## Call:
## svm(formula = form1, data = bbbc_train, kernel = "sigmoid", gamma = tuned_svm4$best.parameters$gamma, 
##     cost = tuned_svm4$best.parameters$cost)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  sigmoid 
##        cost:  0.6 
##      coef.0:  0 
## 
## Number of Support Vectors:  762
## 
##  ( 379 383 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

Confusion Matrix

bbbc_svm_predict3 = predict(bbbc_svm4, bbbc_test, type = "response")
caret::confusionMatrix(bbbc_test$Choice, as.factor(bbbc_svm_predict3), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1983  113
##          1  144   60
##                                           
##                Accuracy : 0.8883          
##                  95% CI : (0.8747, 0.9009)
##     No Information Rate : 0.9248          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.2579          
##                                           
##  Mcnemar's Test P-Value : 0.0613          
##                                           
##             Sensitivity : 0.34682         
##             Specificity : 0.93230         
##          Pos Pred Value : 0.29412         
##          Neg Pred Value : 0.94609         
##              Prevalence : 0.07522         
##          Detection Rate : 0.02609         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.63956         
##                                           
##        'Positive' Class : 1               
## 

Logistic Regression Model Analysis on New Mail Campaign

proportion_00 = 1473/2300
proportion_01 = 623/2300
proportion_10 = 61/2300
proportion_11 = 143/2300

new_customer = 50000

print("Proportion of people who are not going to buy book")
## [1] "Proportion of people who are not going to buy book"
proportion_00
## [1] 0.6404348
print("Proportion of people who we predicted would not buy, but did buy")
## [1] "Proportion of people who we predicted would not buy, but did buy"
proportion_01
## [1] 0.2708696
print("Proportion of people who we predicted would buy, but did not buy")
## [1] "Proportion of people who we predicted would buy, but did not buy"
proportion_10
## [1] 0.02652174
print("Proportion of people who we predicted will buy")
## [1] "Proportion of people who we predicted will buy"
proportion_11
## [1] 0.06217391
print("Number of people who are not going to buy book")
## [1] "Number of people who are not going to buy book"
number_00 = proportion_00*new_customer
number_00
## [1] 32021.74
print("Number of people who we predicted would not buy, but did buy")
## [1] "Number of people who we predicted would not buy, but did buy"
number_01 = proportion_01*new_customer
number_01
## [1] 13543.48
print("Number of people who we predicted would buy, but did not buy")
## [1] "Number of people who we predicted would buy, but did not buy"
number_10 = proportion_10*new_customer
number_10
## [1] 1326.087
print("Number of people who we predicted will buy")
## [1] "Number of people who we predicted will buy"
number_11 = proportion_11*new_customer
number_11
## [1] 3108.696
print("Number of people to be sent mail")
## [1] "Number of people to be sent mail"
number_send = number_10 + number_11
number_send
## [1] 4434.783
print("Number of people who will buy out of the number of people to be sent mail")
## [1] "Number of people who will buy out of the number of people to be sent mail"
number_11
## [1] 3108.696
print("proportion of people who will buy from the number of people sent mail")
## [1] "proportion of people who will buy from the number of people sent mail"
proportion_sent = number_11/number_send
proportion_sent
## [1] 0.7009804
print("cost of sending mail to all in $")
## [1] "cost of sending mail to all in $"
cost_total = number_send * 0.65
cost_total
## [1] 2882.609
print("cost for sending book to total number of people who buy in $")
## [1] "cost for sending book to total number of people who buy in $"
cost_buy = number_11*15
cost_buy
## [1] 46630.43
print("cost overhead in $")
## [1] "cost overhead in $"
cost_overhead = cost_buy*.45
cost_overhead
## [1] 20983.7
print("Total Revenue generated out of books sold in $")
## [1] "Total Revenue generated out of books sold in $"
revenue_total = number_11*31.95
revenue_total
## [1] 99322.83
print("Total Profit made by selecting this logistic model in $")
## [1] "Total Profit made by selecting this logistic model in $"
profit_total = revenue_total - cost_total - cost_buy - cost_overhead
profit_total
## [1] 28826.09

SVM Model Analysis on New Mail Campaign

proportion_00_SVM = 1983/2300
proportion_01_SVM = 113/2300
proportion_10_SVM = 144/2300
proportion_11_SVM = 60/2300


number_00_SVM = proportion_00_SVM*new_customer
number_00_SVM
## [1] 43108.7
number_01_SVM = proportion_01_SVM*new_customer
number_01_SVM
## [1] 2456.522
number_10_SVM = proportion_10_SVM*new_customer
number_10_SVM
## [1] 3130.435
number_11_SVM = proportion_11_SVM*new_customer
number_11_SVM
## [1] 1304.348
print("Number of people to be sent mail")
## [1] "Number of people to be sent mail"
number_send_SVM = number_10_SVM + number_11_SVM
number_send_SVM
## [1] 4434.783
print("Number of people who will buy out of the number of people to be sent mail")
## [1] "Number of people who will buy out of the number of people to be sent mail"
number_11_SVM
## [1] 1304.348
print("proportion of people who will buy from the number of people sent mail")
## [1] "proportion of people who will buy from the number of people sent mail"
proportion_sent_SVM = number_11_SVM/number_send_SVM
proportion_sent_SVM
## [1] 0.2941176
print("cost of sending mail to all in $")
## [1] "cost of sending mail to all in $"
cost_total_SVM = number_send_SVM * 0.65
cost_total_SVM
## [1] 2882.609
print("cost for sending book to total number of people who buy in $")
## [1] "cost for sending book to total number of people who buy in $"
cost_buy_SVM = number_11_SVM*15
cost_buy_SVM
## [1] 19565.22
print("cost overhead in $")
## [1] "cost overhead in $"
cost_overhead_SVM = cost_buy_SVM*.45
cost_overhead_SVM
## [1] 8804.348
print("Total Revenue generated out of books sold in $")
## [1] "Total Revenue generated out of books sold in $"
revenue_total_SVM = number_11_SVM*31.95
revenue_total_SVM
## [1] 41673.91
print("Total Profit made by selecting this SVM model in $")
## [1] "Total Profit made by selecting this SVM model in $"
profit_total_SVM = revenue_total_SVM - cost_total_SVM - cost_buy_SVM - cost_overhead_SVM
profit_total_SVM
## [1] 10421.74

Model Analysis on New Mail Campaign if mail sent to all

summary(bbbc_test$Choice)
##    0    1 
## 2096  204
proportion_1 = 204/2300
proportion_0 = 2096/2300

proportion_1
## [1] 0.08869565
proportion_0
## [1] 0.9113043
total_1 = proportion_1*new_customer
total_0 = proportion_0*new_customer

print("Number of people to be sent mail")
## [1] "Number of people to be sent mail"
send_total = total_1 + total_0
send_total
## [1] 50000
print("Number of people who will buy out of the number of people to be sent mail")
## [1] "Number of people who will buy out of the number of people to be sent mail"
total_1
## [1] 4434.783
print("proportion of people who will buy from the number of people sent mail")
## [1] "proportion of people who will buy from the number of people sent mail"
proportion_sent_total = total_1/send_total
proportion_sent_total
## [1] 0.08869565
print("cost of sending mail to all in $")
## [1] "cost of sending mail to all in $"
cost_total_all = send_total*.65
cost_total_all
## [1] 32500
print("cost for sending book to total number of people who buy in $")
## [1] "cost for sending book to total number of people who buy in $"
cost_buy_all = total_1*15
cost_buy_all
## [1] 66521.74
print("cost overhead in $")
## [1] "cost overhead in $"
cost_overhead_all = cost_buy_all*.45
cost_overhead_all
## [1] 29934.78
print("Total Revenue generated out of books sold in $")
## [1] "Total Revenue generated out of books sold in $"
revenue_total_all = total_1*31.95
revenue_total_all
## [1] 141691.3
print("Total Profit made by sending campaign mail to all in $")
## [1] "Total Profit made by sending campaign mail to all in $"
profit_total_all = revenue_total_all - cost_total_all - cost_buy_all - cost_overhead_all
profit_total_all
## [1] 12734.78