Case Study 2

Load Libraries

Read train and test dataset

bbbc_train = read_excel("BBBC-Train.xlsx")[2:12]
bbbc_test = read_excel("BBBC-Test.xlsx")[2:12]

Check structure of train dataset

str(bbbc_train)

## tibble [1,600 × 11] (S3: tbl_df/tbl/data.frame)
##  $ Choice          : num [1:1600] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Gender          : num [1:1600] 1 1 1 1 0 1 1 0 1 1 ...
##  $ Amount_purchased: num [1:1600] 113 418 336 180 320 268 198 280 393 138 ...
##  $ Frequency       : num [1:1600] 8 6 18 16 2 4 2 6 12 10 ...
##  $ Last_purchase   : num [1:1600] 1 11 6 5 3 1 12 2 11 7 ...
##  $ First_purchase  : num [1:1600] 8 66 32 42 18 4 62 12 50 38 ...
##  $ P_Child         : num [1:1600] 0 0 2 2 0 0 2 0 3 2 ...
##  $ P_Youth         : num [1:1600] 1 2 0 0 0 0 3 2 0 3 ...
##  $ P_Cook          : num [1:1600] 0 3 1 0 0 0 2 0 3 0 ...
##  $ P_DIY           : num [1:1600] 0 2 1 1 1 0 1 0 0 0 ...
##  $ P_Art           : num [1:1600] 0 3 2 1 2 0 2 0 2 1 ...

Check structure of test dataset

str(bbbc_test)

## tibble [2,300 × 11] (S3: tbl_df/tbl/data.frame)
##  $ Choice          : num [1:2300] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Gender          : num [1:2300] 0 1 1 0 1 0 1 1 1 1 ...
##  $ Amount_purchased: num [1:2300] 287 215 261 24 120 66 42 233 66 199 ...
##  $ Frequency       : num [1:2300] 12 4 2 4 8 2 12 8 12 22 ...
##  $ Last_purchase   : num [1:2300] 4 1 1 1 1 4 1 2 1 1 ...
##  $ First_purchase  : num [1:2300] 24 4 2 4 8 16 12 12 12 22 ...
##  $ P_Child         : num [1:2300] 0 0 0 1 0 0 0 0 0 0 ...
##  $ P_Youth         : num [1:2300] 3 0 0 0 0 0 0 0 0 0 ...
##  $ P_Cook          : num [1:2300] 0 0 0 0 0 1 1 0 0 0 ...
##  $ P_DIY           : num [1:2300] 0 0 0 0 0 1 0 0 0 0 ...
##  $ P_Art           : num [1:2300] 1 1 1 0 1 1 0 0 0 1 ...

Change Gender variable to factor

bbbc_train$Gender = as.factor(bbbc_train$Gender)
bbbc_test$Gender = as.factor(bbbc_test$Gender)

Check/remove missing na values from dataset

bbbc_train = na.omit(bbbc_train)
bbbc_test = na.omit(bbbc_test)

Choice as factor variable

levels(as.factor(bbbc_train$Choice))

## [1] "0" "1"

Gender as factor variable

levels(as.factor(bbbc_train$Gender))

## [1] "0" "1"

Amount purchased as factor variable

levels(as.factor(bbbc_train$Amount_purchased))

##   [1] "15"  "16"  "17"  "18"  "21"  "22"  "23"  "24"  "25"  "26"  "27"  "28" 
##  [13] "29"  "30"  "31"  "32"  "33"  "34"  "35"  "36"  "37"  "38"  "39"  "40" 
##  [25] "41"  "42"  "43"  "44"  "45"  "46"  "47"  "48"  "49"  "51"  "52"  "53" 
##  [37] "54"  "55"  "56"  "57"  "59"  "60"  "61"  "62"  "63"  "64"  "65"  "66" 
##  [49] "67"  "68"  "69"  "70"  "71"  "72"  "73"  "74"  "75"  "76"  "77"  "78" 
##  [61] "79"  "80"  "81"  "82"  "83"  "84"  "85"  "86"  "87"  "88"  "89"  "90" 
##  [73] "91"  "92"  "93"  "94"  "95"  "96"  "97"  "98"  "99"  "100" "101" "102"
##  [85] "103" "104" "105" "106" "107" "108" "110" "111" "112" "113" "114" "115"
##  [97] "116" "117" "118" "119" "120" "121" "122" "123" "124" "125" "126" "127"
## [109] "128" "129" "130" "131" "132" "133" "134" "135" "136" "137" "138" "139"
## [121] "140" "141" "142" "143" "144" "145" "146" "147" "148" "149" "150" "151"
## [133] "152" "153" "154" "155" "156" "157" "158" "159" "160" "161" "162" "163"
## [145] "164" "165" "166" "167" "168" "169" "170" "171" "172" "173" "174" "175"
## [157] "176" "177" "178" "179" "180" "181" "182" "183" "184" "185" "186" "187"
## [169] "188" "189" "190" "191" "192" "193" "194" "195" "196" "197" "198" "199"
## [181] "200" "201" "202" "203" "204" "205" "206" "207" "208" "209" "210" "211"
## [193] "212" "213" "214" "215" "216" "217" "218" "219" "220" "221" "222" "223"
## [205] "224" "225" "226" "227" "228" "229" "230" "231" "232" "233" "234" "235"
## [217] "236" "237" "238" "239" "240" "241" "242" "243" "244" "245" "246" "247"
## [229] "248" "249" "250" "251" "252" "253" "254" "255" "256" "257" "258" "259"
## [241] "260" "261" "262" "263" "264" "265" "266" "267" "268" "269" "270" "271"
## [253] "272" "273" "274" "275" "276" "277" "278" "279" "280" "281" "282" "283"
## [265] "284" "285" "286" "287" "288" "289" "290" "291" "292" "293" "294" "295"
## [277] "296" "297" "298" "299" "300" "301" "302" "303" "304" "305" "306" "307"
## [289] "308" "309" "310" "311" "312" "313" "314" "315" "316" "317" "318" "319"
## [301] "320" "321" "322" "323" "324" "325" "326" "327" "328" "329" "330" "331"
## [313] "332" "333" "334" "335" "336" "338" "339" "340" "341" "342" "343" "344"
## [325] "345" "346" "347" "349" "350" "351" "352" "353" "354" "355" "356" "357"
## [337] "360" "361" "364" "366" "367" "369" "370" "371" "372" "373" "374" "375"
## [349] "376" "377" "382" "383" "384" "385" "386" "388" "389" "390" "391" "393"
## [361] "394" "396" "397" "399" "400" "401" "403" "404" "405" "406" "407" "408"
## [373] "410" "413" "414" "418" "425" "428" "432" "434" "437" "444" "447" "451"
## [385] "469" "473" "474"

levels(as.factor(bbbc_train$Frequency))

##  [1] "2"  "4"  "6"  "8"  "10" "12" "14" "16" "18" "20" "22" "24" "26" "28" "30"
## [16] "32" "34" "36"

levels(as.factor(bbbc_train$Last_purchase))

##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12"

levels(as.factor(bbbc_train$First_purchase))

##  [1] "2"  "4"  "6"  "8"  "10" "12" "14" "16" "18" "20" "22" "24" "26" "28" "30"
## [16] "32" "34" "36" "38" "40" "42" "44" "46" "48" "50" "52" "54" "56" "58" "60"
## [31] "62" "64" "66" "68" "70" "72" "74" "76" "78" "80" "82" "84" "86" "96"

levels(as.factor(bbbc_train$P_Child))

## [1] "0" "1" "2" "3" "4" "5" "6" "8"

levels(as.factor(bbbc_train$P_Youth))

## [1] "0" "1" "2" "3" "4"

levels(as.factor(bbbc_train$P_Cook))

## [1] "0" "1" "2" "3" "4" "5" "6"

levels(as.factor(bbbc_train$P_DIY))

## [1] "0" "1" "2" "3" "4"

levels(as.factor(bbbc_train$P_Art))

## [1] "0" "1" "2" "3" "4" "5"

levels(as.factor(bbbc_test$Choice))

## [1] "0" "1"

levels(as.factor(bbbc_test$Gender))

## [1] "0" "1"

levels(as.factor(bbbc_test$Amount_purchased))

##   [1] "15"  "16"  "17"  "18"  "19"  "20"  "21"  "22"  "23"  "24"  "25"  "26" 
##  [13] "29"  "30"  "31"  "32"  "33"  "34"  "35"  "36"  "37"  "38"  "39"  "40" 
##  [25] "41"  "42"  "43"  "44"  "45"  "46"  "47"  "48"  "49"  "50"  "51"  "52" 
##  [37] "53"  "54"  "55"  "56"  "57"  "58"  "59"  "60"  "61"  "62"  "63"  "64" 
##  [49] "65"  "66"  "67"  "68"  "69"  "70"  "71"  "72"  "73"  "74"  "75"  "76" 
##  [61] "77"  "78"  "79"  "80"  "81"  "82"  "83"  "84"  "85"  "86"  "87"  "88" 
##  [73] "89"  "90"  "91"  "92"  "93"  "94"  "95"  "96"  "97"  "98"  "99"  "100"
##  [85] "101" "102" "103" "104" "105" "106" "107" "108" "109" "110" "111" "112"
##  [97] "113" "114" "115" "116" "117" "118" "119" "120" "121" "122" "123" "124"
## [109] "125" "126" "127" "128" "129" "130" "131" "132" "133" "134" "135" "136"
## [121] "137" "138" "139" "140" "141" "142" "143" "144" "145" "146" "147" "148"
## [133] "149" "150" "151" "152" "153" "154" "155" "156" "157" "158" "159" "160"
## [145] "161" "162" "163" "164" "165" "166" "167" "168" "169" "170" "171" "172"
## [157] "173" "174" "175" "176" "177" "178" "179" "180" "181" "182" "183" "184"
## [169] "185" "186" "187" "188" "189" "190" "191" "192" "193" "194" "195" "196"
## [181] "197" "198" "199" "200" "201" "202" "203" "204" "205" "206" "207" "208"
## [193] "209" "210" "211" "212" "213" "214" "215" "216" "217" "218" "219" "220"
## [205] "221" "222" "223" "224" "225" "226" "227" "228" "229" "230" "231" "232"
## [217] "233" "234" "235" "236" "237" "238" "239" "240" "241" "242" "243" "244"
## [229] "245" "246" "247" "248" "249" "250" "251" "252" "253" "254" "255" "256"
## [241] "257" "258" "259" "260" "261" "262" "263" "264" "265" "266" "267" "268"
## [253] "269" "270" "271" "272" "273" "274" "275" "276" "277" "278" "279" "280"
## [265] "281" "282" "283" "284" "285" "286" "287" "288" "289" "290" "291" "292"
## [277] "293" "294" "295" "296" "297" "298" "299" "300" "301" "302" "303" "304"
## [289] "305" "306" "307" "308" "309" "310" "311" "312" "313" "314" "315" "316"
## [301] "317" "318" "319" "320" "321" "322" "323" "324" "325" "326" "327" "328"
## [313] "329" "330" "331" "332" "333" "334" "335" "336" "337" "338" "339" "342"
## [325] "343" "344" "345" "346" "347" "348" "350" "351" "352" "353" "355" "356"
## [337] "357" "359" "360" "363" "364" "365" "366" "368" "369" "370" "371" "372"
## [349] "374" "377" "379" "382" "383" "384" "385" "386" "387" "388" "391" "392"
## [361] "393" "396" "397" "398" "399" "400" "401" "402" "404" "405" "406" "407"
## [373] "409" "410" "411" "414" "419" "420" "421" "423" "427" "428" "429" "431"
## [385] "433" "434" "435" "439" "442" "443" "444" "447" "450" "458" "461"

levels(as.factor(bbbc_test$Frequency))

##  [1] "2"  "4"  "6"  "8"  "10" "12" "14" "16" "18" "20" "22" "24" "26" "28" "30"
## [16] "32" "34" "36"

levels(as.factor(bbbc_test$Last_purchase))

##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12"

levels(as.factor(bbbc_test$First_purchase))

##  [1] "2"  "4"  "6"  "8"  "10" "12" "14" "16" "18" "20" "22" "24" "26" "28" "30"
## [16] "32" "34" "36" "38" "40" "42" "44" "46" "48" "50" "52" "54" "56" "58" "60"
## [31] "62" "64" "66" "68" "70" "72" "74" "76" "78" "80" "84" "88" "96"

levels(as.factor(bbbc_test$P_Child))

## [1] "0" "1" "2" "3" "4" "5" "6" "7"

levels(as.factor(bbbc_test$P_Youth))

## [1] "0" "1" "2" "3" "4" "5"

levels(as.factor(bbbc_test$P_Cook))

## [1] "0" "1" "2" "3" "4" "5" "6"

levels(as.factor(bbbc_test$P_DIY))

## [1] "0" "1" "2" "3" "4"

levels(as.factor(bbbc_test$P_Art))

## [1] "0" "1" "2" "3" "4"

Apply Linear model with all predictors

bbbc_lm = lm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_train)
summary(bbbc_lm)

## 
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + 
##     P_DIY + P_Art, data = bbbc_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9603 -0.2462 -0.1161  0.1622  1.0588 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.3642284  0.0307411  11.848  < 2e-16 ***
## Gender1          -0.1309205  0.0200303  -6.536 8.48e-11 ***
## Amount_purchased  0.0002736  0.0001110   2.464   0.0138 *  
## Frequency        -0.0090868  0.0021791  -4.170 3.21e-05 ***
## Last_purchase     0.0970286  0.0135589   7.156 1.26e-12 ***
## First_purchase   -0.0020024  0.0018160  -1.103   0.2704    
## P_Child          -0.1262584  0.0164011  -7.698 2.41e-14 ***
## P_Youth          -0.0963563  0.0201097  -4.792 1.81e-06 ***
## P_Cook           -0.1414907  0.0166064  -8.520  < 2e-16 ***
## P_DIY            -0.1352313  0.0197873  -6.834 1.17e-11 ***
## P_Art             0.1178494  0.0194427   6.061 1.68e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3788 on 1589 degrees of freedom
## Multiple R-squared:  0.2401, Adjusted R-squared:  0.2353 
## F-statistic:  50.2 on 10 and 1589 DF,  p-value: < 2.2e-16

Apply linear regressions with predictors interaction effect

bbbc_lm1 = lm(Choice ~ (.)^2, data = bbbc_train)
summary(bbbc_lm1)

## 
## Call:
## lm(formula = Choice ~ (.)^2, data = bbbc_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.03612 -0.22165 -0.10283  0.06123  0.99352 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      3.765e-01  6.995e-02   5.382 8.51e-08 ***
## Gender1                         -1.777e-01  5.843e-02  -3.041  0.00240 ** 
## Amount_purchased                 1.989e-04  3.084e-04   0.645  0.51902    
## Frequency                       -7.691e-03  8.245e-03  -0.933  0.35102    
## Last_purchase                    1.533e-01  5.225e-02   2.933  0.00340 ** 
## First_purchase                  -1.035e-02  7.439e-03  -1.392  0.16420    
## P_Child                         -1.630e-01  5.581e-02  -2.920  0.00356 ** 
## P_Youth                         -8.511e-02  6.878e-02  -1.237  0.21610    
## P_Cook                          -1.305e-01  5.775e-02  -2.260  0.02398 *  
## P_DIY                           -1.416e-01  7.104e-02  -1.994  0.04635 *  
## P_Art                            1.835e-01  6.735e-02   2.725  0.00651 ** 
## Gender1:Amount_purchased         2.387e-04  2.396e-04   0.996  0.31922    
## Gender1:Frequency               -3.741e-03  4.702e-03  -0.796  0.42632    
## Gender1:Last_purchase           -6.512e-02  2.931e-02  -2.222  0.02645 *  
## Gender1:First_purchase           7.184e-03  3.981e-03   1.804  0.07135 .  
## Gender1:P_Child                  4.698e-02  3.502e-02   1.341  0.17996    
## Gender1:P_Youth                  4.745e-02  4.279e-02   1.109  0.26766    
## Gender1:P_Cook                   4.097e-02  3.516e-02   1.165  0.24408    
## Gender1:P_DIY                   -2.289e-02  4.233e-02  -0.541  0.58867    
## Gender1:P_Art                    3.922e-02  4.205e-02   0.933  0.35116    
## Amount_purchased:Frequency      -3.406e-06  2.713e-05  -0.126  0.90014    
## Amount_purchased:Last_purchase  -1.530e-05  1.555e-04  -0.098  0.92165    
## Amount_purchased:First_purchase -2.818e-06  2.277e-05  -0.124  0.90155    
## Amount_purchased:P_Child         1.670e-04  1.926e-04   0.867  0.38598    
## Amount_purchased:P_Youth        -4.600e-04  2.276e-04  -2.021  0.04343 *  
## Amount_purchased:P_Cook         -6.230e-05  1.958e-04  -0.318  0.75040    
## Amount_purchased:P_DIY           1.370e-04  2.336e-04   0.587  0.55747    
## Amount_purchased:P_Art           2.441e-04  2.276e-04   1.072  0.28379    
## Frequency:Last_purchase         -1.008e-03  1.802e-03  -0.559  0.57604    
## Frequency:First_purchase         2.641e-04  1.081e-04   2.444  0.01465 *  
## Frequency:P_Child               -2.299e-04  2.894e-03  -0.079  0.93668    
## Frequency:P_Youth               -1.632e-04  3.336e-03  -0.049  0.96100    
## Frequency:P_Cook                -1.263e-04  2.813e-03  -0.045  0.96420    
## Frequency:P_DIY                  2.298e-03  3.546e-03   0.648  0.51706    
## Frequency:P_Art                 -9.074e-03  3.219e-03  -2.819  0.00488 ** 
## Last_purchase:First_purchase     1.856e-04  9.478e-04   0.196  0.84480    
## Last_purchase:P_Child           -3.015e-03  7.146e-03  -0.422  0.67315    
## Last_purchase:P_Youth            1.825e-02  1.325e-02   1.377  0.16877    
## Last_purchase:P_Cook            -7.985e-04  9.259e-03  -0.086  0.93129    
## Last_purchase:P_DIY             -1.875e-02  1.355e-02  -1.383  0.16682    
## Last_purchase:P_Art             -4.667e-03  1.319e-02  -0.354  0.72351    
## First_purchase:P_Child          -1.048e-03  1.701e-03  -0.616  0.53779    
## First_purchase:P_Youth           9.484e-04  2.197e-03   0.432  0.66600    
## First_purchase:P_Cook           -2.934e-04  1.707e-03  -0.172  0.86355    
## First_purchase:P_DIY             1.131e-03  2.229e-03   0.507  0.61191    
## First_purchase:P_Art             9.424e-04  2.047e-03   0.460  0.64525    
## P_Child:P_Youth                 -1.070e-02  2.005e-02  -0.534  0.59364    
## P_Child:P_Cook                   2.717e-02  1.337e-02   2.032  0.04236 *  
## P_Child:P_DIY                    2.246e-02  2.116e-02   1.061  0.28877    
## P_Child:P_Art                   -1.623e-02  1.933e-02  -0.840  0.40114    
## P_Youth:P_Cook                  -3.018e-02  2.109e-02  -1.431  0.15259    
## P_Youth:P_DIY                   -2.040e-02  2.994e-02  -0.681  0.49567    
## P_Youth:P_Art                   -1.887e-02  2.628e-02  -0.718  0.47276    
## P_Cook:P_DIY                     3.197e-03  2.370e-02   0.135  0.89272    
## P_Cook:P_Art                    -2.763e-02  1.973e-02  -1.400  0.16165    
## P_DIY:P_Art                      2.514e-02  2.742e-02   0.917  0.35935    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3759 on 1544 degrees of freedom
## Multiple R-squared:  0.2727, Adjusted R-squared:  0.2468 
## F-statistic: 10.53 on 55 and 1544 DF,  p-value: < 2.2e-16

Apply Linear regression with predictors interaction effect and backward selection

bbbc_lm2 = ols_step_backward_p(bbbc_lm1, prem = 0.05, details = F)
bbbc_lm2$model

## 
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")), 
##     data = l)
## 
## Coefficients:
##              (Intercept)                   Gender1          Amount_purchased  
##                0.3652837                -0.1300602                 0.0004283  
##                Frequency             Last_purchase                   P_Child  
##               -0.0159536                 0.0757656                -0.1051257  
##                  P_Youth                    P_Cook                     P_DIY  
##               -0.0439214                -0.1468073                -0.1319547  
##                    P_Art  Amount_purchased:P_Youth  Frequency:First_purchase  
##                0.2569808                -0.0004654                 0.0002160  
##          Frequency:P_Art     Last_purchase:P_Youth    P_Child:First_purchase  
##               -0.0088380                 0.0095153                -0.0012821  
##           P_Child:P_Cook              P_Cook:P_Art  
##                0.0240791                -0.0294565

Linear regression with selected variables from previous backward selection step

bbbc_lm2 = lm(data = bbbc_train, Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Amount_purchased:P_Youth + Frequency:First_purchase + Frequency:P_Art + Last_purchase:P_Youth + P_Child:First_purchase + P_Child:P_Cook + P_Cook:P_Art)
summary(bbbc_lm2)

## 
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art + 
##     Amount_purchased:P_Youth + Frequency:First_purchase + Frequency:P_Art + 
##     Last_purchase:P_Youth + P_Child:First_purchase + P_Child:P_Cook + 
##     P_Cook:P_Art, data = bbbc_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.97590 -0.22903 -0.10373  0.07595  1.00459 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.653e-01  4.103e-02   8.903  < 2e-16 ***
## Gender1                  -1.301e-01  1.983e-02  -6.558 7.35e-11 ***
## Amount_purchased          4.283e-04  1.259e-04   3.402 0.000687 ***
## Frequency                -1.595e-02  3.045e-03  -5.240 1.82e-07 ***
## Last_purchase             7.577e-02  1.218e-02   6.219 6.37e-10 ***
## P_Child                  -1.051e-01  2.364e-02  -4.447 9.31e-06 ***
## P_Youth                  -4.392e-02  4.269e-02  -1.029 0.303753    
## P_Cook                   -1.468e-01  2.011e-02  -7.302 4.47e-13 ***
## P_DIY                    -1.320e-01  1.973e-02  -6.689 3.11e-11 ***
## P_Art                     2.570e-01  3.250e-02   7.906 4.93e-15 ***
## Amount_purchased:P_Youth -4.654e-04  1.697e-04  -2.743 0.006165 ** 
## Frequency:First_purchase  2.160e-04  7.231e-05   2.987 0.002857 ** 
## Frequency:P_Art          -8.838e-03  1.936e-03  -4.564 5.40e-06 ***
## Last_purchase:P_Youth     9.515e-03  4.796e-03   1.984 0.047437 *  
## P_Child:First_purchase   -1.282e-03  5.160e-04  -2.485 0.013060 *  
## P_Child:P_Cook            2.408e-02  8.469e-03   2.843 0.004524 ** 
## P_Cook:P_Art             -2.946e-02  1.123e-02  -2.622 0.008821 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3746 on 1583 degrees of freedom
## Multiple R-squared:  0.2597, Adjusted R-squared:  0.2523 
## F-statistic: 34.71 on 16 and 1583 DF,  p-value: < 2.2e-16

Cooks distance diagnostic plot

cook_bbbc_lm2 = cooks.distance(bbbc_lm2)
plot(cook_bbbc_lm2, col = "red", pch = 20, cex = 1)
abline(h = 4/1600, lty = 2, col = "black")

large_cook_distance = which(cooks.distance(bbbc_lm2)>4/1600)

bbbc_lm2_trained = bbbc_train[-large_cook_distance,]

bbbc_lm2_1 = lm(data = bbbc_lm2_trained, Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Amount_purchased:P_Youth + Frequency:First_purchase + Frequency:P_Art + Last_purchase:P_Youth + P_Child:First_purchase + P_Child:P_Cook + P_Cook:P_Art)
summary(bbbc_lm2_1)

## 
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art + 
##     Amount_purchased:P_Youth + Frequency:First_purchase + Frequency:P_Art + 
##     Last_purchase:P_Youth + P_Child:First_purchase + P_Child:P_Cook + 
##     P_Cook:P_Art, data = bbbc_lm2_trained)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.75671 -0.21055 -0.08308  0.07178  0.99302 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.937e-01  3.935e-02  10.005  < 2e-16 ***
## Gender1                  -1.179e-01  1.906e-02  -6.182 8.12e-10 ***
## Amount_purchased          3.403e-04  1.205e-04   2.823 0.004816 ** 
## Frequency                -1.886e-02  2.951e-03  -6.390 2.20e-10 ***
## Last_purchase             7.137e-02  1.295e-02   5.512 4.18e-08 ***
## P_Child                  -1.026e-01  2.370e-02  -4.328 1.60e-05 ***
## P_Youth                  -1.044e-01  4.230e-02  -2.469 0.013654 *  
## P_Cook                   -1.467e-01  1.988e-02  -7.381 2.59e-13 ***
## P_DIY                    -1.292e-01  2.017e-02  -6.407 1.98e-10 ***
## P_Art                     2.995e-01  3.202e-02   9.352  < 2e-16 ***
## Amount_purchased:P_Youth -4.138e-04  1.758e-04  -2.354 0.018690 *  
## Frequency:First_purchase  2.759e-04  7.112e-05   3.880 0.000109 ***
## Frequency:P_Art          -1.081e-02  1.920e-03  -5.631 2.14e-08 ***
## Last_purchase:P_Youth     1.227e-02  4.977e-03   2.465 0.013798 *  
## P_Child:First_purchase   -1.318e-03  5.734e-04  -2.299 0.021629 *  
## P_Child:P_Cook            1.889e-02  9.627e-03   1.962 0.049906 *  
## P_Cook:P_Art             -3.943e-02  1.323e-02  -2.979 0.002934 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3506 on 1508 degrees of freedom
## Multiple R-squared:  0.3055, Adjusted R-squared:  0.2981 
## F-statistic: 41.46 on 16 and 1508 DF,  p-value: < 2.2e-16

Multicollinearity check VIF values

vif(bbbc_lm2_1)

##                   Gender         Amount_purchased                Frequency 
##                 1.007185                 1.571742                 6.550741 
##            Last_purchase                  P_Child                  P_Youth 
##                16.041588                 6.999756                 7.730242 
##                   P_Cook                    P_DIY                    P_Art 
##                 4.675060                 2.113232                 6.263660 
## Amount_purchased:P_Youth Frequency:First_purchase          Frequency:P_Art 
##                10.366335                 9.185984                 4.366306 
##    Last_purchase:P_Youth   P_Child:First_purchase           P_Child:P_Cook 
##                 6.608138                 9.556593                 4.866074 
##             P_Cook:P_Art 
##                 2.813394

Removing variable Last_purchase, Amount_purchased:P_Youth, Frequency:First_purchase, P_Child:First_purchase, with high VIF value correlation

bbbc_lm2_2 = lm(data = bbbc_lm2_trained, Choice ~ Gender + Amount_purchased + Frequency + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Frequency:P_Art + Last_purchase:P_Youth + P_Child:P_Cook + P_Cook:P_Art)
summary(bbbc_lm2_2)

## 
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     P_Child + P_Youth + P_Cook + P_DIY + P_Art + Frequency:P_Art + 
##     Last_purchase:P_Youth + P_Child:P_Cook + P_Cook:P_Art, data = bbbc_lm2_trained)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.82113 -0.20684 -0.10215  0.07888  0.94534 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            0.3573304  0.0320051  11.165  < 2e-16 ***
## Gender1               -0.1176548  0.0194322  -6.055 1.77e-09 ***
## Amount_purchased       0.0003135  0.0001071   2.928 0.003466 ** 
## Frequency             -0.0090326  0.0013688  -6.599 5.71e-11 ***
## P_Child               -0.0507128  0.0138707  -3.656 0.000265 ***
## P_Youth               -0.0977169  0.0311716  -3.135 0.001753 ** 
## P_Cook                -0.0554737  0.0153856  -3.606 0.000322 ***
## P_DIY                 -0.0422865  0.0156734  -2.698 0.007054 ** 
## P_Art                  0.3691289  0.0276425  13.354  < 2e-16 ***
## Frequency:P_Art       -0.0089962  0.0018532  -4.854 1.33e-06 ***
## P_Youth:Last_purchase  0.0089195  0.0043355   2.057 0.039829 *  
## P_Child:P_Cook         0.0130521  0.0083536   1.562 0.118391    
## P_Cook:P_Art          -0.0362256  0.0134283  -2.698 0.007059 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3576 on 1512 degrees of freedom
## Multiple R-squared:  0.2758, Adjusted R-squared:   0.27 
## F-statistic: 47.98 on 12 and 1512 DF,  p-value: < 2.2e-16

Multicollinearity check VIF values

vif(bbbc_lm2_2)

##                Gender      Amount_purchased             Frequency 
##              1.006296              1.192679              1.354652 
##               P_Child               P_Youth                P_Cook 
##              2.304990              4.036195              2.693090 
##                 P_DIY                 P_Art       Frequency:P_Art 
##              1.227421              4.487716              3.912225 
## P_Youth:Last_purchase        P_Child:P_Cook          P_Cook:P_Art 
##              4.822456              3.522952              2.785372

bbbc_lm2_3 = ols_step_backward_p(bbbc_lm2_2, prem = 0.05, details = F)
bbbc_lm2_3$model

## 
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")), 
##     data = l)
## 
## Coefficients:
##           (Intercept)                Gender1       Amount_purchased  
##             0.3451476             -0.1159939              0.0003101  
##             Frequency                P_Child                P_Youth  
##            -0.0089816             -0.0359693             -0.0962058  
##                P_Cook                  P_DIY                  P_Art  
##            -0.0418821             -0.0401135              0.3697613  
##       Frequency:P_Art  P_Youth:Last_purchase           P_Cook:P_Art  
##            -0.0090406              0.0087674             -0.0361205

bbbc_lm2_3 = lm(data = bbbc_lm2_trained,Choice ~ Gender + Amount_purchased + Frequency + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Frequency:P_Art + Last_purchase:P_Youth + P_Cook:P_Art)
summary(bbbc_lm2_3)

## 
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     P_Child + P_Youth + P_Cook + P_DIY + P_Art + Frequency:P_Art + 
##     Last_purchase:P_Youth + P_Cook:P_Art, data = bbbc_lm2_trained)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.80923 -0.20561 -0.10483  0.08567  0.93807 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            0.3451476  0.0310555  11.114  < 2e-16 ***
## Gender1               -0.1159939  0.0194123  -5.975 2.86e-09 ***
## Amount_purchased       0.0003101  0.0001071   2.896 0.003838 ** 
## Frequency             -0.0089816  0.0013690  -6.561 7.34e-11 ***
## P_Child               -0.0359693  0.0101712  -3.536 0.000418 ***
## P_Youth               -0.0962058  0.0311714  -3.086 0.002063 ** 
## P_Cook                -0.0418821  0.0126964  -3.299 0.000994 ***
## P_DIY                 -0.0401135  0.0156191  -2.568 0.010317 *  
## P_Art                  0.3697613  0.0276527  13.372  < 2e-16 ***
## Frequency:P_Art       -0.0090406  0.0018538  -4.877 1.19e-06 ***
## P_Youth:Last_purchase  0.0087674  0.0043365   2.022 0.043375 *  
## P_Cook:P_Art          -0.0361205  0.0134345  -2.689 0.007253 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3578 on 1513 degrees of freedom
## Multiple R-squared:  0.2746, Adjusted R-squared:  0.2693 
## F-statistic: 52.07 on 11 and 1513 DF,  p-value: < 2.2e-16

par(mfrow=c(2,2))
plot(bbbc_lm2_3, which=c(1:4))

## Apply Linear regression with all predictors and backward selection

bbbc_lm3 = ols_step_backward_p(bbbc_lm, prem = 0.05, details = F)
bbbc_lm3$model

## 
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")), 
##     data = l)
## 
## Coefficients:
##      (Intercept)           Gender1  Amount_purchased         Frequency  
##        0.3727367        -0.1316464         0.0002742        -0.0110830  
##    Last_purchase           P_Child           P_Youth            P_Cook  
##        0.0894288        -0.1275991        -0.0973642        -0.1433497  
##            P_DIY             P_Art  
##       -0.1365578         0.1150034

Linear regression with selected variables from previous backward selection step

bbbc_lm3 = lm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_train)
summary(bbbc_lm3)

## 
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, 
##     data = bbbc_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9802 -0.2452 -0.1157  0.1655  1.0595 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.3727367  0.0297590  12.525  < 2e-16 ***
## Gender1          -0.1316464  0.0200208  -6.575 6.56e-11 ***
## Amount_purchased  0.0002742  0.0001110   2.470   0.0136 *  
## Frequency        -0.0110830  0.0012128  -9.138  < 2e-16 ***
## Last_purchase     0.0894288  0.0116772   7.658 3.25e-14 ***
## P_Child          -0.1275991  0.0163571  -7.801 1.11e-14 ***
## P_Youth          -0.0973642  0.0200903  -4.846 1.38e-06 ***
## P_Cook           -0.1433497  0.0165218  -8.676  < 2e-16 ***
## P_DIY            -0.1365578  0.0197520  -6.914 6.82e-12 ***
## P_Art             0.1150034  0.0192719   5.967 2.97e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3788 on 1590 degrees of freedom
## Multiple R-squared:  0.2395, Adjusted R-squared:  0.2352 
## F-statistic: 55.63 on 9 and 1590 DF,  p-value: < 2.2e-16

Cooks distance diagnostic plot

cook_bbbc_lm3 = cooks.distance(bbbc_lm3)
plot(cook_bbbc_lm3, col = "red", pch = 20, cex = 1)
abline(h = 4/1600, lty = 2, col = "black")

large_cook_distance = which(cooks.distance(bbbc_lm3)>4/1600)

bbbc_lm3_trained = bbbc_train[-large_cook_distance,]

bbbc_lm3_1 = lm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_lm3_trained)
summary(bbbc_lm3_1)

## 
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, 
##     data = bbbc_lm3_trained)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.72682 -0.22418 -0.09923  0.12558  0.98094 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.3589669  0.0283022  12.683  < 2e-16 ***
## Gender1          -0.1133411  0.0191199  -5.928 3.79e-09 ***
## Amount_purchased  0.0002985  0.0001063   2.808  0.00505 ** 
## Frequency        -0.0120967  0.0011619 -10.411  < 2e-16 ***
## Last_purchase     0.0837024  0.0122794   6.816 1.34e-11 ***
## P_Child          -0.1273699  0.0168130  -7.576 6.18e-14 ***
## P_Youth          -0.1241418  0.0204237  -6.078 1.53e-09 ***
## P_Cook           -0.1455776  0.0167062  -8.714  < 2e-16 ***
## P_DIY            -0.1276728  0.0198407  -6.435 1.65e-10 ***
## P_Art             0.1490765  0.0196550   7.585 5.78e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3523 on 1517 degrees of freedom
## Multiple R-squared:  0.2935, Adjusted R-squared:  0.2893 
## F-statistic: 70.03 on 9 and 1517 DF,  p-value: < 2.2e-16

Multicollinearity check VIF values

vif(bbbc_lm3_1)

##           Gender Amount_purchased        Frequency    Last_purchase 
##         1.003393         1.226169         1.009234        15.034206 
##          P_Child          P_Youth           P_Cook            P_DIY 
##         3.617257         1.820319         3.384715         2.089291 
##            P_Art 
##         2.401423

Removing variable Last_purchase with high VIF value correlation

bbbc_lm3_2 = lm(Choice ~ Gender + Amount_purchased + Frequency + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_lm3_trained)
summary(bbbc_lm3_2)

## 
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_lm3_trained)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7698 -0.2261 -0.1036  0.1216  0.9464 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.3584078  0.0287228  12.478  < 2e-16 ***
## Gender1          -0.1108832  0.0194007  -5.715 1.31e-08 ***
## Amount_purchased  0.0003941  0.0001069   3.685 0.000236 ***
## Frequency        -0.0121444  0.0011792 -10.299  < 2e-16 ***
## P_Child          -0.0336805  0.0098271  -3.427 0.000626 ***
## P_Youth          -0.0360643  0.0160518  -2.247 0.024800 *  
## P_Cook           -0.0542681  0.0101315  -5.356 9.80e-08 ***
## P_DIY            -0.0379397  0.0150652  -2.518 0.011892 *  
## P_Art             0.2472133  0.0135798  18.205  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3575 on 1518 degrees of freedom
## Multiple R-squared:  0.2719, Adjusted R-squared:  0.268 
## F-statistic: 70.85 on 8 and 1518 DF,  p-value: < 2.2e-16

vif(bbbc_lm3_2)

##           Gender Amount_purchased        Frequency          P_Child 
##         1.003036         1.204798         1.009197         1.199848 
##          P_Youth           P_Cook            P_DIY            P_Art 
##         1.091722         1.208629         1.169544         1.112987

par(mfrow=c(2,2))
plot(bbbc_lm3_2, which=c(1:4))

#### Choice as factor variable

bbbc_train$Choice = as.factor(bbbc_train$Choice)
bbbc_test$Choice = as.factor(bbbc_test$Choice)

Full Logistic Regression Model

bbbc_log = glm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_train, family = binomial)
summary(bbbc_log)

## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + 
##     P_DIY + P_Art, family = binomial, data = bbbc_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.38586  -0.66728  -0.43696  -0.02242   2.72238  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.3515281  0.2143839  -1.640   0.1011    
## Gender1          -0.8632319  0.1374499  -6.280 3.38e-10 ***
## Amount_purchased  0.0018641  0.0007918   2.354   0.0186 *  
## Frequency        -0.0755142  0.0165937  -4.551 5.35e-06 ***
## Last_purchase     0.6117713  0.0938127   6.521 6.97e-11 ***
## First_purchase   -0.0147792  0.0128027  -1.154   0.2483    
## P_Child          -0.8112489  0.1167067  -6.951 3.62e-12 ***
## P_Youth          -0.6370422  0.1433778  -4.443 8.87e-06 ***
## P_Cook           -0.9230066  0.1194814  -7.725 1.12e-14 ***
## P_DIY            -0.9058697  0.1437025  -6.304 2.90e-10 ***
## P_Art             0.6861124  0.1270176   5.402 6.60e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1392.2  on 1589  degrees of freedom
## AIC: 1414.2
## 
## Number of Fisher Scoring iterations: 5

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1968  128
##          1  125   79
##                                           
##                Accuracy : 0.89            
##                  95% CI : (0.8765, 0.9025)
##     No Information Rate : 0.91            
##     P-Value [Acc > NIR] : 0.9995          
##                                           
##                   Kappa : 0.324           
##                                           
##  Mcnemar's Test P-Value : 0.8999          
##                                           
##             Sensitivity : 0.38164         
##             Specificity : 0.94028         
##          Pos Pred Value : 0.38725         
##          Neg Pred Value : 0.93893         
##              Prevalence : 0.09000         
##          Detection Rate : 0.03435         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.66096         
##                                           
##        'Positive' Class : 1               
##

Full Logistic Regression with interaction terms

bbbc_log2 = glm(Choice ~ (.)^2, data = bbbc_train, family = binomial)
summary(bbbc_log2)

## 
## Call:
## glm(formula = Choice ~ (.)^2, family = binomial, data = bbbc_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.66574  -0.65647  -0.42036  -0.00011   2.62619  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                     -4.924e-01  4.846e-01  -1.016   0.3096  
## Gender1                         -9.323e-01  4.281e-01  -2.178   0.0294 *
## Amount_purchased                 2.517e-04  2.169e-03   0.116   0.9076  
## Frequency                       -3.397e-02  6.206e-02  -0.547   0.5841  
## Last_purchase                    6.055e-01  3.723e-01   1.627   0.1038  
## First_purchase                  -5.060e-02  5.556e-02  -0.911   0.3625  
## P_Child                         -7.134e-01  4.117e-01  -1.733   0.0832 .
## P_Youth                         -1.269e-02  5.098e-01  -0.025   0.9801  
## P_Cook                          -2.425e-01  4.232e-01  -0.573   0.5665  
## P_DIY                           -5.820e-01  5.164e-01  -1.127   0.2597  
## P_Art                            8.205e-01  4.654e-01   1.763   0.0779 .
## Gender1:Amount_purchased         2.726e-03  1.709e-03   1.596   0.1106  
## Gender1:Frequency               -5.859e-02  3.833e-02  -1.528   0.1264  
## Gender1:Last_purchase           -2.992e-01  2.165e-01  -1.382   0.1669  
## Gender1:First_purchase           4.681e-02  3.148e-02   1.487   0.1370  
## Gender1:P_Child                  2.939e-02  2.576e-01   0.114   0.9092  
## Gender1:P_Youth                  6.852e-02  3.332e-01   0.206   0.8371  
## Gender1:P_Cook                   2.024e-02  2.654e-01   0.076   0.9392  
## Gender1:P_DIY                   -5.411e-01  3.241e-01  -1.670   0.0950 .
## Gender1:P_Art                    3.834e-01  2.830e-01   1.355   0.1755  
## Amount_purchased:Frequency       3.692e-05  2.183e-04   0.169   0.8657  
## Amount_purchased:Last_purchase   1.662e-04  1.162e-03   0.143   0.8862  
## Amount_purchased:First_purchase -3.205e-05  1.743e-04  -0.184   0.8541  
## Amount_purchased:P_Child         1.779e-03  1.448e-03   1.228   0.2193  
## Amount_purchased:P_Youth        -4.112e-03  1.791e-03  -2.296   0.0217 *
## Amount_purchased:P_Cook         -1.025e-03  1.520e-03  -0.674   0.5001  
## Amount_purchased:P_DIY           1.438e-03  1.843e-03   0.780   0.4353  
## Amount_purchased:P_Art           4.243e-04  1.593e-03   0.266   0.7899  
## Frequency:Last_purchase          1.202e-02  1.514e-02   0.794   0.4272  
## Frequency:First_purchase         7.832e-04  9.614e-04   0.815   0.4153  
## Frequency:P_Child               -2.940e-02  2.484e-02  -1.183   0.2366  
## Frequency:P_Youth               -2.877e-02  3.080e-02  -0.934   0.3502  
## Frequency:P_Cook                -3.762e-02  2.471e-02  -1.523   0.1279  
## Frequency:P_DIY                 -6.784e-03  3.043e-02  -0.223   0.8236  
## Frequency:P_Art                 -3.816e-02  2.808e-02  -1.359   0.1742  
## Last_purchase:First_purchase     1.447e-03  7.004e-03   0.207   0.8363  
## Last_purchase:P_Child           -2.727e-02  6.489e-02  -0.420   0.6742  
## Last_purchase:P_Youth            1.975e-01  1.167e-01   1.693   0.0905 .
## Last_purchase:P_Cook            -1.965e-02  7.094e-02  -0.277   0.7817  
## Last_purchase:P_DIY             -1.001e-01  1.043e-01  -0.959   0.3374  
## Last_purchase:P_Art              3.443e-02  1.365e-01   0.252   0.8009  
## First_purchase:P_Child          -8.368e-03  1.401e-02  -0.597   0.5503  
## First_purchase:P_Youth           3.070e-03  1.909e-02   0.161   0.8722  
## First_purchase:P_Cook            3.805e-03  1.363e-02   0.279   0.7801  
## First_purchase:P_DIY             4.674e-03  1.819e-02   0.257   0.7973  
## First_purchase:P_Art            -2.875e-03  1.800e-02  -0.160   0.8731  
## P_Child:P_Youth                 -1.283e-01  1.824e-01  -0.703   0.4820  
## P_Child:P_Cook                   2.110e-01  1.200e-01   1.757   0.0788 .
## P_Child:P_DIY                    3.803e-02  1.746e-01   0.218   0.8276  
## P_Child:P_Art                   -7.824e-02  1.874e-01  -0.418   0.6762  
## P_Youth:P_Cook                  -3.904e-01  1.808e-01  -2.159   0.0309 *
## P_Youth:P_DIY                   -2.996e-01  2.589e-01  -1.157   0.2472  
## P_Youth:P_Art                    3.997e-02  2.307e-01   0.173   0.8625  
## P_Cook:P_DIY                    -8.013e-02  1.985e-01  -0.404   0.6864  
## P_Cook:P_Art                    -1.709e-01  1.856e-01  -0.921   0.3571  
## P_DIY:P_Art                      2.937e-01  2.351e-01   1.250   0.2114  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1340.0  on 1544  degrees of freedom
## AIC: 1452
## 
## Number of Fisher Scoring iterations: 6

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log2, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1975  121
##          1  124   80
##                                           
##                Accuracy : 0.8935          
##                  95% CI : (0.8801, 0.9058)
##     No Information Rate : 0.9126          
##     P-Value [Acc > NIR] : 0.9993          
##                                           
##                   Kappa : 0.3367          
##                                           
##  Mcnemar's Test P-Value : 0.8983          
##                                           
##             Sensitivity : 0.39801         
##             Specificity : 0.94092         
##          Pos Pred Value : 0.39216         
##          Neg Pred Value : 0.94227         
##              Prevalence : 0.08739         
##          Detection Rate : 0.03478         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.66947         
##                                           
##        'Positive' Class : 1               
##

Full Logistic regression with interaction terms backward selection

bbbc_log3 = step(bbbc_log2, direction = "backward", trace = FALSE)
summary(bbbc_log3)

## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + 
##     P_DIY + P_Art + Gender:Amount_purchased + Gender:P_DIY + 
##     Amount_purchased:P_Child + Amount_purchased:P_Youth + Last_purchase:P_Youth + 
##     Last_purchase:P_DIY + First_purchase:P_Child + P_Child:P_Cook + 
##     P_Youth:P_Cook + P_Cook:P_Art + P_DIY:P_Art, family = binomial, 
##     data = bbbc_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.08079  -0.65926  -0.42548   0.01452   2.70453  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -0.1777724  0.3168490  -0.561 0.574754    
## Gender1                  -1.2238857  0.3354875  -3.648 0.000264 ***
## Amount_purchased          0.0005102  0.0013783   0.370 0.711236    
## Frequency                -0.0955830  0.0186950  -5.113 3.17e-07 ***
## Last_purchase             0.5393426  0.1044378   5.164 2.41e-07 ***
## First_purchase            0.0112353  0.0164963   0.681 0.495821    
## P_Child                  -1.0831641  0.2356436  -4.597 4.29e-06 ***
## P_Youth                  -0.3039130  0.3238519  -0.938 0.348022    
## P_Cook                   -0.8759147  0.1640133  -5.341 9.27e-08 ***
## P_DIY                    -0.4908395  0.2726998  -1.800 0.071872 .  
## P_Art                     0.7101141  0.1653700   4.294 1.75e-05 ***
## Gender1:Amount_purchased  0.0025071  0.0015265   1.642 0.100504    
## Gender1:P_DIY            -0.5392425  0.2355548  -2.289 0.022065 *  
## Amount_purchased:P_Child  0.0019907  0.0008728   2.281 0.022559 *  
## Amount_purchased:P_Youth -0.0037796  0.0013114  -2.882 0.003951 ** 
## Last_purchase:P_Youth     0.1459361  0.0490033   2.978 0.002901 ** 
## Last_purchase:P_DIY      -0.0772748  0.0393397  -1.964 0.049496 *  
## First_purchase:P_Child   -0.0134187  0.0048060  -2.792 0.005238 ** 
## P_Child:P_Cook            0.1745490  0.0689410   2.532 0.011346 *  
## P_Youth:P_Cook           -0.2992836  0.1421260  -2.106 0.035225 *  
## P_Cook:P_Art             -0.1614371  0.0903445  -1.787 0.073953 .  
## P_DIY:P_Art               0.3362011  0.1614869   2.082 0.037351 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1358.2  on 1578  degrees of freedom
## AIC: 1402.2
## 
## Number of Fisher Scoring iterations: 5

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log3, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1973  123
##          1  120   84
##                                           
##                Accuracy : 0.8943          
##                  95% CI : (0.8811, 0.9066)
##     No Information Rate : 0.91            
##     P-Value [Acc > NIR] : 0.9954          
##                                           
##                   Kappa : 0.3508          
##                                           
##  Mcnemar's Test P-Value : 0.8979          
##                                           
##             Sensitivity : 0.40580         
##             Specificity : 0.94267         
##          Pos Pred Value : 0.41176         
##          Neg Pred Value : 0.94132         
##              Prevalence : 0.09000         
##          Detection Rate : 0.03652         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.67423         
##                                           
##        'Positive' Class : 1               
##

Cooks distance diagnostic plot

cook_distance_log3 = cooks.distance(bbbc_log3)
plot(cook_distance_log3, col = "red", pch = 20, cex = 1)
abline(h = 4/1600, lty = 2, col = "black")

#### Remove outliers

large_cook_distance = which(cooks.distance(bbbc_log3)>(4/1600))

bbbc_log3_trained = bbbc_train[-large_cook_distance,]

bbbc_log3_2 = glm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Gender:Amount_purchased + Gender:P_DIY + Amount_purchased:P_Child + Amount_purchased:P_Youth + Last_purchase:P_Youth + Last_purchase:P_DIY + First_purchase:P_Child + P_Child:P_Cook + P_Youth:P_Cook + P_Cook:P_Art + P_DIY:P_Art, data = bbbc_log3_trained, family = binomial)
summary(bbbc_log3_2)

## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + 
##     P_DIY + P_Art + Gender:Amount_purchased + Gender:P_DIY + 
##     Amount_purchased:P_Child + Amount_purchased:P_Youth + Last_purchase:P_Youth + 
##     Last_purchase:P_DIY + First_purchase:P_Child + P_Child:P_Cook + 
##     P_Youth:P_Cook + P_Cook:P_Art + P_DIY:P_Art, family = binomial, 
##     data = bbbc_log3_trained)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.08528  -0.55742  -0.28775  -0.02826   2.74734  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               0.0045600  0.3819120   0.012 0.990474    
## Gender1                  -1.2870270  0.4036421  -3.189 0.001430 ** 
## Amount_purchased          0.0009134  0.0016268   0.561 0.574471    
## Frequency                -0.1372397  0.0272378  -5.039 4.69e-07 ***
## Last_purchase             0.7056159  0.1516651   4.652 3.28e-06 ***
## First_purchase            0.0090064  0.0241811   0.372 0.709553    
## P_Child                  -1.3932946  0.3178214  -4.384 1.17e-05 ***
## P_Youth                  -0.8798479  0.4633697  -1.899 0.057590 .  
## P_Cook                   -1.1053726  0.1975767  -5.595 2.21e-08 ***
## P_DIY                    -0.6485602  0.3615495  -1.794 0.072839 .  
## P_Art                     0.7290163  0.1954992   3.729 0.000192 ***
## Gender1:Amount_purchased  0.0030971  0.0018839   1.644 0.100182    
## Gender1:P_DIY            -1.2880630  0.3720225  -3.462 0.000536 ***
## Amount_purchased:P_Child  0.0031359  0.0013007   2.411 0.015913 *  
## Amount_purchased:P_Youth -0.0063309  0.0020373  -3.107 0.001887 ** 
## Last_purchase:P_Youth     0.4049317  0.0967919   4.184 2.87e-05 ***
## Last_purchase:P_DIY      -0.1518262  0.0756022  -2.008 0.044619 *  
## First_purchase:P_Child   -0.0260979  0.0077131  -3.384 0.000715 ***
## P_Child:P_Cook            0.2090575  0.1327920   1.574 0.115413    
## P_Youth:P_Cook           -1.1989748  0.3181952  -3.768 0.000165 ***
## P_Cook:P_Art              0.0207649  0.1465806   0.142 0.887347    
## P_DIY:P_Art               0.8832159  0.3110327   2.840 0.004517 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1584.7  on 1502  degrees of freedom
## Residual deviance: 1014.2  on 1481  degrees of freedom
## AIC: 1058.2
## 
## Number of Fisher Scoring iterations: 7

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log3_2, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1946  150
##          1  120   84
##                                           
##                Accuracy : 0.8826          
##                  95% CI : (0.8687, 0.8955)
##     No Information Rate : 0.8983          
##     P-Value [Acc > NIR] : 0.99328         
##                                           
##                   Kappa : 0.319           
##                                           
##  Mcnemar's Test P-Value : 0.07758         
##                                           
##             Sensitivity : 0.35897         
##             Specificity : 0.94192         
##          Pos Pred Value : 0.41176         
##          Neg Pred Value : 0.92844         
##              Prevalence : 0.10174         
##          Detection Rate : 0.03652         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.65045         
##                                           
##        'Positive' Class : 1               
##

Multicollinearity check VIF

vif(bbbc_log3_2)

##                   Gender         Amount_purchased                Frequency 
##                 6.327947                 3.231413                 3.959253 
##            Last_purchase           First_purchase                  P_Child 
##                18.097883                14.781570                 9.903587 
##                  P_Youth                   P_Cook                    P_DIY 
##                 7.031414                 3.818008                 4.813335 
##                    P_Art  Gender:Amount_purchased             Gender:P_DIY 
##                 2.749984                 8.253189                 2.942640 
## Amount_purchased:P_Child Amount_purchased:P_Youth    Last_purchase:P_Youth 
##                13.202736                 8.600721                12.389289 
##      Last_purchase:P_DIY   First_purchase:P_Child           P_Child:P_Cook 
##                10.272898                 9.451090                 4.470351 
##           P_Youth:P_Cook             P_Cook:P_Art              P_DIY:P_Art 
##                 5.266006                 4.369223                 6.828573

Removing high collerated VIF value variables and interaction terms

bbbc_log3_3 = glm(Choice ~ Gender + Amount_purchased + Frequency + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Gender:P_DIY + P_Child:P_Cook + P_Youth:P_Cook + P_Cook:P_Art + P_DIY:P_Art, data = bbbc_log3_trained, family = binomial)
summary(bbbc_log3_3)

## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     P_Child + P_Youth + P_Cook + P_DIY + P_Art + Gender:P_DIY + 
##     P_Child:P_Cook + P_Youth:P_Cook + P_Cook:P_Art + P_DIY:P_Art, 
##     family = binomial, data = bbbc_log3_trained)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.12475  -0.59325  -0.35155  -0.08197   2.61190  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.1466637  0.2597245  -0.565 0.572285    
## Gender1          -0.7279068  0.1719913  -4.232 2.31e-05 ***
## Amount_purchased  0.0033546  0.0009092   3.690 0.000225 ***
## Frequency        -0.1335029  0.0136510  -9.780  < 2e-16 ***
## P_Child          -0.3777533  0.1237906  -3.052 0.002277 ** 
## P_Youth          -0.0992598  0.1901371  -0.522 0.601640    
## P_Cook           -0.4114009  0.1542256  -2.668 0.007641 ** 
## P_DIY            -0.5006975  0.2532521  -1.977 0.048034 *  
## P_Art             1.5406510  0.1541179   9.997  < 2e-16 ***
## Gender1:P_DIY    -0.7268812  0.3192180  -2.277 0.022782 *  
## P_Child:P_Cook    0.0939201  0.0924061   1.016 0.309447    
## P_Youth:P_Cook   -0.2982718  0.1659704  -1.797 0.072314 .  
## P_Cook:P_Art     -0.0919085  0.1247962  -0.736 0.461445    
## P_DIY:P_Art       0.5070186  0.2025299   2.503 0.012300 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1584.7  on 1502  degrees of freedom
## Residual deviance: 1106.2  on 1489  degrees of freedom
## AIC: 1134.2
## 
## Number of Fisher Scoring iterations: 6

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log3_3, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1976  120
##          1  130   74
##                                           
##                Accuracy : 0.8913          
##                  95% CI : (0.8779, 0.9037)
##     No Information Rate : 0.9157          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.3124          
##                                           
##  Mcnemar's Test P-Value : 0.5692          
##                                           
##             Sensitivity : 0.38144         
##             Specificity : 0.93827         
##          Pos Pred Value : 0.36275         
##          Neg Pred Value : 0.94275         
##              Prevalence : 0.08435         
##          Detection Rate : 0.03217         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.65986         
##                                           
##        'Positive' Class : 1               
##

Check VIF multicollinearity

vif(bbbc_log3_3)

##           Gender Amount_purchased        Frequency          P_Child 
##         1.244340         1.134827         1.052314         1.982088 
##          P_Youth           P_Cook            P_DIY            P_Art 
##         1.721617         2.835163         3.023992         1.965903 
##     Gender:P_DIY   P_Child:P_Cook   P_Youth:P_Cook     P_Cook:P_Art 
##         3.002732         3.658630         2.453211         4.223305 
##      P_DIY:P_Art 
##         3.828999

bbbc_log3_4 = step(bbbc_log3_3, direction = "backward", trace = FALSE)
summary(bbbc_log3_4)

## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     P_Child + P_Youth + P_Cook + P_DIY + P_Art + Gender:P_DIY + 
##     P_Youth:P_Cook + P_DIY:P_Art, family = binomial, data = bbbc_log3_trained)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.09834  -0.59177  -0.35310  -0.07416   2.60709  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.1748168  0.2481528  -0.704 0.481139    
## Gender1          -0.7243421  0.1717539  -4.217 2.47e-05 ***
## Amount_purchased  0.0033395  0.0009082   3.677 0.000236 ***
## Frequency        -0.1328249  0.0135764  -9.784  < 2e-16 ***
## P_Child          -0.3105357  0.0966700  -3.212 0.001317 ** 
## P_Youth          -0.0940805  0.1890695  -0.498 0.618768    
## P_Cook           -0.3929480  0.1164072  -3.376 0.000736 ***
## P_DIY            -0.4764514  0.2524590  -1.887 0.059128 .  
## P_Art             1.5095302  0.1366903  11.043  < 2e-16 ***
## Gender1:P_DIY    -0.7203885  0.3188350  -2.259 0.023856 *  
## P_Youth:P_Cook   -0.3141036  0.1653946  -1.899 0.057549 .  
## P_DIY:P_Art       0.4675144  0.1975315   2.367 0.017943 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1584.7  on 1502  degrees of freedom
## Residual deviance: 1107.4  on 1491  degrees of freedom
## AIC: 1131.4
## 
## Number of Fisher Scoring iterations: 6

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log3_4, newdata = bbbc_test, type = "response")
book_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .18178,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1976  120
##          1  130   74
##                                           
##                Accuracy : 0.8913          
##                  95% CI : (0.8779, 0.9037)
##     No Information Rate : 0.9157          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.3124          
##                                           
##  Mcnemar's Test P-Value : 0.5692          
##                                           
##             Sensitivity : 0.38144         
##             Specificity : 0.93827         
##          Pos Pred Value : 0.36275         
##          Neg Pred Value : 0.94275         
##              Prevalence : 0.08435         
##          Detection Rate : 0.03217         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.65986         
##                                           
##        'Positive' Class : 1               
##

par(mfrow=c(2,2))
plot(bbbc_log3_4, which=c(1:4))

Optimal cutoff point on Full logistic regression model with interaction

effect terms using backward selection method

pred = prediction(predict(bbbc_log3_4, newdata = bbbc_test, type = "response"), bbbc_test$Choice)

auc = round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates = performance(pred, "fpr", "fnr")
accuracy = performance(pred, "acc", "err")

perf = performance(pred, "tpr", "fpr")
plot(perf, colorize = T, main = "ROC curve")
text(.5,.5, paste("AUC", auc))

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

Full Logistic regression model

using backward selection

bbbc_log4 = step(bbbc_log, direction = "backward", trace = FALSE)
summary(bbbc_log4)

## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, 
##     family = binomial, data = bbbc_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.44132  -0.66647  -0.43745  -0.01855   2.72460  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.2833949  0.2062721  -1.374   0.1695    
## Gender1          -0.8660575  0.1373268  -6.307 2.85e-10 ***
## Amount_purchased  0.0018357  0.0007908   2.321   0.0203 *  
## Frequency        -0.0903261  0.0106304  -8.497  < 2e-16 ***
## Last_purchase     0.5536689  0.0784519   7.057 1.70e-12 ***
## P_Child          -0.8181807  0.1163377  -7.033 2.02e-12 ***
## P_Youth          -0.6424923  0.1432548  -4.485 7.29e-06 ***
## P_Cook           -0.9330131  0.1190073  -7.840 4.51e-15 ***
## P_DIY            -0.9101106  0.1433591  -6.348 2.17e-10 ***
## P_Art             0.6643371  0.1255243   5.292 1.21e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1799.5  on 1599  degrees of freedom
## Residual deviance: 1393.5  on 1590  degrees of freedom
## AIC: 1413.5
## 
## Number of Fisher Scoring iterations: 5

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log4, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1973  123
##          1  127   77
##                                           
##                Accuracy : 0.8913          
##                  95% CI : (0.8779, 0.9037)
##     No Information Rate : 0.913           
##     P-Value [Acc > NIR] : 0.9999          
##                                           
##                   Kappa : 0.3216          
##                                           
##  Mcnemar's Test P-Value : 0.8495          
##                                           
##             Sensitivity : 0.38500         
##             Specificity : 0.93952         
##          Pos Pred Value : 0.37745         
##          Neg Pred Value : 0.94132         
##              Prevalence : 0.08696         
##          Detection Rate : 0.03348         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.66226         
##                                           
##        'Positive' Class : 1               
##

Cooks distance diagnostic plot

cook_distance_log4 = cooks.distance(bbbc_log4)
plot(cook_distance_log4, col = "red", pch = 20, cex = 1)
abline(h = 4/1600, lty = 2, col = "black")

#### Remove outliers

large_cook_distance = which(cooks.distance(bbbc_log4)>(4/1600))

bbbc_log4_trained = bbbc_train[-large_cook_distance,]

bbbc_log4_2 = glm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_log4_trained, family = binomial) 
summary(bbbc_log4_2)

## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, 
##     family = binomial, data = bbbc_log4_trained)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.12566  -0.52618  -0.29449  -0.07674   2.78007  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.0359902  0.2490070  -0.145  0.88508    
## Gender1          -0.9746586  0.1653465  -5.895 3.75e-09 ***
## Amount_purchased  0.0025809  0.0009611   2.685  0.00725 ** 
## Frequency        -0.1531425  0.0151177 -10.130  < 2e-16 ***
## Last_purchase     0.8096840  0.1094971   7.395 1.42e-13 ***
## P_Child          -1.2429659  0.1634518  -7.604 2.86e-14 ***
## P_Youth          -1.2818878  0.2081274  -6.159 7.31e-10 ***
## P_Cook           -1.3858412  0.1629903  -8.503  < 2e-16 ***
## P_DIY            -1.4611262  0.1967092  -7.428 1.10e-13 ***
## P_Art             0.9484052  0.1614054   5.876 4.21e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1558.29  on 1495  degrees of freedom
## Residual deviance:  999.36  on 1486  degrees of freedom
## AIC: 1019.4
## 
## Number of Fisher Scoring iterations: 6

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log4_2, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1946  150
##          1  119   85
##                                           
##                Accuracy : 0.883           
##                  95% CI : (0.8692, 0.8959)
##     No Information Rate : 0.8978          
##     P-Value [Acc > NIR] : 0.99024         
##                                           
##                   Kappa : 0.323           
##                                           
##  Mcnemar's Test P-Value : 0.06738         
##                                           
##             Sensitivity : 0.36170         
##             Specificity : 0.94237         
##          Pos Pred Value : 0.41667         
##          Neg Pred Value : 0.92844         
##              Prevalence : 0.10217         
##          Detection Rate : 0.03696         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.65204         
##                                           
##        'Positive' Class : 1               
##

Multicollinearity check VIF

vif(bbbc_log4_2)

##           Gender Amount_purchased        Frequency    Last_purchase 
##         1.034201         1.191262         1.078824        12.954224 
##          P_Child          P_Youth           P_Cook            P_DIY 
##         3.147356         1.838353         3.218469         2.048614 
##            P_Art 
##         1.984071

Removing variables with high VIF

bbbc_log4_3 = glm(Choice ~ Gender + Amount_purchased + Frequency + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_log4_trained, family = binomial) 
summary(bbbc_log4_3)

## 
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency + 
##     P_Child + P_Youth + P_Cook + P_DIY + P_Art, family = binomial, 
##     data = bbbc_log4_trained)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.03973  -0.55738  -0.32735  -0.09369   2.56494  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.0934399  0.2419181  -0.386 0.699315    
## Gender1          -0.9195610  0.1606789  -5.723 1.05e-08 ***
## Amount_purchased  0.0034735  0.0009316   3.729 0.000192 ***
## Frequency        -0.1442607  0.0143420 -10.059  < 2e-16 ***
## P_Child          -0.3050191  0.0955213  -3.193 0.001407 ** 
## P_Youth          -0.3524054  0.1523640  -2.313 0.020727 *  
## P_Cook           -0.4693417  0.0979090  -4.794 1.64e-06 ***
## P_DIY            -0.5691434  0.1467527  -3.878 0.000105 ***
## P_Art             1.7802192  0.1261186  14.115  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1558.3  on 1495  degrees of freedom
## Residual deviance: 1057.9  on 1487  degrees of freedom
## AIC: 1075.9
## 
## Number of Fisher Scoring iterations: 6

Confusion matrix

bbbc_log_Pred_Prob = predict.glm(bbbc_log4_3, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .16438,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1473  623
##          1   61  143
##                                           
##                Accuracy : 0.7026          
##                  95% CI : (0.6835, 0.7212)
##     No Information Rate : 0.667           
##     P-Value [Acc > NIR] : 0.0001376       
##                                           
##                   Kappa : 0.18            
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.18668         
##             Specificity : 0.96023         
##          Pos Pred Value : 0.70098         
##          Neg Pred Value : 0.70277         
##              Prevalence : 0.33304         
##          Detection Rate : 0.06217         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.57346         
##                                           
##        'Positive' Class : 1               
##

OR= exp(bbbc_log4_3$coefficients)
round(OR, 3)

##      (Intercept)          Gender1 Amount_purchased        Frequency 
##            0.911            0.399            1.003            0.866 
##          P_Child          P_Youth           P_Cook            P_DIY 
##            0.737            0.703            0.625            0.566 
##            P_Art 
##            5.931

vif(bbbc_log4_3)

##           Gender Amount_purchased        Frequency          P_Child 
##         1.032861         1.169365         1.058566         1.169014 
##          P_Youth           P_Cook            P_DIY            P_Art 
##         1.087578         1.200231         1.192834         1.292923

par(mfrow=c(2,2))
plot(bbbc_log4_3, which=c(1:4))

probabilities <- predict(bbbc_log4_3, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)

# Select only numeric variables
mydata = dplyr::select_if(bbbc_log4_trained, is.numeric) 
predictors <- colnames(mydata)
# Binding the logit and tidying the data for plot
mydata = mutate(mydata, logit = log(probabilities/(1-probabilities))) 
mydata = gather(mydata, key = "predictors", value = "predictor.value", -logit)

#making the plot
ggplot(mydata, aes(logit, predictor.value))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") + 
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_y")

## `geom_smooth()` using formula 'y ~ x'

Optimal cutoff point on Full logistic regression model

without interaction effect terms using backward selection method

pred = prediction(predict(bbbc_log4_3, newdata = bbbc_test, type = "response"), bbbc_test$Choice)

auc = round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates = performance(pred, "fpr", "fnr")
accuracy = performance(pred, "acc", "err")

perf = performance(pred, "tpr", "fpr")
plot(perf, colorize = T, main = "ROC curve")
text(.5,.5, paste("AUC", auc))

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

## Linear SVM

set.seed(1)
form1 = Choice ~ .
tuned_svm = tune.svm(form1, data = bbbc_train,kernel = "linear", gamma = seq(.01, .1, by = .01), cost = seq(.1, 1, by = .1))

tuned_svm$best.parameters

##    gamma cost
## 61  0.01  0.7

bbbc_svm = svm(Choice ~., data = bbbc_train, kernel = "linear", gamma = tuned_svm$best.parameters$gamma, cost = tuned_svm$best.parameters$cost)
summary(bbbc_svm)

## 
## Call:
## svm(formula = Choice ~ ., data = bbbc_train, kernel = "linear", gamma = tuned_svm$best.parameters$gamma, 
##     cost = tuned_svm$best.parameters$cost)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.7 
## 
## Number of Support Vectors:  739
## 
##  ( 367 372 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

Confusion Matrix

bbbc_svm_predict1 = predict(bbbc_svm, bbbc_test, type = "response")
caret::confusionMatrix(bbbc_test$Choice, as.factor(bbbc_svm_predict1), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2011   85
##          1  147   57
##                                           
##                Accuracy : 0.8991          
##                  95% CI : (0.8861, 0.9111)
##     No Information Rate : 0.9383          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2768          
##                                           
##  Mcnemar's Test P-Value : 6.206e-05       
##                                           
##             Sensitivity : 0.40141         
##             Specificity : 0.93188         
##          Pos Pred Value : 0.27941         
##          Neg Pred Value : 0.95945         
##              Prevalence : 0.06174         
##          Detection Rate : 0.02478         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.66664         
##                                           
##        'Positive' Class : 1               
##

Polynomial SVM

set.seed(1)
tuned_svm2 = tune.svm(form1, data = bbbc_train,kernel = "polynomial", gamma = seq(.01, .1, by = .01), cost = seq(.1, 1, by = .1))

tuned_svm2$best.parameters

##    gamma cost
## 90   0.1  0.9

bbbc_svm2 = svm(formula = form1, data = bbbc_train,kernel = "polynomial", gamma = tuned_svm2$best.parameters$gamma, cost = tuned_svm2$best.parameters$cost)
summary(bbbc_svm2)

## 
## Call:
## svm(formula = form1, data = bbbc_train, kernel = "polynomial", gamma = tuned_svm2$best.parameters$gamma, 
##     cost = tuned_svm2$best.parameters$cost)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  0.9 
##      degree:  3 
##      coef.0:  0 
## 
## Number of Support Vectors:  749
## 
##  ( 356 393 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

Confusion matrix

bbbc_svm_predict2 = predict(bbbc_svm2, bbbc_test, type = "response")
caret::confusionMatrix(bbbc_test$Choice, as.factor(bbbc_svm_predict2), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2050   46
##          1  168   36
##                                           
##                Accuracy : 0.907           
##                  95% CI : (0.8943, 0.9185)
##     No Information Rate : 0.9643          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2117          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.43902         
##             Specificity : 0.92426         
##          Pos Pred Value : 0.17647         
##          Neg Pred Value : 0.97805         
##              Prevalence : 0.03565         
##          Detection Rate : 0.01565         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.68164         
##                                           
##        'Positive' Class : 1               
##

Radial SVM

set.seed(1)
tuned_svm3 = tune.svm(form1, data = bbbc_train,kernel = "radial", gamma = seq(.01, .1, by = .01), cost = seq(.1, 1, by = .1))

tuned_svm3$best.parameters

##    gamma cost
## 75  0.05  0.8

bbbc_svm3 = svm(formula = form1, data = bbbc_train,kernel = "radial", gamma = tuned_svm3$best.parameters$gamma, cost = tuned_svm3$best.parameters$cost)
summary(bbbc_svm3)

## 
## Call:
## svm(formula = form1, data = bbbc_train, kernel = "radial", gamma = tuned_svm3$best.parameters$gamma, 
##     cost = tuned_svm3$best.parameters$cost)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  0.8 
## 
## Number of Support Vectors:  782
## 
##  ( 371 411 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

Confusion Matrix

bbbc_svm_predict3 = predict(bbbc_svm3, bbbc_test, type = "response")
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_svm_predict3), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2054   42
##          1  166   38
##                                          
##                Accuracy : 0.9096         
##                  95% CI : (0.8971, 0.921)
##     No Information Rate : 0.9652         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.2291         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.47500        
##             Specificity : 0.92523        
##          Pos Pred Value : 0.18627        
##          Neg Pred Value : 0.97996        
##              Prevalence : 0.03478        
##          Detection Rate : 0.01652        
##    Detection Prevalence : 0.08870        
##       Balanced Accuracy : 0.70011        
##                                          
##        'Positive' Class : 1              
##

Sigmoid SVM

set.seed(1)
tuned_svm4 = tune.svm(form1, data = bbbc_train,kernel = "sigmoid", gamma = seq(.01, .1, by = .01), cost = seq(.1, 1, by = .1))

tuned_svm4$best.parameters

##    gamma cost
## 53  0.03  0.6

bbbc_svm4 = svm(formula = form1, data = bbbc_train,kernel = "sigmoid", gamma = tuned_svm4$best.parameters$gamma, cost = tuned_svm4$best.parameters$cost)
summary(bbbc_svm4)

## 
## Call:
## svm(formula = form1, data = bbbc_train, kernel = "sigmoid", gamma = tuned_svm4$best.parameters$gamma, 
##     cost = tuned_svm4$best.parameters$cost)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  sigmoid 
##        cost:  0.6 
##      coef.0:  0 
## 
## Number of Support Vectors:  762
## 
##  ( 379 383 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

Confusion Matrix

bbbc_svm_predict3 = predict(bbbc_svm4, bbbc_test, type = "response")
caret::confusionMatrix(bbbc_test$Choice, as.factor(bbbc_svm_predict3), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1983  113
##          1  144   60
##                                           
##                Accuracy : 0.8883          
##                  95% CI : (0.8747, 0.9009)
##     No Information Rate : 0.9248          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.2579          
##                                           
##  Mcnemar's Test P-Value : 0.0613          
##                                           
##             Sensitivity : 0.34682         
##             Specificity : 0.93230         
##          Pos Pred Value : 0.29412         
##          Neg Pred Value : 0.94609         
##              Prevalence : 0.07522         
##          Detection Rate : 0.02609         
##    Detection Prevalence : 0.08870         
##       Balanced Accuracy : 0.63956         
##                                           
##        'Positive' Class : 1               
##

Logistic Regression Model Analysis on New Mail Campaign

proportion_00 = 1473/2300
proportion_01 = 623/2300
proportion_10 = 61/2300
proportion_11 = 143/2300

new_customer = 50000

print("Proportion of people who are not going to buy book")

## [1] "Proportion of people who are not going to buy book"

proportion_00

## [1] 0.6404348

print("Proportion of people who we predicted would not buy, but did buy")

## [1] "Proportion of people who we predicted would not buy, but did buy"

proportion_01

## [1] 0.2708696

print("Proportion of people who we predicted would buy, but did not buy")

## [1] "Proportion of people who we predicted would buy, but did not buy"

proportion_10

## [1] 0.02652174

print("Proportion of people who we predicted will buy")

## [1] "Proportion of people who we predicted will buy"

proportion_11

## [1] 0.06217391

print("Number of people who are not going to buy book")

## [1] "Number of people who are not going to buy book"

number_00 = proportion_00*new_customer
number_00

## [1] 32021.74

print("Number of people who we predicted would not buy, but did buy")

## [1] "Number of people who we predicted would not buy, but did buy"

number_01 = proportion_01*new_customer
number_01

## [1] 13543.48

print("Number of people who we predicted would buy, but did not buy")

## [1] "Number of people who we predicted would buy, but did not buy"

number_10 = proportion_10*new_customer
number_10

## [1] 1326.087

print("Number of people who we predicted will buy")

## [1] "Number of people who we predicted will buy"

number_11 = proportion_11*new_customer
number_11

## [1] 3108.696

print("Number of people to be sent mail")

## [1] "Number of people to be sent mail"

number_send = number_10 + number_11
number_send

## [1] 4434.783

print("Number of people who will buy out of the number of people to be sent mail")

## [1] "Number of people who will buy out of the number of people to be sent mail"

number_11

## [1] 3108.696

print("proportion of people who will buy from the number of people sent mail")

## [1] "proportion of people who will buy from the number of people sent mail"

proportion_sent = number_11/number_send
proportion_sent

## [1] 0.7009804

print("cost of sending mail to all in $")

## [1] "cost of sending mail to all in $"

cost_total = number_send * 0.65
cost_total

## [1] 2882.609

print("cost for sending book to total number of people who buy in $")

## [1] "cost for sending book to total number of people who buy in $"

cost_buy = number_11*15
cost_buy

## [1] 46630.43

print("cost overhead in $")

## [1] "cost overhead in $"

cost_overhead = cost_buy*.45
cost_overhead

## [1] 20983.7

print("Total Revenue generated out of books sold in $")

## [1] "Total Revenue generated out of books sold in $"

revenue_total = number_11*31.95
revenue_total

## [1] 99322.83

print("Total Profit made by selecting this logistic model in $")

## [1] "Total Profit made by selecting this logistic model in $"

profit_total = revenue_total - cost_total - cost_buy - cost_overhead
profit_total

## [1] 28826.09

SVM Model Analysis on New Mail Campaign

proportion_00_SVM = 1983/2300
proportion_01_SVM = 113/2300
proportion_10_SVM = 144/2300
proportion_11_SVM = 60/2300


number_00_SVM = proportion_00_SVM*new_customer
number_00_SVM

## [1] 43108.7

number_01_SVM = proportion_01_SVM*new_customer
number_01_SVM

## [1] 2456.522

number_10_SVM = proportion_10_SVM*new_customer
number_10_SVM

## [1] 3130.435

number_11_SVM = proportion_11_SVM*new_customer
number_11_SVM

## [1] 1304.348

print("Number of people to be sent mail")

## [1] "Number of people to be sent mail"

number_send_SVM = number_10_SVM + number_11_SVM
number_send_SVM

## [1] 4434.783

print("Number of people who will buy out of the number of people to be sent mail")

## [1] "Number of people who will buy out of the number of people to be sent mail"

number_11_SVM

## [1] 1304.348

print("proportion of people who will buy from the number of people sent mail")

## [1] "proportion of people who will buy from the number of people sent mail"

proportion_sent_SVM = number_11_SVM/number_send_SVM
proportion_sent_SVM

## [1] 0.2941176

print("cost of sending mail to all in $")

## [1] "cost of sending mail to all in $"

cost_total_SVM = number_send_SVM * 0.65
cost_total_SVM

## [1] 2882.609

print("cost for sending book to total number of people who buy in $")

## [1] "cost for sending book to total number of people who buy in $"

cost_buy_SVM = number_11_SVM*15
cost_buy_SVM

## [1] 19565.22

print("cost overhead in $")

## [1] "cost overhead in $"

cost_overhead_SVM = cost_buy_SVM*.45
cost_overhead_SVM

## [1] 8804.348

print("Total Revenue generated out of books sold in $")

## [1] "Total Revenue generated out of books sold in $"

revenue_total_SVM = number_11_SVM*31.95
revenue_total_SVM

## [1] 41673.91

print("Total Profit made by selecting this SVM model in $")

## [1] "Total Profit made by selecting this SVM model in $"

profit_total_SVM = revenue_total_SVM - cost_total_SVM - cost_buy_SVM - cost_overhead_SVM
profit_total_SVM

## [1] 10421.74

Model Analysis on New Mail Campaign if mail sent to all

summary(bbbc_test$Choice)

##    0    1 
## 2096  204

proportion_1 = 204/2300
proportion_0 = 2096/2300

proportion_1

## [1] 0.08869565

proportion_0

## [1] 0.9113043

total_1 = proportion_1*new_customer
total_0 = proportion_0*new_customer

print("Number of people to be sent mail")

## [1] "Number of people to be sent mail"

send_total = total_1 + total_0
send_total

## [1] 50000

print("Number of people who will buy out of the number of people to be sent mail")

## [1] "Number of people who will buy out of the number of people to be sent mail"

total_1

## [1] 4434.783

print("proportion of people who will buy from the number of people sent mail")

## [1] "proportion of people who will buy from the number of people sent mail"

proportion_sent_total = total_1/send_total
proportion_sent_total

## [1] 0.08869565

print("cost of sending mail to all in $")

## [1] "cost of sending mail to all in $"

cost_total_all = send_total*.65
cost_total_all

## [1] 32500

print("cost for sending book to total number of people who buy in $")

## [1] "cost for sending book to total number of people who buy in $"

cost_buy_all = total_1*15
cost_buy_all

## [1] 66521.74

print("cost overhead in $")

## [1] "cost overhead in $"

cost_overhead_all = cost_buy_all*.45
cost_overhead_all

## [1] 29934.78

print("Total Revenue generated out of books sold in $")

## [1] "Total Revenue generated out of books sold in $"

revenue_total_all = total_1*31.95
revenue_total_all

## [1] 141691.3

print("Total Profit made by sending campaign mail to all in $")

## [1] "Total Profit made by sending campaign mail to all in $"

profit_total_all = revenue_total_all - cost_total_all - cost_buy_all - cost_overhead_all
profit_total_all

## [1] 12734.78

Case Study 2

Shams Tabrez

3/8/2021

Load Libraries

Read train and test dataset

Check structure of train dataset

Check structure of test dataset

Change Gender variable to factor

Check/remove missing na values from dataset

Choice as factor variable

Gender as factor variable

Amount purchased as factor variable

Apply Linear model with all predictors

Apply linear regressions with predictors interaction effect

Apply Linear regression with predictors interaction effect and backward selection

Linear regression with selected variables from previous backward selection step

Cooks distance diagnostic plot

Multicollinearity check VIF values

Removing variable Last_purchase, Amount_purchased:P_Youth, Frequency:First_purchase, P_Child:First_purchase, with high VIF value correlation

Multicollinearity check VIF values

Linear regression with selected variables from previous backward selection step

Cooks distance diagnostic plot

Multicollinearity check VIF values

Removing variable Last_purchase with high VIF value correlation

Full Logistic Regression Model

Confusion matrix

Full Logistic Regression with interaction terms

Confusion matrix

Full Logistic regression with interaction terms backward selection

Confusion matrix

Cooks distance diagnostic plot

Confusion matrix

Multicollinearity check VIF

Removing high collerated VIF value variables and interaction terms

Confusion matrix

Check VIF multicollinearity

Confusion matrix

Optimal cutoff point on Full logistic regression model with interaction

effect terms using backward selection method

Full Logistic regression model

using backward selection

Confusion matrix

Cooks distance diagnostic plot

Confusion matrix

Multicollinearity check VIF

Removing variables with high VIF

Confusion matrix

Optimal cutoff point on Full logistic regression model

without interaction effect terms using backward selection method

Confusion Matrix

Polynomial SVM

Confusion matrix

Radial SVM

Confusion Matrix

Sigmoid SVM

Confusion Matrix

Logistic Regression Model Analysis on New Mail Campaign

SVM Model Analysis on New Mail Campaign

Model Analysis on New Mail Campaign if mail sent to all