bbbc_train = read_excel("BBBC-Train.xlsx")[2:12]
bbbc_test = read_excel("BBBC-Test.xlsx")[2:12]
str(bbbc_train)
## tibble [1,600 × 11] (S3: tbl_df/tbl/data.frame)
## $ Choice : num [1:1600] 1 1 1 1 1 1 1 1 1 1 ...
## $ Gender : num [1:1600] 1 1 1 1 0 1 1 0 1 1 ...
## $ Amount_purchased: num [1:1600] 113 418 336 180 320 268 198 280 393 138 ...
## $ Frequency : num [1:1600] 8 6 18 16 2 4 2 6 12 10 ...
## $ Last_purchase : num [1:1600] 1 11 6 5 3 1 12 2 11 7 ...
## $ First_purchase : num [1:1600] 8 66 32 42 18 4 62 12 50 38 ...
## $ P_Child : num [1:1600] 0 0 2 2 0 0 2 0 3 2 ...
## $ P_Youth : num [1:1600] 1 2 0 0 0 0 3 2 0 3 ...
## $ P_Cook : num [1:1600] 0 3 1 0 0 0 2 0 3 0 ...
## $ P_DIY : num [1:1600] 0 2 1 1 1 0 1 0 0 0 ...
## $ P_Art : num [1:1600] 0 3 2 1 2 0 2 0 2 1 ...
str(bbbc_test)
## tibble [2,300 × 11] (S3: tbl_df/tbl/data.frame)
## $ Choice : num [1:2300] 1 1 1 1 1 1 1 1 1 1 ...
## $ Gender : num [1:2300] 0 1 1 0 1 0 1 1 1 1 ...
## $ Amount_purchased: num [1:2300] 287 215 261 24 120 66 42 233 66 199 ...
## $ Frequency : num [1:2300] 12 4 2 4 8 2 12 8 12 22 ...
## $ Last_purchase : num [1:2300] 4 1 1 1 1 4 1 2 1 1 ...
## $ First_purchase : num [1:2300] 24 4 2 4 8 16 12 12 12 22 ...
## $ P_Child : num [1:2300] 0 0 0 1 0 0 0 0 0 0 ...
## $ P_Youth : num [1:2300] 3 0 0 0 0 0 0 0 0 0 ...
## $ P_Cook : num [1:2300] 0 0 0 0 0 1 1 0 0 0 ...
## $ P_DIY : num [1:2300] 0 0 0 0 0 1 0 0 0 0 ...
## $ P_Art : num [1:2300] 1 1 1 0 1 1 0 0 0 1 ...
bbbc_train$Gender = as.factor(bbbc_train$Gender)
bbbc_test$Gender = as.factor(bbbc_test$Gender)
bbbc_train = na.omit(bbbc_train)
bbbc_test = na.omit(bbbc_test)
levels(as.factor(bbbc_train$Choice))
## [1] "0" "1"
levels(as.factor(bbbc_train$Gender))
## [1] "0" "1"
levels(as.factor(bbbc_train$Amount_purchased))
## [1] "15" "16" "17" "18" "21" "22" "23" "24" "25" "26" "27" "28"
## [13] "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40"
## [25] "41" "42" "43" "44" "45" "46" "47" "48" "49" "51" "52" "53"
## [37] "54" "55" "56" "57" "59" "60" "61" "62" "63" "64" "65" "66"
## [49] "67" "68" "69" "70" "71" "72" "73" "74" "75" "76" "77" "78"
## [61] "79" "80" "81" "82" "83" "84" "85" "86" "87" "88" "89" "90"
## [73] "91" "92" "93" "94" "95" "96" "97" "98" "99" "100" "101" "102"
## [85] "103" "104" "105" "106" "107" "108" "110" "111" "112" "113" "114" "115"
## [97] "116" "117" "118" "119" "120" "121" "122" "123" "124" "125" "126" "127"
## [109] "128" "129" "130" "131" "132" "133" "134" "135" "136" "137" "138" "139"
## [121] "140" "141" "142" "143" "144" "145" "146" "147" "148" "149" "150" "151"
## [133] "152" "153" "154" "155" "156" "157" "158" "159" "160" "161" "162" "163"
## [145] "164" "165" "166" "167" "168" "169" "170" "171" "172" "173" "174" "175"
## [157] "176" "177" "178" "179" "180" "181" "182" "183" "184" "185" "186" "187"
## [169] "188" "189" "190" "191" "192" "193" "194" "195" "196" "197" "198" "199"
## [181] "200" "201" "202" "203" "204" "205" "206" "207" "208" "209" "210" "211"
## [193] "212" "213" "214" "215" "216" "217" "218" "219" "220" "221" "222" "223"
## [205] "224" "225" "226" "227" "228" "229" "230" "231" "232" "233" "234" "235"
## [217] "236" "237" "238" "239" "240" "241" "242" "243" "244" "245" "246" "247"
## [229] "248" "249" "250" "251" "252" "253" "254" "255" "256" "257" "258" "259"
## [241] "260" "261" "262" "263" "264" "265" "266" "267" "268" "269" "270" "271"
## [253] "272" "273" "274" "275" "276" "277" "278" "279" "280" "281" "282" "283"
## [265] "284" "285" "286" "287" "288" "289" "290" "291" "292" "293" "294" "295"
## [277] "296" "297" "298" "299" "300" "301" "302" "303" "304" "305" "306" "307"
## [289] "308" "309" "310" "311" "312" "313" "314" "315" "316" "317" "318" "319"
## [301] "320" "321" "322" "323" "324" "325" "326" "327" "328" "329" "330" "331"
## [313] "332" "333" "334" "335" "336" "338" "339" "340" "341" "342" "343" "344"
## [325] "345" "346" "347" "349" "350" "351" "352" "353" "354" "355" "356" "357"
## [337] "360" "361" "364" "366" "367" "369" "370" "371" "372" "373" "374" "375"
## [349] "376" "377" "382" "383" "384" "385" "386" "388" "389" "390" "391" "393"
## [361] "394" "396" "397" "399" "400" "401" "403" "404" "405" "406" "407" "408"
## [373] "410" "413" "414" "418" "425" "428" "432" "434" "437" "444" "447" "451"
## [385] "469" "473" "474"
levels(as.factor(bbbc_train$Frequency))
## [1] "2" "4" "6" "8" "10" "12" "14" "16" "18" "20" "22" "24" "26" "28" "30"
## [16] "32" "34" "36"
levels(as.factor(bbbc_train$Last_purchase))
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12"
levels(as.factor(bbbc_train$First_purchase))
## [1] "2" "4" "6" "8" "10" "12" "14" "16" "18" "20" "22" "24" "26" "28" "30"
## [16] "32" "34" "36" "38" "40" "42" "44" "46" "48" "50" "52" "54" "56" "58" "60"
## [31] "62" "64" "66" "68" "70" "72" "74" "76" "78" "80" "82" "84" "86" "96"
levels(as.factor(bbbc_train$P_Child))
## [1] "0" "1" "2" "3" "4" "5" "6" "8"
levels(as.factor(bbbc_train$P_Youth))
## [1] "0" "1" "2" "3" "4"
levels(as.factor(bbbc_train$P_Cook))
## [1] "0" "1" "2" "3" "4" "5" "6"
levels(as.factor(bbbc_train$P_DIY))
## [1] "0" "1" "2" "3" "4"
levels(as.factor(bbbc_train$P_Art))
## [1] "0" "1" "2" "3" "4" "5"
levels(as.factor(bbbc_test$Choice))
## [1] "0" "1"
levels(as.factor(bbbc_test$Gender))
## [1] "0" "1"
levels(as.factor(bbbc_test$Amount_purchased))
## [1] "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26"
## [13] "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40"
## [25] "41" "42" "43" "44" "45" "46" "47" "48" "49" "50" "51" "52"
## [37] "53" "54" "55" "56" "57" "58" "59" "60" "61" "62" "63" "64"
## [49] "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "75" "76"
## [61] "77" "78" "79" "80" "81" "82" "83" "84" "85" "86" "87" "88"
## [73] "89" "90" "91" "92" "93" "94" "95" "96" "97" "98" "99" "100"
## [85] "101" "102" "103" "104" "105" "106" "107" "108" "109" "110" "111" "112"
## [97] "113" "114" "115" "116" "117" "118" "119" "120" "121" "122" "123" "124"
## [109] "125" "126" "127" "128" "129" "130" "131" "132" "133" "134" "135" "136"
## [121] "137" "138" "139" "140" "141" "142" "143" "144" "145" "146" "147" "148"
## [133] "149" "150" "151" "152" "153" "154" "155" "156" "157" "158" "159" "160"
## [145] "161" "162" "163" "164" "165" "166" "167" "168" "169" "170" "171" "172"
## [157] "173" "174" "175" "176" "177" "178" "179" "180" "181" "182" "183" "184"
## [169] "185" "186" "187" "188" "189" "190" "191" "192" "193" "194" "195" "196"
## [181] "197" "198" "199" "200" "201" "202" "203" "204" "205" "206" "207" "208"
## [193] "209" "210" "211" "212" "213" "214" "215" "216" "217" "218" "219" "220"
## [205] "221" "222" "223" "224" "225" "226" "227" "228" "229" "230" "231" "232"
## [217] "233" "234" "235" "236" "237" "238" "239" "240" "241" "242" "243" "244"
## [229] "245" "246" "247" "248" "249" "250" "251" "252" "253" "254" "255" "256"
## [241] "257" "258" "259" "260" "261" "262" "263" "264" "265" "266" "267" "268"
## [253] "269" "270" "271" "272" "273" "274" "275" "276" "277" "278" "279" "280"
## [265] "281" "282" "283" "284" "285" "286" "287" "288" "289" "290" "291" "292"
## [277] "293" "294" "295" "296" "297" "298" "299" "300" "301" "302" "303" "304"
## [289] "305" "306" "307" "308" "309" "310" "311" "312" "313" "314" "315" "316"
## [301] "317" "318" "319" "320" "321" "322" "323" "324" "325" "326" "327" "328"
## [313] "329" "330" "331" "332" "333" "334" "335" "336" "337" "338" "339" "342"
## [325] "343" "344" "345" "346" "347" "348" "350" "351" "352" "353" "355" "356"
## [337] "357" "359" "360" "363" "364" "365" "366" "368" "369" "370" "371" "372"
## [349] "374" "377" "379" "382" "383" "384" "385" "386" "387" "388" "391" "392"
## [361] "393" "396" "397" "398" "399" "400" "401" "402" "404" "405" "406" "407"
## [373] "409" "410" "411" "414" "419" "420" "421" "423" "427" "428" "429" "431"
## [385] "433" "434" "435" "439" "442" "443" "444" "447" "450" "458" "461"
levels(as.factor(bbbc_test$Frequency))
## [1] "2" "4" "6" "8" "10" "12" "14" "16" "18" "20" "22" "24" "26" "28" "30"
## [16] "32" "34" "36"
levels(as.factor(bbbc_test$Last_purchase))
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12"
levels(as.factor(bbbc_test$First_purchase))
## [1] "2" "4" "6" "8" "10" "12" "14" "16" "18" "20" "22" "24" "26" "28" "30"
## [16] "32" "34" "36" "38" "40" "42" "44" "46" "48" "50" "52" "54" "56" "58" "60"
## [31] "62" "64" "66" "68" "70" "72" "74" "76" "78" "80" "84" "88" "96"
levels(as.factor(bbbc_test$P_Child))
## [1] "0" "1" "2" "3" "4" "5" "6" "7"
levels(as.factor(bbbc_test$P_Youth))
## [1] "0" "1" "2" "3" "4" "5"
levels(as.factor(bbbc_test$P_Cook))
## [1] "0" "1" "2" "3" "4" "5" "6"
levels(as.factor(bbbc_test$P_DIY))
## [1] "0" "1" "2" "3" "4"
levels(as.factor(bbbc_test$P_Art))
## [1] "0" "1" "2" "3" "4"
bbbc_lm = lm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_train)
summary(bbbc_lm)
##
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## Last_purchase + First_purchase + P_Child + P_Youth + P_Cook +
## P_DIY + P_Art, data = bbbc_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9603 -0.2462 -0.1161 0.1622 1.0588
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3642284 0.0307411 11.848 < 2e-16 ***
## Gender1 -0.1309205 0.0200303 -6.536 8.48e-11 ***
## Amount_purchased 0.0002736 0.0001110 2.464 0.0138 *
## Frequency -0.0090868 0.0021791 -4.170 3.21e-05 ***
## Last_purchase 0.0970286 0.0135589 7.156 1.26e-12 ***
## First_purchase -0.0020024 0.0018160 -1.103 0.2704
## P_Child -0.1262584 0.0164011 -7.698 2.41e-14 ***
## P_Youth -0.0963563 0.0201097 -4.792 1.81e-06 ***
## P_Cook -0.1414907 0.0166064 -8.520 < 2e-16 ***
## P_DIY -0.1352313 0.0197873 -6.834 1.17e-11 ***
## P_Art 0.1178494 0.0194427 6.061 1.68e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3788 on 1589 degrees of freedom
## Multiple R-squared: 0.2401, Adjusted R-squared: 0.2353
## F-statistic: 50.2 on 10 and 1589 DF, p-value: < 2.2e-16
bbbc_lm1 = lm(Choice ~ (.)^2, data = bbbc_train)
summary(bbbc_lm1)
##
## Call:
## lm(formula = Choice ~ (.)^2, data = bbbc_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.03612 -0.22165 -0.10283 0.06123 0.99352
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.765e-01 6.995e-02 5.382 8.51e-08 ***
## Gender1 -1.777e-01 5.843e-02 -3.041 0.00240 **
## Amount_purchased 1.989e-04 3.084e-04 0.645 0.51902
## Frequency -7.691e-03 8.245e-03 -0.933 0.35102
## Last_purchase 1.533e-01 5.225e-02 2.933 0.00340 **
## First_purchase -1.035e-02 7.439e-03 -1.392 0.16420
## P_Child -1.630e-01 5.581e-02 -2.920 0.00356 **
## P_Youth -8.511e-02 6.878e-02 -1.237 0.21610
## P_Cook -1.305e-01 5.775e-02 -2.260 0.02398 *
## P_DIY -1.416e-01 7.104e-02 -1.994 0.04635 *
## P_Art 1.835e-01 6.735e-02 2.725 0.00651 **
## Gender1:Amount_purchased 2.387e-04 2.396e-04 0.996 0.31922
## Gender1:Frequency -3.741e-03 4.702e-03 -0.796 0.42632
## Gender1:Last_purchase -6.512e-02 2.931e-02 -2.222 0.02645 *
## Gender1:First_purchase 7.184e-03 3.981e-03 1.804 0.07135 .
## Gender1:P_Child 4.698e-02 3.502e-02 1.341 0.17996
## Gender1:P_Youth 4.745e-02 4.279e-02 1.109 0.26766
## Gender1:P_Cook 4.097e-02 3.516e-02 1.165 0.24408
## Gender1:P_DIY -2.289e-02 4.233e-02 -0.541 0.58867
## Gender1:P_Art 3.922e-02 4.205e-02 0.933 0.35116
## Amount_purchased:Frequency -3.406e-06 2.713e-05 -0.126 0.90014
## Amount_purchased:Last_purchase -1.530e-05 1.555e-04 -0.098 0.92165
## Amount_purchased:First_purchase -2.818e-06 2.277e-05 -0.124 0.90155
## Amount_purchased:P_Child 1.670e-04 1.926e-04 0.867 0.38598
## Amount_purchased:P_Youth -4.600e-04 2.276e-04 -2.021 0.04343 *
## Amount_purchased:P_Cook -6.230e-05 1.958e-04 -0.318 0.75040
## Amount_purchased:P_DIY 1.370e-04 2.336e-04 0.587 0.55747
## Amount_purchased:P_Art 2.441e-04 2.276e-04 1.072 0.28379
## Frequency:Last_purchase -1.008e-03 1.802e-03 -0.559 0.57604
## Frequency:First_purchase 2.641e-04 1.081e-04 2.444 0.01465 *
## Frequency:P_Child -2.299e-04 2.894e-03 -0.079 0.93668
## Frequency:P_Youth -1.632e-04 3.336e-03 -0.049 0.96100
## Frequency:P_Cook -1.263e-04 2.813e-03 -0.045 0.96420
## Frequency:P_DIY 2.298e-03 3.546e-03 0.648 0.51706
## Frequency:P_Art -9.074e-03 3.219e-03 -2.819 0.00488 **
## Last_purchase:First_purchase 1.856e-04 9.478e-04 0.196 0.84480
## Last_purchase:P_Child -3.015e-03 7.146e-03 -0.422 0.67315
## Last_purchase:P_Youth 1.825e-02 1.325e-02 1.377 0.16877
## Last_purchase:P_Cook -7.985e-04 9.259e-03 -0.086 0.93129
## Last_purchase:P_DIY -1.875e-02 1.355e-02 -1.383 0.16682
## Last_purchase:P_Art -4.667e-03 1.319e-02 -0.354 0.72351
## First_purchase:P_Child -1.048e-03 1.701e-03 -0.616 0.53779
## First_purchase:P_Youth 9.484e-04 2.197e-03 0.432 0.66600
## First_purchase:P_Cook -2.934e-04 1.707e-03 -0.172 0.86355
## First_purchase:P_DIY 1.131e-03 2.229e-03 0.507 0.61191
## First_purchase:P_Art 9.424e-04 2.047e-03 0.460 0.64525
## P_Child:P_Youth -1.070e-02 2.005e-02 -0.534 0.59364
## P_Child:P_Cook 2.717e-02 1.337e-02 2.032 0.04236 *
## P_Child:P_DIY 2.246e-02 2.116e-02 1.061 0.28877
## P_Child:P_Art -1.623e-02 1.933e-02 -0.840 0.40114
## P_Youth:P_Cook -3.018e-02 2.109e-02 -1.431 0.15259
## P_Youth:P_DIY -2.040e-02 2.994e-02 -0.681 0.49567
## P_Youth:P_Art -1.887e-02 2.628e-02 -0.718 0.47276
## P_Cook:P_DIY 3.197e-03 2.370e-02 0.135 0.89272
## P_Cook:P_Art -2.763e-02 1.973e-02 -1.400 0.16165
## P_DIY:P_Art 2.514e-02 2.742e-02 0.917 0.35935
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3759 on 1544 degrees of freedom
## Multiple R-squared: 0.2727, Adjusted R-squared: 0.2468
## F-statistic: 10.53 on 55 and 1544 DF, p-value: < 2.2e-16
bbbc_lm2 = ols_step_backward_p(bbbc_lm1, prem = 0.05, details = F)
bbbc_lm2$model
##
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")),
## data = l)
##
## Coefficients:
## (Intercept) Gender1 Amount_purchased
## 0.3652837 -0.1300602 0.0004283
## Frequency Last_purchase P_Child
## -0.0159536 0.0757656 -0.1051257
## P_Youth P_Cook P_DIY
## -0.0439214 -0.1468073 -0.1319547
## P_Art Amount_purchased:P_Youth Frequency:First_purchase
## 0.2569808 -0.0004654 0.0002160
## Frequency:P_Art Last_purchase:P_Youth P_Child:First_purchase
## -0.0088380 0.0095153 -0.0012821
## P_Child:P_Cook P_Cook:P_Art
## 0.0240791 -0.0294565
bbbc_lm2 = lm(data = bbbc_train, Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Amount_purchased:P_Youth + Frequency:First_purchase + Frequency:P_Art + Last_purchase:P_Youth + P_Child:First_purchase + P_Child:P_Cook + P_Cook:P_Art)
summary(bbbc_lm2)
##
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art +
## Amount_purchased:P_Youth + Frequency:First_purchase + Frequency:P_Art +
## Last_purchase:P_Youth + P_Child:First_purchase + P_Child:P_Cook +
## P_Cook:P_Art, data = bbbc_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.97590 -0.22903 -0.10373 0.07595 1.00459
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.653e-01 4.103e-02 8.903 < 2e-16 ***
## Gender1 -1.301e-01 1.983e-02 -6.558 7.35e-11 ***
## Amount_purchased 4.283e-04 1.259e-04 3.402 0.000687 ***
## Frequency -1.595e-02 3.045e-03 -5.240 1.82e-07 ***
## Last_purchase 7.577e-02 1.218e-02 6.219 6.37e-10 ***
## P_Child -1.051e-01 2.364e-02 -4.447 9.31e-06 ***
## P_Youth -4.392e-02 4.269e-02 -1.029 0.303753
## P_Cook -1.468e-01 2.011e-02 -7.302 4.47e-13 ***
## P_DIY -1.320e-01 1.973e-02 -6.689 3.11e-11 ***
## P_Art 2.570e-01 3.250e-02 7.906 4.93e-15 ***
## Amount_purchased:P_Youth -4.654e-04 1.697e-04 -2.743 0.006165 **
## Frequency:First_purchase 2.160e-04 7.231e-05 2.987 0.002857 **
## Frequency:P_Art -8.838e-03 1.936e-03 -4.564 5.40e-06 ***
## Last_purchase:P_Youth 9.515e-03 4.796e-03 1.984 0.047437 *
## P_Child:First_purchase -1.282e-03 5.160e-04 -2.485 0.013060 *
## P_Child:P_Cook 2.408e-02 8.469e-03 2.843 0.004524 **
## P_Cook:P_Art -2.946e-02 1.123e-02 -2.622 0.008821 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3746 on 1583 degrees of freedom
## Multiple R-squared: 0.2597, Adjusted R-squared: 0.2523
## F-statistic: 34.71 on 16 and 1583 DF, p-value: < 2.2e-16
cook_bbbc_lm2 = cooks.distance(bbbc_lm2)
plot(cook_bbbc_lm2, col = "red", pch = 20, cex = 1)
abline(h = 4/1600, lty = 2, col = "black")
large_cook_distance = which(cooks.distance(bbbc_lm2)>4/1600)
bbbc_lm2_trained = bbbc_train[-large_cook_distance,]
bbbc_lm2_1 = lm(data = bbbc_lm2_trained, Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Amount_purchased:P_Youth + Frequency:First_purchase + Frequency:P_Art + Last_purchase:P_Youth + P_Child:First_purchase + P_Child:P_Cook + P_Cook:P_Art)
summary(bbbc_lm2_1)
##
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art +
## Amount_purchased:P_Youth + Frequency:First_purchase + Frequency:P_Art +
## Last_purchase:P_Youth + P_Child:First_purchase + P_Child:P_Cook +
## P_Cook:P_Art, data = bbbc_lm2_trained)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.75671 -0.21055 -0.08308 0.07178 0.99302
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.937e-01 3.935e-02 10.005 < 2e-16 ***
## Gender1 -1.179e-01 1.906e-02 -6.182 8.12e-10 ***
## Amount_purchased 3.403e-04 1.205e-04 2.823 0.004816 **
## Frequency -1.886e-02 2.951e-03 -6.390 2.20e-10 ***
## Last_purchase 7.137e-02 1.295e-02 5.512 4.18e-08 ***
## P_Child -1.026e-01 2.370e-02 -4.328 1.60e-05 ***
## P_Youth -1.044e-01 4.230e-02 -2.469 0.013654 *
## P_Cook -1.467e-01 1.988e-02 -7.381 2.59e-13 ***
## P_DIY -1.292e-01 2.017e-02 -6.407 1.98e-10 ***
## P_Art 2.995e-01 3.202e-02 9.352 < 2e-16 ***
## Amount_purchased:P_Youth -4.138e-04 1.758e-04 -2.354 0.018690 *
## Frequency:First_purchase 2.759e-04 7.112e-05 3.880 0.000109 ***
## Frequency:P_Art -1.081e-02 1.920e-03 -5.631 2.14e-08 ***
## Last_purchase:P_Youth 1.227e-02 4.977e-03 2.465 0.013798 *
## P_Child:First_purchase -1.318e-03 5.734e-04 -2.299 0.021629 *
## P_Child:P_Cook 1.889e-02 9.627e-03 1.962 0.049906 *
## P_Cook:P_Art -3.943e-02 1.323e-02 -2.979 0.002934 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3506 on 1508 degrees of freedom
## Multiple R-squared: 0.3055, Adjusted R-squared: 0.2981
## F-statistic: 41.46 on 16 and 1508 DF, p-value: < 2.2e-16
vif(bbbc_lm2_1)
## Gender Amount_purchased Frequency
## 1.007185 1.571742 6.550741
## Last_purchase P_Child P_Youth
## 16.041588 6.999756 7.730242
## P_Cook P_DIY P_Art
## 4.675060 2.113232 6.263660
## Amount_purchased:P_Youth Frequency:First_purchase Frequency:P_Art
## 10.366335 9.185984 4.366306
## Last_purchase:P_Youth P_Child:First_purchase P_Child:P_Cook
## 6.608138 9.556593 4.866074
## P_Cook:P_Art
## 2.813394
bbbc_lm2_2 = lm(data = bbbc_lm2_trained, Choice ~ Gender + Amount_purchased + Frequency + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Frequency:P_Art + Last_purchase:P_Youth + P_Child:P_Cook + P_Cook:P_Art)
summary(bbbc_lm2_2)
##
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## P_Child + P_Youth + P_Cook + P_DIY + P_Art + Frequency:P_Art +
## Last_purchase:P_Youth + P_Child:P_Cook + P_Cook:P_Art, data = bbbc_lm2_trained)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.82113 -0.20684 -0.10215 0.07888 0.94534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3573304 0.0320051 11.165 < 2e-16 ***
## Gender1 -0.1176548 0.0194322 -6.055 1.77e-09 ***
## Amount_purchased 0.0003135 0.0001071 2.928 0.003466 **
## Frequency -0.0090326 0.0013688 -6.599 5.71e-11 ***
## P_Child -0.0507128 0.0138707 -3.656 0.000265 ***
## P_Youth -0.0977169 0.0311716 -3.135 0.001753 **
## P_Cook -0.0554737 0.0153856 -3.606 0.000322 ***
## P_DIY -0.0422865 0.0156734 -2.698 0.007054 **
## P_Art 0.3691289 0.0276425 13.354 < 2e-16 ***
## Frequency:P_Art -0.0089962 0.0018532 -4.854 1.33e-06 ***
## P_Youth:Last_purchase 0.0089195 0.0043355 2.057 0.039829 *
## P_Child:P_Cook 0.0130521 0.0083536 1.562 0.118391
## P_Cook:P_Art -0.0362256 0.0134283 -2.698 0.007059 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3576 on 1512 degrees of freedom
## Multiple R-squared: 0.2758, Adjusted R-squared: 0.27
## F-statistic: 47.98 on 12 and 1512 DF, p-value: < 2.2e-16
vif(bbbc_lm2_2)
## Gender Amount_purchased Frequency
## 1.006296 1.192679 1.354652
## P_Child P_Youth P_Cook
## 2.304990 4.036195 2.693090
## P_DIY P_Art Frequency:P_Art
## 1.227421 4.487716 3.912225
## P_Youth:Last_purchase P_Child:P_Cook P_Cook:P_Art
## 4.822456 3.522952 2.785372
bbbc_lm2_3 = ols_step_backward_p(bbbc_lm2_2, prem = 0.05, details = F)
bbbc_lm2_3$model
##
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")),
## data = l)
##
## Coefficients:
## (Intercept) Gender1 Amount_purchased
## 0.3451476 -0.1159939 0.0003101
## Frequency P_Child P_Youth
## -0.0089816 -0.0359693 -0.0962058
## P_Cook P_DIY P_Art
## -0.0418821 -0.0401135 0.3697613
## Frequency:P_Art P_Youth:Last_purchase P_Cook:P_Art
## -0.0090406 0.0087674 -0.0361205
bbbc_lm2_3 = lm(data = bbbc_lm2_trained,Choice ~ Gender + Amount_purchased + Frequency + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Frequency:P_Art + Last_purchase:P_Youth + P_Cook:P_Art)
summary(bbbc_lm2_3)
##
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## P_Child + P_Youth + P_Cook + P_DIY + P_Art + Frequency:P_Art +
## Last_purchase:P_Youth + P_Cook:P_Art, data = bbbc_lm2_trained)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.80923 -0.20561 -0.10483 0.08567 0.93807
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3451476 0.0310555 11.114 < 2e-16 ***
## Gender1 -0.1159939 0.0194123 -5.975 2.86e-09 ***
## Amount_purchased 0.0003101 0.0001071 2.896 0.003838 **
## Frequency -0.0089816 0.0013690 -6.561 7.34e-11 ***
## P_Child -0.0359693 0.0101712 -3.536 0.000418 ***
## P_Youth -0.0962058 0.0311714 -3.086 0.002063 **
## P_Cook -0.0418821 0.0126964 -3.299 0.000994 ***
## P_DIY -0.0401135 0.0156191 -2.568 0.010317 *
## P_Art 0.3697613 0.0276527 13.372 < 2e-16 ***
## Frequency:P_Art -0.0090406 0.0018538 -4.877 1.19e-06 ***
## P_Youth:Last_purchase 0.0087674 0.0043365 2.022 0.043375 *
## P_Cook:P_Art -0.0361205 0.0134345 -2.689 0.007253 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3578 on 1513 degrees of freedom
## Multiple R-squared: 0.2746, Adjusted R-squared: 0.2693
## F-statistic: 52.07 on 11 and 1513 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(bbbc_lm2_3, which=c(1:4))
## Apply Linear regression with all predictors and backward selection
bbbc_lm3 = ols_step_backward_p(bbbc_lm, prem = 0.05, details = F)
bbbc_lm3$model
##
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")),
## data = l)
##
## Coefficients:
## (Intercept) Gender1 Amount_purchased Frequency
## 0.3727367 -0.1316464 0.0002742 -0.0110830
## Last_purchase P_Child P_Youth P_Cook
## 0.0894288 -0.1275991 -0.0973642 -0.1433497
## P_DIY P_Art
## -0.1365578 0.1150034
bbbc_lm3 = lm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_train)
summary(bbbc_lm3)
##
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art,
## data = bbbc_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9802 -0.2452 -0.1157 0.1655 1.0595
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3727367 0.0297590 12.525 < 2e-16 ***
## Gender1 -0.1316464 0.0200208 -6.575 6.56e-11 ***
## Amount_purchased 0.0002742 0.0001110 2.470 0.0136 *
## Frequency -0.0110830 0.0012128 -9.138 < 2e-16 ***
## Last_purchase 0.0894288 0.0116772 7.658 3.25e-14 ***
## P_Child -0.1275991 0.0163571 -7.801 1.11e-14 ***
## P_Youth -0.0973642 0.0200903 -4.846 1.38e-06 ***
## P_Cook -0.1433497 0.0165218 -8.676 < 2e-16 ***
## P_DIY -0.1365578 0.0197520 -6.914 6.82e-12 ***
## P_Art 0.1150034 0.0192719 5.967 2.97e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3788 on 1590 degrees of freedom
## Multiple R-squared: 0.2395, Adjusted R-squared: 0.2352
## F-statistic: 55.63 on 9 and 1590 DF, p-value: < 2.2e-16
cook_bbbc_lm3 = cooks.distance(bbbc_lm3)
plot(cook_bbbc_lm3, col = "red", pch = 20, cex = 1)
abline(h = 4/1600, lty = 2, col = "black")
large_cook_distance = which(cooks.distance(bbbc_lm3)>4/1600)
bbbc_lm3_trained = bbbc_train[-large_cook_distance,]
bbbc_lm3_1 = lm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_lm3_trained)
summary(bbbc_lm3_1)
##
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art,
## data = bbbc_lm3_trained)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.72682 -0.22418 -0.09923 0.12558 0.98094
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3589669 0.0283022 12.683 < 2e-16 ***
## Gender1 -0.1133411 0.0191199 -5.928 3.79e-09 ***
## Amount_purchased 0.0002985 0.0001063 2.808 0.00505 **
## Frequency -0.0120967 0.0011619 -10.411 < 2e-16 ***
## Last_purchase 0.0837024 0.0122794 6.816 1.34e-11 ***
## P_Child -0.1273699 0.0168130 -7.576 6.18e-14 ***
## P_Youth -0.1241418 0.0204237 -6.078 1.53e-09 ***
## P_Cook -0.1455776 0.0167062 -8.714 < 2e-16 ***
## P_DIY -0.1276728 0.0198407 -6.435 1.65e-10 ***
## P_Art 0.1490765 0.0196550 7.585 5.78e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3523 on 1517 degrees of freedom
## Multiple R-squared: 0.2935, Adjusted R-squared: 0.2893
## F-statistic: 70.03 on 9 and 1517 DF, p-value: < 2.2e-16
vif(bbbc_lm3_1)
## Gender Amount_purchased Frequency Last_purchase
## 1.003393 1.226169 1.009234 15.034206
## P_Child P_Youth P_Cook P_DIY
## 3.617257 1.820319 3.384715 2.089291
## P_Art
## 2.401423
bbbc_lm3_2 = lm(Choice ~ Gender + Amount_purchased + Frequency + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_lm3_trained)
summary(bbbc_lm3_2)
##
## Call:
## lm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_lm3_trained)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7698 -0.2261 -0.1036 0.1216 0.9464
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3584078 0.0287228 12.478 < 2e-16 ***
## Gender1 -0.1108832 0.0194007 -5.715 1.31e-08 ***
## Amount_purchased 0.0003941 0.0001069 3.685 0.000236 ***
## Frequency -0.0121444 0.0011792 -10.299 < 2e-16 ***
## P_Child -0.0336805 0.0098271 -3.427 0.000626 ***
## P_Youth -0.0360643 0.0160518 -2.247 0.024800 *
## P_Cook -0.0542681 0.0101315 -5.356 9.80e-08 ***
## P_DIY -0.0379397 0.0150652 -2.518 0.011892 *
## P_Art 0.2472133 0.0135798 18.205 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3575 on 1518 degrees of freedom
## Multiple R-squared: 0.2719, Adjusted R-squared: 0.268
## F-statistic: 70.85 on 8 and 1518 DF, p-value: < 2.2e-16
vif(bbbc_lm3_2)
## Gender Amount_purchased Frequency P_Child
## 1.003036 1.204798 1.009197 1.199848
## P_Youth P_Cook P_DIY P_Art
## 1.091722 1.208629 1.169544 1.112987
par(mfrow=c(2,2))
plot(bbbc_lm3_2, which=c(1:4))
#### Choice as factor variable
bbbc_train$Choice = as.factor(bbbc_train$Choice)
bbbc_test$Choice = as.factor(bbbc_test$Choice)
bbbc_log = glm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_train, family = binomial)
summary(bbbc_log)
##
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## Last_purchase + First_purchase + P_Child + P_Youth + P_Cook +
## P_DIY + P_Art, family = binomial, data = bbbc_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.38586 -0.66728 -0.43696 -0.02242 2.72238
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3515281 0.2143839 -1.640 0.1011
## Gender1 -0.8632319 0.1374499 -6.280 3.38e-10 ***
## Amount_purchased 0.0018641 0.0007918 2.354 0.0186 *
## Frequency -0.0755142 0.0165937 -4.551 5.35e-06 ***
## Last_purchase 0.6117713 0.0938127 6.521 6.97e-11 ***
## First_purchase -0.0147792 0.0128027 -1.154 0.2483
## P_Child -0.8112489 0.1167067 -6.951 3.62e-12 ***
## P_Youth -0.6370422 0.1433778 -4.443 8.87e-06 ***
## P_Cook -0.9230066 0.1194814 -7.725 1.12e-14 ***
## P_DIY -0.9058697 0.1437025 -6.304 2.90e-10 ***
## P_Art 0.6861124 0.1270176 5.402 6.60e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1799.5 on 1599 degrees of freedom
## Residual deviance: 1392.2 on 1589 degrees of freedom
## AIC: 1414.2
##
## Number of Fisher Scoring iterations: 5
bbbc_log_Pred_Prob = predict.glm(bbbc_log, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1968 128
## 1 125 79
##
## Accuracy : 0.89
## 95% CI : (0.8765, 0.9025)
## No Information Rate : 0.91
## P-Value [Acc > NIR] : 0.9995
##
## Kappa : 0.324
##
## Mcnemar's Test P-Value : 0.8999
##
## Sensitivity : 0.38164
## Specificity : 0.94028
## Pos Pred Value : 0.38725
## Neg Pred Value : 0.93893
## Prevalence : 0.09000
## Detection Rate : 0.03435
## Detection Prevalence : 0.08870
## Balanced Accuracy : 0.66096
##
## 'Positive' Class : 1
##
bbbc_log2 = glm(Choice ~ (.)^2, data = bbbc_train, family = binomial)
summary(bbbc_log2)
##
## Call:
## glm(formula = Choice ~ (.)^2, family = binomial, data = bbbc_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.66574 -0.65647 -0.42036 -0.00011 2.62619
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.924e-01 4.846e-01 -1.016 0.3096
## Gender1 -9.323e-01 4.281e-01 -2.178 0.0294 *
## Amount_purchased 2.517e-04 2.169e-03 0.116 0.9076
## Frequency -3.397e-02 6.206e-02 -0.547 0.5841
## Last_purchase 6.055e-01 3.723e-01 1.627 0.1038
## First_purchase -5.060e-02 5.556e-02 -0.911 0.3625
## P_Child -7.134e-01 4.117e-01 -1.733 0.0832 .
## P_Youth -1.269e-02 5.098e-01 -0.025 0.9801
## P_Cook -2.425e-01 4.232e-01 -0.573 0.5665
## P_DIY -5.820e-01 5.164e-01 -1.127 0.2597
## P_Art 8.205e-01 4.654e-01 1.763 0.0779 .
## Gender1:Amount_purchased 2.726e-03 1.709e-03 1.596 0.1106
## Gender1:Frequency -5.859e-02 3.833e-02 -1.528 0.1264
## Gender1:Last_purchase -2.992e-01 2.165e-01 -1.382 0.1669
## Gender1:First_purchase 4.681e-02 3.148e-02 1.487 0.1370
## Gender1:P_Child 2.939e-02 2.576e-01 0.114 0.9092
## Gender1:P_Youth 6.852e-02 3.332e-01 0.206 0.8371
## Gender1:P_Cook 2.024e-02 2.654e-01 0.076 0.9392
## Gender1:P_DIY -5.411e-01 3.241e-01 -1.670 0.0950 .
## Gender1:P_Art 3.834e-01 2.830e-01 1.355 0.1755
## Amount_purchased:Frequency 3.692e-05 2.183e-04 0.169 0.8657
## Amount_purchased:Last_purchase 1.662e-04 1.162e-03 0.143 0.8862
## Amount_purchased:First_purchase -3.205e-05 1.743e-04 -0.184 0.8541
## Amount_purchased:P_Child 1.779e-03 1.448e-03 1.228 0.2193
## Amount_purchased:P_Youth -4.112e-03 1.791e-03 -2.296 0.0217 *
## Amount_purchased:P_Cook -1.025e-03 1.520e-03 -0.674 0.5001
## Amount_purchased:P_DIY 1.438e-03 1.843e-03 0.780 0.4353
## Amount_purchased:P_Art 4.243e-04 1.593e-03 0.266 0.7899
## Frequency:Last_purchase 1.202e-02 1.514e-02 0.794 0.4272
## Frequency:First_purchase 7.832e-04 9.614e-04 0.815 0.4153
## Frequency:P_Child -2.940e-02 2.484e-02 -1.183 0.2366
## Frequency:P_Youth -2.877e-02 3.080e-02 -0.934 0.3502
## Frequency:P_Cook -3.762e-02 2.471e-02 -1.523 0.1279
## Frequency:P_DIY -6.784e-03 3.043e-02 -0.223 0.8236
## Frequency:P_Art -3.816e-02 2.808e-02 -1.359 0.1742
## Last_purchase:First_purchase 1.447e-03 7.004e-03 0.207 0.8363
## Last_purchase:P_Child -2.727e-02 6.489e-02 -0.420 0.6742
## Last_purchase:P_Youth 1.975e-01 1.167e-01 1.693 0.0905 .
## Last_purchase:P_Cook -1.965e-02 7.094e-02 -0.277 0.7817
## Last_purchase:P_DIY -1.001e-01 1.043e-01 -0.959 0.3374
## Last_purchase:P_Art 3.443e-02 1.365e-01 0.252 0.8009
## First_purchase:P_Child -8.368e-03 1.401e-02 -0.597 0.5503
## First_purchase:P_Youth 3.070e-03 1.909e-02 0.161 0.8722
## First_purchase:P_Cook 3.805e-03 1.363e-02 0.279 0.7801
## First_purchase:P_DIY 4.674e-03 1.819e-02 0.257 0.7973
## First_purchase:P_Art -2.875e-03 1.800e-02 -0.160 0.8731
## P_Child:P_Youth -1.283e-01 1.824e-01 -0.703 0.4820
## P_Child:P_Cook 2.110e-01 1.200e-01 1.757 0.0788 .
## P_Child:P_DIY 3.803e-02 1.746e-01 0.218 0.8276
## P_Child:P_Art -7.824e-02 1.874e-01 -0.418 0.6762
## P_Youth:P_Cook -3.904e-01 1.808e-01 -2.159 0.0309 *
## P_Youth:P_DIY -2.996e-01 2.589e-01 -1.157 0.2472
## P_Youth:P_Art 3.997e-02 2.307e-01 0.173 0.8625
## P_Cook:P_DIY -8.013e-02 1.985e-01 -0.404 0.6864
## P_Cook:P_Art -1.709e-01 1.856e-01 -0.921 0.3571
## P_DIY:P_Art 2.937e-01 2.351e-01 1.250 0.2114
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1799.5 on 1599 degrees of freedom
## Residual deviance: 1340.0 on 1544 degrees of freedom
## AIC: 1452
##
## Number of Fisher Scoring iterations: 6
bbbc_log_Pred_Prob = predict.glm(bbbc_log2, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1975 121
## 1 124 80
##
## Accuracy : 0.8935
## 95% CI : (0.8801, 0.9058)
## No Information Rate : 0.9126
## P-Value [Acc > NIR] : 0.9993
##
## Kappa : 0.3367
##
## Mcnemar's Test P-Value : 0.8983
##
## Sensitivity : 0.39801
## Specificity : 0.94092
## Pos Pred Value : 0.39216
## Neg Pred Value : 0.94227
## Prevalence : 0.08739
## Detection Rate : 0.03478
## Detection Prevalence : 0.08870
## Balanced Accuracy : 0.66947
##
## 'Positive' Class : 1
##
bbbc_log3 = step(bbbc_log2, direction = "backward", trace = FALSE)
summary(bbbc_log3)
##
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## Last_purchase + First_purchase + P_Child + P_Youth + P_Cook +
## P_DIY + P_Art + Gender:Amount_purchased + Gender:P_DIY +
## Amount_purchased:P_Child + Amount_purchased:P_Youth + Last_purchase:P_Youth +
## Last_purchase:P_DIY + First_purchase:P_Child + P_Child:P_Cook +
## P_Youth:P_Cook + P_Cook:P_Art + P_DIY:P_Art, family = binomial,
## data = bbbc_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.08079 -0.65926 -0.42548 0.01452 2.70453
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1777724 0.3168490 -0.561 0.574754
## Gender1 -1.2238857 0.3354875 -3.648 0.000264 ***
## Amount_purchased 0.0005102 0.0013783 0.370 0.711236
## Frequency -0.0955830 0.0186950 -5.113 3.17e-07 ***
## Last_purchase 0.5393426 0.1044378 5.164 2.41e-07 ***
## First_purchase 0.0112353 0.0164963 0.681 0.495821
## P_Child -1.0831641 0.2356436 -4.597 4.29e-06 ***
## P_Youth -0.3039130 0.3238519 -0.938 0.348022
## P_Cook -0.8759147 0.1640133 -5.341 9.27e-08 ***
## P_DIY -0.4908395 0.2726998 -1.800 0.071872 .
## P_Art 0.7101141 0.1653700 4.294 1.75e-05 ***
## Gender1:Amount_purchased 0.0025071 0.0015265 1.642 0.100504
## Gender1:P_DIY -0.5392425 0.2355548 -2.289 0.022065 *
## Amount_purchased:P_Child 0.0019907 0.0008728 2.281 0.022559 *
## Amount_purchased:P_Youth -0.0037796 0.0013114 -2.882 0.003951 **
## Last_purchase:P_Youth 0.1459361 0.0490033 2.978 0.002901 **
## Last_purchase:P_DIY -0.0772748 0.0393397 -1.964 0.049496 *
## First_purchase:P_Child -0.0134187 0.0048060 -2.792 0.005238 **
## P_Child:P_Cook 0.1745490 0.0689410 2.532 0.011346 *
## P_Youth:P_Cook -0.2992836 0.1421260 -2.106 0.035225 *
## P_Cook:P_Art -0.1614371 0.0903445 -1.787 0.073953 .
## P_DIY:P_Art 0.3362011 0.1614869 2.082 0.037351 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1799.5 on 1599 degrees of freedom
## Residual deviance: 1358.2 on 1578 degrees of freedom
## AIC: 1402.2
##
## Number of Fisher Scoring iterations: 5
bbbc_log_Pred_Prob = predict.glm(bbbc_log3, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1973 123
## 1 120 84
##
## Accuracy : 0.8943
## 95% CI : (0.8811, 0.9066)
## No Information Rate : 0.91
## P-Value [Acc > NIR] : 0.9954
##
## Kappa : 0.3508
##
## Mcnemar's Test P-Value : 0.8979
##
## Sensitivity : 0.40580
## Specificity : 0.94267
## Pos Pred Value : 0.41176
## Neg Pred Value : 0.94132
## Prevalence : 0.09000
## Detection Rate : 0.03652
## Detection Prevalence : 0.08870
## Balanced Accuracy : 0.67423
##
## 'Positive' Class : 1
##
cook_distance_log3 = cooks.distance(bbbc_log3)
plot(cook_distance_log3, col = "red", pch = 20, cex = 1)
abline(h = 4/1600, lty = 2, col = "black")
#### Remove outliers
large_cook_distance = which(cooks.distance(bbbc_log3)>(4/1600))
bbbc_log3_trained = bbbc_train[-large_cook_distance,]
bbbc_log3_2 = glm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + First_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Gender:Amount_purchased + Gender:P_DIY + Amount_purchased:P_Child + Amount_purchased:P_Youth + Last_purchase:P_Youth + Last_purchase:P_DIY + First_purchase:P_Child + P_Child:P_Cook + P_Youth:P_Cook + P_Cook:P_Art + P_DIY:P_Art, data = bbbc_log3_trained, family = binomial)
summary(bbbc_log3_2)
##
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## Last_purchase + First_purchase + P_Child + P_Youth + P_Cook +
## P_DIY + P_Art + Gender:Amount_purchased + Gender:P_DIY +
## Amount_purchased:P_Child + Amount_purchased:P_Youth + Last_purchase:P_Youth +
## Last_purchase:P_DIY + First_purchase:P_Child + P_Child:P_Cook +
## P_Youth:P_Cook + P_Cook:P_Art + P_DIY:P_Art, family = binomial,
## data = bbbc_log3_trained)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.08528 -0.55742 -0.28775 -0.02826 2.74734
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.0045600 0.3819120 0.012 0.990474
## Gender1 -1.2870270 0.4036421 -3.189 0.001430 **
## Amount_purchased 0.0009134 0.0016268 0.561 0.574471
## Frequency -0.1372397 0.0272378 -5.039 4.69e-07 ***
## Last_purchase 0.7056159 0.1516651 4.652 3.28e-06 ***
## First_purchase 0.0090064 0.0241811 0.372 0.709553
## P_Child -1.3932946 0.3178214 -4.384 1.17e-05 ***
## P_Youth -0.8798479 0.4633697 -1.899 0.057590 .
## P_Cook -1.1053726 0.1975767 -5.595 2.21e-08 ***
## P_DIY -0.6485602 0.3615495 -1.794 0.072839 .
## P_Art 0.7290163 0.1954992 3.729 0.000192 ***
## Gender1:Amount_purchased 0.0030971 0.0018839 1.644 0.100182
## Gender1:P_DIY -1.2880630 0.3720225 -3.462 0.000536 ***
## Amount_purchased:P_Child 0.0031359 0.0013007 2.411 0.015913 *
## Amount_purchased:P_Youth -0.0063309 0.0020373 -3.107 0.001887 **
## Last_purchase:P_Youth 0.4049317 0.0967919 4.184 2.87e-05 ***
## Last_purchase:P_DIY -0.1518262 0.0756022 -2.008 0.044619 *
## First_purchase:P_Child -0.0260979 0.0077131 -3.384 0.000715 ***
## P_Child:P_Cook 0.2090575 0.1327920 1.574 0.115413
## P_Youth:P_Cook -1.1989748 0.3181952 -3.768 0.000165 ***
## P_Cook:P_Art 0.0207649 0.1465806 0.142 0.887347
## P_DIY:P_Art 0.8832159 0.3110327 2.840 0.004517 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1584.7 on 1502 degrees of freedom
## Residual deviance: 1014.2 on 1481 degrees of freedom
## AIC: 1058.2
##
## Number of Fisher Scoring iterations: 7
bbbc_log_Pred_Prob = predict.glm(bbbc_log3_2, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1946 150
## 1 120 84
##
## Accuracy : 0.8826
## 95% CI : (0.8687, 0.8955)
## No Information Rate : 0.8983
## P-Value [Acc > NIR] : 0.99328
##
## Kappa : 0.319
##
## Mcnemar's Test P-Value : 0.07758
##
## Sensitivity : 0.35897
## Specificity : 0.94192
## Pos Pred Value : 0.41176
## Neg Pred Value : 0.92844
## Prevalence : 0.10174
## Detection Rate : 0.03652
## Detection Prevalence : 0.08870
## Balanced Accuracy : 0.65045
##
## 'Positive' Class : 1
##
vif(bbbc_log3_2)
## Gender Amount_purchased Frequency
## 6.327947 3.231413 3.959253
## Last_purchase First_purchase P_Child
## 18.097883 14.781570 9.903587
## P_Youth P_Cook P_DIY
## 7.031414 3.818008 4.813335
## P_Art Gender:Amount_purchased Gender:P_DIY
## 2.749984 8.253189 2.942640
## Amount_purchased:P_Child Amount_purchased:P_Youth Last_purchase:P_Youth
## 13.202736 8.600721 12.389289
## Last_purchase:P_DIY First_purchase:P_Child P_Child:P_Cook
## 10.272898 9.451090 4.470351
## P_Youth:P_Cook P_Cook:P_Art P_DIY:P_Art
## 5.266006 4.369223 6.828573
bbbc_log3_3 = glm(Choice ~ Gender + Amount_purchased + Frequency + P_Child + P_Youth + P_Cook + P_DIY + P_Art + Gender:P_DIY + P_Child:P_Cook + P_Youth:P_Cook + P_Cook:P_Art + P_DIY:P_Art, data = bbbc_log3_trained, family = binomial)
summary(bbbc_log3_3)
##
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## P_Child + P_Youth + P_Cook + P_DIY + P_Art + Gender:P_DIY +
## P_Child:P_Cook + P_Youth:P_Cook + P_Cook:P_Art + P_DIY:P_Art,
## family = binomial, data = bbbc_log3_trained)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.12475 -0.59325 -0.35155 -0.08197 2.61190
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1466637 0.2597245 -0.565 0.572285
## Gender1 -0.7279068 0.1719913 -4.232 2.31e-05 ***
## Amount_purchased 0.0033546 0.0009092 3.690 0.000225 ***
## Frequency -0.1335029 0.0136510 -9.780 < 2e-16 ***
## P_Child -0.3777533 0.1237906 -3.052 0.002277 **
## P_Youth -0.0992598 0.1901371 -0.522 0.601640
## P_Cook -0.4114009 0.1542256 -2.668 0.007641 **
## P_DIY -0.5006975 0.2532521 -1.977 0.048034 *
## P_Art 1.5406510 0.1541179 9.997 < 2e-16 ***
## Gender1:P_DIY -0.7268812 0.3192180 -2.277 0.022782 *
## P_Child:P_Cook 0.0939201 0.0924061 1.016 0.309447
## P_Youth:P_Cook -0.2982718 0.1659704 -1.797 0.072314 .
## P_Cook:P_Art -0.0919085 0.1247962 -0.736 0.461445
## P_DIY:P_Art 0.5070186 0.2025299 2.503 0.012300 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1584.7 on 1502 degrees of freedom
## Residual deviance: 1106.2 on 1489 degrees of freedom
## AIC: 1134.2
##
## Number of Fisher Scoring iterations: 6
bbbc_log_Pred_Prob = predict.glm(bbbc_log3_3, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1976 120
## 1 130 74
##
## Accuracy : 0.8913
## 95% CI : (0.8779, 0.9037)
## No Information Rate : 0.9157
## P-Value [Acc > NIR] : 1.0000
##
## Kappa : 0.3124
##
## Mcnemar's Test P-Value : 0.5692
##
## Sensitivity : 0.38144
## Specificity : 0.93827
## Pos Pred Value : 0.36275
## Neg Pred Value : 0.94275
## Prevalence : 0.08435
## Detection Rate : 0.03217
## Detection Prevalence : 0.08870
## Balanced Accuracy : 0.65986
##
## 'Positive' Class : 1
##
vif(bbbc_log3_3)
## Gender Amount_purchased Frequency P_Child
## 1.244340 1.134827 1.052314 1.982088
## P_Youth P_Cook P_DIY P_Art
## 1.721617 2.835163 3.023992 1.965903
## Gender:P_DIY P_Child:P_Cook P_Youth:P_Cook P_Cook:P_Art
## 3.002732 3.658630 2.453211 4.223305
## P_DIY:P_Art
## 3.828999
bbbc_log3_4 = step(bbbc_log3_3, direction = "backward", trace = FALSE)
summary(bbbc_log3_4)
##
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## P_Child + P_Youth + P_Cook + P_DIY + P_Art + Gender:P_DIY +
## P_Youth:P_Cook + P_DIY:P_Art, family = binomial, data = bbbc_log3_trained)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.09834 -0.59177 -0.35310 -0.07416 2.60709
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1748168 0.2481528 -0.704 0.481139
## Gender1 -0.7243421 0.1717539 -4.217 2.47e-05 ***
## Amount_purchased 0.0033395 0.0009082 3.677 0.000236 ***
## Frequency -0.1328249 0.0135764 -9.784 < 2e-16 ***
## P_Child -0.3105357 0.0966700 -3.212 0.001317 **
## P_Youth -0.0940805 0.1890695 -0.498 0.618768
## P_Cook -0.3929480 0.1164072 -3.376 0.000736 ***
## P_DIY -0.4764514 0.2524590 -1.887 0.059128 .
## P_Art 1.5095302 0.1366903 11.043 < 2e-16 ***
## Gender1:P_DIY -0.7203885 0.3188350 -2.259 0.023856 *
## P_Youth:P_Cook -0.3141036 0.1653946 -1.899 0.057549 .
## P_DIY:P_Art 0.4675144 0.1975315 2.367 0.017943 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1584.7 on 1502 degrees of freedom
## Residual deviance: 1107.4 on 1491 degrees of freedom
## AIC: 1131.4
##
## Number of Fisher Scoring iterations: 6
bbbc_log_Pred_Prob = predict.glm(bbbc_log3_4, newdata = bbbc_test, type = "response")
book_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .18178,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1976 120
## 1 130 74
##
## Accuracy : 0.8913
## 95% CI : (0.8779, 0.9037)
## No Information Rate : 0.9157
## P-Value [Acc > NIR] : 1.0000
##
## Kappa : 0.3124
##
## Mcnemar's Test P-Value : 0.5692
##
## Sensitivity : 0.38144
## Specificity : 0.93827
## Pos Pred Value : 0.36275
## Neg Pred Value : 0.94275
## Prevalence : 0.08435
## Detection Rate : 0.03217
## Detection Prevalence : 0.08870
## Balanced Accuracy : 0.65986
##
## 'Positive' Class : 1
##
par(mfrow=c(2,2))
plot(bbbc_log3_4, which=c(1:4))
pred = prediction(predict(bbbc_log3_4, newdata = bbbc_test, type = "response"), bbbc_test$Choice)
auc = round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates = performance(pred, "fpr", "fnr")
accuracy = performance(pred, "acc", "err")
perf = performance(pred, "tpr", "fpr")
plot(perf, colorize = T, main = "ROC curve")
text(.5,.5, paste("AUC", auc))
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
bbbc_log4 = step(bbbc_log, direction = "backward", trace = FALSE)
summary(bbbc_log4)
##
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art,
## family = binomial, data = bbbc_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.44132 -0.66647 -0.43745 -0.01855 2.72460
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.2833949 0.2062721 -1.374 0.1695
## Gender1 -0.8660575 0.1373268 -6.307 2.85e-10 ***
## Amount_purchased 0.0018357 0.0007908 2.321 0.0203 *
## Frequency -0.0903261 0.0106304 -8.497 < 2e-16 ***
## Last_purchase 0.5536689 0.0784519 7.057 1.70e-12 ***
## P_Child -0.8181807 0.1163377 -7.033 2.02e-12 ***
## P_Youth -0.6424923 0.1432548 -4.485 7.29e-06 ***
## P_Cook -0.9330131 0.1190073 -7.840 4.51e-15 ***
## P_DIY -0.9101106 0.1433591 -6.348 2.17e-10 ***
## P_Art 0.6643371 0.1255243 5.292 1.21e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1799.5 on 1599 degrees of freedom
## Residual deviance: 1393.5 on 1590 degrees of freedom
## AIC: 1413.5
##
## Number of Fisher Scoring iterations: 5
bbbc_log_Pred_Prob = predict.glm(bbbc_log4, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1973 123
## 1 127 77
##
## Accuracy : 0.8913
## 95% CI : (0.8779, 0.9037)
## No Information Rate : 0.913
## P-Value [Acc > NIR] : 0.9999
##
## Kappa : 0.3216
##
## Mcnemar's Test P-Value : 0.8495
##
## Sensitivity : 0.38500
## Specificity : 0.93952
## Pos Pred Value : 0.37745
## Neg Pred Value : 0.94132
## Prevalence : 0.08696
## Detection Rate : 0.03348
## Detection Prevalence : 0.08870
## Balanced Accuracy : 0.66226
##
## 'Positive' Class : 1
##
cook_distance_log4 = cooks.distance(bbbc_log4)
plot(cook_distance_log4, col = "red", pch = 20, cex = 1)
abline(h = 4/1600, lty = 2, col = "black")
#### Remove outliers
large_cook_distance = which(cooks.distance(bbbc_log4)>(4/1600))
bbbc_log4_trained = bbbc_train[-large_cook_distance,]
bbbc_log4_2 = glm(Choice ~ Gender + Amount_purchased + Frequency + Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_log4_trained, family = binomial)
summary(bbbc_log4_2)
##
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## Last_purchase + P_Child + P_Youth + P_Cook + P_DIY + P_Art,
## family = binomial, data = bbbc_log4_trained)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.12566 -0.52618 -0.29449 -0.07674 2.78007
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0359902 0.2490070 -0.145 0.88508
## Gender1 -0.9746586 0.1653465 -5.895 3.75e-09 ***
## Amount_purchased 0.0025809 0.0009611 2.685 0.00725 **
## Frequency -0.1531425 0.0151177 -10.130 < 2e-16 ***
## Last_purchase 0.8096840 0.1094971 7.395 1.42e-13 ***
## P_Child -1.2429659 0.1634518 -7.604 2.86e-14 ***
## P_Youth -1.2818878 0.2081274 -6.159 7.31e-10 ***
## P_Cook -1.3858412 0.1629903 -8.503 < 2e-16 ***
## P_DIY -1.4611262 0.1967092 -7.428 1.10e-13 ***
## P_Art 0.9484052 0.1614054 5.876 4.21e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1558.29 on 1495 degrees of freedom
## Residual deviance: 999.36 on 1486 degrees of freedom
## AIC: 1019.4
##
## Number of Fisher Scoring iterations: 6
bbbc_log_Pred_Prob = predict.glm(bbbc_log4_2, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .5,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1946 150
## 1 119 85
##
## Accuracy : 0.883
## 95% CI : (0.8692, 0.8959)
## No Information Rate : 0.8978
## P-Value [Acc > NIR] : 0.99024
##
## Kappa : 0.323
##
## Mcnemar's Test P-Value : 0.06738
##
## Sensitivity : 0.36170
## Specificity : 0.94237
## Pos Pred Value : 0.41667
## Neg Pred Value : 0.92844
## Prevalence : 0.10217
## Detection Rate : 0.03696
## Detection Prevalence : 0.08870
## Balanced Accuracy : 0.65204
##
## 'Positive' Class : 1
##
vif(bbbc_log4_2)
## Gender Amount_purchased Frequency Last_purchase
## 1.034201 1.191262 1.078824 12.954224
## P_Child P_Youth P_Cook P_DIY
## 3.147356 1.838353 3.218469 2.048614
## P_Art
## 1.984071
bbbc_log4_3 = glm(Choice ~ Gender + Amount_purchased + Frequency + P_Child + P_Youth + P_Cook + P_DIY + P_Art, data = bbbc_log4_trained, family = binomial)
summary(bbbc_log4_3)
##
## Call:
## glm(formula = Choice ~ Gender + Amount_purchased + Frequency +
## P_Child + P_Youth + P_Cook + P_DIY + P_Art, family = binomial,
## data = bbbc_log4_trained)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.03973 -0.55738 -0.32735 -0.09369 2.56494
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0934399 0.2419181 -0.386 0.699315
## Gender1 -0.9195610 0.1606789 -5.723 1.05e-08 ***
## Amount_purchased 0.0034735 0.0009316 3.729 0.000192 ***
## Frequency -0.1442607 0.0143420 -10.059 < 2e-16 ***
## P_Child -0.3050191 0.0955213 -3.193 0.001407 **
## P_Youth -0.3524054 0.1523640 -2.313 0.020727 *
## P_Cook -0.4693417 0.0979090 -4.794 1.64e-06 ***
## P_DIY -0.5691434 0.1467527 -3.878 0.000105 ***
## P_Art 1.7802192 0.1261186 14.115 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1558.3 on 1495 degrees of freedom
## Residual deviance: 1057.9 on 1487 degrees of freedom
## AIC: 1075.9
##
## Number of Fisher Scoring iterations: 6
bbbc_log_Pred_Prob = predict.glm(bbbc_log4_3, newdata = bbbc_test, type = "response")
bbbc_log_Pred_Sur = ifelse(bbbc_log_Pred_Prob >= .16438,1,0)
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_log_Pred_Sur), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1473 623
## 1 61 143
##
## Accuracy : 0.7026
## 95% CI : (0.6835, 0.7212)
## No Information Rate : 0.667
## P-Value [Acc > NIR] : 0.0001376
##
## Kappa : 0.18
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.18668
## Specificity : 0.96023
## Pos Pred Value : 0.70098
## Neg Pred Value : 0.70277
## Prevalence : 0.33304
## Detection Rate : 0.06217
## Detection Prevalence : 0.08870
## Balanced Accuracy : 0.57346
##
## 'Positive' Class : 1
##
OR= exp(bbbc_log4_3$coefficients)
round(OR, 3)
## (Intercept) Gender1 Amount_purchased Frequency
## 0.911 0.399 1.003 0.866
## P_Child P_Youth P_Cook P_DIY
## 0.737 0.703 0.625 0.566
## P_Art
## 5.931
vif(bbbc_log4_3)
## Gender Amount_purchased Frequency P_Child
## 1.032861 1.169365 1.058566 1.169014
## P_Youth P_Cook P_DIY P_Art
## 1.087578 1.200231 1.192834 1.292923
par(mfrow=c(2,2))
plot(bbbc_log4_3, which=c(1:4))
probabilities <- predict(bbbc_log4_3, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
# Select only numeric variables
mydata = dplyr::select_if(bbbc_log4_trained, is.numeric)
predictors <- colnames(mydata)
# Binding the logit and tidying the data for plot
mydata = mutate(mydata, logit = log(probabilities/(1-probabilities)))
mydata = gather(mydata, key = "predictors", value = "predictor.value", -logit)
#making the plot
ggplot(mydata, aes(logit, predictor.value))+
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = "loess") +
theme_bw() +
facet_wrap(~predictors, scales = "free_y")
## `geom_smooth()` using formula 'y ~ x'
pred = prediction(predict(bbbc_log4_3, newdata = bbbc_test, type = "response"), bbbc_test$Choice)
auc = round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates = performance(pred, "fpr", "fnr")
accuracy = performance(pred, "acc", "err")
perf = performance(pred, "tpr", "fpr")
plot(perf, colorize = T, main = "ROC curve")
text(.5,.5, paste("AUC", auc))
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
## Linear SVM
set.seed(1)
form1 = Choice ~ .
tuned_svm = tune.svm(form1, data = bbbc_train,kernel = "linear", gamma = seq(.01, .1, by = .01), cost = seq(.1, 1, by = .1))
tuned_svm$best.parameters
## gamma cost
## 61 0.01 0.7
bbbc_svm = svm(Choice ~., data = bbbc_train, kernel = "linear", gamma = tuned_svm$best.parameters$gamma, cost = tuned_svm$best.parameters$cost)
summary(bbbc_svm)
##
## Call:
## svm(formula = Choice ~ ., data = bbbc_train, kernel = "linear", gamma = tuned_svm$best.parameters$gamma,
## cost = tuned_svm$best.parameters$cost)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.7
##
## Number of Support Vectors: 739
##
## ( 367 372 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
bbbc_svm_predict1 = predict(bbbc_svm, bbbc_test, type = "response")
caret::confusionMatrix(bbbc_test$Choice, as.factor(bbbc_svm_predict1), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2011 85
## 1 147 57
##
## Accuracy : 0.8991
## 95% CI : (0.8861, 0.9111)
## No Information Rate : 0.9383
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2768
##
## Mcnemar's Test P-Value : 6.206e-05
##
## Sensitivity : 0.40141
## Specificity : 0.93188
## Pos Pred Value : 0.27941
## Neg Pred Value : 0.95945
## Prevalence : 0.06174
## Detection Rate : 0.02478
## Detection Prevalence : 0.08870
## Balanced Accuracy : 0.66664
##
## 'Positive' Class : 1
##
set.seed(1)
tuned_svm2 = tune.svm(form1, data = bbbc_train,kernel = "polynomial", gamma = seq(.01, .1, by = .01), cost = seq(.1, 1, by = .1))
tuned_svm2$best.parameters
## gamma cost
## 90 0.1 0.9
bbbc_svm2 = svm(formula = form1, data = bbbc_train,kernel = "polynomial", gamma = tuned_svm2$best.parameters$gamma, cost = tuned_svm2$best.parameters$cost)
summary(bbbc_svm2)
##
## Call:
## svm(formula = form1, data = bbbc_train, kernel = "polynomial", gamma = tuned_svm2$best.parameters$gamma,
## cost = tuned_svm2$best.parameters$cost)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 0.9
## degree: 3
## coef.0: 0
##
## Number of Support Vectors: 749
##
## ( 356 393 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
bbbc_svm_predict2 = predict(bbbc_svm2, bbbc_test, type = "response")
caret::confusionMatrix(bbbc_test$Choice, as.factor(bbbc_svm_predict2), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2050 46
## 1 168 36
##
## Accuracy : 0.907
## 95% CI : (0.8943, 0.9185)
## No Information Rate : 0.9643
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2117
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.43902
## Specificity : 0.92426
## Pos Pred Value : 0.17647
## Neg Pred Value : 0.97805
## Prevalence : 0.03565
## Detection Rate : 0.01565
## Detection Prevalence : 0.08870
## Balanced Accuracy : 0.68164
##
## 'Positive' Class : 1
##
set.seed(1)
tuned_svm3 = tune.svm(form1, data = bbbc_train,kernel = "radial", gamma = seq(.01, .1, by = .01), cost = seq(.1, 1, by = .1))
tuned_svm3$best.parameters
## gamma cost
## 75 0.05 0.8
bbbc_svm3 = svm(formula = form1, data = bbbc_train,kernel = "radial", gamma = tuned_svm3$best.parameters$gamma, cost = tuned_svm3$best.parameters$cost)
summary(bbbc_svm3)
##
## Call:
## svm(formula = form1, data = bbbc_train, kernel = "radial", gamma = tuned_svm3$best.parameters$gamma,
## cost = tuned_svm3$best.parameters$cost)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.8
##
## Number of Support Vectors: 782
##
## ( 371 411 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
bbbc_svm_predict3 = predict(bbbc_svm3, bbbc_test, type = "response")
caret::confusionMatrix(as.factor(bbbc_test$Choice), as.factor(bbbc_svm_predict3), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2054 42
## 1 166 38
##
## Accuracy : 0.9096
## 95% CI : (0.8971, 0.921)
## No Information Rate : 0.9652
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2291
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.47500
## Specificity : 0.92523
## Pos Pred Value : 0.18627
## Neg Pred Value : 0.97996
## Prevalence : 0.03478
## Detection Rate : 0.01652
## Detection Prevalence : 0.08870
## Balanced Accuracy : 0.70011
##
## 'Positive' Class : 1
##
set.seed(1)
tuned_svm4 = tune.svm(form1, data = bbbc_train,kernel = "sigmoid", gamma = seq(.01, .1, by = .01), cost = seq(.1, 1, by = .1))
tuned_svm4$best.parameters
## gamma cost
## 53 0.03 0.6
bbbc_svm4 = svm(formula = form1, data = bbbc_train,kernel = "sigmoid", gamma = tuned_svm4$best.parameters$gamma, cost = tuned_svm4$best.parameters$cost)
summary(bbbc_svm4)
##
## Call:
## svm(formula = form1, data = bbbc_train, kernel = "sigmoid", gamma = tuned_svm4$best.parameters$gamma,
## cost = tuned_svm4$best.parameters$cost)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: sigmoid
## cost: 0.6
## coef.0: 0
##
## Number of Support Vectors: 762
##
## ( 379 383 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
bbbc_svm_predict3 = predict(bbbc_svm4, bbbc_test, type = "response")
caret::confusionMatrix(bbbc_test$Choice, as.factor(bbbc_svm_predict3), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1983 113
## 1 144 60
##
## Accuracy : 0.8883
## 95% CI : (0.8747, 0.9009)
## No Information Rate : 0.9248
## P-Value [Acc > NIR] : 1.0000
##
## Kappa : 0.2579
##
## Mcnemar's Test P-Value : 0.0613
##
## Sensitivity : 0.34682
## Specificity : 0.93230
## Pos Pred Value : 0.29412
## Neg Pred Value : 0.94609
## Prevalence : 0.07522
## Detection Rate : 0.02609
## Detection Prevalence : 0.08870
## Balanced Accuracy : 0.63956
##
## 'Positive' Class : 1
##
proportion_00 = 1473/2300
proportion_01 = 623/2300
proportion_10 = 61/2300
proportion_11 = 143/2300
new_customer = 50000
print("Proportion of people who are not going to buy book")
## [1] "Proportion of people who are not going to buy book"
proportion_00
## [1] 0.6404348
print("Proportion of people who we predicted would not buy, but did buy")
## [1] "Proportion of people who we predicted would not buy, but did buy"
proportion_01
## [1] 0.2708696
print("Proportion of people who we predicted would buy, but did not buy")
## [1] "Proportion of people who we predicted would buy, but did not buy"
proportion_10
## [1] 0.02652174
print("Proportion of people who we predicted will buy")
## [1] "Proportion of people who we predicted will buy"
proportion_11
## [1] 0.06217391
print("Number of people who are not going to buy book")
## [1] "Number of people who are not going to buy book"
number_00 = proportion_00*new_customer
number_00
## [1] 32021.74
print("Number of people who we predicted would not buy, but did buy")
## [1] "Number of people who we predicted would not buy, but did buy"
number_01 = proportion_01*new_customer
number_01
## [1] 13543.48
print("Number of people who we predicted would buy, but did not buy")
## [1] "Number of people who we predicted would buy, but did not buy"
number_10 = proportion_10*new_customer
number_10
## [1] 1326.087
print("Number of people who we predicted will buy")
## [1] "Number of people who we predicted will buy"
number_11 = proportion_11*new_customer
number_11
## [1] 3108.696
print("Number of people to be sent mail")
## [1] "Number of people to be sent mail"
number_send = number_10 + number_11
number_send
## [1] 4434.783
print("Number of people who will buy out of the number of people to be sent mail")
## [1] "Number of people who will buy out of the number of people to be sent mail"
number_11
## [1] 3108.696
print("proportion of people who will buy from the number of people sent mail")
## [1] "proportion of people who will buy from the number of people sent mail"
proportion_sent = number_11/number_send
proportion_sent
## [1] 0.7009804
print("cost of sending mail to all in $")
## [1] "cost of sending mail to all in $"
cost_total = number_send * 0.65
cost_total
## [1] 2882.609
print("cost for sending book to total number of people who buy in $")
## [1] "cost for sending book to total number of people who buy in $"
cost_buy = number_11*15
cost_buy
## [1] 46630.43
print("cost overhead in $")
## [1] "cost overhead in $"
cost_overhead = cost_buy*.45
cost_overhead
## [1] 20983.7
print("Total Revenue generated out of books sold in $")
## [1] "Total Revenue generated out of books sold in $"
revenue_total = number_11*31.95
revenue_total
## [1] 99322.83
print("Total Profit made by selecting this logistic model in $")
## [1] "Total Profit made by selecting this logistic model in $"
profit_total = revenue_total - cost_total - cost_buy - cost_overhead
profit_total
## [1] 28826.09
proportion_00_SVM = 1983/2300
proportion_01_SVM = 113/2300
proportion_10_SVM = 144/2300
proportion_11_SVM = 60/2300
number_00_SVM = proportion_00_SVM*new_customer
number_00_SVM
## [1] 43108.7
number_01_SVM = proportion_01_SVM*new_customer
number_01_SVM
## [1] 2456.522
number_10_SVM = proportion_10_SVM*new_customer
number_10_SVM
## [1] 3130.435
number_11_SVM = proportion_11_SVM*new_customer
number_11_SVM
## [1] 1304.348
print("Number of people to be sent mail")
## [1] "Number of people to be sent mail"
number_send_SVM = number_10_SVM + number_11_SVM
number_send_SVM
## [1] 4434.783
print("Number of people who will buy out of the number of people to be sent mail")
## [1] "Number of people who will buy out of the number of people to be sent mail"
number_11_SVM
## [1] 1304.348
print("proportion of people who will buy from the number of people sent mail")
## [1] "proportion of people who will buy from the number of people sent mail"
proportion_sent_SVM = number_11_SVM/number_send_SVM
proportion_sent_SVM
## [1] 0.2941176
print("cost of sending mail to all in $")
## [1] "cost of sending mail to all in $"
cost_total_SVM = number_send_SVM * 0.65
cost_total_SVM
## [1] 2882.609
print("cost for sending book to total number of people who buy in $")
## [1] "cost for sending book to total number of people who buy in $"
cost_buy_SVM = number_11_SVM*15
cost_buy_SVM
## [1] 19565.22
print("cost overhead in $")
## [1] "cost overhead in $"
cost_overhead_SVM = cost_buy_SVM*.45
cost_overhead_SVM
## [1] 8804.348
print("Total Revenue generated out of books sold in $")
## [1] "Total Revenue generated out of books sold in $"
revenue_total_SVM = number_11_SVM*31.95
revenue_total_SVM
## [1] 41673.91
print("Total Profit made by selecting this SVM model in $")
## [1] "Total Profit made by selecting this SVM model in $"
profit_total_SVM = revenue_total_SVM - cost_total_SVM - cost_buy_SVM - cost_overhead_SVM
profit_total_SVM
## [1] 10421.74
summary(bbbc_test$Choice)
## 0 1
## 2096 204
proportion_1 = 204/2300
proportion_0 = 2096/2300
proportion_1
## [1] 0.08869565
proportion_0
## [1] 0.9113043
total_1 = proportion_1*new_customer
total_0 = proportion_0*new_customer
print("Number of people to be sent mail")
## [1] "Number of people to be sent mail"
send_total = total_1 + total_0
send_total
## [1] 50000
print("Number of people who will buy out of the number of people to be sent mail")
## [1] "Number of people who will buy out of the number of people to be sent mail"
total_1
## [1] 4434.783
print("proportion of people who will buy from the number of people sent mail")
## [1] "proportion of people who will buy from the number of people sent mail"
proportion_sent_total = total_1/send_total
proportion_sent_total
## [1] 0.08869565
print("cost of sending mail to all in $")
## [1] "cost of sending mail to all in $"
cost_total_all = send_total*.65
cost_total_all
## [1] 32500
print("cost for sending book to total number of people who buy in $")
## [1] "cost for sending book to total number of people who buy in $"
cost_buy_all = total_1*15
cost_buy_all
## [1] 66521.74
print("cost overhead in $")
## [1] "cost overhead in $"
cost_overhead_all = cost_buy_all*.45
cost_overhead_all
## [1] 29934.78
print("Total Revenue generated out of books sold in $")
## [1] "Total Revenue generated out of books sold in $"
revenue_total_all = total_1*31.95
revenue_total_all
## [1] 141691.3
print("Total Profit made by sending campaign mail to all in $")
## [1] "Total Profit made by sending campaign mail to all in $"
profit_total_all = revenue_total_all - cost_total_all - cost_buy_all - cost_overhead_all
profit_total_all
## [1] 12734.78