The goal is to identify bestbuy customers who have high propensity to buy geek squad protection plan. The data is collected only from Santa Clara Best Buy stores. The dataset, named “BestBuy.csv” includes 3,206 transactions made in March 2017. Please note this is very small dataset and provides analysis for a beginner ML enthusiast.

add libraries

#install.packages("sqldf")
library(sqldf)
## Warning: package 'sqldf' was built under R version 3.4.2
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.4.2
## Loading required package: proto
## Warning: package 'proto' was built under R version 3.4.2
## Loading required package: RSQLite
library(usdm)
## Warning: package 'usdm' was built under R version 3.4.3
## Loading required package: sp
## Warning: package 'sp' was built under R version 3.4.2
## Loading required package: raster
## Warning: package 'raster' was built under R version 3.4.2
library(MASS)
## Warning: package 'MASS' was built under R version 3.4.4
## 
## Attaching package: 'MASS'
## The following objects are masked from 'package:raster':
## 
##     area, select
library(QuantPsyc)
## Warning: package 'QuantPsyc' was built under R version 3.4.2
## Loading required package: boot
## 
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:base':
## 
##     norm
library(aod)
## Warning: package 'aod' was built under R version 3.4.4
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(Rcpp)
## Warning: package 'Rcpp' was built under R version 3.4.4
library(lmtest)
## Warning: package 'lmtest' was built under R version 3.4.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.2
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
## Warning: package 'sandwich' was built under R version 3.4.2
library(foreign)
mydataset<-read.csv("BestBuy.csv")
summary(mydataset)
##     personid             age             hisp         PriceCategory 
##  Min.   :10004010   Min.   :52.00   Min.   :0.00000   Min.   : 0.0  
##  1st Qu.:31884768   1st Qu.:65.00   1st Qu.:0.00000   1st Qu.:10.0  
##  Median :47241515   Median :67.00   Median :0.00000   Median :12.0  
##  Mean   :48620356   Mean   :66.91   Mean   :0.07268   Mean   :11.9  
##  3rd Qu.:71483028   3rd Qu.:69.00   3rd Qu.:0.00000   3rd Qu.:14.0  
##  Max.   :99564010   Max.   :86.00   Max.   :1.00000   Max.   :17.0  
##     married        MyBestBuy         hhincome         appliances    
##  Min.   :0.000   Min.   :0.0000   Min.   :   0.00   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:  17.00   1st Qu.:0.0000  
##  Median :1.000   Median :1.0000   Median :  31.10   Median :1.0000  
##  Mean   :0.733   Mean   :0.6248   Mean   :  45.26   Mean   :0.7046  
##  3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:  52.80   3rd Qu.:1.0000  
##  Max.   :1.000   Max.   :1.0000   Max.   :1312.12   Max.   :1.0000  
##     Warranty        familysize    productgeneration  newcustomer    
##  Min.   :0.0000   Min.   :1.000   Min.   : 1.000    Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.: 7.000    1st Qu.:0.0000  
##  Median :1.0000   Median :3.000   Median : 8.000    Median :1.0000  
##  Mean   :0.6207   Mean   :2.973   Mean   : 7.442    Mean   :0.7077  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.: 8.000    3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :4.000   Max.   :11.000    Max.   :1.0000  
##     weekend      
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.4039  
##  3rd Qu.:1.0000  
##  Max.   :1.0000
hist(mydataset$Warranty)

testsql=sqldf("select age,newcustomer from mydataset where newcustomer=1")

#Summary Statistics
hist(mydataset$PriceCategory) #comparing the histograms of key independent variable which are not dummy/factor variables

hist(log(mydataset$PriceCategory))

hist(mydataset$hhincome) #comparing the histograms of key independent variable which are not dummy/factor variables

hist(log(mydataset$hhincome))

#checking for multicollinearity

df=data.frame(mydataset$newcustomer,mydataset$age,mydataset$hisp,mydataset$PriceCategory,mydataset$married,mydataset$MyBestBuy,mydataset$hhincome,mydataset$appliances,mydataset$familysize,mydataset$productgeneration,mydataset$weekend)
vif(df)
##                      Variables      VIF
## 1        mydataset.newcustomer 1.004547
## 2                mydataset.age 1.165371
## 3               mydataset.hisp 1.123186
## 4      mydataset.PriceCategory 9.503022
## 5            mydataset.married 4.314001
## 6          mydataset.MyBestBuy 1.146866
## 7           mydataset.hhincome 1.151368
## 8         mydataset.appliances 1.159733
## 9         mydataset.familysize 4.224151
## 10 mydataset.productgeneration 9.231568
## 11           mydataset.weekend 1.002920
cor(df)
##                             mydataset.newcustomer mydataset.age
## mydataset.newcustomer                 1.000000000  0.0120054043
## mydataset.age                         0.012005404  1.0000000000
## mydataset.hisp                        0.013466952 -0.0006346689
## mydataset.PriceCategory              -0.005186286  0.0399169525
## mydataset.married                     0.015226049  0.1575621439
## mydataset.MyBestBuy                  -0.047584715  0.3115567007
## mydataset.hhincome                    0.012219606  0.0652357371
## mydataset.appliances                 -0.002667570  0.1967221171
## mydataset.familysize                  0.023601740  0.1456522428
## mydataset.productgeneration          -0.011593103  0.0542402527
## mydataset.weekend                     0.010458271 -0.0083913126
##                             mydataset.hisp mydataset.PriceCategory
## mydataset.newcustomer         0.0134669523            -0.005186286
## mydataset.age                -0.0006346689             0.039916952
## mydataset.hisp                1.0000000000            -0.328141800
## mydataset.PriceCategory      -0.3281418001             1.000000000
## mydataset.married            -0.0265865550             0.074948382
## mydataset.MyBestBuy          -0.0659347592             0.135125725
## mydataset.hhincome           -0.1052860258             0.290427681
## mydataset.appliances         -0.1163422707             0.309767740
## mydataset.familysize         -0.0147721428             0.064004686
## mydataset.productgeneration  -0.3080954399             0.944045453
## mydataset.weekend            -0.0149751267            -0.011490043
##                             mydataset.married mydataset.MyBestBuy
## mydataset.newcustomer              0.01522605        -0.047584715
## mydataset.age                      0.15756214         0.311556701
## mydataset.hisp                    -0.02658656        -0.065934759
## mydataset.PriceCategory            0.07494838         0.135125725
## mydataset.married                  1.00000000         0.122028744
## mydataset.MyBestBuy                0.12202874         1.000000000
## mydataset.hhincome                 0.20854885        -0.012045000
## mydataset.appliances               0.09605320         0.128009299
## mydataset.familysize               0.87308567         0.108696288
## mydataset.productgeneration        0.07437216         0.133728765
## mydataset.weekend                  0.03270994         0.005156923
##                             mydataset.hhincome mydataset.appliances
## mydataset.newcustomer               0.01221961         -0.002667570
## mydataset.age                       0.06523574          0.196722117
## mydataset.hisp                     -0.10528603         -0.116342271
## mydataset.PriceCategory             0.29042768          0.309767740
## mydataset.married                   0.20854885          0.096053202
## mydataset.MyBestBuy                -0.01204500          0.128009299
## mydataset.hhincome                  1.00000000          0.155125760
## mydataset.appliances                0.15512576          1.000000000
## mydataset.familysize                0.17158996          0.064026525
## mydataset.productgeneration         0.28408757          0.293973195
## mydataset.weekend                   0.01489175          0.007693935
##                             mydataset.familysize
## mydataset.newcustomer                 0.02360174
## mydataset.age                         0.14565224
## mydataset.hisp                       -0.01477214
## mydataset.PriceCategory               0.06400469
## mydataset.married                     0.87308567
## mydataset.MyBestBuy                   0.10869629
## mydataset.hhincome                    0.17158996
## mydataset.appliances                  0.06402652
## mydataset.familysize                  1.00000000
## mydataset.productgeneration           0.06164029
## mydataset.weekend                     0.01707987
##                             mydataset.productgeneration mydataset.weekend
## mydataset.newcustomer                       -0.01159310       0.010458271
## mydataset.age                                0.05424025      -0.008391313
## mydataset.hisp                              -0.30809544      -0.014975127
## mydataset.PriceCategory                      0.94404545      -0.011490043
## mydataset.married                            0.07437216       0.032709938
## mydataset.MyBestBuy                          0.13372877       0.005156923
## mydataset.hhincome                           0.28408757       0.014891748
## mydataset.appliances                         0.29397319       0.007693935
## mydataset.familysize                         0.06164029       0.017079870
## mydataset.productgeneration                  1.00000000      -0.014526542
## mydataset.weekend                           -0.01452654       1.000000000
df1=data.frame(mydataset$newcustomer,mydataset$age,mydataset$hisp,mydataset$PriceCategory,mydataset$married,mydataset$MyBestBuy,mydataset$hhincome,mydataset$appliances,mydataset$familysize,mydataset$weekend)
vif(df1)
##                  Variables      VIF
## 1    mydataset.newcustomer 1.004115
## 2            mydataset.age 1.162970
## 3           mydataset.hisp 1.123154
## 4  mydataset.PriceCategory 1.319947
## 5        mydataset.married 4.313577
## 6      mydataset.MyBestBuy 1.146835
## 7       mydataset.hhincome 1.150371
## 8     mydataset.appliances 1.159672
## 9     mydataset.familysize 4.223491
## 10       mydataset.weekend 1.002794
cor(df1)
##                         mydataset.newcustomer mydataset.age mydataset.hisp
## mydataset.newcustomer             1.000000000  0.0120054043   0.0134669523
## mydataset.age                     0.012005404  1.0000000000  -0.0006346689
## mydataset.hisp                    0.013466952 -0.0006346689   1.0000000000
## mydataset.PriceCategory          -0.005186286  0.0399169525  -0.3281418001
## mydataset.married                 0.015226049  0.1575621439  -0.0265865550
## mydataset.MyBestBuy              -0.047584715  0.3115567007  -0.0659347592
## mydataset.hhincome                0.012219606  0.0652357371  -0.1052860258
## mydataset.appliances             -0.002667570  0.1967221171  -0.1163422707
## mydataset.familysize              0.023601740  0.1456522428  -0.0147721428
## mydataset.weekend                 0.010458271 -0.0083913126  -0.0149751267
##                         mydataset.PriceCategory mydataset.married
## mydataset.newcustomer              -0.005186286        0.01522605
## mydataset.age                       0.039916952        0.15756214
## mydataset.hisp                     -0.328141800       -0.02658656
## mydataset.PriceCategory             1.000000000        0.07494838
## mydataset.married                   0.074948382        1.00000000
## mydataset.MyBestBuy                 0.135125725        0.12202874
## mydataset.hhincome                  0.290427681        0.20854885
## mydataset.appliances                0.309767740        0.09605320
## mydataset.familysize                0.064004686        0.87308567
## mydataset.weekend                  -0.011490043        0.03270994
##                         mydataset.MyBestBuy mydataset.hhincome
## mydataset.newcustomer          -0.047584715         0.01221961
## mydataset.age                   0.311556701         0.06523574
## mydataset.hisp                 -0.065934759        -0.10528603
## mydataset.PriceCategory         0.135125725         0.29042768
## mydataset.married               0.122028744         0.20854885
## mydataset.MyBestBuy             1.000000000        -0.01204500
## mydataset.hhincome             -0.012045000         1.00000000
## mydataset.appliances            0.128009299         0.15512576
## mydataset.familysize            0.108696288         0.17158996
## mydataset.weekend               0.005156923         0.01489175
##                         mydataset.appliances mydataset.familysize
## mydataset.newcustomer           -0.002667570           0.02360174
## mydataset.age                    0.196722117           0.14565224
## mydataset.hisp                  -0.116342271          -0.01477214
## mydataset.PriceCategory          0.309767740           0.06400469
## mydataset.married                0.096053202           0.87308567
## mydataset.MyBestBuy              0.128009299           0.10869629
## mydataset.hhincome               0.155125760           0.17158996
## mydataset.appliances             1.000000000           0.06402652
## mydataset.familysize             0.064026525           1.00000000
## mydataset.weekend                0.007693935           0.01707987
##                         mydataset.weekend
## mydataset.newcustomer         0.010458271
## mydataset.age                -0.008391313
## mydataset.hisp               -0.014975127
## mydataset.PriceCategory      -0.011490043
## mydataset.married             0.032709938
## mydataset.MyBestBuy           0.005156923
## mydataset.hhincome            0.014891748
## mydataset.appliances          0.007693935
## mydataset.familysize          0.017079870
## mydataset.weekend             1.000000000
df2=data.frame(mydataset$newcustomer,mydataset$age,mydataset$hisp,mydataset$PriceCategory,mydataset$MyBestBuy,mydataset$hhincome,mydataset$appliances,mydataset$familysize,mydataset$weekend)
vif(df2)
##                 Variables      VIF
## 1   mydataset.newcustomer 1.004008
## 2           mydataset.age 1.161676
## 3          mydataset.hisp 1.122978
## 4 mydataset.PriceCategory 1.319399
## 5     mydataset.MyBestBuy 1.144873
## 6      mydataset.hhincome 1.135540
## 7    mydataset.appliances 1.156152
## 8    mydataset.familysize 1.056786
## 9       mydataset.weekend 1.001570
cor(df2)
##                         mydataset.newcustomer mydataset.age mydataset.hisp
## mydataset.newcustomer             1.000000000  0.0120054043   0.0134669523
## mydataset.age                     0.012005404  1.0000000000  -0.0006346689
## mydataset.hisp                    0.013466952 -0.0006346689   1.0000000000
## mydataset.PriceCategory          -0.005186286  0.0399169525  -0.3281418001
## mydataset.MyBestBuy              -0.047584715  0.3115567007  -0.0659347592
## mydataset.hhincome                0.012219606  0.0652357371  -0.1052860258
## mydataset.appliances             -0.002667570  0.1967221171  -0.1163422707
## mydataset.familysize              0.023601740  0.1456522428  -0.0147721428
## mydataset.weekend                 0.010458271 -0.0083913126  -0.0149751267
##                         mydataset.PriceCategory mydataset.MyBestBuy
## mydataset.newcustomer              -0.005186286        -0.047584715
## mydataset.age                       0.039916952         0.311556701
## mydataset.hisp                     -0.328141800        -0.065934759
## mydataset.PriceCategory             1.000000000         0.135125725
## mydataset.MyBestBuy                 0.135125725         1.000000000
## mydataset.hhincome                  0.290427681        -0.012045000
## mydataset.appliances                0.309767740         0.128009299
## mydataset.familysize                0.064004686         0.108696288
## mydataset.weekend                  -0.011490043         0.005156923
##                         mydataset.hhincome mydataset.appliances
## mydataset.newcustomer           0.01221961         -0.002667570
## mydataset.age                   0.06523574          0.196722117
## mydataset.hisp                 -0.10528603         -0.116342271
## mydataset.PriceCategory         0.29042768          0.309767740
## mydataset.MyBestBuy            -0.01204500          0.128009299
## mydataset.hhincome              1.00000000          0.155125760
## mydataset.appliances            0.15512576          1.000000000
## mydataset.familysize            0.17158996          0.064026525
## mydataset.weekend               0.01489175          0.007693935
##                         mydataset.familysize mydataset.weekend
## mydataset.newcustomer             0.02360174       0.010458271
## mydataset.age                     0.14565224      -0.008391313
## mydataset.hisp                   -0.01477214      -0.014975127
## mydataset.PriceCategory           0.06400469      -0.011490043
## mydataset.MyBestBuy               0.10869629       0.005156923
## mydataset.hhincome                0.17158996       0.014891748
## mydataset.appliances              0.06402652       0.007693935
## mydataset.familysize              1.00000000       0.017079870
## mydataset.weekend                 0.01707987       1.000000000

Let’s start modeling. First Linear probability model : why? Let’s understand why linear probability model is a bad fit for classification. Then we will do it the right way

#our dependent variable is Warranty
#our Key Independent Variables are PriceCategory*appliances, hisp,hhincome,familysize
#our control variables are age, newcustomer, weekend
names(mydataset)
##  [1] "personid"          "age"               "hisp"             
##  [4] "PriceCategory"     "married"           "MyBestBuy"        
##  [7] "hhincome"          "appliances"        "Warranty"         
## [10] "familysize"        "productgeneration" "newcustomer"      
## [13] "weekend"
model1<- lm(Warranty~PriceCategory+newcustomer+age+hisp+MyBestBuy+hhincome+appliances+familysize+weekend,data =mydataset) 
summary(model1) #first attempt
## 
## Call:
## lm(formula = Warranty ~ PriceCategory + newcustomer + age + hisp + 
##     MyBestBuy + hhincome + appliances + familysize + weekend, 
##     data = mydataset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8196 -0.5253  0.2592  0.3497  0.8543 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.6419974  0.1608661   3.991 6.73e-05 ***
## PriceCategory -0.0036649  0.0028642  -1.280 0.200797    
## newcustomer    0.0106986  0.0181514   0.589 0.555629    
## age           -0.0042590  0.0024161  -1.763 0.078039 .  
## hisp          -0.3353707  0.0336309  -9.972  < 2e-16 ***
## MyBestBuy      0.0405427  0.0182068   2.227 0.026031 *  
## hhincome       0.0004871  0.0001365   3.569 0.000363 ***
## appliances     0.0114716  0.0194179   0.591 0.554715    
## familysize     0.0883997  0.0082614  10.700  < 2e-16 ***
## weekend        0.0145492  0.0168036   0.866 0.386645    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4665 on 3196 degrees of freedom
## Multiple R-squared:  0.07854,    Adjusted R-squared:  0.07595 
## F-statistic: 30.27 on 9 and 3196 DF,  p-value: < 2.2e-16
model2<- lm(Warranty~PriceCategory+PriceCategory*appliances+newcustomer+age+hisp+MyBestBuy+hhincome+appliances+familysize+weekend,data =mydataset)
summary(model2) #adding interaction variable
## 
## Call:
## lm(formula = Warranty ~ PriceCategory + PriceCategory * appliances + 
##     newcustomer + age + hisp + MyBestBuy + hhincome + appliances + 
##     familysize + weekend, data = mydataset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9282 -0.5202  0.2419  0.3508  0.9318 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               0.2665913  0.1637043   1.628   0.1035    
## PriceCategory             0.0280350  0.0044083   6.360 2.31e-10 ***
## appliances                0.5726817  0.0628874   9.106  < 2e-16 ***
## newcustomer               0.0061764  0.0179163   0.345   0.7303    
## age                      -0.0034897  0.0023854  -1.463   0.1436    
## hisp                     -0.3088329  0.0333040  -9.273  < 2e-16 ***
## MyBestBuy                 0.0356596  0.0179721   1.984   0.0473 *  
## hhincome                  0.0005737  0.0001350   4.251 2.19e-05 ***
## familysize                0.0877437  0.0081517  10.764  < 2e-16 ***
## weekend                   0.0136309  0.0165803   0.822   0.4111    
## PriceCategory:appliances -0.0503873  0.0053778  -9.369  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4603 on 3195 degrees of freedom
## Multiple R-squared:  0.1032, Adjusted R-squared:  0.1004 
## F-statistic: 36.76 on 10 and 3195 DF,  p-value: < 2.2e-16
model2wl<- lm(Warranty~PriceCategory+PriceCategory*appliances+newcustomer+age+hisp+MyBestBuy+log(hhincome+1)+appliances+familysize+weekend,data =mydataset)
summary(model2wl) # this is model2 using log(hhincome) --> we get an error because there are some 0 values (which are transformed into -inf)
## 
## Call:
## lm(formula = Warranty ~ PriceCategory + PriceCategory * appliances + 
##     newcustomer + age + hisp + MyBestBuy + log(hhincome + 1) + 
##     appliances + familysize + weekend, data = mydataset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9192 -0.5091  0.2412  0.3482  0.9297 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               0.205221   0.163260   1.257    0.209    
## PriceCategory             0.025463   0.004483   5.680 1.47e-08 ***
## appliances                0.556595   0.062778   8.866  < 2e-16 ***
## newcustomer               0.005473   0.017905   0.306    0.760    
## age                      -0.003905   0.002388  -1.635    0.102    
## hisp                     -0.300539   0.033354  -9.011  < 2e-16 ***
## MyBestBuy                 0.031668   0.017901   1.769    0.077 .  
## log(hhincome + 1)         0.054561   0.011422   4.777 1.86e-06 ***
## familysize                0.077550   0.008687   8.927  < 2e-16 ***
## weekend                   0.012647   0.016572   0.763    0.445    
## PriceCategory:appliances -0.050034   0.005367  -9.322  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4599 on 3195 degrees of freedom
## Multiple R-squared:  0.1045, Adjusted R-squared:  0.1017 
## F-statistic: 37.29 on 10 and 3195 DF,  p-value: < 2.2e-16
predictedprobability_lm<-predict(model2) # let's look at the predicted probability of return for each observation in the data 
plot(mydataset$Warranty,predictedprobability_lm)

range(predictedprobability_lm) #negative values and values greater than one so this is wrong!
## [1] -0.1497612  1.4659433
predictedprobability_lm<-predict(model2wl) # let's look at the predicted probability of return for each observation in the data 
plot(mydataset$Warranty,predictedprobability_lm)

range(predictedprobability_lm)
## [1] -0.1387258  1.0154840
#Logit



#calculating ratio
sum(mydataset$Warranty==0)
## [1] 1216
sum(mydataset$Warranty==1)
## [1] 1990
a=1216/9
b=1990/9
min(a,b)
## [1] 135.1111

Now, we will use logistic regression to correctly transform the predictions into probability range between 0 and 1.

#Changing family size to dummy
mydataset$familydummy<-ifelse(mydataset$familysize>2,1,0)

#model with interaction
logit1<-glm(Warranty~PriceCategory+newcustomer+age+hisp+MyBestBuy+hhincome+appliances+familysize+weekend,data =mydataset,family = "binomial")
summary(logit1)
## 
## Call:
## glm(formula = Warranty ~ PriceCategory + newcustomer + age + 
##     hisp + MyBestBuy + hhincome + appliances + familysize + weekend, 
##     family = "binomial", data = mydataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8720  -1.2145   0.7786   0.9187   1.8727  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    0.7202738  0.7398837   0.973 0.330307    
## PriceCategory -0.0205217  0.0133950  -1.532 0.125512    
## newcustomer    0.0507235  0.0833395   0.609 0.542765    
## age           -0.0199280  0.0111238  -1.791 0.073217 .  
## hisp          -1.4583152  0.1599041  -9.120  < 2e-16 ***
## MyBestBuy      0.1896736  0.0835198   2.271 0.023147 *  
## hhincome       0.0032567  0.0009248   3.521 0.000429 ***
## appliances     0.0423635  0.0891714   0.475 0.634731    
## familysize     0.3789697  0.0380430   9.962  < 2e-16 ***
## weekend        0.0642373  0.0775178   0.829 0.407287    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4255.7  on 3205  degrees of freedom
## Residual deviance: 3999.5  on 3196  degrees of freedom
## AIC: 4019.5
## 
## Number of Fisher Scoring iterations: 4
#model with interaction
logit2<-glm(Warranty~PriceCategory+PriceCategory*appliances+newcustomer+age+hisp+MyBestBuy+hhincome+appliances+familysize+weekend,data =mydataset,family = "binomial")
summary(logit2)
## 
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances + 
##     newcustomer + age + hisp + MyBestBuy + hhincome + appliances + 
##     familysize + weekend, family = "binomial", data = mydataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0428  -1.2006   0.7400   0.9194   2.1229  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.1873844  0.7851770  -1.512   0.1305    
## PriceCategory             0.1403763  0.0229857   6.107 1.01e-09 ***
## appliances                2.8342713  0.3175643   8.925  < 2e-16 ***
## newcustomer               0.0309996  0.0845716   0.367   0.7140    
## age                      -0.0163625  0.0113663  -1.440   0.1500    
## hisp                     -1.4200859  0.1644499  -8.635  < 2e-16 ***
## MyBestBuy                 0.1789387  0.0846593   2.114   0.0345 *  
## hhincome                  0.0040424  0.0009604   4.209 2.57e-05 ***
## familysize                0.3849451  0.0386900   9.949  < 2e-16 ***
## weekend                   0.0583670  0.0786832   0.742   0.4582    
## PriceCategory:appliances -0.2526930  0.0276414  -9.142  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4255.7  on 3205  degrees of freedom
## Residual deviance: 3908.3  on 3195  degrees of freedom
## AIC: 3930.3
## 
## Number of Fisher Scoring iterations: 4
#testing logit1 and logit
anova(logit1,logit2)
## Analysis of Deviance Table
## 
## Model 1: Warranty ~ PriceCategory + newcustomer + age + hisp + MyBestBuy + 
##     hhincome + appliances + familysize + weekend
## Model 2: Warranty ~ PriceCategory + PriceCategory * appliances + newcustomer + 
##     age + hisp + MyBestBuy + hhincome + appliances + familysize + 
##     weekend
##   Resid. Df Resid. Dev Df Deviance
## 1      3196     3999.5            
## 2      3195     3908.3  1   91.258
exp(coef(logit2))
##              (Intercept)            PriceCategory               appliances 
##                0.3050180                1.1507067               17.0179949 
##              newcustomer                      age                     hisp 
##                1.0314851                0.9837707                0.2416933 
##                MyBestBuy                 hhincome               familysize 
##                1.1959474                1.0040506                1.4695337 
##                  weekend PriceCategory:appliances 
##                1.0601040                0.7767063
with(logit2, null.deviance - deviance)
## [1] 347.451
with(logit2, df.null - df.residual)
## [1] 10
with(logit2, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 1.384424e-68
pred = predict(logit2, data=mydataset)
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.649407361197754"
table(mydataset$Warranty, pred>=0.5)
##    
##     FALSE TRUE
##   0   708  508
##   1   616 1374
#model with log(hhincome+1)
logit3<-glm(Warranty~PriceCategory+PriceCategory*appliances+newcustomer+age+hisp+MyBestBuy+log(hhincome+1)+appliances+familysize+weekend,data =mydataset,family = "binomial") #added log hhincome
summary(logit3)
## 
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances + 
##     newcustomer + age + hisp + MyBestBuy + log(hhincome + 1) + 
##     appliances + familysize + weekend, family = "binomial", data = mydataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0267  -1.1825   0.7364   0.9119   2.1197  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.49267    0.78414  -1.904   0.0570 .  
## PriceCategory             0.13191    0.02327   5.669 1.43e-08 ***
## appliances                2.74350    0.31668   8.663  < 2e-16 ***
## newcustomer               0.02589    0.08460   0.306   0.7595    
## age                      -0.01835    0.01140  -1.610   0.1073    
## hisp                     -1.39227    0.16448  -8.465  < 2e-16 ***
## MyBestBuy                 0.15146    0.08435   1.796   0.0726 .  
## log(hhincome + 1)         0.25418    0.05435   4.677 2.91e-06 ***
## familysize                0.34928    0.04063   8.598  < 2e-16 ***
## weekend                   0.05718    0.07870   0.727   0.4675    
## PriceCategory:appliances -0.24871    0.02758  -9.016  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4255.7  on 3205  degrees of freedom
## Residual deviance: 3909.1  on 3195  degrees of freedom
## AIC: 3931.1
## 
## Number of Fisher Scoring iterations: 4
exp(coef(logit3))
##              (Intercept)            PriceCategory               appliances 
##                0.2247721                1.1410045               15.5413525 
##              newcustomer                      age                     hisp 
##                1.0262328                0.9818165                0.2485105 
##                MyBestBuy        log(hhincome + 1)               familysize 
##                1.1635306                1.2894098                1.4180509 
##                  weekend PriceCategory:appliances 
##                1.0588451                0.7798062
with(logit3, null.deviance - deviance)
## [1] 346.6728
with(logit3, df.null - df.residual)
## [1] 10
with(logit3, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 2.024786e-68
pred = predict(logit3, data=mydataset)
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.653462258265752"
table(mydataset$Warranty, pred>=0.5)
##    
##     FALSE TRUE
##   0   706  510
##   1   601 1389
#model with interaction and log(hhincome+1) and newcustomer
logit4<-glm(Warranty~PriceCategory+PriceCategory*appliances+age+hisp+MyBestBuy+log(hhincome+1)+appliances+familydummy+weekend+newcustomer,data =mydataset,family = "binomial")
summary(logit4) # final model
## 
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances + 
##     age + hisp + MyBestBuy + log(hhincome + 1) + appliances + 
##     familydummy + weekend + newcustomer, family = "binomial", 
##     data = mydataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0096  -1.1403   0.7400   0.8938   2.0234  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -0.94784    0.78808  -1.203 0.229084    
## PriceCategory             0.14018    0.02346   5.976 2.29e-09 ***
## appliances                2.77945    0.31831   8.732  < 2e-16 ***
## age                      -0.01907    0.01143  -1.669 0.095052 .  
## hisp                     -1.40460    0.16535  -8.495  < 2e-16 ***
## MyBestBuy                 0.13661    0.08474   1.612 0.106923    
## log(hhincome + 1)         0.19227    0.05611   3.427 0.000611 ***
## familydummy               0.92265    0.09630   9.581  < 2e-16 ***
## weekend                   0.04410    0.07896   0.559 0.576455    
## newcustomer               0.03176    0.08485   0.374 0.708191    
## PriceCategory:appliances -0.25272    0.02772  -9.118  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4255.7  on 3205  degrees of freedom
## Residual deviance: 3891.5  on 3195  degrees of freedom
## AIC: 3913.5
## 
## Number of Fisher Scoring iterations: 4
exp(coef(logit4))
##              (Intercept)            PriceCategory               appliances 
##                0.3875757                1.1504793               16.1102075 
##                      age                     hisp                MyBestBuy 
##                0.9811081                0.2454641                1.1463829 
##        log(hhincome + 1)              familydummy                  weekend 
##                1.2120008                2.5159539                1.0450899 
##              newcustomer PriceCategory:appliances 
##                1.0322693                0.7766864
with(logit4, null.deviance - deviance)
## [1] 364.2123
with(logit4, df.null - df.residual)
## [1] 10
with(logit4, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 3.828063e-72
pred = predict(logit4, data=mydataset, type="response")
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.682470368059888"
table(mydataset$Warranty, pred>=0.5)
##    
##     FALSE TRUE
##   0   480  736
##   1   282 1708
#testing
AIC(logit3, logit4)  
##        df      AIC
## logit3 11 3931.067
## logit4 11 3913.528
BIC(logit3, logit4)
##        df      BIC
## logit3 11 3997.868
## logit4 11 3980.328
#model with familydummy and familyfactor
mydataset$familyfactor<-factor(mydataset$familysize)
logit8<-glm(Warranty~PriceCategory+PriceCategory*appliances+age+hisp+MyBestBuy+log(hhincome+1)+appliances+familyfactor+weekend+newcustomer,data =mydataset,family = "binomial")
summary(logit8) # final model
## 
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances + 
##     age + hisp + MyBestBuy + log(hhincome + 1) + appliances + 
##     familyfactor + weekend + newcustomer, family = "binomial", 
##     data = mydataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0023  -1.1355   0.7428   0.8949   2.0520  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -0.88612    0.79137  -1.120 0.262832    
## PriceCategory             0.14048    0.02349   5.980 2.23e-09 ***
## appliances                2.78765    0.31852   8.752  < 2e-16 ***
## age                      -0.01949    0.01144  -1.704 0.088371 .  
## hisp                     -1.40331    0.16544  -8.482  < 2e-16 ***
## MyBestBuy                 0.13608    0.08475   1.606 0.108354    
## log(hhincome + 1)         0.19221    0.05613   3.424 0.000617 ***
## familyfactor2            -0.07408    0.14238  -0.520 0.602865    
## familyfactor3             0.82342    0.12620   6.525 6.82e-11 ***
## familyfactor4             0.94998    0.12614   7.531 5.02e-14 ***
## weekend                   0.04498    0.07901   0.569 0.569158    
## newcustomer               0.02799    0.08491   0.330 0.741674    
## PriceCategory:appliances -0.25329    0.02775  -9.127  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4255.7  on 3205  degrees of freedom
## Residual deviance: 3889.4  on 3193  degrees of freedom
## AIC: 3915.4
## 
## Number of Fisher Scoring iterations: 4
exp(coef(logit8))
##              (Intercept)            PriceCategory               appliances 
##                0.4122535                1.1508313               16.2428564 
##                      age                     hisp                MyBestBuy 
##                0.9807000                0.2457824                1.1457737 
##        log(hhincome + 1)            familyfactor2            familyfactor3 
##                1.2119208                0.9285992                2.2782821 
##            familyfactor4                  weekend              newcustomer 
##                2.5856601                1.0460037                1.0283861 
## PriceCategory:appliances 
##                0.7762440
with(logit4, null.deviance - deviance)
## [1] 364.2123
with(logit4, df.null - df.residual)
## [1] 10
with(logit4, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 3.828063e-72
pred = predict(logit8, data=mydataset, type="response")
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.680598877105427"
table(mydataset$Warranty, pred>=0.5)
##    
##     FALSE TRUE
##   0   474  742
##   1   282 1708
#testing
AIC(logit8, logit4)  
##        df      AIC
## logit8 13 3915.370
## logit4 11 3913.528
BIC(logit8, logit4)
##        df      BIC
## logit8 13 3994.316
## logit4 11 3980.328
#trying interaction newcustomer*MyBestBuy
logit5<-glm(Warranty~PriceCategory+PriceCategory*appliances+newcustomer*MyBestBuy+newcustomer+age+hisp+MyBestBuy+log(hhincome+1)+appliances+familydummy+weekend,data =mydataset,family = "binomial")
summary(logit5) # with new customer*MyBestBuy
## 
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances + 
##     newcustomer * MyBestBuy + newcustomer + age + hisp + MyBestBuy + 
##     log(hhincome + 1) + appliances + familydummy + weekend, family = "binomial", 
##     data = mydataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9888  -1.1363   0.7399   0.8922   2.0442  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -0.81389    0.79256  -1.027 0.304460    
## PriceCategory             0.14035    0.02347   5.981 2.22e-09 ***
## appliances                2.77816    0.31833   8.727  < 2e-16 ***
## newcustomer              -0.15646    0.14333  -1.092 0.275034    
## MyBestBuy                -0.07278    0.15373  -0.473 0.635924    
## age                      -0.01908    0.01143  -1.670 0.095016 .  
## hisp                     -1.40269    0.16531  -8.485  < 2e-16 ***
## log(hhincome + 1)         0.19302    0.05612   3.439 0.000583 ***
## familydummy               0.92311    0.09635   9.581  < 2e-16 ***
## weekend                   0.04606    0.07900   0.583 0.559916    
## PriceCategory:appliances -0.25264    0.02772  -9.114  < 2e-16 ***
## newcustomer:MyBestBuy     0.29091    0.17773   1.637 0.101674    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4255.7  on 3205  degrees of freedom
## Residual deviance: 3888.8  on 3194  degrees of freedom
## AIC: 3912.8
## 
## Number of Fisher Scoring iterations: 4
with(logit5, null.deviance - deviance)
## [1] 366.8994
with(logit5, df.null - df.residual)
## [1] 11
with(logit5, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 6.404846e-72
pred = predict(logit5, data=mydataset, type="response")
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.67935121646912"
table(mydataset$Warranty, pred>=0.5)
##    
##     FALSE TRUE
##   0   469  747
##   1   281 1709
anova(logit5,logit4, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: Warranty ~ PriceCategory + PriceCategory * appliances + newcustomer * 
##     MyBestBuy + newcustomer + age + hisp + MyBestBuy + log(hhincome + 
##     1) + appliances + familydummy + weekend
## Model 2: Warranty ~ PriceCategory + PriceCategory * appliances + age + 
##     hisp + MyBestBuy + log(hhincome + 1) + appliances + familydummy + 
##     weekend + newcustomer
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      3194     3888.8                     
## 2      3195     3891.5 -1  -2.6871   0.1012
#trying interaction PriceCategory*familydummy
logit6<-glm(Warranty~PriceCategory+PriceCategory*familydummy+newcustomer+age+hisp+MyBestBuy+log(hhincome+1)+appliances+familydummy+weekend,data =mydataset,family = "binomial")
summary(logit6) # with new customer*MyBestBuy
## 
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * familydummy + 
##     newcustomer + age + hisp + MyBestBuy + log(hhincome + 1) + 
##     appliances + familydummy + weekend, family = "binomial", 
##     data = mydataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7661  -1.1360   0.7925   0.8590   1.9539  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                0.9977553  0.7696954   1.296  0.19487    
## PriceCategory             -0.0217478  0.0232342  -0.936  0.34926    
## familydummy                0.9055994  0.3202999   2.827  0.00469 ** 
## newcustomer                0.0525210  0.0836252   0.628  0.52997    
## age                       -0.0222614  0.0111846  -1.990  0.04655 *  
## hisp                      -1.4451438  0.1614156  -8.953  < 2e-16 ***
## MyBestBuy                  0.1554587  0.0835863   1.860  0.06291 .  
## log(hhincome + 1)          0.1649459  0.0553491   2.980  0.00288 ** 
## appliances                -0.0098269  0.0909602  -0.108  0.91397    
## weekend                    0.0501578  0.0778058   0.645  0.51915    
## PriceCategory:familydummy -0.0005674  0.0260433  -0.022  0.98262    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4255.7  on 3205  degrees of freedom
## Residual deviance: 3982.6  on 3195  degrees of freedom
## AIC: 4004.6
## 
## Number of Fisher Scoring iterations: 4
with(logit6, null.deviance - deviance)
## [1] 273.153
with(logit6, df.null - df.residual)
## [1] 10
with(logit6, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 7.239105e-53
pred = predict(logit6, data=mydataset, type="response")
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.665003119151591"
table(mydataset$Warranty, pred>=0.5)
##    
##     FALSE TRUE
##   0   509  707
##   1   367 1623
#testing
anova(logit6,logit4, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: Warranty ~ PriceCategory + PriceCategory * familydummy + newcustomer + 
##     age + hisp + MyBestBuy + log(hhincome + 1) + appliances + 
##     familydummy + weekend
## Model 2: Warranty ~ PriceCategory + PriceCategory * appliances + age + 
##     hisp + MyBestBuy + log(hhincome + 1) + appliances + familydummy + 
##     weekend + newcustomer
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      3195     3982.6                     
## 2      3195     3891.5  0   91.059
#trying with quadratic
logit7<-glm(Warranty~PriceCategory+PriceCategory*appliances+age+hisp+MyBestBuy+log(hhincome+1)+I(log(hhincome+1)^2)+appliances+familydummy+weekend+newcustomer,data =mydataset,family = "binomial")
summary(logit7) # final model
## 
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances + 
##     age + hisp + MyBestBuy + log(hhincome + 1) + I(log(hhincome + 
##     1)^2) + appliances + familydummy + weekend + newcustomer, 
##     family = "binomial", data = mydataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0176  -1.1388   0.7403   0.8922   2.0245  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -0.729218   0.818805  -0.891    0.373    
## PriceCategory             0.140675   0.023456   5.997    2e-09 ***
## appliances                2.808706   0.319849   8.781   <2e-16 ***
## age                      -0.018321   0.011450  -1.600    0.110    
## hisp                     -1.405425   0.165441  -8.495   <2e-16 ***
## MyBestBuy                 0.146545   0.085359   1.717    0.086 .  
## log(hhincome + 1)         0.007313   0.197553   0.037    0.970    
## I(log(hhincome + 1)^2)    0.027632   0.028383   0.974    0.330    
## familydummy               0.930220   0.096668   9.623   <2e-16 ***
## weekend                   0.043741   0.078970   0.554    0.580    
## newcustomer               0.032187   0.084863   0.379    0.704    
## PriceCategory:appliances -0.254866   0.027815  -9.163   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4255.7  on 3205  degrees of freedom
## Residual deviance: 3890.6  on 3194  degrees of freedom
## AIC: 3914.6
## 
## Number of Fisher Scoring iterations: 4
exp(coef(logit7))
##              (Intercept)            PriceCategory               appliances 
##                0.4822858                1.1510506               16.5884353 
##                      age                     hisp                MyBestBuy 
##                0.9818457                0.2452628                1.1578273 
##        log(hhincome + 1)   I(log(hhincome + 1)^2)              familydummy 
##                1.0073394                1.0280174                2.5350673 
##                  weekend              newcustomer PriceCategory:appliances 
##                1.0447112                1.0327106                0.7750202
with(logit7, null.deviance - deviance)
## [1] 365.1689
with(logit7, df.null - df.residual)
## [1] 11
with(logit7, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 1.489693e-71
pred = predict(logit7, data=mydataset, type="response")
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.682470368059888"
table(mydataset$Warranty, pred>=0.5)
##    
##     FALSE TRUE
##   0   482  734
##   1   284 1706
#model with familydummy and familyfactor
mydataset$familyfactor<-factor(mydataset$familysize)
logit8<-glm(Warranty~PriceCategory+PriceCategory*appliances+age+hisp+MyBestBuy+log(hhincome+1)+appliances+familyfactor+weekend+newcustomer,data =mydataset,family = "binomial")
summary(logit8) # final model
## 
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances + 
##     age + hisp + MyBestBuy + log(hhincome + 1) + appliances + 
##     familyfactor + weekend + newcustomer, family = "binomial", 
##     data = mydataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0023  -1.1355   0.7428   0.8949   2.0520  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -0.88612    0.79137  -1.120 0.262832    
## PriceCategory             0.14048    0.02349   5.980 2.23e-09 ***
## appliances                2.78765    0.31852   8.752  < 2e-16 ***
## age                      -0.01949    0.01144  -1.704 0.088371 .  
## hisp                     -1.40331    0.16544  -8.482  < 2e-16 ***
## MyBestBuy                 0.13608    0.08475   1.606 0.108354    
## log(hhincome + 1)         0.19221    0.05613   3.424 0.000617 ***
## familyfactor2            -0.07408    0.14238  -0.520 0.602865    
## familyfactor3             0.82342    0.12620   6.525 6.82e-11 ***
## familyfactor4             0.94998    0.12614   7.531 5.02e-14 ***
## weekend                   0.04498    0.07901   0.569 0.569158    
## newcustomer               0.02799    0.08491   0.330 0.741674    
## PriceCategory:appliances -0.25329    0.02775  -9.127  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4255.7  on 3205  degrees of freedom
## Residual deviance: 3889.4  on 3193  degrees of freedom
## AIC: 3915.4
## 
## Number of Fisher Scoring iterations: 4
exp(coef(logit8))
##              (Intercept)            PriceCategory               appliances 
##                0.4122535                1.1508313               16.2428564 
##                      age                     hisp                MyBestBuy 
##                0.9807000                0.2457824                1.1457737 
##        log(hhincome + 1)            familyfactor2            familyfactor3 
##                1.2119208                0.9285992                2.2782821 
##            familyfactor4                  weekend              newcustomer 
##                2.5856601                1.0460037                1.0283861 
## PriceCategory:appliances 
##                0.7762440
with(logit4, null.deviance - deviance)
## [1] 364.2123
with(logit4, df.null - df.residual)
## [1] 10
with(logit4, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 3.828063e-72
pred = predict(logit8, data=mydataset, type="response")
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.680598877105427"
table(mydataset$Warranty, pred>=0.5)
##    
##     FALSE TRUE
##   0   474  742
##   1   282 1708
#testing
AIC(logit8, logit4)  
##        df      AIC
## logit8 13 3915.370
## logit4 11 3913.528
BIC(logit8, logit4)
##        df      BIC
## logit8 13 3994.316
## logit4 11 3980.328
#select required rows
test<-subset(mydataset,hhincome>17 & hhincome<52,select =c("hhincome"))
nrow(test)
## [1] 1576
#heteroskedasticity

gqtest(logit4) 
## 
##  Goldfeld-Quandt test
## 
## data:  logit4
## GQ = 0.98566, df1 = 1592, df2 = 1592, p-value = 0.6133
## alternative hypothesis: variance increases from segment 1 to 2
bptest(logit4)
## 
##  studentized Breusch-Pagan test
## 
## data:  logit4
## BP = 49.935, df = 10, p-value = 2.743e-07
coeftest(logit4, vcov = vcovHC(logit4, "HC1")) # With robust standard errors
## 
## z test of coefficients:
## 
##                           Estimate Std. Error z value  Pr(>|z|)    
## (Intercept)              -0.947844   0.794000 -1.1938 0.2325728    
## PriceCategory             0.140179   0.023000  6.0947 1.096e-09 ***
## appliances                2.779453   0.313561  8.8642 < 2.2e-16 ***
## age                      -0.019073   0.011526 -1.6548 0.0979607 .  
## hisp                     -1.404605   0.162881 -8.6235 < 2.2e-16 ***
## MyBestBuy                 0.136612   0.085281  1.6019 0.1091794    
## log(hhincome + 1)         0.192273   0.055778  3.4471 0.0005666 ***
## familydummy               0.922652   0.096269  9.5841 < 2.2e-16 ***
## weekend                   0.044103   0.079131  0.5573 0.5772929    
## newcustomer               0.031760   0.084960  0.3738 0.7085396    
## PriceCategory:appliances -0.252719   0.027158 -9.3056 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(logit4)
## 
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances + 
##     age + hisp + MyBestBuy + log(hhincome + 1) + appliances + 
##     familydummy + weekend + newcustomer, family = "binomial", 
##     data = mydataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0096  -1.1403   0.7400   0.8938   2.0234  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -0.94784    0.78808  -1.203 0.229084    
## PriceCategory             0.14018    0.02346   5.976 2.29e-09 ***
## appliances                2.77945    0.31831   8.732  < 2e-16 ***
## age                      -0.01907    0.01143  -1.669 0.095052 .  
## hisp                     -1.40460    0.16535  -8.495  < 2e-16 ***
## MyBestBuy                 0.13661    0.08474   1.612 0.106923    
## log(hhincome + 1)         0.19227    0.05611   3.427 0.000611 ***
## familydummy               0.92265    0.09630   9.581  < 2e-16 ***
## weekend                   0.04410    0.07896   0.559 0.576455    
## newcustomer               0.03176    0.08485   0.374 0.708191    
## PriceCategory:appliances -0.25272    0.02772  -9.118  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4255.7  on 3205  degrees of freedom
## Residual deviance: 3891.5  on 3195  degrees of freedom
## AIC: 3913.5
## 
## Number of Fisher Scoring iterations: 4
##Predicted Probs

#Interaction interpretation


newdatatest <- with(mydataset,data.frame(PriceCategory= rep(seq(from = 1, to = 17, length.out = 17),2), newcustomer=mean(newcustomer),age=mean(age),hisp=mean(hisp),MyBestBuy=mean(MyBestBuy),hhincome=mean(hhincome),appliances=(rep(0:1, each = 17)),weekend=mean(weekend),familydummy=mean(familydummy)))

newdatatest1 <- cbind(newdatatest, predict(logit4, newdata = newdatatest, type="link", se=TRUE))
newdatatest1 <- within(newdatatest1, {
  PredictedProb <- plogis(fit)
  LL <- plogis(fit - (1.96 * se.fit))
  UL <- plogis(fit + (1.96 * se.fit))})

head(newdatatest1)
##   PriceCategory newcustomer      age       hisp MyBestBuy hhincome
## 1             1   0.7077355 66.91391 0.07267623 0.6247661 45.26391
## 2             2   0.7077355 66.91391 0.07267623 0.6247661 45.26391
## 3             3   0.7077355 66.91391 0.07267623 0.6247661 45.26391
## 4             4   0.7077355 66.91391 0.07267623 0.6247661 45.26391
## 5             5   0.7077355 66.91391 0.07267623 0.6247661 45.26391
## 6             6   0.7077355 66.91391 0.07267623 0.6247661 45.26391
##   appliances   weekend familydummy         fit    se.fit residual.scale
## 1          0 0.4039301   0.7330006 -0.64678296 0.2416974              1
## 2          0 0.4039301   0.7330006 -0.50660430 0.2199889              1
## 3          0 0.4039301   0.7330006 -0.36642564 0.1986784              1
## 4          0 0.4039301   0.7330006 -0.22624698 0.1779088              1
## 5          0 0.4039301   0.7330006 -0.08606832 0.1578938              1
## 6          0 0.4039301   0.7330006  0.05411034 0.1389598              1
##          UL        LL PredictedProb
## 1 0.4568436 0.2459167     0.3437149
## 2 0.4811524 0.2813485     0.3759899
## 3 0.5057458 0.3195511     0.4094050
## 4 0.5305754 0.3600958     0.4436783
## 5 0.5556197 0.4023843     0.4784962
## 6 0.5809006 0.4456529     0.5135243
ggplot(newdatatest1, aes(x = PriceCategory, y = PredictedProb))+
  geom_ribbon(aes(ymin = LL, ymax = UL, fill = factor(appliances)), alpha = .2) +
  geom_line(aes(colour = factor(appliances)), size=1)

#Hispanic
newdatatest_hisp <- with(mydataset,data.frame(hisp=(rep(0:1)),PriceCategory=mean(PriceCategory), newcustomer=mean(newcustomer),age=mean(age),MyBestBuy=mean(MyBestBuy),hhincome=mean(hhincome),appliances=mean(appliances),weekend=mean(weekend),familydummy=mean(familydummy)))

newdatatest_hisp1 <- cbind(newdatatest_hisp, predict(logit4, newdata = newdatatest_hisp, type="link", se=TRUE))
newdatatest_hisp1 <- within(newdatatest_hisp1, {
  PredictedProb <- plogis(fit)
  LL <- plogis(fit - (1.96 * se.fit))
  UL <- plogis(fit + (1.96 * se.fit))})

ggplot(newdatatest_hisp1, aes(x = hisp, y = PredictedProb))+
  geom_ribbon(aes(ymin = LL, ymax = UL)) +geom_line(aes(colour = "hisp"), size=1)

# FamilyDummy 
newdatatest_familysize <- with(mydataset,data.frame(hisp=mean(hisp),PriceCategory=mean(PriceCategory), newcustomer=mean(newcustomer),age=mean(age),MyBestBuy=mean(MyBestBuy),hhincome=mean(hhincome),appliances=mean(appliances),weekend=mean(weekend),familydummy=rep(0:1)))

newdatatest_familysize1 <- cbind(newdatatest_familysize, predict(logit4, newdata = newdatatest_familysize, type="link", se=TRUE))
newdatatest_familysize1
##         hisp PriceCategory newcustomer      age MyBestBuy hhincome
## 1 0.07267623      11.89863   0.7077355 66.91391 0.6247661 45.26391
## 2 0.07267623      11.89863   0.7077355 66.91391 0.6247661 45.26391
##   appliances   weekend familydummy        fit     se.fit residual.scale
## 1  0.7046163 0.4039301           0 0.04433092 0.09152702              1
## 2  0.7046163 0.4039301           1 0.96698295 0.05119074              1
newdatatest_familysize1 <- within(newdatatest_familysize1, {
  PredictedProb <- plogis(fit)
  LL <- plogis(fit - (1.96 * se.fit))
  UL <- plogis(fit + (1.96 * se.fit))})

ggplot(newdatatest_familysize1, aes(x = familydummy, y = PredictedProb))+
  geom_ribbon(aes(ymin = LL, ymax = UL)) +
  geom_line(aes(colour = "familysize"), size=1)

##HHIncome
newdatatest_income <- with(mydataset,data.frame(PriceCategory= mean(PriceCategory), newcustomer=mean(newcustomer),age=mean(age),hisp=mean(hisp),MyBestBuy=mean(MyBestBuy),hhincome=rep(seq(from = 0, to =1312 ,length.out=10)),appliances=mean(appliances),weekend=mean(weekend),familydummy=mean(familydummy)))

hist(log(1+mydataset$hhincome))

newdatatest1 <- cbind(newdatatest_income, predict(logit4, newdata = newdatatest_income, type="link", se=TRUE))
newdatatest1 <- within(newdatatest1, {
  PredictedProb <- plogis(fit)
  LL <- plogis(fit - (1.96 * se.fit))
  UL <- plogis(fit + (1.96 * se.fit))})


ggplot(newdatatest1, aes(x = hhincome, y = PredictedProb))+
  geom_ribbon(aes(ymin = LL, ymax = UL), alpha = .2) +
  geom_line(aes(colour = "hhincome"), size=1)

Lets, try Probit model too

#Probit
probit1<- glm(Warranty~PriceCategory+newcustomer+age+hisp+MyBestBuy+hhincome+appliances+familysize+weekend,data =mydataset, family=binomial(link="probit"))
summary(probit1)              
## 
## Call:
## glm(formula = Warranty ~ PriceCategory + newcustomer + age + 
##     hisp + MyBestBuy + hhincome + appliances + familysize + weekend, 
##     family = binomial(link = "probit"), data = mydataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8819  -1.2147   0.7781   0.9211   1.8893  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    0.4254754  0.4506618   0.944 0.345112    
## PriceCategory -0.0120941  0.0081333  -1.487 0.137019    
## newcustomer    0.0308066  0.0507998   0.606 0.544227    
## age           -0.0120256  0.0067716  -1.776 0.075750 .  
## hisp          -0.9000405  0.0962416  -9.352  < 2e-16 ***
## MyBestBuy      0.1176283  0.0509580   2.308 0.020980 *  
## hhincome       0.0019624  0.0005317   3.690 0.000224 ***
## appliances     0.0247963  0.0543965   0.456 0.648503    
## familysize     0.2328456  0.0231980  10.037  < 2e-16 ***
## weekend        0.0385597  0.0471681   0.817 0.413646    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4255.7  on 3205  degrees of freedom
## Residual deviance: 3999.3  on 3196  degrees of freedom
## AIC: 4019.3
## 
## Number of Fisher Scoring iterations: 4
### final model
probit2<- glm(Warranty~PriceCategory+PriceCategory*appliances+newcustomer+age+hisp+MyBestBuy+hhincome+appliances+familydummy+weekend,data =mydataset, family=binomial(link="probit"))
summary(probit2)   
## 
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances + 
##     newcustomer + age + hisp + MyBestBuy + hhincome + appliances + 
##     familydummy + weekend, family = binomial(link = "probit"), 
##     data = mydataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0241  -1.1415   0.7397   0.8944   2.0405  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -0.3992900  0.4751128  -0.840 0.400678    
## PriceCategory             0.0887201  0.0136150   6.516  7.2e-11 ***
## appliances                1.7323987  0.1894559   9.144  < 2e-16 ***
## newcustomer               0.0215749  0.0514435   0.419 0.674932    
## age                      -0.0109648  0.0068895  -1.592 0.111490    
## hisp                     -0.8676857  0.0987345  -8.788  < 2e-16 ***
## MyBestBuy                 0.0982485  0.0516341   1.903 0.057069 .  
## hhincome                  0.0020474  0.0005409   3.785 0.000154 ***
## familydummy               0.6062111  0.0545861  11.106  < 2e-16 ***
## weekend                   0.0262452  0.0477891   0.549 0.582876    
## PriceCategory:appliances -0.1559184  0.0164101  -9.501  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4255.7  on 3205  degrees of freedom
## Residual deviance: 3885.8  on 3195  degrees of freedom
## AIC: 3907.8
## 
## Number of Fisher Scoring iterations: 5
probit3<- glm(Warranty~PriceCategory+PriceCategory*appliances+newcustomer+age+hisp+MyBestBuy+log(hhincome+1)+appliances+familysize+weekend,data =mydataset, family=binomial(link="probit"))
summary(probit3)
## 
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances + 
##     newcustomer + age + hisp + MyBestBuy + log(hhincome + 1) + 
##     appliances + familysize + weekend, family = binomial(link = "probit"), 
##     data = mydataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0498  -1.1837   0.7380   0.9154   2.1598  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -0.91506    0.47317  -1.934   0.0531 .  
## PriceCategory             0.07998    0.01372   5.831 5.52e-09 ***
## appliances                1.65617    0.18815   8.802  < 2e-16 ***
## newcustomer               0.01496    0.05132   0.291   0.7707    
## age                      -0.01099    0.00689  -1.594   0.1108    
## hisp                     -0.85087    0.09840  -8.647  < 2e-16 ***
## MyBestBuy                 0.09269    0.05120   1.810   0.0702 .  
## log(hhincome + 1)         0.15429    0.03290   4.690 2.73e-06 ***
## familysize                0.21315    0.02473   8.618  < 2e-16 ***
## weekend                   0.03408    0.04765   0.715   0.4746    
## PriceCategory:appliances -0.15019    0.01629  -9.220  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4255.7  on 3205  degrees of freedom
## Residual deviance: 3909.1  on 3195  degrees of freedom
## AIC: 3931.1
## 
## Number of Fisher Scoring iterations: 4
with(probit2, null.deviance - deviance)
## [1] 369.9354
with(probit2, df.null - df.residual)
## [1] 10
with(probit2, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 2.328973e-73
pred = predict(probit2, data=mydataset)
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.589831565814099"
table(mydataset$Warranty, pred>=0.5)
##    
##     FALSE TRUE
##   0   849  367
##   1   948 1042

Final model- logit. Since accuracy is higher with logit model. Some ideas to extend the model accuracy would be: 1. using IV regression as myBestBuy variable is suitable to be considered endogenous. 2. Trying other classification models to improve model performace.