The goal is to identify bestbuy customers who have high propensity to buy geek squad protection plan. The data is collected only from Santa Clara Best Buy stores. The dataset, named “BestBuy.csv” includes 3,206 transactions made in March 2017. Please note this is very small dataset and provides analysis for a beginner ML enthusiast.
add libraries
#install.packages("sqldf")
library(sqldf)
## Warning: package 'sqldf' was built under R version 3.4.2
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.4.2
## Loading required package: proto
## Warning: package 'proto' was built under R version 3.4.2
## Loading required package: RSQLite
library(usdm)
## Warning: package 'usdm' was built under R version 3.4.3
## Loading required package: sp
## Warning: package 'sp' was built under R version 3.4.2
## Loading required package: raster
## Warning: package 'raster' was built under R version 3.4.2
library(MASS)
## Warning: package 'MASS' was built under R version 3.4.4
##
## Attaching package: 'MASS'
## The following objects are masked from 'package:raster':
##
## area, select
library(QuantPsyc)
## Warning: package 'QuantPsyc' was built under R version 3.4.2
## Loading required package: boot
##
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:base':
##
## norm
library(aod)
## Warning: package 'aod' was built under R version 3.4.4
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(Rcpp)
## Warning: package 'Rcpp' was built under R version 3.4.4
library(lmtest)
## Warning: package 'lmtest' was built under R version 3.4.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.2
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
## Warning: package 'sandwich' was built under R version 3.4.2
library(foreign)
mydataset<-read.csv("BestBuy.csv")
summary(mydataset)
## personid age hisp PriceCategory
## Min. :10004010 Min. :52.00 Min. :0.00000 Min. : 0.0
## 1st Qu.:31884768 1st Qu.:65.00 1st Qu.:0.00000 1st Qu.:10.0
## Median :47241515 Median :67.00 Median :0.00000 Median :12.0
## Mean :48620356 Mean :66.91 Mean :0.07268 Mean :11.9
## 3rd Qu.:71483028 3rd Qu.:69.00 3rd Qu.:0.00000 3rd Qu.:14.0
## Max. :99564010 Max. :86.00 Max. :1.00000 Max. :17.0
## married MyBestBuy hhincome appliances
## Min. :0.000 Min. :0.0000 Min. : 0.00 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.: 17.00 1st Qu.:0.0000
## Median :1.000 Median :1.0000 Median : 31.10 Median :1.0000
## Mean :0.733 Mean :0.6248 Mean : 45.26 Mean :0.7046
## 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.: 52.80 3rd Qu.:1.0000
## Max. :1.000 Max. :1.0000 Max. :1312.12 Max. :1.0000
## Warranty familysize productgeneration newcustomer
## Min. :0.0000 Min. :1.000 Min. : 1.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.: 7.000 1st Qu.:0.0000
## Median :1.0000 Median :3.000 Median : 8.000 Median :1.0000
## Mean :0.6207 Mean :2.973 Mean : 7.442 Mean :0.7077
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.: 8.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :4.000 Max. :11.000 Max. :1.0000
## weekend
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4039
## 3rd Qu.:1.0000
## Max. :1.0000
hist(mydataset$Warranty)
testsql=sqldf("select age,newcustomer from mydataset where newcustomer=1")
#Summary Statistics
hist(mydataset$PriceCategory) #comparing the histograms of key independent variable which are not dummy/factor variables
hist(log(mydataset$PriceCategory))
hist(mydataset$hhincome) #comparing the histograms of key independent variable which are not dummy/factor variables
hist(log(mydataset$hhincome))
#checking for multicollinearity
df=data.frame(mydataset$newcustomer,mydataset$age,mydataset$hisp,mydataset$PriceCategory,mydataset$married,mydataset$MyBestBuy,mydataset$hhincome,mydataset$appliances,mydataset$familysize,mydataset$productgeneration,mydataset$weekend)
vif(df)
## Variables VIF
## 1 mydataset.newcustomer 1.004547
## 2 mydataset.age 1.165371
## 3 mydataset.hisp 1.123186
## 4 mydataset.PriceCategory 9.503022
## 5 mydataset.married 4.314001
## 6 mydataset.MyBestBuy 1.146866
## 7 mydataset.hhincome 1.151368
## 8 mydataset.appliances 1.159733
## 9 mydataset.familysize 4.224151
## 10 mydataset.productgeneration 9.231568
## 11 mydataset.weekend 1.002920
cor(df)
## mydataset.newcustomer mydataset.age
## mydataset.newcustomer 1.000000000 0.0120054043
## mydataset.age 0.012005404 1.0000000000
## mydataset.hisp 0.013466952 -0.0006346689
## mydataset.PriceCategory -0.005186286 0.0399169525
## mydataset.married 0.015226049 0.1575621439
## mydataset.MyBestBuy -0.047584715 0.3115567007
## mydataset.hhincome 0.012219606 0.0652357371
## mydataset.appliances -0.002667570 0.1967221171
## mydataset.familysize 0.023601740 0.1456522428
## mydataset.productgeneration -0.011593103 0.0542402527
## mydataset.weekend 0.010458271 -0.0083913126
## mydataset.hisp mydataset.PriceCategory
## mydataset.newcustomer 0.0134669523 -0.005186286
## mydataset.age -0.0006346689 0.039916952
## mydataset.hisp 1.0000000000 -0.328141800
## mydataset.PriceCategory -0.3281418001 1.000000000
## mydataset.married -0.0265865550 0.074948382
## mydataset.MyBestBuy -0.0659347592 0.135125725
## mydataset.hhincome -0.1052860258 0.290427681
## mydataset.appliances -0.1163422707 0.309767740
## mydataset.familysize -0.0147721428 0.064004686
## mydataset.productgeneration -0.3080954399 0.944045453
## mydataset.weekend -0.0149751267 -0.011490043
## mydataset.married mydataset.MyBestBuy
## mydataset.newcustomer 0.01522605 -0.047584715
## mydataset.age 0.15756214 0.311556701
## mydataset.hisp -0.02658656 -0.065934759
## mydataset.PriceCategory 0.07494838 0.135125725
## mydataset.married 1.00000000 0.122028744
## mydataset.MyBestBuy 0.12202874 1.000000000
## mydataset.hhincome 0.20854885 -0.012045000
## mydataset.appliances 0.09605320 0.128009299
## mydataset.familysize 0.87308567 0.108696288
## mydataset.productgeneration 0.07437216 0.133728765
## mydataset.weekend 0.03270994 0.005156923
## mydataset.hhincome mydataset.appliances
## mydataset.newcustomer 0.01221961 -0.002667570
## mydataset.age 0.06523574 0.196722117
## mydataset.hisp -0.10528603 -0.116342271
## mydataset.PriceCategory 0.29042768 0.309767740
## mydataset.married 0.20854885 0.096053202
## mydataset.MyBestBuy -0.01204500 0.128009299
## mydataset.hhincome 1.00000000 0.155125760
## mydataset.appliances 0.15512576 1.000000000
## mydataset.familysize 0.17158996 0.064026525
## mydataset.productgeneration 0.28408757 0.293973195
## mydataset.weekend 0.01489175 0.007693935
## mydataset.familysize
## mydataset.newcustomer 0.02360174
## mydataset.age 0.14565224
## mydataset.hisp -0.01477214
## mydataset.PriceCategory 0.06400469
## mydataset.married 0.87308567
## mydataset.MyBestBuy 0.10869629
## mydataset.hhincome 0.17158996
## mydataset.appliances 0.06402652
## mydataset.familysize 1.00000000
## mydataset.productgeneration 0.06164029
## mydataset.weekend 0.01707987
## mydataset.productgeneration mydataset.weekend
## mydataset.newcustomer -0.01159310 0.010458271
## mydataset.age 0.05424025 -0.008391313
## mydataset.hisp -0.30809544 -0.014975127
## mydataset.PriceCategory 0.94404545 -0.011490043
## mydataset.married 0.07437216 0.032709938
## mydataset.MyBestBuy 0.13372877 0.005156923
## mydataset.hhincome 0.28408757 0.014891748
## mydataset.appliances 0.29397319 0.007693935
## mydataset.familysize 0.06164029 0.017079870
## mydataset.productgeneration 1.00000000 -0.014526542
## mydataset.weekend -0.01452654 1.000000000
df1=data.frame(mydataset$newcustomer,mydataset$age,mydataset$hisp,mydataset$PriceCategory,mydataset$married,mydataset$MyBestBuy,mydataset$hhincome,mydataset$appliances,mydataset$familysize,mydataset$weekend)
vif(df1)
## Variables VIF
## 1 mydataset.newcustomer 1.004115
## 2 mydataset.age 1.162970
## 3 mydataset.hisp 1.123154
## 4 mydataset.PriceCategory 1.319947
## 5 mydataset.married 4.313577
## 6 mydataset.MyBestBuy 1.146835
## 7 mydataset.hhincome 1.150371
## 8 mydataset.appliances 1.159672
## 9 mydataset.familysize 4.223491
## 10 mydataset.weekend 1.002794
cor(df1)
## mydataset.newcustomer mydataset.age mydataset.hisp
## mydataset.newcustomer 1.000000000 0.0120054043 0.0134669523
## mydataset.age 0.012005404 1.0000000000 -0.0006346689
## mydataset.hisp 0.013466952 -0.0006346689 1.0000000000
## mydataset.PriceCategory -0.005186286 0.0399169525 -0.3281418001
## mydataset.married 0.015226049 0.1575621439 -0.0265865550
## mydataset.MyBestBuy -0.047584715 0.3115567007 -0.0659347592
## mydataset.hhincome 0.012219606 0.0652357371 -0.1052860258
## mydataset.appliances -0.002667570 0.1967221171 -0.1163422707
## mydataset.familysize 0.023601740 0.1456522428 -0.0147721428
## mydataset.weekend 0.010458271 -0.0083913126 -0.0149751267
## mydataset.PriceCategory mydataset.married
## mydataset.newcustomer -0.005186286 0.01522605
## mydataset.age 0.039916952 0.15756214
## mydataset.hisp -0.328141800 -0.02658656
## mydataset.PriceCategory 1.000000000 0.07494838
## mydataset.married 0.074948382 1.00000000
## mydataset.MyBestBuy 0.135125725 0.12202874
## mydataset.hhincome 0.290427681 0.20854885
## mydataset.appliances 0.309767740 0.09605320
## mydataset.familysize 0.064004686 0.87308567
## mydataset.weekend -0.011490043 0.03270994
## mydataset.MyBestBuy mydataset.hhincome
## mydataset.newcustomer -0.047584715 0.01221961
## mydataset.age 0.311556701 0.06523574
## mydataset.hisp -0.065934759 -0.10528603
## mydataset.PriceCategory 0.135125725 0.29042768
## mydataset.married 0.122028744 0.20854885
## mydataset.MyBestBuy 1.000000000 -0.01204500
## mydataset.hhincome -0.012045000 1.00000000
## mydataset.appliances 0.128009299 0.15512576
## mydataset.familysize 0.108696288 0.17158996
## mydataset.weekend 0.005156923 0.01489175
## mydataset.appliances mydataset.familysize
## mydataset.newcustomer -0.002667570 0.02360174
## mydataset.age 0.196722117 0.14565224
## mydataset.hisp -0.116342271 -0.01477214
## mydataset.PriceCategory 0.309767740 0.06400469
## mydataset.married 0.096053202 0.87308567
## mydataset.MyBestBuy 0.128009299 0.10869629
## mydataset.hhincome 0.155125760 0.17158996
## mydataset.appliances 1.000000000 0.06402652
## mydataset.familysize 0.064026525 1.00000000
## mydataset.weekend 0.007693935 0.01707987
## mydataset.weekend
## mydataset.newcustomer 0.010458271
## mydataset.age -0.008391313
## mydataset.hisp -0.014975127
## mydataset.PriceCategory -0.011490043
## mydataset.married 0.032709938
## mydataset.MyBestBuy 0.005156923
## mydataset.hhincome 0.014891748
## mydataset.appliances 0.007693935
## mydataset.familysize 0.017079870
## mydataset.weekend 1.000000000
df2=data.frame(mydataset$newcustomer,mydataset$age,mydataset$hisp,mydataset$PriceCategory,mydataset$MyBestBuy,mydataset$hhincome,mydataset$appliances,mydataset$familysize,mydataset$weekend)
vif(df2)
## Variables VIF
## 1 mydataset.newcustomer 1.004008
## 2 mydataset.age 1.161676
## 3 mydataset.hisp 1.122978
## 4 mydataset.PriceCategory 1.319399
## 5 mydataset.MyBestBuy 1.144873
## 6 mydataset.hhincome 1.135540
## 7 mydataset.appliances 1.156152
## 8 mydataset.familysize 1.056786
## 9 mydataset.weekend 1.001570
cor(df2)
## mydataset.newcustomer mydataset.age mydataset.hisp
## mydataset.newcustomer 1.000000000 0.0120054043 0.0134669523
## mydataset.age 0.012005404 1.0000000000 -0.0006346689
## mydataset.hisp 0.013466952 -0.0006346689 1.0000000000
## mydataset.PriceCategory -0.005186286 0.0399169525 -0.3281418001
## mydataset.MyBestBuy -0.047584715 0.3115567007 -0.0659347592
## mydataset.hhincome 0.012219606 0.0652357371 -0.1052860258
## mydataset.appliances -0.002667570 0.1967221171 -0.1163422707
## mydataset.familysize 0.023601740 0.1456522428 -0.0147721428
## mydataset.weekend 0.010458271 -0.0083913126 -0.0149751267
## mydataset.PriceCategory mydataset.MyBestBuy
## mydataset.newcustomer -0.005186286 -0.047584715
## mydataset.age 0.039916952 0.311556701
## mydataset.hisp -0.328141800 -0.065934759
## mydataset.PriceCategory 1.000000000 0.135125725
## mydataset.MyBestBuy 0.135125725 1.000000000
## mydataset.hhincome 0.290427681 -0.012045000
## mydataset.appliances 0.309767740 0.128009299
## mydataset.familysize 0.064004686 0.108696288
## mydataset.weekend -0.011490043 0.005156923
## mydataset.hhincome mydataset.appliances
## mydataset.newcustomer 0.01221961 -0.002667570
## mydataset.age 0.06523574 0.196722117
## mydataset.hisp -0.10528603 -0.116342271
## mydataset.PriceCategory 0.29042768 0.309767740
## mydataset.MyBestBuy -0.01204500 0.128009299
## mydataset.hhincome 1.00000000 0.155125760
## mydataset.appliances 0.15512576 1.000000000
## mydataset.familysize 0.17158996 0.064026525
## mydataset.weekend 0.01489175 0.007693935
## mydataset.familysize mydataset.weekend
## mydataset.newcustomer 0.02360174 0.010458271
## mydataset.age 0.14565224 -0.008391313
## mydataset.hisp -0.01477214 -0.014975127
## mydataset.PriceCategory 0.06400469 -0.011490043
## mydataset.MyBestBuy 0.10869629 0.005156923
## mydataset.hhincome 0.17158996 0.014891748
## mydataset.appliances 0.06402652 0.007693935
## mydataset.familysize 1.00000000 0.017079870
## mydataset.weekend 0.01707987 1.000000000
Let’s start modeling. First Linear probability model : why? Let’s understand why linear probability model is a bad fit for classification. Then we will do it the right way
#our dependent variable is Warranty
#our Key Independent Variables are PriceCategory*appliances, hisp,hhincome,familysize
#our control variables are age, newcustomer, weekend
names(mydataset)
## [1] "personid" "age" "hisp"
## [4] "PriceCategory" "married" "MyBestBuy"
## [7] "hhincome" "appliances" "Warranty"
## [10] "familysize" "productgeneration" "newcustomer"
## [13] "weekend"
model1<- lm(Warranty~PriceCategory+newcustomer+age+hisp+MyBestBuy+hhincome+appliances+familysize+weekend,data =mydataset)
summary(model1) #first attempt
##
## Call:
## lm(formula = Warranty ~ PriceCategory + newcustomer + age + hisp +
## MyBestBuy + hhincome + appliances + familysize + weekend,
## data = mydataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8196 -0.5253 0.2592 0.3497 0.8543
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6419974 0.1608661 3.991 6.73e-05 ***
## PriceCategory -0.0036649 0.0028642 -1.280 0.200797
## newcustomer 0.0106986 0.0181514 0.589 0.555629
## age -0.0042590 0.0024161 -1.763 0.078039 .
## hisp -0.3353707 0.0336309 -9.972 < 2e-16 ***
## MyBestBuy 0.0405427 0.0182068 2.227 0.026031 *
## hhincome 0.0004871 0.0001365 3.569 0.000363 ***
## appliances 0.0114716 0.0194179 0.591 0.554715
## familysize 0.0883997 0.0082614 10.700 < 2e-16 ***
## weekend 0.0145492 0.0168036 0.866 0.386645
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4665 on 3196 degrees of freedom
## Multiple R-squared: 0.07854, Adjusted R-squared: 0.07595
## F-statistic: 30.27 on 9 and 3196 DF, p-value: < 2.2e-16
model2<- lm(Warranty~PriceCategory+PriceCategory*appliances+newcustomer+age+hisp+MyBestBuy+hhincome+appliances+familysize+weekend,data =mydataset)
summary(model2) #adding interaction variable
##
## Call:
## lm(formula = Warranty ~ PriceCategory + PriceCategory * appliances +
## newcustomer + age + hisp + MyBestBuy + hhincome + appliances +
## familysize + weekend, data = mydataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9282 -0.5202 0.2419 0.3508 0.9318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2665913 0.1637043 1.628 0.1035
## PriceCategory 0.0280350 0.0044083 6.360 2.31e-10 ***
## appliances 0.5726817 0.0628874 9.106 < 2e-16 ***
## newcustomer 0.0061764 0.0179163 0.345 0.7303
## age -0.0034897 0.0023854 -1.463 0.1436
## hisp -0.3088329 0.0333040 -9.273 < 2e-16 ***
## MyBestBuy 0.0356596 0.0179721 1.984 0.0473 *
## hhincome 0.0005737 0.0001350 4.251 2.19e-05 ***
## familysize 0.0877437 0.0081517 10.764 < 2e-16 ***
## weekend 0.0136309 0.0165803 0.822 0.4111
## PriceCategory:appliances -0.0503873 0.0053778 -9.369 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4603 on 3195 degrees of freedom
## Multiple R-squared: 0.1032, Adjusted R-squared: 0.1004
## F-statistic: 36.76 on 10 and 3195 DF, p-value: < 2.2e-16
model2wl<- lm(Warranty~PriceCategory+PriceCategory*appliances+newcustomer+age+hisp+MyBestBuy+log(hhincome+1)+appliances+familysize+weekend,data =mydataset)
summary(model2wl) # this is model2 using log(hhincome) --> we get an error because there are some 0 values (which are transformed into -inf)
##
## Call:
## lm(formula = Warranty ~ PriceCategory + PriceCategory * appliances +
## newcustomer + age + hisp + MyBestBuy + log(hhincome + 1) +
## appliances + familysize + weekend, data = mydataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9192 -0.5091 0.2412 0.3482 0.9297
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.205221 0.163260 1.257 0.209
## PriceCategory 0.025463 0.004483 5.680 1.47e-08 ***
## appliances 0.556595 0.062778 8.866 < 2e-16 ***
## newcustomer 0.005473 0.017905 0.306 0.760
## age -0.003905 0.002388 -1.635 0.102
## hisp -0.300539 0.033354 -9.011 < 2e-16 ***
## MyBestBuy 0.031668 0.017901 1.769 0.077 .
## log(hhincome + 1) 0.054561 0.011422 4.777 1.86e-06 ***
## familysize 0.077550 0.008687 8.927 < 2e-16 ***
## weekend 0.012647 0.016572 0.763 0.445
## PriceCategory:appliances -0.050034 0.005367 -9.322 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4599 on 3195 degrees of freedom
## Multiple R-squared: 0.1045, Adjusted R-squared: 0.1017
## F-statistic: 37.29 on 10 and 3195 DF, p-value: < 2.2e-16
predictedprobability_lm<-predict(model2) # let's look at the predicted probability of return for each observation in the data
plot(mydataset$Warranty,predictedprobability_lm)
range(predictedprobability_lm) #negative values and values greater than one so this is wrong!
## [1] -0.1497612 1.4659433
predictedprobability_lm<-predict(model2wl) # let's look at the predicted probability of return for each observation in the data
plot(mydataset$Warranty,predictedprobability_lm)
range(predictedprobability_lm)
## [1] -0.1387258 1.0154840
#Logit
#calculating ratio
sum(mydataset$Warranty==0)
## [1] 1216
sum(mydataset$Warranty==1)
## [1] 1990
a=1216/9
b=1990/9
min(a,b)
## [1] 135.1111
Now, we will use logistic regression to correctly transform the predictions into probability range between 0 and 1.
#Changing family size to dummy
mydataset$familydummy<-ifelse(mydataset$familysize>2,1,0)
#model with interaction
logit1<-glm(Warranty~PriceCategory+newcustomer+age+hisp+MyBestBuy+hhincome+appliances+familysize+weekend,data =mydataset,family = "binomial")
summary(logit1)
##
## Call:
## glm(formula = Warranty ~ PriceCategory + newcustomer + age +
## hisp + MyBestBuy + hhincome + appliances + familysize + weekend,
## family = "binomial", data = mydataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8720 -1.2145 0.7786 0.9187 1.8727
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.7202738 0.7398837 0.973 0.330307
## PriceCategory -0.0205217 0.0133950 -1.532 0.125512
## newcustomer 0.0507235 0.0833395 0.609 0.542765
## age -0.0199280 0.0111238 -1.791 0.073217 .
## hisp -1.4583152 0.1599041 -9.120 < 2e-16 ***
## MyBestBuy 0.1896736 0.0835198 2.271 0.023147 *
## hhincome 0.0032567 0.0009248 3.521 0.000429 ***
## appliances 0.0423635 0.0891714 0.475 0.634731
## familysize 0.3789697 0.0380430 9.962 < 2e-16 ***
## weekend 0.0642373 0.0775178 0.829 0.407287
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4255.7 on 3205 degrees of freedom
## Residual deviance: 3999.5 on 3196 degrees of freedom
## AIC: 4019.5
##
## Number of Fisher Scoring iterations: 4
#model with interaction
logit2<-glm(Warranty~PriceCategory+PriceCategory*appliances+newcustomer+age+hisp+MyBestBuy+hhincome+appliances+familysize+weekend,data =mydataset,family = "binomial")
summary(logit2)
##
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances +
## newcustomer + age + hisp + MyBestBuy + hhincome + appliances +
## familysize + weekend, family = "binomial", data = mydataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0428 -1.2006 0.7400 0.9194 2.1229
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1873844 0.7851770 -1.512 0.1305
## PriceCategory 0.1403763 0.0229857 6.107 1.01e-09 ***
## appliances 2.8342713 0.3175643 8.925 < 2e-16 ***
## newcustomer 0.0309996 0.0845716 0.367 0.7140
## age -0.0163625 0.0113663 -1.440 0.1500
## hisp -1.4200859 0.1644499 -8.635 < 2e-16 ***
## MyBestBuy 0.1789387 0.0846593 2.114 0.0345 *
## hhincome 0.0040424 0.0009604 4.209 2.57e-05 ***
## familysize 0.3849451 0.0386900 9.949 < 2e-16 ***
## weekend 0.0583670 0.0786832 0.742 0.4582
## PriceCategory:appliances -0.2526930 0.0276414 -9.142 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4255.7 on 3205 degrees of freedom
## Residual deviance: 3908.3 on 3195 degrees of freedom
## AIC: 3930.3
##
## Number of Fisher Scoring iterations: 4
#testing logit1 and logit
anova(logit1,logit2)
## Analysis of Deviance Table
##
## Model 1: Warranty ~ PriceCategory + newcustomer + age + hisp + MyBestBuy +
## hhincome + appliances + familysize + weekend
## Model 2: Warranty ~ PriceCategory + PriceCategory * appliances + newcustomer +
## age + hisp + MyBestBuy + hhincome + appliances + familysize +
## weekend
## Resid. Df Resid. Dev Df Deviance
## 1 3196 3999.5
## 2 3195 3908.3 1 91.258
exp(coef(logit2))
## (Intercept) PriceCategory appliances
## 0.3050180 1.1507067 17.0179949
## newcustomer age hisp
## 1.0314851 0.9837707 0.2416933
## MyBestBuy hhincome familysize
## 1.1959474 1.0040506 1.4695337
## weekend PriceCategory:appliances
## 1.0601040 0.7767063
with(logit2, null.deviance - deviance)
## [1] 347.451
with(logit2, df.null - df.residual)
## [1] 10
with(logit2, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 1.384424e-68
pred = predict(logit2, data=mydataset)
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.649407361197754"
table(mydataset$Warranty, pred>=0.5)
##
## FALSE TRUE
## 0 708 508
## 1 616 1374
#model with log(hhincome+1)
logit3<-glm(Warranty~PriceCategory+PriceCategory*appliances+newcustomer+age+hisp+MyBestBuy+log(hhincome+1)+appliances+familysize+weekend,data =mydataset,family = "binomial") #added log hhincome
summary(logit3)
##
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances +
## newcustomer + age + hisp + MyBestBuy + log(hhincome + 1) +
## appliances + familysize + weekend, family = "binomial", data = mydataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0267 -1.1825 0.7364 0.9119 2.1197
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.49267 0.78414 -1.904 0.0570 .
## PriceCategory 0.13191 0.02327 5.669 1.43e-08 ***
## appliances 2.74350 0.31668 8.663 < 2e-16 ***
## newcustomer 0.02589 0.08460 0.306 0.7595
## age -0.01835 0.01140 -1.610 0.1073
## hisp -1.39227 0.16448 -8.465 < 2e-16 ***
## MyBestBuy 0.15146 0.08435 1.796 0.0726 .
## log(hhincome + 1) 0.25418 0.05435 4.677 2.91e-06 ***
## familysize 0.34928 0.04063 8.598 < 2e-16 ***
## weekend 0.05718 0.07870 0.727 0.4675
## PriceCategory:appliances -0.24871 0.02758 -9.016 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4255.7 on 3205 degrees of freedom
## Residual deviance: 3909.1 on 3195 degrees of freedom
## AIC: 3931.1
##
## Number of Fisher Scoring iterations: 4
exp(coef(logit3))
## (Intercept) PriceCategory appliances
## 0.2247721 1.1410045 15.5413525
## newcustomer age hisp
## 1.0262328 0.9818165 0.2485105
## MyBestBuy log(hhincome + 1) familysize
## 1.1635306 1.2894098 1.4180509
## weekend PriceCategory:appliances
## 1.0588451 0.7798062
with(logit3, null.deviance - deviance)
## [1] 346.6728
with(logit3, df.null - df.residual)
## [1] 10
with(logit3, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 2.024786e-68
pred = predict(logit3, data=mydataset)
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.653462258265752"
table(mydataset$Warranty, pred>=0.5)
##
## FALSE TRUE
## 0 706 510
## 1 601 1389
#model with interaction and log(hhincome+1) and newcustomer
logit4<-glm(Warranty~PriceCategory+PriceCategory*appliances+age+hisp+MyBestBuy+log(hhincome+1)+appliances+familydummy+weekend+newcustomer,data =mydataset,family = "binomial")
summary(logit4) # final model
##
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances +
## age + hisp + MyBestBuy + log(hhincome + 1) + appliances +
## familydummy + weekend + newcustomer, family = "binomial",
## data = mydataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0096 -1.1403 0.7400 0.8938 2.0234
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.94784 0.78808 -1.203 0.229084
## PriceCategory 0.14018 0.02346 5.976 2.29e-09 ***
## appliances 2.77945 0.31831 8.732 < 2e-16 ***
## age -0.01907 0.01143 -1.669 0.095052 .
## hisp -1.40460 0.16535 -8.495 < 2e-16 ***
## MyBestBuy 0.13661 0.08474 1.612 0.106923
## log(hhincome + 1) 0.19227 0.05611 3.427 0.000611 ***
## familydummy 0.92265 0.09630 9.581 < 2e-16 ***
## weekend 0.04410 0.07896 0.559 0.576455
## newcustomer 0.03176 0.08485 0.374 0.708191
## PriceCategory:appliances -0.25272 0.02772 -9.118 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4255.7 on 3205 degrees of freedom
## Residual deviance: 3891.5 on 3195 degrees of freedom
## AIC: 3913.5
##
## Number of Fisher Scoring iterations: 4
exp(coef(logit4))
## (Intercept) PriceCategory appliances
## 0.3875757 1.1504793 16.1102075
## age hisp MyBestBuy
## 0.9811081 0.2454641 1.1463829
## log(hhincome + 1) familydummy weekend
## 1.2120008 2.5159539 1.0450899
## newcustomer PriceCategory:appliances
## 1.0322693 0.7766864
with(logit4, null.deviance - deviance)
## [1] 364.2123
with(logit4, df.null - df.residual)
## [1] 10
with(logit4, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 3.828063e-72
pred = predict(logit4, data=mydataset, type="response")
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.682470368059888"
table(mydataset$Warranty, pred>=0.5)
##
## FALSE TRUE
## 0 480 736
## 1 282 1708
#testing
AIC(logit3, logit4)
## df AIC
## logit3 11 3931.067
## logit4 11 3913.528
BIC(logit3, logit4)
## df BIC
## logit3 11 3997.868
## logit4 11 3980.328
#model with familydummy and familyfactor
mydataset$familyfactor<-factor(mydataset$familysize)
logit8<-glm(Warranty~PriceCategory+PriceCategory*appliances+age+hisp+MyBestBuy+log(hhincome+1)+appliances+familyfactor+weekend+newcustomer,data =mydataset,family = "binomial")
summary(logit8) # final model
##
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances +
## age + hisp + MyBestBuy + log(hhincome + 1) + appliances +
## familyfactor + weekend + newcustomer, family = "binomial",
## data = mydataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0023 -1.1355 0.7428 0.8949 2.0520
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.88612 0.79137 -1.120 0.262832
## PriceCategory 0.14048 0.02349 5.980 2.23e-09 ***
## appliances 2.78765 0.31852 8.752 < 2e-16 ***
## age -0.01949 0.01144 -1.704 0.088371 .
## hisp -1.40331 0.16544 -8.482 < 2e-16 ***
## MyBestBuy 0.13608 0.08475 1.606 0.108354
## log(hhincome + 1) 0.19221 0.05613 3.424 0.000617 ***
## familyfactor2 -0.07408 0.14238 -0.520 0.602865
## familyfactor3 0.82342 0.12620 6.525 6.82e-11 ***
## familyfactor4 0.94998 0.12614 7.531 5.02e-14 ***
## weekend 0.04498 0.07901 0.569 0.569158
## newcustomer 0.02799 0.08491 0.330 0.741674
## PriceCategory:appliances -0.25329 0.02775 -9.127 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4255.7 on 3205 degrees of freedom
## Residual deviance: 3889.4 on 3193 degrees of freedom
## AIC: 3915.4
##
## Number of Fisher Scoring iterations: 4
exp(coef(logit8))
## (Intercept) PriceCategory appliances
## 0.4122535 1.1508313 16.2428564
## age hisp MyBestBuy
## 0.9807000 0.2457824 1.1457737
## log(hhincome + 1) familyfactor2 familyfactor3
## 1.2119208 0.9285992 2.2782821
## familyfactor4 weekend newcustomer
## 2.5856601 1.0460037 1.0283861
## PriceCategory:appliances
## 0.7762440
with(logit4, null.deviance - deviance)
## [1] 364.2123
with(logit4, df.null - df.residual)
## [1] 10
with(logit4, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 3.828063e-72
pred = predict(logit8, data=mydataset, type="response")
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.680598877105427"
table(mydataset$Warranty, pred>=0.5)
##
## FALSE TRUE
## 0 474 742
## 1 282 1708
#testing
AIC(logit8, logit4)
## df AIC
## logit8 13 3915.370
## logit4 11 3913.528
BIC(logit8, logit4)
## df BIC
## logit8 13 3994.316
## logit4 11 3980.328
#trying interaction newcustomer*MyBestBuy
logit5<-glm(Warranty~PriceCategory+PriceCategory*appliances+newcustomer*MyBestBuy+newcustomer+age+hisp+MyBestBuy+log(hhincome+1)+appliances+familydummy+weekend,data =mydataset,family = "binomial")
summary(logit5) # with new customer*MyBestBuy
##
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances +
## newcustomer * MyBestBuy + newcustomer + age + hisp + MyBestBuy +
## log(hhincome + 1) + appliances + familydummy + weekend, family = "binomial",
## data = mydataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9888 -1.1363 0.7399 0.8922 2.0442
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.81389 0.79256 -1.027 0.304460
## PriceCategory 0.14035 0.02347 5.981 2.22e-09 ***
## appliances 2.77816 0.31833 8.727 < 2e-16 ***
## newcustomer -0.15646 0.14333 -1.092 0.275034
## MyBestBuy -0.07278 0.15373 -0.473 0.635924
## age -0.01908 0.01143 -1.670 0.095016 .
## hisp -1.40269 0.16531 -8.485 < 2e-16 ***
## log(hhincome + 1) 0.19302 0.05612 3.439 0.000583 ***
## familydummy 0.92311 0.09635 9.581 < 2e-16 ***
## weekend 0.04606 0.07900 0.583 0.559916
## PriceCategory:appliances -0.25264 0.02772 -9.114 < 2e-16 ***
## newcustomer:MyBestBuy 0.29091 0.17773 1.637 0.101674
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4255.7 on 3205 degrees of freedom
## Residual deviance: 3888.8 on 3194 degrees of freedom
## AIC: 3912.8
##
## Number of Fisher Scoring iterations: 4
with(logit5, null.deviance - deviance)
## [1] 366.8994
with(logit5, df.null - df.residual)
## [1] 11
with(logit5, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 6.404846e-72
pred = predict(logit5, data=mydataset, type="response")
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.67935121646912"
table(mydataset$Warranty, pred>=0.5)
##
## FALSE TRUE
## 0 469 747
## 1 281 1709
anova(logit5,logit4, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: Warranty ~ PriceCategory + PriceCategory * appliances + newcustomer *
## MyBestBuy + newcustomer + age + hisp + MyBestBuy + log(hhincome +
## 1) + appliances + familydummy + weekend
## Model 2: Warranty ~ PriceCategory + PriceCategory * appliances + age +
## hisp + MyBestBuy + log(hhincome + 1) + appliances + familydummy +
## weekend + newcustomer
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 3194 3888.8
## 2 3195 3891.5 -1 -2.6871 0.1012
#trying interaction PriceCategory*familydummy
logit6<-glm(Warranty~PriceCategory+PriceCategory*familydummy+newcustomer+age+hisp+MyBestBuy+log(hhincome+1)+appliances+familydummy+weekend,data =mydataset,family = "binomial")
summary(logit6) # with new customer*MyBestBuy
##
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * familydummy +
## newcustomer + age + hisp + MyBestBuy + log(hhincome + 1) +
## appliances + familydummy + weekend, family = "binomial",
## data = mydataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7661 -1.1360 0.7925 0.8590 1.9539
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.9977553 0.7696954 1.296 0.19487
## PriceCategory -0.0217478 0.0232342 -0.936 0.34926
## familydummy 0.9055994 0.3202999 2.827 0.00469 **
## newcustomer 0.0525210 0.0836252 0.628 0.52997
## age -0.0222614 0.0111846 -1.990 0.04655 *
## hisp -1.4451438 0.1614156 -8.953 < 2e-16 ***
## MyBestBuy 0.1554587 0.0835863 1.860 0.06291 .
## log(hhincome + 1) 0.1649459 0.0553491 2.980 0.00288 **
## appliances -0.0098269 0.0909602 -0.108 0.91397
## weekend 0.0501578 0.0778058 0.645 0.51915
## PriceCategory:familydummy -0.0005674 0.0260433 -0.022 0.98262
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4255.7 on 3205 degrees of freedom
## Residual deviance: 3982.6 on 3195 degrees of freedom
## AIC: 4004.6
##
## Number of Fisher Scoring iterations: 4
with(logit6, null.deviance - deviance)
## [1] 273.153
with(logit6, df.null - df.residual)
## [1] 10
with(logit6, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 7.239105e-53
pred = predict(logit6, data=mydataset, type="response")
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.665003119151591"
table(mydataset$Warranty, pred>=0.5)
##
## FALSE TRUE
## 0 509 707
## 1 367 1623
#testing
anova(logit6,logit4, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: Warranty ~ PriceCategory + PriceCategory * familydummy + newcustomer +
## age + hisp + MyBestBuy + log(hhincome + 1) + appliances +
## familydummy + weekend
## Model 2: Warranty ~ PriceCategory + PriceCategory * appliances + age +
## hisp + MyBestBuy + log(hhincome + 1) + appliances + familydummy +
## weekend + newcustomer
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 3195 3982.6
## 2 3195 3891.5 0 91.059
#trying with quadratic
logit7<-glm(Warranty~PriceCategory+PriceCategory*appliances+age+hisp+MyBestBuy+log(hhincome+1)+I(log(hhincome+1)^2)+appliances+familydummy+weekend+newcustomer,data =mydataset,family = "binomial")
summary(logit7) # final model
##
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances +
## age + hisp + MyBestBuy + log(hhincome + 1) + I(log(hhincome +
## 1)^2) + appliances + familydummy + weekend + newcustomer,
## family = "binomial", data = mydataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0176 -1.1388 0.7403 0.8922 2.0245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.729218 0.818805 -0.891 0.373
## PriceCategory 0.140675 0.023456 5.997 2e-09 ***
## appliances 2.808706 0.319849 8.781 <2e-16 ***
## age -0.018321 0.011450 -1.600 0.110
## hisp -1.405425 0.165441 -8.495 <2e-16 ***
## MyBestBuy 0.146545 0.085359 1.717 0.086 .
## log(hhincome + 1) 0.007313 0.197553 0.037 0.970
## I(log(hhincome + 1)^2) 0.027632 0.028383 0.974 0.330
## familydummy 0.930220 0.096668 9.623 <2e-16 ***
## weekend 0.043741 0.078970 0.554 0.580
## newcustomer 0.032187 0.084863 0.379 0.704
## PriceCategory:appliances -0.254866 0.027815 -9.163 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4255.7 on 3205 degrees of freedom
## Residual deviance: 3890.6 on 3194 degrees of freedom
## AIC: 3914.6
##
## Number of Fisher Scoring iterations: 4
exp(coef(logit7))
## (Intercept) PriceCategory appliances
## 0.4822858 1.1510506 16.5884353
## age hisp MyBestBuy
## 0.9818457 0.2452628 1.1578273
## log(hhincome + 1) I(log(hhincome + 1)^2) familydummy
## 1.0073394 1.0280174 2.5350673
## weekend newcustomer PriceCategory:appliances
## 1.0447112 1.0327106 0.7750202
with(logit7, null.deviance - deviance)
## [1] 365.1689
with(logit7, df.null - df.residual)
## [1] 11
with(logit7, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 1.489693e-71
pred = predict(logit7, data=mydataset, type="response")
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.682470368059888"
table(mydataset$Warranty, pred>=0.5)
##
## FALSE TRUE
## 0 482 734
## 1 284 1706
#model with familydummy and familyfactor
mydataset$familyfactor<-factor(mydataset$familysize)
logit8<-glm(Warranty~PriceCategory+PriceCategory*appliances+age+hisp+MyBestBuy+log(hhincome+1)+appliances+familyfactor+weekend+newcustomer,data =mydataset,family = "binomial")
summary(logit8) # final model
##
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances +
## age + hisp + MyBestBuy + log(hhincome + 1) + appliances +
## familyfactor + weekend + newcustomer, family = "binomial",
## data = mydataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0023 -1.1355 0.7428 0.8949 2.0520
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.88612 0.79137 -1.120 0.262832
## PriceCategory 0.14048 0.02349 5.980 2.23e-09 ***
## appliances 2.78765 0.31852 8.752 < 2e-16 ***
## age -0.01949 0.01144 -1.704 0.088371 .
## hisp -1.40331 0.16544 -8.482 < 2e-16 ***
## MyBestBuy 0.13608 0.08475 1.606 0.108354
## log(hhincome + 1) 0.19221 0.05613 3.424 0.000617 ***
## familyfactor2 -0.07408 0.14238 -0.520 0.602865
## familyfactor3 0.82342 0.12620 6.525 6.82e-11 ***
## familyfactor4 0.94998 0.12614 7.531 5.02e-14 ***
## weekend 0.04498 0.07901 0.569 0.569158
## newcustomer 0.02799 0.08491 0.330 0.741674
## PriceCategory:appliances -0.25329 0.02775 -9.127 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4255.7 on 3205 degrees of freedom
## Residual deviance: 3889.4 on 3193 degrees of freedom
## AIC: 3915.4
##
## Number of Fisher Scoring iterations: 4
exp(coef(logit8))
## (Intercept) PriceCategory appliances
## 0.4122535 1.1508313 16.2428564
## age hisp MyBestBuy
## 0.9807000 0.2457824 1.1457737
## log(hhincome + 1) familyfactor2 familyfactor3
## 1.2119208 0.9285992 2.2782821
## familyfactor4 weekend newcustomer
## 2.5856601 1.0460037 1.0283861
## PriceCategory:appliances
## 0.7762440
with(logit4, null.deviance - deviance)
## [1] 364.2123
with(logit4, df.null - df.residual)
## [1] 10
with(logit4, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 3.828063e-72
pred = predict(logit8, data=mydataset, type="response")
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.680598877105427"
table(mydataset$Warranty, pred>=0.5)
##
## FALSE TRUE
## 0 474 742
## 1 282 1708
#testing
AIC(logit8, logit4)
## df AIC
## logit8 13 3915.370
## logit4 11 3913.528
BIC(logit8, logit4)
## df BIC
## logit8 13 3994.316
## logit4 11 3980.328
#select required rows
test<-subset(mydataset,hhincome>17 & hhincome<52,select =c("hhincome"))
nrow(test)
## [1] 1576
#heteroskedasticity
gqtest(logit4)
##
## Goldfeld-Quandt test
##
## data: logit4
## GQ = 0.98566, df1 = 1592, df2 = 1592, p-value = 0.6133
## alternative hypothesis: variance increases from segment 1 to 2
bptest(logit4)
##
## studentized Breusch-Pagan test
##
## data: logit4
## BP = 49.935, df = 10, p-value = 2.743e-07
coeftest(logit4, vcov = vcovHC(logit4, "HC1")) # With robust standard errors
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.947844 0.794000 -1.1938 0.2325728
## PriceCategory 0.140179 0.023000 6.0947 1.096e-09 ***
## appliances 2.779453 0.313561 8.8642 < 2.2e-16 ***
## age -0.019073 0.011526 -1.6548 0.0979607 .
## hisp -1.404605 0.162881 -8.6235 < 2.2e-16 ***
## MyBestBuy 0.136612 0.085281 1.6019 0.1091794
## log(hhincome + 1) 0.192273 0.055778 3.4471 0.0005666 ***
## familydummy 0.922652 0.096269 9.5841 < 2.2e-16 ***
## weekend 0.044103 0.079131 0.5573 0.5772929
## newcustomer 0.031760 0.084960 0.3738 0.7085396
## PriceCategory:appliances -0.252719 0.027158 -9.3056 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(logit4)
##
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances +
## age + hisp + MyBestBuy + log(hhincome + 1) + appliances +
## familydummy + weekend + newcustomer, family = "binomial",
## data = mydataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0096 -1.1403 0.7400 0.8938 2.0234
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.94784 0.78808 -1.203 0.229084
## PriceCategory 0.14018 0.02346 5.976 2.29e-09 ***
## appliances 2.77945 0.31831 8.732 < 2e-16 ***
## age -0.01907 0.01143 -1.669 0.095052 .
## hisp -1.40460 0.16535 -8.495 < 2e-16 ***
## MyBestBuy 0.13661 0.08474 1.612 0.106923
## log(hhincome + 1) 0.19227 0.05611 3.427 0.000611 ***
## familydummy 0.92265 0.09630 9.581 < 2e-16 ***
## weekend 0.04410 0.07896 0.559 0.576455
## newcustomer 0.03176 0.08485 0.374 0.708191
## PriceCategory:appliances -0.25272 0.02772 -9.118 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4255.7 on 3205 degrees of freedom
## Residual deviance: 3891.5 on 3195 degrees of freedom
## AIC: 3913.5
##
## Number of Fisher Scoring iterations: 4
##Predicted Probs
#Interaction interpretation
newdatatest <- with(mydataset,data.frame(PriceCategory= rep(seq(from = 1, to = 17, length.out = 17),2), newcustomer=mean(newcustomer),age=mean(age),hisp=mean(hisp),MyBestBuy=mean(MyBestBuy),hhincome=mean(hhincome),appliances=(rep(0:1, each = 17)),weekend=mean(weekend),familydummy=mean(familydummy)))
newdatatest1 <- cbind(newdatatest, predict(logit4, newdata = newdatatest, type="link", se=TRUE))
newdatatest1 <- within(newdatatest1, {
PredictedProb <- plogis(fit)
LL <- plogis(fit - (1.96 * se.fit))
UL <- plogis(fit + (1.96 * se.fit))})
head(newdatatest1)
## PriceCategory newcustomer age hisp MyBestBuy hhincome
## 1 1 0.7077355 66.91391 0.07267623 0.6247661 45.26391
## 2 2 0.7077355 66.91391 0.07267623 0.6247661 45.26391
## 3 3 0.7077355 66.91391 0.07267623 0.6247661 45.26391
## 4 4 0.7077355 66.91391 0.07267623 0.6247661 45.26391
## 5 5 0.7077355 66.91391 0.07267623 0.6247661 45.26391
## 6 6 0.7077355 66.91391 0.07267623 0.6247661 45.26391
## appliances weekend familydummy fit se.fit residual.scale
## 1 0 0.4039301 0.7330006 -0.64678296 0.2416974 1
## 2 0 0.4039301 0.7330006 -0.50660430 0.2199889 1
## 3 0 0.4039301 0.7330006 -0.36642564 0.1986784 1
## 4 0 0.4039301 0.7330006 -0.22624698 0.1779088 1
## 5 0 0.4039301 0.7330006 -0.08606832 0.1578938 1
## 6 0 0.4039301 0.7330006 0.05411034 0.1389598 1
## UL LL PredictedProb
## 1 0.4568436 0.2459167 0.3437149
## 2 0.4811524 0.2813485 0.3759899
## 3 0.5057458 0.3195511 0.4094050
## 4 0.5305754 0.3600958 0.4436783
## 5 0.5556197 0.4023843 0.4784962
## 6 0.5809006 0.4456529 0.5135243
ggplot(newdatatest1, aes(x = PriceCategory, y = PredictedProb))+
geom_ribbon(aes(ymin = LL, ymax = UL, fill = factor(appliances)), alpha = .2) +
geom_line(aes(colour = factor(appliances)), size=1)
#Hispanic
newdatatest_hisp <- with(mydataset,data.frame(hisp=(rep(0:1)),PriceCategory=mean(PriceCategory), newcustomer=mean(newcustomer),age=mean(age),MyBestBuy=mean(MyBestBuy),hhincome=mean(hhincome),appliances=mean(appliances),weekend=mean(weekend),familydummy=mean(familydummy)))
newdatatest_hisp1 <- cbind(newdatatest_hisp, predict(logit4, newdata = newdatatest_hisp, type="link", se=TRUE))
newdatatest_hisp1 <- within(newdatatest_hisp1, {
PredictedProb <- plogis(fit)
LL <- plogis(fit - (1.96 * se.fit))
UL <- plogis(fit + (1.96 * se.fit))})
ggplot(newdatatest_hisp1, aes(x = hisp, y = PredictedProb))+
geom_ribbon(aes(ymin = LL, ymax = UL)) +geom_line(aes(colour = "hisp"), size=1)
# FamilyDummy
newdatatest_familysize <- with(mydataset,data.frame(hisp=mean(hisp),PriceCategory=mean(PriceCategory), newcustomer=mean(newcustomer),age=mean(age),MyBestBuy=mean(MyBestBuy),hhincome=mean(hhincome),appliances=mean(appliances),weekend=mean(weekend),familydummy=rep(0:1)))
newdatatest_familysize1 <- cbind(newdatatest_familysize, predict(logit4, newdata = newdatatest_familysize, type="link", se=TRUE))
newdatatest_familysize1
## hisp PriceCategory newcustomer age MyBestBuy hhincome
## 1 0.07267623 11.89863 0.7077355 66.91391 0.6247661 45.26391
## 2 0.07267623 11.89863 0.7077355 66.91391 0.6247661 45.26391
## appliances weekend familydummy fit se.fit residual.scale
## 1 0.7046163 0.4039301 0 0.04433092 0.09152702 1
## 2 0.7046163 0.4039301 1 0.96698295 0.05119074 1
newdatatest_familysize1 <- within(newdatatest_familysize1, {
PredictedProb <- plogis(fit)
LL <- plogis(fit - (1.96 * se.fit))
UL <- plogis(fit + (1.96 * se.fit))})
ggplot(newdatatest_familysize1, aes(x = familydummy, y = PredictedProb))+
geom_ribbon(aes(ymin = LL, ymax = UL)) +
geom_line(aes(colour = "familysize"), size=1)
##HHIncome
newdatatest_income <- with(mydataset,data.frame(PriceCategory= mean(PriceCategory), newcustomer=mean(newcustomer),age=mean(age),hisp=mean(hisp),MyBestBuy=mean(MyBestBuy),hhincome=rep(seq(from = 0, to =1312 ,length.out=10)),appliances=mean(appliances),weekend=mean(weekend),familydummy=mean(familydummy)))
hist(log(1+mydataset$hhincome))
newdatatest1 <- cbind(newdatatest_income, predict(logit4, newdata = newdatatest_income, type="link", se=TRUE))
newdatatest1 <- within(newdatatest1, {
PredictedProb <- plogis(fit)
LL <- plogis(fit - (1.96 * se.fit))
UL <- plogis(fit + (1.96 * se.fit))})
ggplot(newdatatest1, aes(x = hhincome, y = PredictedProb))+
geom_ribbon(aes(ymin = LL, ymax = UL), alpha = .2) +
geom_line(aes(colour = "hhincome"), size=1)
Lets, try Probit model too
#Probit
probit1<- glm(Warranty~PriceCategory+newcustomer+age+hisp+MyBestBuy+hhincome+appliances+familysize+weekend,data =mydataset, family=binomial(link="probit"))
summary(probit1)
##
## Call:
## glm(formula = Warranty ~ PriceCategory + newcustomer + age +
## hisp + MyBestBuy + hhincome + appliances + familysize + weekend,
## family = binomial(link = "probit"), data = mydataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8819 -1.2147 0.7781 0.9211 1.8893
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4254754 0.4506618 0.944 0.345112
## PriceCategory -0.0120941 0.0081333 -1.487 0.137019
## newcustomer 0.0308066 0.0507998 0.606 0.544227
## age -0.0120256 0.0067716 -1.776 0.075750 .
## hisp -0.9000405 0.0962416 -9.352 < 2e-16 ***
## MyBestBuy 0.1176283 0.0509580 2.308 0.020980 *
## hhincome 0.0019624 0.0005317 3.690 0.000224 ***
## appliances 0.0247963 0.0543965 0.456 0.648503
## familysize 0.2328456 0.0231980 10.037 < 2e-16 ***
## weekend 0.0385597 0.0471681 0.817 0.413646
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4255.7 on 3205 degrees of freedom
## Residual deviance: 3999.3 on 3196 degrees of freedom
## AIC: 4019.3
##
## Number of Fisher Scoring iterations: 4
### final model
probit2<- glm(Warranty~PriceCategory+PriceCategory*appliances+newcustomer+age+hisp+MyBestBuy+hhincome+appliances+familydummy+weekend,data =mydataset, family=binomial(link="probit"))
summary(probit2)
##
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances +
## newcustomer + age + hisp + MyBestBuy + hhincome + appliances +
## familydummy + weekend, family = binomial(link = "probit"),
## data = mydataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0241 -1.1415 0.7397 0.8944 2.0405
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3992900 0.4751128 -0.840 0.400678
## PriceCategory 0.0887201 0.0136150 6.516 7.2e-11 ***
## appliances 1.7323987 0.1894559 9.144 < 2e-16 ***
## newcustomer 0.0215749 0.0514435 0.419 0.674932
## age -0.0109648 0.0068895 -1.592 0.111490
## hisp -0.8676857 0.0987345 -8.788 < 2e-16 ***
## MyBestBuy 0.0982485 0.0516341 1.903 0.057069 .
## hhincome 0.0020474 0.0005409 3.785 0.000154 ***
## familydummy 0.6062111 0.0545861 11.106 < 2e-16 ***
## weekend 0.0262452 0.0477891 0.549 0.582876
## PriceCategory:appliances -0.1559184 0.0164101 -9.501 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4255.7 on 3205 degrees of freedom
## Residual deviance: 3885.8 on 3195 degrees of freedom
## AIC: 3907.8
##
## Number of Fisher Scoring iterations: 5
probit3<- glm(Warranty~PriceCategory+PriceCategory*appliances+newcustomer+age+hisp+MyBestBuy+log(hhincome+1)+appliances+familysize+weekend,data =mydataset, family=binomial(link="probit"))
summary(probit3)
##
## Call:
## glm(formula = Warranty ~ PriceCategory + PriceCategory * appliances +
## newcustomer + age + hisp + MyBestBuy + log(hhincome + 1) +
## appliances + familysize + weekend, family = binomial(link = "probit"),
## data = mydataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0498 -1.1837 0.7380 0.9154 2.1598
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.91506 0.47317 -1.934 0.0531 .
## PriceCategory 0.07998 0.01372 5.831 5.52e-09 ***
## appliances 1.65617 0.18815 8.802 < 2e-16 ***
## newcustomer 0.01496 0.05132 0.291 0.7707
## age -0.01099 0.00689 -1.594 0.1108
## hisp -0.85087 0.09840 -8.647 < 2e-16 ***
## MyBestBuy 0.09269 0.05120 1.810 0.0702 .
## log(hhincome + 1) 0.15429 0.03290 4.690 2.73e-06 ***
## familysize 0.21315 0.02473 8.618 < 2e-16 ***
## weekend 0.03408 0.04765 0.715 0.4746
## PriceCategory:appliances -0.15019 0.01629 -9.220 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4255.7 on 3205 degrees of freedom
## Residual deviance: 3909.1 on 3195 degrees of freedom
## AIC: 3931.1
##
## Number of Fisher Scoring iterations: 4
with(probit2, null.deviance - deviance)
## [1] 369.9354
with(probit2, df.null - df.residual)
## [1] 10
with(probit2, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 2.328973e-73
pred = predict(probit2, data=mydataset)
warrenty_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(warrenty_prediction != mydataset$Warranty)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.589831565814099"
table(mydataset$Warranty, pred>=0.5)
##
## FALSE TRUE
## 0 849 367
## 1 948 1042
Final model- logit. Since accuracy is higher with logit model. Some ideas to extend the model accuracy would be: 1. using IV regression as myBestBuy variable is suitable to be considered endogenous. 2. Trying other classification models to improve model performace.