##Step 1. EDA I replaced (manually with ctrl+h in excel) the long strings with shorter categorical descriptions.
sum(is.na(data))
## [1] 0
str(data)
## tibble [113 x 33] (S3: tbl_df/tbl/data.frame)
## $ Id : num [1:113] 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : chr [1:113] "1" "1" "0" "1" ...
## $ age : chr [1:113] "1" "1" "1" "1" ...
## $ status : chr [1:113] "0" "0" "2" "0" ...
## $ income : chr [1:113] "0" "0" "0" "0" ...
## $ visitNo : chr [1:113] "3" "3" "2" "3" ...
## $ method : chr [1:113] "0" "2" "0" "2" ...
## $ timeSpend : chr [1:113] "1" "0" "1" "0" ...
## $ location : chr [1:113] "0" "1" "2" "2" ...
## $ membershipCard : chr [1:113] "0" "0" "0" "1" ...
## $ itemPurchaseCoffee : chr [1:113] "1" "1" "1" "1" ...
## $ itempurchaseCold : chr [1:113] "1" "1" "1" "1" ...
## $ itemPurchasePastries : chr [1:113] "1" "1" "1" "1" ...
## $ itemPurchaseJuices : chr [1:113] "1" "1" "1" "1" ...
## $ itemPurchaseSandwiches: chr [1:113] "1" "1" "1" "1" ...
## $ itemPurchaseOthers : chr [1:113] "1" "1" "1" "1" ...
## $ spendPurchase : chr [1:113] "1" "1" "1" "1" ...
## $ productRate : chr [1:113] "4" "4" "4" "2" ...
## $ priceRate : chr [1:113] "3" "3" "3" "1" ...
## $ promoRate : chr [1:113] "5" "4" "4" "4" ...
## $ ambianceRate : chr [1:113] "5" "4" "4" "3" ...
## $ wifiRate : chr [1:113] "4" "4" "4" "3" ...
## $ serviceRate : chr [1:113] "4" "5" "4" "3" ...
## $ chooseRate : chr [1:113] "3" "2" "3" "3" ...
## $ promoMethodApp : chr [1:113] "1" "1" "1" "1" ...
## $ promoMethodSoc : chr [1:113] "1" "1" "1" "1" ...
## $ promoMethodEmail : chr [1:113] "1" "1" "1" "1" ...
## $ promoMethodDeal : chr [1:113] "1" "1" "1" "1" ...
## $ promoMethodFriend : chr [1:113] "1" "1" "1" "1" ...
## $ promoMethodDisplay : chr [1:113] "1" "1" "1" "1" ...
## $ promoMethodBillboard : chr [1:113] "1" "1" "1" "1" ...
## $ promoMethodOthers : chr [1:113] "1" "1" "1" "1" ...
## $ loyal : chr [1:113] "0" "0" "0" "1" ...
##Step 3. Logistic Regression
names(data)
## [1] "Id" "gender" "age"
## [4] "status" "income" "visitNo"
## [7] "method" "timeSpend" "location"
## [10] "membershipCard" "itemPurchaseCoffee" "itempurchaseCold"
## [13] "itemPurchasePastries" "itemPurchaseJuices" "itemPurchaseSandwiches"
## [16] "itemPurchaseOthers" "spendPurchase" "productRate"
## [19] "priceRate" "promoRate" "ambianceRate"
## [22] "wifiRate" "serviceRate" "chooseRate"
## [25] "promoMethodApp" "promoMethodSoc" "promoMethodEmail"
## [28] "promoMethodDeal" "promoMethodFriend" "promoMethodDisplay"
## [31] "promoMethodBillboard" "promoMethodOthers" "loyal"
testData <- data[, c("gender","age","status","income", "visitNo","method", "timeSpend","location","membershipCard", "spendPurchase", "productRate", "priceRate","promoRate","ambianceRate","wifiRate","serviceRate","chooseRate", "loyal")]
str(testData)
## tibble [113 x 18] (S3: tbl_df/tbl/data.frame)
## $ gender : chr [1:113] "1" "1" "0" "1" ...
## $ age : chr [1:113] "1" "1" "1" "1" ...
## $ status : chr [1:113] "0" "0" "2" "0" ...
## $ income : chr [1:113] "0" "0" "0" "0" ...
## $ visitNo : chr [1:113] "3" "3" "2" "3" ...
## $ method : chr [1:113] "0" "2" "0" "2" ...
## $ timeSpend : chr [1:113] "1" "0" "1" "0" ...
## $ location : chr [1:113] "0" "1" "2" "2" ...
## $ membershipCard: chr [1:113] "0" "0" "0" "1" ...
## $ spendPurchase : chr [1:113] "1" "1" "1" "1" ...
## $ productRate : chr [1:113] "4" "4" "4" "2" ...
## $ priceRate : chr [1:113] "3" "3" "3" "1" ...
## $ promoRate : chr [1:113] "5" "4" "4" "4" ...
## $ ambianceRate : chr [1:113] "5" "4" "4" "3" ...
## $ wifiRate : chr [1:113] "4" "4" "4" "3" ...
## $ serviceRate : chr [1:113] "4" "5" "4" "3" ...
## $ chooseRate : chr [1:113] "3" "2" "3" "3" ...
## $ loyal : chr [1:113] "0" "0" "0" "1" ...
testData <- as.data.frame(unclass(testData))
str(testData)
## 'data.frame': 113 obs. of 18 variables:
## $ gender : chr "1" "1" "0" "1" ...
## $ age : chr "1" "1" "1" "1" ...
## $ status : chr "0" "0" "2" "0" ...
## $ income : chr "0" "0" "0" "0" ...
## $ visitNo : chr "3" "3" "2" "3" ...
## $ method : chr "0" "2" "0" "2" ...
## $ timeSpend : chr "1" "0" "1" "0" ...
## $ location : chr "0" "1" "2" "2" ...
## $ membershipCard: chr "0" "0" "0" "1" ...
## $ spendPurchase : chr "1" "1" "1" "1" ...
## $ productRate : chr "4" "4" "4" "2" ...
## $ priceRate : chr "3" "3" "3" "1" ...
## $ promoRate : chr "5" "4" "4" "4" ...
## $ ambianceRate : chr "5" "4" "4" "3" ...
## $ wifiRate : chr "4" "4" "4" "3" ...
## $ serviceRate : chr "4" "5" "4" "3" ...
## $ chooseRate : chr "3" "2" "3" "3" ...
## $ loyal : chr "0" "0" "0" "1" ...
testData$loyal <- as.numeric(testData$loyal)
mylrm <- glm(loyal ~ membershipCard + spendPurchase, data = testData, family = "binomial")
summary(mylrm)
##
## Call:
## glm(formula = loyal ~ membershipCard + spendPurchase, family = "binomial",
## data = testData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.49177 -0.56785 -0.50889 -0.00018 2.64643
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.7782 1.0607 -0.734 0.46312
## membershipCard1 1.4924 0.5763 2.590 0.00961 **
## spendPurchase1 -0.9650 1.0042 -0.961 0.33655
## spendPurchase2 -2.6929 1.1325 -2.378 0.01741 *
## spendPurchase3 -17.1260 1449.4126 -0.012 0.99057
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 114.19 on 112 degrees of freedom
## Residual deviance: 89.93 on 108 degrees of freedom
## AIC: 99.93
##
## Number of Fisher Scoring iterations: 16
#Check for multicollinearity
car::vif(mylrm)
## GVIF Df GVIF^(1/(2*Df))
## membershipCard 1.004198 1 1.002097
## spendPurchase 1.004198 3 1.000698
mylrm <- glm(loyal ~ productRate+priceRate+promoRate+ambianceRate+wifiRate+serviceRate+chooseRate, data = testData, family = "binomial")
summary(mylrm)
##
## Call:
## glm(formula = loyal ~ productRate + priceRate + promoRate + ambianceRate +
## wifiRate + serviceRate + chooseRate, family = "binomial",
## data = testData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.45749 -0.28016 -0.04272 -0.00009 2.95754
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -18.3843 9224.4222 -0.002 0.9984
## productRate2 -12.3298 6522.6404 -0.002 0.9985
## productRate3 -18.8299 6522.6388 -0.003 0.9977
## productRate4 -19.4058 6522.6389 -0.003 0.9976
## productRate5 -18.2632 6522.6390 -0.003 0.9978
## priceRate2 2.5087 1.6777 1.495 0.1348
## priceRate3 -2.8877 1.9093 -1.512 0.1304
## priceRate4 -3.1091 1.6645 -1.868 0.0618 .
## priceRate5 -13.8879 1979.8687 -0.007 0.9944
## promoRate2 11.0402 14.2299 0.776 0.4378
## promoRate3 4.9075 13.9566 0.352 0.7251
## promoRate4 6.8613 13.9846 0.491 0.6237
## promoRate5 7.1557 14.0543 0.509 0.6107
## ambianceRate2 28.4292 6522.6420 0.004 0.9965
## ambianceRate3 23.1830 6522.6406 0.004 0.9972
## ambianceRate4 20.8519 6522.6406 0.003 0.9974
## ambianceRate5 21.8504 6522.6405 0.003 0.9973
## wifiRate2 -2.9357 2.8928 -1.015 0.3102
## wifiRate3 0.3290 1.7514 0.188 0.8510
## wifiRate4 0.3088 1.9520 0.158 0.8743
## wifiRate5 1.1112 2.9793 0.373 0.7092
## serviceRate3 6.0635 6.1931 0.979 0.3275
## serviceRate4 8.7357 6.4557 1.353 0.1760
## serviceRate5 3.0207 6.3375 0.477 0.6336
## chooseRate2 2.5504 2.6645 0.957 0.3385
## chooseRate3 -1.3799 2.2750 -0.607 0.5442
## chooseRate4 -0.2193 2.1804 -0.101 0.9199
## chooseRate5 -2.0092 2.5364 -0.792 0.4283
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 114.191 on 112 degrees of freedom
## Residual deviance: 40.402 on 85 degrees of freedom
## AIC: 96.402
##
## Number of Fisher Scoring iterations: 17
#Check for multicollinearity
car::vif(mylrm)
## GVIF Df GVIF^(1/(2*Df))
## productRate 5.314527 4 1.232205
## priceRate 10.002806 4 1.333568
## promoRate 8.504721 4 1.306795
## ambianceRate 9.713870 4 1.328691
## wifiRate 14.145982 4 1.392609
## serviceRate 6.564192 3 1.368349
## chooseRate 7.342216 4 1.283005
mylrm <- glm(loyal ~ gender + age + status + income + visitNo + method + timeSpend +location +membershipCard +spendPurchase, data = testData, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mylrm)
##
## Call:
## glm(formula = loyal ~ gender + age + status + income + visitNo +
## method + timeSpend + location + membershipCard + spendPurchase,
## family = "binomial", data = testData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.80388 -0.37898 -0.00004 0.00000 2.11522
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.866e+01 1.611e+04 -0.001 0.9991
## gender1 -2.756e-01 8.418e-01 -0.327 0.7434
## age1 -4.487e-02 1.203e+00 -0.037 0.9702
## age2 2.013e-01 2.033e+00 0.099 0.9211
## age3 1.063e+00 2.979e+00 0.357 0.7213
## status1 -5.246e+00 2.706e+00 -1.938 0.0526 .
## status2 -1.618e+00 1.295e+00 -1.249 0.2117
## status3 -1.720e+01 1.782e+04 -0.001 0.9992
## income1 1.826e+00 1.347e+00 1.355 0.1753
## income2 6.227e-01 1.695e+00 0.367 0.7133
## income3 4.882e+00 2.705e+00 1.805 0.0711 .
## income4 2.021e+01 3.651e+03 0.006 0.9956
## visitNo1 1.784e+00 1.805e+04 0.000 0.9999
## visitNo2 -1.457e+01 1.700e+04 -0.001 0.9993
## visitNo3 1.871e+01 1.611e+04 0.001 0.9991
## method1 -1.053e+00 1.555e+00 -0.677 0.4984
## method2 9.585e-01 1.059e+00 0.905 0.3652
## method5 -1.626e+01 2.923e+04 -0.001 0.9996
## timeSpend1 1.512e+00 1.004e+00 1.505 0.1322
## timeSpend2 6.402e-01 1.948e+00 0.329 0.7425
## timeSpend3 8.929e-01 2.951e+04 0.000 1.0000
## timeSpend4 6.060e+01 1.673e+04 0.004 0.9971
## location1 7.094e-01 1.240e+00 0.572 0.5673
## location2 9.840e-01 1.180e+00 0.834 0.4042
## membershipCard1 1.719e+00 9.132e-01 1.883 0.0597 .
## spendPurchase1 -2.811e+00 1.990e+00 -1.413 0.1578
## spendPurchase2 -5.377e+00 2.630e+00 -2.045 0.0409 *
## spendPurchase3 -3.538e+01 8.779e+03 -0.004 0.9968
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 114.191 on 112 degrees of freedom
## Residual deviance: 56.119 on 85 degrees of freedom
## AIC: 112.12
##
## Number of Fisher Scoring iterations: 20
#Check for multicollinearity
car::vif(mylrm)
## GVIF Df GVIF^(1/(2*Df))
## gender 1.567128 1 1.251850
## age 10.110098 3 1.470480
## status 10.234150 3 1.473472
## income 22.675234 4 1.477216
## visitNo 1.980286 3 1.120610
## method 3.010769 3 1.201654
## timeSpend 3.948144 4 1.187269
## location 2.306077 2 1.232306
## membershipCard 1.666113 1 1.290780
## spendPurchase 8.058733 3 1.415939
##Step 4. Propensity score We select the independent variables that have a p-value less than or equal to 0.05 to be in the PS formula. That list includes: membershipCard (1) spendPurchase (2)
We also include the independent variables that have a p-value less than or equal to be in the PS formula. That list includes: priceRate (4): 0.0618 income (3): 0.0711 membershipCard (1): 0.0597 spendPurchase (2): 0.0409
#mylrm2 <- glm(loyal~ gender + age + status + income + visitNo + method + timeSpend +location +membershipCard +spendPurchase + productRate +priceRate +promoRate +ambianceRate +wifiRate +serviceRate +chooseRate, data = testData, family = "binomial")
#summary(mylrm2)
##glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
##Check for multicollinearity
#car::vif(mylrm2)
##... every variable has a GVIF of over 100???
#Now we generate the formula:
psformula <- formula("loyal ~ membershipCard + spendPurchase + priceRate + income")
#test <- glm(loyal ~ membershipCard + spendPurchase + priceRate + income, data = testData, family = "binomial")
#summary(test)
#Now we generate:
psmodel <- glm(psformula, family = "binomial", data = testData)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
testData$ps <- fitted(psmodel)
#Plot to look for overlap (common support):
testData %>%
ggplot(aes(x = ps)) +
geom_histogram(color = "white") +
facet_wrap(~loyal)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
testData$psweights <- with(testData, ifelse(loyal == 1, mean(ps)/ps, mean(1-ps)/(1-ps)))
with(testData, by(psweights, loyal, summary))
## loyal: 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.7965 0.8005 0.8169 0.9932 0.9006 3.7299
## ------------------------------------------------------------
## loyal: 1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2125 0.2588 0.2719 0.6622 0.6130 3.9076
design <- svydesign(ids = ~0, weights = testData$psweights, data = testData)
testData$loyal <- as.numeric(testData$loyal)
testData$membershipCard <- as.numeric(testData$membershipCard)
testData$spendPurchase <- as.numeric(testData$spendPurchase)
testData$priceRate <- as.numeric(testData$priceRate)
testData$income <- as.numeric(testData$income)
#balanceiv <- bal.stat(design$variables, estimand = "ATT", w.all = testData$psweights, vars = c("membershipCard", "spendPurchase", "priceRate", "income"), sampw = 1, get.ks = FALSE, treat.var = "loyal", multinom = FALSE)
#balanceiv
#balanceiv[["results"]][["p"]]
#> balanceiv[["results"]][["p"]]
#[1] 0.6604018 0.9204247 0.6255584 0.6729542
weighteddesign <- svydesign(ids = ~0, weights = testData$psweights, data = testData)
weighteddesign <- as.svrepdesign(weighteddesign, type = c("bootstrap"), replicates = 5000)
#weightedmodel <- svyglm(loyal ~ membershipCard + spendPurchase + priceRate + income, design = weighteddesign, family = binomial())
##################
testData$loyal <- as.numeric(testData$loyal)
testData$membershipCard <- as.numeric(testData$membershipCard)
testData$spendPurchase <- as.numeric(testData$spendPurchase)
testData$priceRate <- as.numeric(testData$priceRate)
testData$income <- as.numeric(testData$income)
slr <- glm(factor(membershipCard) ~ spendPurchase + priceRate + income, data = testData, family = "binomial")
summary(slr)
##
## Call:
## glm(formula = factor(membershipCard) ~ spendPurchase + priceRate +
## income, family = "binomial", data = testData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.609 -1.041 -0.351 1.036 2.098
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.6455 0.7130 2.308 0.0210 *
## spendPurchase -0.4617 0.3261 -1.416 0.1569
## priceRate -0.2104 0.1968 -1.069 0.2850
## income -0.7118 0.2445 -2.911 0.0036 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 156.22 on 112 degrees of freedom
## Residual deviance: 138.05 on 109 degrees of freedom
## AIC: 146.05
##
## Number of Fisher Scoring iterations: 4
psformula0 <- formula("factor(membershipCard) ~ + spendPurchase + priceRate + income")
psmodel0 <- glm(psformula0, family = "binomial", data = testData)
testData$ps0 <- fitted(psmodel0)
testData %>%
ggplot(aes(x = ps)) +
geom_histogram(color = "white") +
facet_wrap(~factor(membershipCard))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
testData$psweights0 <- with(testData, ifelse(loyal == 1, mean(ps)/ps, mean(1-ps)/(1-ps)))
with(testData, by(psweights0, membershipCard, summary))
## membershipCard: 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3515 0.7973 0.8040 0.8834 0.8392 3.9076
## ------------------------------------------------------------
## membershipCard: 1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2125 0.3905 0.8169 0.9738 0.9823 3.7299
design0 <- svydesign(ids = ~0, weights = testData$psweights0, data = testData)
balanceiv0 <- bal.stat(design0$variables, estimand = "ATT", w.all = testData$psweights0, vars = c("spendPurchase", "priceRate" , "income"), sampw = 1, get.ks = FALSE, treat.var = "membershipCard", multinom = FALSE)
balanceiv0
## $results
## tx.mn tx.sd ct.mn ct.sd std.eff.sz stat
## spendPurchase 1.2509503 0.6166588 1.630512 0.6714477 -0.6155134 -2.7570098
## priceRate 2.7303727 1.0739518 2.972481 1.0931891 -0.2254368 -0.9321782
## income 0.2636492 0.6994451 1.126205 1.1471524 -1.2332001 -4.6669326
## p
## spendPurchase 6.821837e-03
## priceRate 3.532676e-01
## income 8.601353e-06
balanceiv0[["results"]][["p"]]
## [1] 6.821837e-03 3.532676e-01 8.601353e-06
weighteddesign0 <- svydesign(ids = ~0, weights = testData$psweights0, data = testData)
weighteddesign0 <- as.svrepdesign(weighteddesign0, type = c("bootstrap"), replicates = 5000)
#weightedmodel0 <- svyglm(loyal ~ membershipCard + spendPurchase + priceRate + income, design = weighteddesign0, family = binomial())
#summary(weightedmodel0)
################
##Step 6. 505 categorical method