525 Term Project

##Step 1. EDA I replaced (manually with ctrl+h in excel) the long strings with shorter categorical descriptions.

sum(is.na(data))

## [1] 0

str(data)

## tibble [113 x 33] (S3: tbl_df/tbl/data.frame)
##  $ Id                    : num [1:113] 1 2 3 4 5 6 7 8 9 10 ...
##  $ gender                : chr [1:113] "1" "1" "0" "1" ...
##  $ age                   : chr [1:113] "1" "1" "1" "1" ...
##  $ status                : chr [1:113] "0" "0" "2" "0" ...
##  $ income                : chr [1:113] "0" "0" "0" "0" ...
##  $ visitNo               : chr [1:113] "3" "3" "2" "3" ...
##  $ method                : chr [1:113] "0" "2" "0" "2" ...
##  $ timeSpend             : chr [1:113] "1" "0" "1" "0" ...
##  $ location              : chr [1:113] "0" "1" "2" "2" ...
##  $ membershipCard        : chr [1:113] "0" "0" "0" "1" ...
##  $ itemPurchaseCoffee    : chr [1:113] "1" "1" "1" "1" ...
##  $ itempurchaseCold      : chr [1:113] "1" "1" "1" "1" ...
##  $ itemPurchasePastries  : chr [1:113] "1" "1" "1" "1" ...
##  $ itemPurchaseJuices    : chr [1:113] "1" "1" "1" "1" ...
##  $ itemPurchaseSandwiches: chr [1:113] "1" "1" "1" "1" ...
##  $ itemPurchaseOthers    : chr [1:113] "1" "1" "1" "1" ...
##  $ spendPurchase         : chr [1:113] "1" "1" "1" "1" ...
##  $ productRate           : chr [1:113] "4" "4" "4" "2" ...
##  $ priceRate             : chr [1:113] "3" "3" "3" "1" ...
##  $ promoRate             : chr [1:113] "5" "4" "4" "4" ...
##  $ ambianceRate          : chr [1:113] "5" "4" "4" "3" ...
##  $ wifiRate              : chr [1:113] "4" "4" "4" "3" ...
##  $ serviceRate           : chr [1:113] "4" "5" "4" "3" ...
##  $ chooseRate            : chr [1:113] "3" "2" "3" "3" ...
##  $ promoMethodApp        : chr [1:113] "1" "1" "1" "1" ...
##  $ promoMethodSoc        : chr [1:113] "1" "1" "1" "1" ...
##  $ promoMethodEmail      : chr [1:113] "1" "1" "1" "1" ...
##  $ promoMethodDeal       : chr [1:113] "1" "1" "1" "1" ...
##  $ promoMethodFriend     : chr [1:113] "1" "1" "1" "1" ...
##  $ promoMethodDisplay    : chr [1:113] "1" "1" "1" "1" ...
##  $ promoMethodBillboard  : chr [1:113] "1" "1" "1" "1" ...
##  $ promoMethodOthers     : chr [1:113] "1" "1" "1" "1" ...
##  $ loyal                 : chr [1:113] "0" "0" "0" "1" ...

##Step 3. Logistic Regression

names(data)

##  [1] "Id"                     "gender"                 "age"                   
##  [4] "status"                 "income"                 "visitNo"               
##  [7] "method"                 "timeSpend"              "location"              
## [10] "membershipCard"         "itemPurchaseCoffee"     "itempurchaseCold"      
## [13] "itemPurchasePastries"   "itemPurchaseJuices"     "itemPurchaseSandwiches"
## [16] "itemPurchaseOthers"     "spendPurchase"          "productRate"           
## [19] "priceRate"              "promoRate"              "ambianceRate"          
## [22] "wifiRate"               "serviceRate"            "chooseRate"            
## [25] "promoMethodApp"         "promoMethodSoc"         "promoMethodEmail"      
## [28] "promoMethodDeal"        "promoMethodFriend"      "promoMethodDisplay"    
## [31] "promoMethodBillboard"   "promoMethodOthers"      "loyal"

testData <- data[, c("gender","age","status","income", "visitNo","method", "timeSpend","location","membershipCard", "spendPurchase", "productRate", "priceRate","promoRate","ambianceRate","wifiRate","serviceRate","chooseRate", "loyal")]
str(testData)

## tibble [113 x 18] (S3: tbl_df/tbl/data.frame)
##  $ gender        : chr [1:113] "1" "1" "0" "1" ...
##  $ age           : chr [1:113] "1" "1" "1" "1" ...
##  $ status        : chr [1:113] "0" "0" "2" "0" ...
##  $ income        : chr [1:113] "0" "0" "0" "0" ...
##  $ visitNo       : chr [1:113] "3" "3" "2" "3" ...
##  $ method        : chr [1:113] "0" "2" "0" "2" ...
##  $ timeSpend     : chr [1:113] "1" "0" "1" "0" ...
##  $ location      : chr [1:113] "0" "1" "2" "2" ...
##  $ membershipCard: chr [1:113] "0" "0" "0" "1" ...
##  $ spendPurchase : chr [1:113] "1" "1" "1" "1" ...
##  $ productRate   : chr [1:113] "4" "4" "4" "2" ...
##  $ priceRate     : chr [1:113] "3" "3" "3" "1" ...
##  $ promoRate     : chr [1:113] "5" "4" "4" "4" ...
##  $ ambianceRate  : chr [1:113] "5" "4" "4" "3" ...
##  $ wifiRate      : chr [1:113] "4" "4" "4" "3" ...
##  $ serviceRate   : chr [1:113] "4" "5" "4" "3" ...
##  $ chooseRate    : chr [1:113] "3" "2" "3" "3" ...
##  $ loyal         : chr [1:113] "0" "0" "0" "1" ...

testData <- as.data.frame(unclass(testData))
str(testData)

## 'data.frame':    113 obs. of  18 variables:
##  $ gender        : chr  "1" "1" "0" "1" ...
##  $ age           : chr  "1" "1" "1" "1" ...
##  $ status        : chr  "0" "0" "2" "0" ...
##  $ income        : chr  "0" "0" "0" "0" ...
##  $ visitNo       : chr  "3" "3" "2" "3" ...
##  $ method        : chr  "0" "2" "0" "2" ...
##  $ timeSpend     : chr  "1" "0" "1" "0" ...
##  $ location      : chr  "0" "1" "2" "2" ...
##  $ membershipCard: chr  "0" "0" "0" "1" ...
##  $ spendPurchase : chr  "1" "1" "1" "1" ...
##  $ productRate   : chr  "4" "4" "4" "2" ...
##  $ priceRate     : chr  "3" "3" "3" "1" ...
##  $ promoRate     : chr  "5" "4" "4" "4" ...
##  $ ambianceRate  : chr  "5" "4" "4" "3" ...
##  $ wifiRate      : chr  "4" "4" "4" "3" ...
##  $ serviceRate   : chr  "4" "5" "4" "3" ...
##  $ chooseRate    : chr  "3" "2" "3" "3" ...
##  $ loyal         : chr  "0" "0" "0" "1" ...

testData$loyal <- as.numeric(testData$loyal)
mylrm <- glm(loyal ~ membershipCard + spendPurchase, data = testData, family = "binomial")
summary(mylrm)

## 
## Call:
## glm(formula = loyal ~ membershipCard + spendPurchase, family = "binomial", 
##     data = testData)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.49177  -0.56785  -0.50889  -0.00018   2.64643  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)   
## (Intercept)       -0.7782     1.0607  -0.734  0.46312   
## membershipCard1    1.4924     0.5763   2.590  0.00961 **
## spendPurchase1    -0.9650     1.0042  -0.961  0.33655   
## spendPurchase2    -2.6929     1.1325  -2.378  0.01741 * 
## spendPurchase3   -17.1260  1449.4126  -0.012  0.99057   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 114.19  on 112  degrees of freedom
## Residual deviance:  89.93  on 108  degrees of freedom
## AIC: 99.93
## 
## Number of Fisher Scoring iterations: 16

#Check for multicollinearity
car::vif(mylrm)

##                    GVIF Df GVIF^(1/(2*Df))
## membershipCard 1.004198  1        1.002097
## spendPurchase  1.004198  3        1.000698

mylrm <- glm(loyal ~ productRate+priceRate+promoRate+ambianceRate+wifiRate+serviceRate+chooseRate, data = testData, family = "binomial")

summary(mylrm)

## 
## Call:
## glm(formula = loyal ~ productRate + priceRate + promoRate + ambianceRate + 
##     wifiRate + serviceRate + chooseRate, family = "binomial", 
##     data = testData)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.45749  -0.28016  -0.04272  -0.00009   2.95754  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    -18.3843  9224.4222  -0.002   0.9984  
## productRate2   -12.3298  6522.6404  -0.002   0.9985  
## productRate3   -18.8299  6522.6388  -0.003   0.9977  
## productRate4   -19.4058  6522.6389  -0.003   0.9976  
## productRate5   -18.2632  6522.6390  -0.003   0.9978  
## priceRate2       2.5087     1.6777   1.495   0.1348  
## priceRate3      -2.8877     1.9093  -1.512   0.1304  
## priceRate4      -3.1091     1.6645  -1.868   0.0618 .
## priceRate5     -13.8879  1979.8687  -0.007   0.9944  
## promoRate2      11.0402    14.2299   0.776   0.4378  
## promoRate3       4.9075    13.9566   0.352   0.7251  
## promoRate4       6.8613    13.9846   0.491   0.6237  
## promoRate5       7.1557    14.0543   0.509   0.6107  
## ambianceRate2   28.4292  6522.6420   0.004   0.9965  
## ambianceRate3   23.1830  6522.6406   0.004   0.9972  
## ambianceRate4   20.8519  6522.6406   0.003   0.9974  
## ambianceRate5   21.8504  6522.6405   0.003   0.9973  
## wifiRate2       -2.9357     2.8928  -1.015   0.3102  
## wifiRate3        0.3290     1.7514   0.188   0.8510  
## wifiRate4        0.3088     1.9520   0.158   0.8743  
## wifiRate5        1.1112     2.9793   0.373   0.7092  
## serviceRate3     6.0635     6.1931   0.979   0.3275  
## serviceRate4     8.7357     6.4557   1.353   0.1760  
## serviceRate5     3.0207     6.3375   0.477   0.6336  
## chooseRate2      2.5504     2.6645   0.957   0.3385  
## chooseRate3     -1.3799     2.2750  -0.607   0.5442  
## chooseRate4     -0.2193     2.1804  -0.101   0.9199  
## chooseRate5     -2.0092     2.5364  -0.792   0.4283  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 114.191  on 112  degrees of freedom
## Residual deviance:  40.402  on  85  degrees of freedom
## AIC: 96.402
## 
## Number of Fisher Scoring iterations: 17

#Check for multicollinearity
car::vif(mylrm)

##                   GVIF Df GVIF^(1/(2*Df))
## productRate   5.314527  4        1.232205
## priceRate    10.002806  4        1.333568
## promoRate     8.504721  4        1.306795
## ambianceRate  9.713870  4        1.328691
## wifiRate     14.145982  4        1.392609
## serviceRate   6.564192  3        1.368349
## chooseRate    7.342216  4        1.283005

mylrm <- glm(loyal ~ gender + age + status +  income + visitNo + method + timeSpend +location +membershipCard +spendPurchase, data = testData, family = "binomial")

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(mylrm)

## 
## Call:
## glm(formula = loyal ~ gender + age + status + income + visitNo + 
##     method + timeSpend + location + membershipCard + spendPurchase, 
##     family = "binomial", data = testData)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.80388  -0.37898  -0.00004   0.00000   2.11522  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     -1.866e+01  1.611e+04  -0.001   0.9991  
## gender1         -2.756e-01  8.418e-01  -0.327   0.7434  
## age1            -4.487e-02  1.203e+00  -0.037   0.9702  
## age2             2.013e-01  2.033e+00   0.099   0.9211  
## age3             1.063e+00  2.979e+00   0.357   0.7213  
## status1         -5.246e+00  2.706e+00  -1.938   0.0526 .
## status2         -1.618e+00  1.295e+00  -1.249   0.2117  
## status3         -1.720e+01  1.782e+04  -0.001   0.9992  
## income1          1.826e+00  1.347e+00   1.355   0.1753  
## income2          6.227e-01  1.695e+00   0.367   0.7133  
## income3          4.882e+00  2.705e+00   1.805   0.0711 .
## income4          2.021e+01  3.651e+03   0.006   0.9956  
## visitNo1         1.784e+00  1.805e+04   0.000   0.9999  
## visitNo2        -1.457e+01  1.700e+04  -0.001   0.9993  
## visitNo3         1.871e+01  1.611e+04   0.001   0.9991  
## method1         -1.053e+00  1.555e+00  -0.677   0.4984  
## method2          9.585e-01  1.059e+00   0.905   0.3652  
## method5         -1.626e+01  2.923e+04  -0.001   0.9996  
## timeSpend1       1.512e+00  1.004e+00   1.505   0.1322  
## timeSpend2       6.402e-01  1.948e+00   0.329   0.7425  
## timeSpend3       8.929e-01  2.951e+04   0.000   1.0000  
## timeSpend4       6.060e+01  1.673e+04   0.004   0.9971  
## location1        7.094e-01  1.240e+00   0.572   0.5673  
## location2        9.840e-01  1.180e+00   0.834   0.4042  
## membershipCard1  1.719e+00  9.132e-01   1.883   0.0597 .
## spendPurchase1  -2.811e+00  1.990e+00  -1.413   0.1578  
## spendPurchase2  -5.377e+00  2.630e+00  -2.045   0.0409 *
## spendPurchase3  -3.538e+01  8.779e+03  -0.004   0.9968  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 114.191  on 112  degrees of freedom
## Residual deviance:  56.119  on  85  degrees of freedom
## AIC: 112.12
## 
## Number of Fisher Scoring iterations: 20

#Check for multicollinearity
car::vif(mylrm)

##                     GVIF Df GVIF^(1/(2*Df))
## gender          1.567128  1        1.251850
## age            10.110098  3        1.470480
## status         10.234150  3        1.473472
## income         22.675234  4        1.477216
## visitNo         1.980286  3        1.120610
## method          3.010769  3        1.201654
## timeSpend       3.948144  4        1.187269
## location        2.306077  2        1.232306
## membershipCard  1.666113  1        1.290780
## spendPurchase   8.058733  3        1.415939

##Step 4. Propensity score We select the independent variables that have a p-value less than or equal to 0.05 to be in the PS formula. That list includes: membershipCard (1) spendPurchase (2)

We also include the independent variables that have a p-value less than or equal to be in the PS formula. That list includes: priceRate (4): 0.0618 income (3): 0.0711 membershipCard (1): 0.0597 spendPurchase (2): 0.0409

#mylrm2 <- glm(loyal~ gender + age + status +  income + visitNo + method + timeSpend +location +membershipCard +spendPurchase + productRate +priceRate +promoRate +ambianceRate +wifiRate +serviceRate +chooseRate, data = testData, family = "binomial")
#summary(mylrm2)
##glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
##Check for multicollinearity
#car::vif(mylrm2)
##... every variable has a GVIF of over 100???

#Now we generate the formula:
psformula <- formula("loyal ~ membershipCard + spendPurchase + priceRate + income")
#test <- glm(loyal ~ membershipCard + spendPurchase + priceRate + income, data = testData, family = "binomial")
#summary(test)

#Now we generate:
psmodel <- glm(psformula, family = "binomial", data = testData)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

testData$ps <- fitted(psmodel)

#Plot to look for overlap (common support):
testData %>%
ggplot(aes(x = ps)) +
geom_histogram(color = "white") +
facet_wrap(~loyal)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

testData$psweights <- with(testData, ifelse(loyal == 1, mean(ps)/ps, mean(1-ps)/(1-ps)))
with(testData, by(psweights, loyal, summary))

## loyal: 0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7965  0.8005  0.8169  0.9932  0.9006  3.7299 
## ------------------------------------------------------------ 
## loyal: 1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2125  0.2588  0.2719  0.6622  0.6130  3.9076

design <- svydesign(ids = ~0, weights = testData$psweights, data = testData)

testData$loyal <- as.numeric(testData$loyal)
testData$membershipCard <- as.numeric(testData$membershipCard)
testData$spendPurchase <- as.numeric(testData$spendPurchase)
testData$priceRate <- as.numeric(testData$priceRate)
testData$income <- as.numeric(testData$income)

#balanceiv <- bal.stat(design$variables, estimand = "ATT", w.all = testData$psweights, vars = c("membershipCard", "spendPurchase", "priceRate", "income"), sampw = 1, get.ks = FALSE, treat.var = "loyal", multinom = FALSE)
#balanceiv
#balanceiv[["results"]][["p"]]

#> balanceiv[["results"]][["p"]]
#[1] 0.6604018 0.9204247 0.6255584 0.6729542


weighteddesign <- svydesign(ids = ~0, weights = testData$psweights, data = testData)
weighteddesign <- as.svrepdesign(weighteddesign, type = c("bootstrap"), replicates = 5000)
#weightedmodel <- svyglm(loyal ~ membershipCard + spendPurchase + priceRate + income, design = weighteddesign, family = binomial())

##################

testData$loyal <- as.numeric(testData$loyal)
testData$membershipCard <- as.numeric(testData$membershipCard)
testData$spendPurchase <- as.numeric(testData$spendPurchase)
testData$priceRate <- as.numeric(testData$priceRate)
testData$income <- as.numeric(testData$income)

slr <- glm(factor(membershipCard) ~ spendPurchase + priceRate + income, data = testData, family = "binomial")
summary(slr)

## 
## Call:
## glm(formula = factor(membershipCard) ~ spendPurchase + priceRate + 
##     income, family = "binomial", data = testData)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.609  -1.041  -0.351   1.036   2.098  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)     1.6455     0.7130   2.308   0.0210 * 
## spendPurchase  -0.4617     0.3261  -1.416   0.1569   
## priceRate      -0.2104     0.1968  -1.069   0.2850   
## income         -0.7118     0.2445  -2.911   0.0036 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 156.22  on 112  degrees of freedom
## Residual deviance: 138.05  on 109  degrees of freedom
## AIC: 146.05
## 
## Number of Fisher Scoring iterations: 4

psformula0 <- formula("factor(membershipCard) ~ + spendPurchase + priceRate + income")
psmodel0 <- glm(psformula0, family = "binomial", data = testData)
testData$ps0 <- fitted(psmodel0)

testData %>%
ggplot(aes(x = ps)) +
geom_histogram(color = "white") +
facet_wrap(~factor(membershipCard))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

testData$psweights0 <- with(testData, ifelse(loyal == 1, mean(ps)/ps, mean(1-ps)/(1-ps)))
with(testData, by(psweights0, membershipCard, summary))

## membershipCard: 0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3515  0.7973  0.8040  0.8834  0.8392  3.9076 
## ------------------------------------------------------------ 
## membershipCard: 1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2125  0.3905  0.8169  0.9738  0.9823  3.7299

design0 <- svydesign(ids = ~0, weights = testData$psweights0, data = testData)



balanceiv0 <- bal.stat(design0$variables, estimand = "ATT", w.all = testData$psweights0, vars = c("spendPurchase", "priceRate" , "income"), sampw = 1, get.ks = FALSE, treat.var = "membershipCard", multinom = FALSE)

balanceiv0

## $results
##                   tx.mn     tx.sd    ct.mn     ct.sd std.eff.sz       stat
## spendPurchase 1.2509503 0.6166588 1.630512 0.6714477 -0.6155134 -2.7570098
## priceRate     2.7303727 1.0739518 2.972481 1.0931891 -0.2254368 -0.9321782
## income        0.2636492 0.6994451 1.126205 1.1471524 -1.2332001 -4.6669326
##                          p
## spendPurchase 6.821837e-03
## priceRate     3.532676e-01
## income        8.601353e-06

balanceiv0[["results"]][["p"]]

## [1] 6.821837e-03 3.532676e-01 8.601353e-06

weighteddesign0 <- svydesign(ids = ~0, weights = testData$psweights0, data = testData)
weighteddesign0 <- as.svrepdesign(weighteddesign0, type = c("bootstrap"), replicates = 5000)
#weightedmodel0 <- svyglm(loyal ~ membershipCard + spendPurchase + priceRate + income, design = weighteddesign0, family = binomial())
#summary(weightedmodel0)

################

##Step 6. 505 categorical method

525 Term Project

Ruoran Lin, Tanuj Babele, Shidong Li

1/28/2021