1 Setup

Library

library(tidyverse)
library(dplyr)
library(ggplot2)
library(feather)
library(gridExtra)
library(GGally) 
library(DescTools) # complete decimal

Data

raw <- read.csv('data_1216.csv', header = TRUE, encoding = 'UTF-8') 
# colnames(raw)
raw.kp <- raw[, c(1, 3:8, 10, 12, 20:28, 31, 32)]
names(raw.kp)[names(raw.kp) == "TravelDist3"] <- "TravelDist"
colnames(raw.kp)
##  [1] "ID"         "Recency"    "TMed"       "RTRatio"    "Frequency" 
##  [6] "Monetary"   "AvgPmt"     "DailyQpon"  "DailyNote"  "NightType" 
## [11] "ChildType"  "GuestNum"   "Device"     "BnbType"    "DoW"       
## [16] "OrderPre"   "Season"     "Area"       "TravelDist" "tmedlog"

2 Data Exploration

2.1 TMed (Purchasing Cycle) vs Numerical variables

p2 = raw %>% 
  filter(., Frequency > 1) %>% 
  ggpairs(c(4,  9:12)) 
p2

  1. Since TMed is calcualted by Frequency, and RTRatio is calculated by TMed so we would drop these two variables.

  2. The distribution of TMed is apparently right skewed, which means we can do log transformation.

2.2 TMed

p4.1 = raw %>% 
  filter(., Frequency > 1) %>% 
  ggplot(aes(TMed)) +
  geom_histogram(bins = 50)

p4.2 = raw %>% 
  filter(., Frequency > 1) %>% 
  ggplot(aes(x = factor(1), TMed)) +
  geom_boxplot()

grid.arrange(p4.1, p4.2)

3 Model Building

3.1 Multiple Linear Regression (MLR)

  • Goal:predict purchasing cycle of customers TMed

  • Data:the customer who make more than one purchase (old customer)

res.all = raw.kp
res.allf2 = filter(res.all, Frequency > 1)
colnames(res.allf2)
##  [1] "ID"         "Recency"    "TMed"       "RTRatio"    "Frequency" 
##  [6] "Monetary"   "AvgPmt"     "DailyQpon"  "DailyNote"  "NightType" 
## [11] "ChildType"  "GuestNum"   "Device"     "BnbType"    "DoW"       
## [16] "OrderPre"   "Season"     "Area"       "TravelDist" "tmedlog"

3.1.1 Model1

  1. R squared = 0.2863,Residual standard error = 112.8, model doesn’t fit well.
mlr.1 = lm(TMed ~ ., data = res.allf2[, c(2, 3, 6:19)])
summary(mlr.1)
## 
## Call:
## lm(formula = TMed ~ ., data = res.allf2[, c(2, 3, 6:19)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -296.82  -75.88  -19.13   45.67  500.22 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -2.817e+01  2.455e+01  -1.147 0.251279    
## Recency           -2.588e-01  5.882e-03 -44.000  < 2e-16 ***
## Monetary           2.168e-03  3.544e-04   6.119 9.72e-10 ***
## AvgPmt             2.101e-03  3.335e-03   0.630 0.528632    
## DailyQpon         -2.559e+01  2.806e+00  -9.117  < 2e-16 ***
## DailyNote         -1.331e+00  2.775e+00  -0.480 0.631431    
## NightTypemost_4up  3.003e+01  5.785e+00   5.190 2.13e-07 ***
## NightTypeother     1.925e+01  4.428e+00   4.347 1.39e-05 ***
## ChildTypexa_child  4.915e+01  6.416e+00   7.661 1.98e-14 ***
## GuestNumsingle    -1.486e+01  4.144e+00  -3.585 0.000338 ***
## Devicemobile       2.336e+01  3.019e+00   7.738 1.09e-14 ***
## Deviceneutral      5.211e+01  3.011e+00  17.307  < 2e-16 ***
## Devicepc           3.702e+01  2.893e+00  12.799  < 2e-16 ***
## BnbTypeelse.apt   -1.394e+01  3.567e+00  -3.908 9.34e-05 ***
## BnbTypeneutral     9.383e+00  2.612e+00   3.593 0.000328 ***
## DoWweekend         9.362e+00  2.692e+00   3.478 0.000507 ***
## DoWxspecial        1.030e+01  2.267e+00   4.545 5.54e-06 ***
## OrderPreearly+     1.690e+01  8.991e+00   1.880 0.060166 .  
## OrderPrenear       2.855e+01  7.315e+00   3.902 9.58e-05 ***
## OrderPrenormal     1.683e+01  7.845e+00   2.145 0.031936 *  
## OrderPrenormal+    9.087e+00  8.193e+00   1.109 0.267369    
## Seasonno_special   1.311e+02  3.799e+00  34.496  < 2e-16 ***
## Seasonspring       7.866e+01  4.603e+00  17.089  < 2e-16 ***
## Seasonsummer       5.593e+01  4.079e+00  13.712  < 2e-16 ***
## Seasonwinter       6.916e+01  4.726e+00  14.633  < 2e-16 ***
## AreaEastSouthAsia -8.234e+00  2.376e+01  -0.347 0.728906    
## AreaOthers        -2.697e+00  2.666e+01  -0.101 0.919447    
## TravelDistP_Far   -6.499e+00  3.063e+01  -0.212 0.831983    
## TravelDistP_Local  1.000e+01  2.199e+01   0.455 0.649278    
## TravelDistP_Short  1.689e+01  2.198e+01   0.769 0.442179    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 112.8 on 12838 degrees of freedom
## Multiple R-squared:  0.2863, Adjusted R-squared:  0.2847 
## F-statistic: 177.6 on 29 and 12838 DF,  p-value: < 2.2e-16
  1. Checking Model Assumptions According to the QQ-Plot, the model violates the assumption of normality.

  2. Outliers The cood distance of data point is almost smaller than 0.5, so we’re going keep all data.

plot(mlr.1)

3.1.2 Model2

Recall that TMed’s distriution is skewed to right so that it would be better to take log. But before that, we would like to run a suitable Box-Cox transformation of the dependent variable:

  1. Box-Cox Transformation
library(lindia)
fit1 = lm(TMed ~., data = res.allf2[, c(2, 3, 6:19)])
gg_boxcox(fit1, showlambda = TRUE, lambdaSF = 3, scale.factor = 0.5)

According to the result of box-cox transformation, the maximum likelihood is when lambda equals 0.22. Since, 0.22 is similar to 0, so we’re going to take log of TMed.

  1. R-squared = 0.3721、Residual standard error = 1.432, the model interpretability is low.
mlr.2 = lm(tmedlog ~ ., data = res.allf2[, c(2, 6:20)])
summary(mlr.2)
## 
## Call:
## lm(formula = tmedlog ~ ., data = res.allf2[, c(2, 6:20)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4958 -0.8291  0.1744  0.9268  4.5847 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        5.584e-01  3.116e-01   1.792 0.073156 .  
## Recency           -2.312e-03  7.467e-05 -30.965  < 2e-16 ***
## Monetary           3.505e-05  4.499e-06   7.791 7.13e-15 ***
## AvgPmt            -5.017e-05  4.233e-05  -1.185 0.235977    
## DailyQpon         -5.430e-02  3.563e-02  -1.524 0.127467    
## DailyNote         -2.310e-01  3.522e-02  -6.560 5.60e-11 ***
## NightTypemost_4up  4.791e-01  7.344e-02   6.524 7.12e-11 ***
## NightTypeother     2.461e-01  5.621e-02   4.378 1.21e-05 ***
## ChildTypexa_child  9.658e-01  8.144e-02  11.859  < 2e-16 ***
## GuestNumsingle    -1.347e-01  5.261e-02  -2.561 0.010450 *  
## Devicemobile       1.682e-01  3.833e-02   4.388 1.15e-05 ***
## Deviceneutral      4.488e-01  3.823e-02  11.742  < 2e-16 ***
## Devicepc           3.148e-01  3.672e-02   8.573  < 2e-16 ***
## BnbTypeelse.apt   -1.726e-01  4.528e-02  -3.813 0.000138 ***
## BnbTypeneutral    -7.064e-02  3.315e-02  -2.131 0.033120 *  
## DoWweekend         1.686e-01  3.417e-02   4.934 8.17e-07 ***
## DoWxspecial       -6.402e-02  2.878e-02  -2.225 0.026118 *  
## OrderPreearly+     4.962e-01  1.141e-01   4.347 1.39e-05 ***
## OrderPrenear       9.956e-01  9.286e-02  10.721  < 2e-16 ***
## OrderPrenormal     5.599e-01  9.958e-02   5.623 1.92e-08 ***
## OrderPrenormal+    3.505e-01  1.040e-01   3.370 0.000753 ***
## Seasonno_special   2.401e+00  4.823e-02  49.784  < 2e-16 ***
## Seasonspring       1.092e+00  5.843e-02  18.685  < 2e-16 ***
## Seasonsummer       9.672e-01  5.178e-02  18.678  < 2e-16 ***
## Seasonwinter       9.685e-01  6.000e-02  16.142  < 2e-16 ***
## AreaEastSouthAsia -5.242e-01  3.016e-01  -1.738 0.082216 .  
## AreaOthers        -4.610e-01  3.385e-01  -1.362 0.173190    
## TravelDistP_Far   -1.643e-02  3.888e-01  -0.042 0.966298    
## TravelDistP_Local  2.077e-01  2.791e-01   0.744 0.456888    
## TravelDistP_Short  1.001e-01  2.790e-01   0.359 0.719672    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.432 on 12838 degrees of freedom
## Multiple R-squared:  0.3721, Adjusted R-squared:  0.3707 
## F-statistic: 262.4 on 29 and 12838 DF,  p-value: < 2.2e-16
  1. Checking Model Assumption The model doesn’t violate the assumptions.
plot(mlr.2)

QQ plot looks better!

3.1.3 Model3

We’re going to add interaction terms in model3.

mlr.3 <- lm(tmedlog ~ . + Monetary:DailyNote
            + Monetary:TravelDist
            + DailyQpon:Device 
            + DailyQpon:BnbType 
            + DailyQpon:Season 
            + NightType:TravelDist
            + ChildType:Season 
            + GuestNum:OrderPre
            + Device:DoW
            + OrderPre:Season
            + OrderPre:TravelDist
            + Season:TravelDist
            + Monetary:AvgPmt 
            + Monetary:GuestNum 
            + Monetary:Area 
            + Monetary:Season 
            + AvgPmt:DailyQpon
            + AvgPmt:Season 
            + AvgPmt:Area 
            + DailyQpon:DailyNote 
            + DailyQpon:TravelDist 
            + DailyNote:GuestNum
            + DailyNote:Device 
            + DailyNote:Season 
            + DailyNote:Area
            + GuestNum:TravelDist
            + Device:OrderPre 
            + DoW:Season 
            + Area:TravelDist, data = res.allf2[, c(2, 6:20)])

summary(mlr.3)
## 
## Call:
## lm(formula = tmedlog ~ . + Monetary:DailyNote + Monetary:TravelDist + 
##     DailyQpon:Device + DailyQpon:BnbType + DailyQpon:Season + 
##     NightType:TravelDist + ChildType:Season + GuestNum:OrderPre + 
##     Device:DoW + OrderPre:Season + OrderPre:TravelDist + Season:TravelDist + 
##     Monetary:AvgPmt + Monetary:GuestNum + Monetary:Area + Monetary:Season + 
##     AvgPmt:DailyQpon + AvgPmt:Season + AvgPmt:Area + DailyQpon:DailyNote + 
##     DailyQpon:TravelDist + DailyNote:GuestNum + DailyNote:Device + 
##     DailyNote:Season + DailyNote:Area + GuestNum:TravelDist + 
##     Device:OrderPre + DoW:Season + Area:TravelDist, data = res.allf2[, 
##     c(2, 6:20)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3246 -0.7536  0.1662  0.8714  4.2739 
## 
## Coefficients: (6 not defined because of singularities)
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          2.163e+00  2.651e+00   0.816 0.414653    
## Recency                             -2.341e-03  7.347e-05 -31.871  < 2e-16 ***
## Monetary                             1.014e-04  9.318e-05   1.088 0.276406    
## AvgPmt                              -2.518e-04  1.459e-04  -1.725 0.084529 .  
## DailyQpon                            1.041e+00  1.014e+00   1.027 0.304582    
## DailyNote                           -3.691e-01  1.260e-01  -2.930 0.003395 ** 
## NightTypemost_4up                   -3.728e-01  1.212e+00  -0.308 0.758449    
## NightTypeother                      -6.598e-01  8.445e-01  -0.781 0.434618    
## ChildTypexa_child                    5.892e-01  2.154e-01   2.735 0.006241 ** 
## GuestNumsingle                      -1.986e+00  3.310e+00  -0.600 0.548501    
## Devicemobile                        -3.288e-01  2.708e-01  -1.214 0.224652    
## Deviceneutral                       -1.496e-01  3.018e-01  -0.496 0.620109    
## Devicepc                            -4.964e-01  2.437e-01  -2.037 0.041669 *  
## BnbTypeelse.apt                     -7.711e-02  5.339e-02  -1.444 0.148679    
## BnbTypeneutral                      -3.116e-02  3.934e-02  -0.792 0.428416    
## DoWweekend                           2.727e-01  1.279e-01   2.133 0.032979 *  
## DoWxspecial                         -3.359e-01  1.053e-01  -3.189 0.001429 ** 
## OrderPreearly+                       3.399e+00  3.345e+00   1.016 0.309643    
## OrderPrenear                         2.505e+00  2.155e+00   1.162 0.245263    
## OrderPrenormal                      -8.143e-01  3.643e-01  -2.235 0.025414 *  
## OrderPrenormal+                      2.358e-01  3.755e-01   0.628 0.530048    
## Seasonno_special                     1.047e+00  1.603e+00   0.653 0.513762    
## Seasonspring                        -7.174e-01  5.345e-01  -1.342 0.179523    
## Seasonsummer                        -2.820e+00  2.128e+00  -1.325 0.185092    
## Seasonwinter                        -2.050e-01  1.996e+00  -0.103 0.918219    
## AreaEastSouthAsia                   -5.930e-01  1.596e+00  -0.372 0.710211    
## AreaOthers                          -1.758e+00  6.368e-01  -2.761 0.005775 ** 
## TravelDistP_Far                      3.289e-01  2.689e+00   0.122 0.902657    
## TravelDistP_Local                   -1.342e+00  2.629e+00  -0.510 0.609790    
## TravelDistP_Short                   -3.736e-01  2.623e+00  -0.142 0.886767    
## Monetary:DailyNote                   3.312e-05  8.277e-06   4.002 6.32e-05 ***
## Monetary:TravelDistP_Far            -1.108e-04  1.066e-04  -1.039 0.298907    
## Monetary:TravelDistP_Local           3.155e-05  9.246e-05   0.341 0.732977    
## Monetary:TravelDistP_Short          -1.338e-05  9.213e-05  -0.145 0.884497    
## DailyQpon:Devicemobile               2.126e-01  1.038e-01   2.049 0.040524 *  
## DailyQpon:Deviceneutral              3.668e-01  9.589e-02   3.825 0.000132 ***
## DailyQpon:Devicepc                   5.374e-01  9.138e-02   5.881 4.17e-09 ***
## DailyQpon:BnbTypeelse.apt           -2.639e-01  1.126e-01  -2.343 0.019141 *  
## DailyQpon:BnbTypeneutral            -7.393e-02  8.389e-02  -0.881 0.378191    
## DailyQpon:Seasonno_special          -3.732e-01  1.109e-01  -3.366 0.000765 ***
## DailyQpon:Seasonspring               5.290e-01  1.443e-01   3.665 0.000248 ***
## DailyQpon:Seasonsummer               3.663e-01  1.218e-01   3.007 0.002647 ** 
## DailyQpon:Seasonwinter               3.007e-01  1.460e-01   2.059 0.039474 *  
## NightTypemost_4up:TravelDistP_Far    8.655e-01  1.243e+00   0.696 0.486229    
## NightTypeother:TravelDistP_Far       1.125e+00  8.673e-01   1.297 0.194819    
## NightTypemost_4up:TravelDistP_Local -3.382e-01  1.239e+00  -0.273 0.784959    
## NightTypeother:TravelDistP_Local     5.531e-01  8.559e-01   0.646 0.518121    
## NightTypemost_4up:TravelDistP_Short  9.565e-01  1.215e+00   0.787 0.431050    
## NightTypeother:TravelDistP_Short     1.011e+00  8.468e-01   1.194 0.232428    
## ChildTypexa_child:Seasonno_special  -1.295e-01  2.654e-01  -0.488 0.625542    
## ChildTypexa_child:Seasonspring       1.299e+00  3.480e-01   3.732 0.000191 ***
## ChildTypexa_child:Seasonsummer       6.103e-01  2.507e-01   2.434 0.014948 *  
## ChildTypexa_child:Seasonwinter       2.051e-01  3.001e-01   0.684 0.494282    
## GuestNumsingle:OrderPreearly+        6.272e-01  6.243e-01   1.005 0.315124    
## GuestNumsingle:OrderPrenear         -1.125e+00  4.037e-01  -2.787 0.005327 ** 
## GuestNumsingle:OrderPrenormal       -7.089e-01  4.358e-01  -1.627 0.103852    
## GuestNumsingle:OrderPrenormal+      -2.169e-01  4.533e-01  -0.479 0.632290    
## Devicemobile:DoWweekend              1.649e-01  9.558e-02   1.725 0.084505 .  
## Deviceneutral:DoWweekend             4.153e-02  1.020e-01   0.407 0.683814    
## Devicepc:DoWweekend                  9.442e-02  9.243e-02   1.022 0.307006    
## Devicemobile:DoWxspecial             2.070e-01  8.426e-02   2.457 0.014028 *  
## Deviceneutral:DoWxspecial            3.304e-01  8.264e-02   3.998 6.43e-05 ***
## Devicepc:DoWxspecial                 1.619e-01  7.960e-02   2.033 0.042047 *  
## OrderPreearly+:Seasonno_special     -8.486e-01  4.429e-01  -1.916 0.055364 .  
## OrderPrenear:Seasonno_special       -1.409e+00  3.762e-01  -3.745 0.000181 ***
## OrderPrenormal:Seasonno_special      1.564e-02  3.934e-01   0.040 0.968287    
## OrderPrenormal+:Seasonno_special    -7.395e-01  4.086e-01  -1.810 0.070334 .  
## OrderPreearly+:Seasonspring         -6.993e-01  4.240e-01  -1.650 0.099063 .  
## OrderPrenear:Seasonspring           -1.639e-01  3.435e-01  -0.477 0.633161    
## OrderPrenormal:Seasonspring          5.792e-01  3.670e-01   1.578 0.114588    
## OrderPrenormal+:Seasonspring        -2.157e-01  3.801e-01  -0.567 0.570505    
## OrderPreearly+:Seasonsummer         -6.866e-01  4.085e-01  -1.681 0.092820 .  
## OrderPrenear:Seasonsummer           -3.463e-01  3.416e-01  -1.014 0.310612    
## OrderPrenormal:Seasonsummer          4.465e-01  3.614e-01   1.236 0.216589    
## OrderPrenormal+:Seasonsummer        -7.680e-01  3.755e-01  -2.045 0.040866 *  
## OrderPreearly+:Seasonwinter         -1.845e-01  4.208e-01  -0.438 0.661119    
## OrderPrenear:Seasonwinter            5.211e-02  3.510e-01   0.148 0.882001    
## OrderPrenormal:Seasonwinter          8.062e-01  3.758e-01   2.145 0.031951 *  
## OrderPrenormal+:Seasonwinter        -6.282e-01  3.888e-01  -1.616 0.106185    
## OrderPreearly+:TravelDistP_Far      -3.009e+00  3.338e+00  -0.902 0.367297    
## OrderPrenear:TravelDistP_Far        -1.718e+00  2.153e+00  -0.798 0.424684    
## OrderPrenormal:TravelDistP_Far       6.105e-02  3.658e-01   0.167 0.867453    
## OrderPrenormal+:TravelDistP_Far      4.081e-02  3.509e-01   0.116 0.907424    
## OrderPreearly+:TravelDistP_Local    -2.465e+00  3.334e+00  -0.739 0.459652    
## OrderPrenear:TravelDistP_Local      -1.105e+00  2.140e+00  -0.516 0.605832    
## OrderPrenormal:TravelDistP_Local     1.050e+00  2.417e-01   4.345 1.40e-05 ***
## OrderPrenormal+:TravelDistP_Local    7.393e-01  2.415e-01   3.061 0.002214 ** 
## OrderPreearly+:TravelDistP_Short    -2.720e+00  3.320e+00  -0.819 0.412727    
## OrderPrenear:TravelDistP_Short      -1.416e+00  2.130e+00  -0.665 0.506179    
## OrderPrenormal:TravelDistP_Short            NA         NA      NA       NA    
## OrderPrenormal+:TravelDistP_Short           NA         NA      NA       NA    
## Seasonno_special:TravelDistP_Far     2.418e+00  1.558e+00   1.552 0.120630    
## Seasonspring:TravelDistP_Far        -5.846e-01  2.467e-01  -2.370 0.017827 *  
## Seasonsummer:TravelDistP_Far         2.539e+00  2.096e+00   1.211 0.225838    
## Seasonwinter:TravelDistP_Far        -5.816e-02  1.954e+00  -0.030 0.976252    
## Seasonno_special:TravelDistP_Local   2.702e+00  1.534e+00   1.762 0.078077 .  
## Seasonspring:TravelDistP_Local       5.167e-01  1.849e-01   2.794 0.005214 ** 
## Seasonsummer:TravelDistP_Local       3.294e+00  2.083e+00   1.582 0.113780    
## Seasonwinter:TravelDistP_Local       8.963e-01  1.940e+00   0.462 0.644072    
## Seasonno_special:TravelDistP_Short   2.157e+00  1.541e+00   1.399 0.161742    
## Seasonspring:TravelDistP_Short              NA         NA      NA       NA    
## Seasonsummer:TravelDistP_Short       2.573e+00  2.088e+00   1.232 0.217994    
## Seasonwinter:TravelDistP_Short       1.853e-01  1.946e+00   0.095 0.924139    
## Monetary:AvgPmt                     -1.557e-08  4.317e-09  -3.605 0.000313 ***
## Monetary:GuestNumsingle              4.858e-05  2.732e-05   1.778 0.075407 .  
## Monetary:AreaEastSouthAsia           6.811e-05  5.526e-05   1.232 0.217822    
## Monetary:AreaOthers                 -4.070e-05  6.639e-05  -0.613 0.539801    
## Monetary:Seasonno_special           -9.181e-05  1.507e-05  -6.092 1.14e-09 ***
## Monetary:Seasonspring               -2.461e-05  1.722e-05  -1.430 0.152872    
## Monetary:Seasonsummer               -2.135e-05  1.594e-05  -1.339 0.180623    
## Monetary:Seasonwinter               -2.990e-05  1.797e-05  -1.664 0.096188 .  
## AvgPmt:DailyQpon                    -3.856e-04  1.118e-04  -3.448 0.000566 ***
## AvgPmt:Seasonno_special              4.328e-04  1.519e-04   2.850 0.004383 ** 
## AvgPmt:Seasonspring                  1.711e-04  1.783e-04   0.959 0.337418    
## AvgPmt:Seasonsummer                  3.505e-04  1.652e-04   2.121 0.033960 *  
## AvgPmt:Seasonwinter                  3.135e-04  1.847e-04   1.697 0.089756 .  
## AvgPmt:AreaEastSouthAsia             3.975e-04  2.732e-04   1.455 0.145732    
## AvgPmt:AreaOthers                    1.476e-03  5.334e-04   2.767 0.005667 ** 
## DailyQpon:DailyNote                 -1.549e-01  7.155e-02  -2.166 0.030360 *  
## DailyQpon:TravelDistP_Far           -1.308e+00  1.023e+00  -1.279 0.201023    
## DailyQpon:TravelDistP_Local         -1.040e+00  1.001e+00  -1.039 0.298829    
## DailyQpon:TravelDistP_Short         -8.374e-01  1.002e+00  -0.836 0.403408    
## DailyNote:GuestNumsingle             3.693e-01  1.398e-01   2.641 0.008285 ** 
## DailyNote:Devicemobile              -1.121e-01  1.073e-01  -1.044 0.296469    
## DailyNote:Deviceneutral             -9.824e-02  1.054e-01  -0.932 0.351321    
## DailyNote:Devicepc                  -2.882e-01  9.581e-02  -3.008 0.002637 ** 
## DailyNote:Seasonno_special           3.903e-01  1.081e-01   3.613 0.000304 ***
## DailyNote:Seasonspring               1.228e-01  1.366e-01   0.899 0.368710    
## DailyNote:Seasonsummer               1.654e-01  1.206e-01   1.371 0.170448    
## DailyNote:Seasonwinter              -1.492e-01  1.452e-01  -1.028 0.304104    
## DailyNote:AreaEastSouthAsia          5.120e-01  1.865e-01   2.746 0.006049 ** 
## DailyNote:AreaOthers                 7.299e-01  4.214e-01   1.732 0.083293 .  
## GuestNumsingle:TravelDistP_Far       2.703e+00  3.296e+00   0.820 0.412134    
## GuestNumsingle:TravelDistP_Local     2.788e+00  3.286e+00   0.849 0.396111    
## GuestNumsingle:TravelDistP_Short     2.304e+00  3.288e+00   0.701 0.483497    
## Devicemobile:OrderPreearly+          3.759e-01  3.424e-01   1.098 0.272296    
## Deviceneutral:OrderPreearly+        -2.505e-01  3.644e-01  -0.687 0.491803    
## Devicepc:OrderPreearly+              4.891e-01  3.022e-01   1.618 0.105648    
## Devicemobile:OrderPrenear            4.016e-01  2.693e-01   1.491 0.135877    
## Deviceneutral:OrderPrenear           4.266e-01  2.995e-01   1.424 0.154429    
## Devicepc:OrderPrenear                6.814e-01  2.420e-01   2.815 0.004883 ** 
## Devicemobile:OrderPrenormal          2.432e-01  2.904e-01   0.837 0.402383    
## Deviceneutral:OrderPrenormal         3.391e-01  3.208e-01   1.057 0.290482    
## Devicepc:OrderPrenormal              5.533e-01  2.613e-01   2.118 0.034221 *  
## Devicemobile:OrderPrenormal+        -1.463e-01  3.083e-01  -0.475 0.635141    
## Deviceneutral:OrderPrenormal+        3.632e-01  3.380e-01   1.075 0.282583    
## Devicepc:OrderPrenormal+             6.561e-01  2.753e-01   2.383 0.017199 *  
## DoWweekend:Seasonno_special         -3.099e-01  1.252e-01  -2.475 0.013320 *  
## DoWxspecial:Seasonno_special         2.663e-01  1.000e-01   2.662 0.007780 ** 
## DoWweekend:Seasonspring             -9.637e-03  1.457e-01  -0.066 0.947254    
## DoWxspecial:Seasonspring            -1.595e-03  1.207e-01  -0.013 0.989460    
## DoWweekend:Seasonsummer             -1.867e-01  1.317e-01  -1.417 0.156366    
## DoWxspecial:Seasonsummer            -1.303e-01  1.096e-01  -1.189 0.234483    
## DoWweekend:Seasonwinter             -1.519e-01  1.503e-01  -1.011 0.311912    
## DoWxspecial:Seasonwinter            -1.190e-01  1.273e-01  -0.935 0.349841    
## AreaEastSouthAsia:TravelDistP_Far   -4.805e-01  1.603e+00  -0.300 0.764431    
## AreaOthers:TravelDistP_Far                  NA         NA      NA       NA    
## AreaEastSouthAsia:TravelDistP_Local -4.068e+00  2.102e+00  -1.935 0.053030 .  
## AreaOthers:TravelDistP_Local                NA         NA      NA       NA    
## AreaEastSouthAsia:TravelDistP_Short -1.856e-01  2.105e+00  -0.088 0.929741    
## AreaOthers:TravelDistP_Short                NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.383 on 12713 degrees of freedom
## Multiple R-squared:  0.4203, Adjusted R-squared:  0.4133 
## F-statistic: 59.86 on 154 and 12713 DF,  p-value: < 2.2e-16
  1. Model assumptions are met
plot(mlr.3)
## Warning: not plotting observations with leverage one:
##   121, 784, 4239, 10776, 12222

## Warning: not plotting observations with leverage one:
##   121, 784, 4239, 10776, 12222

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

  1. Adjusted R-squared: 0.4133 、Residual standard error = 1.383

According to the results, we decide model3 as our final model for predicting customer purchasing cycle (TMed)

3.2 GLM: Binomial Logistic Regression

  • Goal:predict the probability of new customer being alive

  • Data:the customer who make more than one purchase (old customer)

  • Definition:

    • Online room booking platform is a case of non-contractual setting for customer analysis

    • Status: We are going to build binomial logistic regression model to predict Status, which means the customer is alive or not. If Status equals 1 then is alive, otherwise dead (drop out the platform). And we defined the Status variable according to the RT-Ratio.

  • RT-Ratio = Recency / TMed

    • We assumed that each customer has their purchasing cycle, so we used RT-Ratio as measurement. When the value closes to 1, the retention rate would be higher; otherwise, when the value is too large, it means the customer may leave the platform.
hist(res.allf2$RTRatio, freq = TRUE, breaks = seq(0, 1200, 1), labels = TRUE,
     xlim = c(0, 20), ylim = c(0, 12000),
     ylab = "#Members", xlab = "Recency-Purchasing Period Ratio",
     main = 'Histogram of R-T Ratio')
abline(v=8,col="red")

  • We are going to calculated the proportion of neighboring counts of RT-Ratio. When the proportion > 90%, we assumed the customer will jump to the higher RT-Ratio (jump to the right of graph).

  • We defined when RT-Ratio equals 8, the probability of the customer jump to the right is 90% (221/247), which means there is little chance they will come back. Therefore, when RT-Ratio is larger than 8, the customer is dead, otherwise alive.

Data

res.allf2$Status <- 1
# [RTRatio = 8] / [RTRatio = 7] > 90%
res.allf2[res.allf2$RTRatio > 8, ]$Status <- 0
colnames(res.allf2)
##  [1] "ID"         "Recency"    "TMed"       "RTRatio"    "Frequency" 
##  [6] "Monetary"   "AvgPmt"     "DailyQpon"  "DailyNote"  "NightType" 
## [11] "ChildType"  "GuestNum"   "Device"     "BnbType"    "DoW"       
## [16] "OrderPre"   "Season"     "Area"       "TravelDist" "tmedlog"   
## [21] "Status"

3.2.1 Model1

Y:Status

X:Monetary、DailyQpon、DailyNote、ChildType、GuestNum、Device、BnbType、DoW、OrderPre、 Season、Area、TravelDist

logit0 = glm(Status ~ . + Season:TravelDist
                 + ChildType:Season
                 + OrderPre:TravelDist
                 + DoW:Season
                 + Monetary:Season
                 + DailyQpon:Season
                 + OrderPre:Season, data = res.allf2[, c(6:19, 21)],
            family = binomial(link="logit"))
summary(logit0)
## 
## Call:
## glm(formula = Status ~ . + Season:TravelDist + ChildType:Season + 
##     OrderPre:TravelDist + DoW:Season + Monetary:Season + DailyQpon:Season + 
##     OrderPre:Season, family = binomial(link = "logit"), data = res.allf2[, 
##     c(6:19, 21)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7768  -0.8700   0.4577   0.6797   2.5160  
## 
## Coefficients: (3 not defined because of singularities)
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -1.385e+01  3.430e+02  -0.040 0.967783    
## Monetary                            5.622e-05  2.505e-05   2.244 0.024828 *  
## AvgPmt                              1.079e-04  7.662e-05   1.408 0.159160    
## DailyQpon                           5.986e-01  1.478e-01   4.049 5.14e-05 ***
## DailyNote                          -9.151e-02  6.124e-02  -1.494 0.135105    
## NightTypemost_4up                   6.929e-01  1.357e-01   5.108 3.26e-07 ***
## NightTypeother                      2.862e-01  1.050e-01   2.725 0.006432 ** 
## ChildTypexa_child                   6.552e-02  3.385e-01   0.194 0.846540    
## GuestNumsingle                     -4.829e-01  8.597e-02  -5.616 1.95e-08 ***
## Devicemobile                       -2.462e-01  6.479e-02  -3.801 0.000144 ***
## Deviceneutral                       7.455e-02  6.831e-02   1.091 0.275126    
## Devicepc                           -2.293e-01  6.268e-02  -3.659 0.000254 ***
## BnbTypeelse.apt                    -7.917e-02  7.541e-02  -1.050 0.293781    
## BnbTypeneutral                     -1.034e-01  5.872e-02  -1.761 0.078181 .  
## DoWweekend                          2.702e-01  1.810e-01   1.493 0.135399    
## DoWxspecial                        -1.321e-01  1.436e-01  -0.920 0.357725    
## OrderPreearly+                      2.517e+01  8.065e+02   0.031 0.975105    
## OrderPrenear                        2.606e+01  4.851e+02   0.054 0.957160    
## OrderPrenormal                      6.006e-01  6.414e-01   0.936 0.349118    
## OrderPrenormal+                     9.195e-01  6.518e-01   1.411 0.158318    
## Seasonno_special                   -8.987e+00  3.430e+02  -0.026 0.979098    
## Seasonspring                       -1.075e+00  9.762e-01  -1.101 0.270745    
## Seasonsummer                        4.596e-01  4.961e+02   0.001 0.999261    
## Seasonwinter                        2.125e+00  4.600e+02   0.005 0.996313    
## AreaEastSouthAsia                  -1.094e+00  5.148e-01  -2.125 0.033573 *  
## AreaOthers                         -7.599e-01  5.767e-01  -1.318 0.187609    
## TravelDistP_Far                     1.067e+01  3.430e+02   0.031 0.975185    
## TravelDistP_Local                   1.162e+01  3.430e+02   0.034 0.972985    
## TravelDistP_Short                   1.182e+01  3.430e+02   0.034 0.972514    
## Seasonno_special:TravelDistP_Far    1.312e+01  3.430e+02   0.038 0.969495    
## Seasonspring:TravelDistP_Far       -8.177e-01  4.813e-01  -1.699 0.089351 .  
## Seasonsummer:TravelDistP_Far       -8.155e-01  4.961e+02  -0.002 0.998688    
## Seasonwinter:TravelDistP_Far       -1.470e+00  4.600e+02  -0.003 0.997451    
## Seasonno_special:TravelDistP_Local  1.271e+01  3.430e+02   0.037 0.970431    
## Seasonspring:TravelDistP_Local      6.685e-02  3.028e-01   0.221 0.825247    
## Seasonsummer:TravelDistP_Local     -5.511e-01  4.961e+02  -0.001 0.999114    
## Seasonwinter:TravelDistP_Local     -1.225e+00  4.600e+02  -0.003 0.997875    
## Seasonno_special:TravelDistP_Short  1.250e+01  3.430e+02   0.036 0.970925    
## Seasonspring:TravelDistP_Short             NA         NA      NA       NA    
## Seasonsummer:TravelDistP_Short     -8.848e-01  4.961e+02  -0.002 0.998577    
## Seasonwinter:TravelDistP_Short     -1.139e+00  4.600e+02  -0.002 0.998024    
## ChildTypexa_child:Seasonno_special  6.439e-02  4.828e-01   0.133 0.893904    
## ChildTypexa_child:Seasonspring      1.343e+00  6.299e-01   2.132 0.032996 *  
## ChildTypexa_child:Seasonsummer      9.663e-01  4.001e-01   2.415 0.015724 *  
## ChildTypexa_child:Seasonwinter      5.452e-01  4.792e-01   1.138 0.255285    
## OrderPreearly+:TravelDistP_Far     -2.343e+01  8.065e+02  -0.029 0.976824    
## OrderPrenear:TravelDistP_Far       -2.235e+01  4.851e+02  -0.046 0.963255    
## OrderPrenormal:TravelDistP_Far      1.373e+00  1.184e+00   1.160 0.246083    
## OrderPrenormal+:TravelDistP_Far     1.040e+00  1.181e+00   0.881 0.378510    
## OrderPreearly+:TravelDistP_Local   -2.404e+01  8.065e+02  -0.030 0.976216    
## OrderPrenear:TravelDistP_Local     -2.343e+01  4.851e+02  -0.048 0.961480    
## OrderPrenormal:TravelDistP_Local    1.167e+00  4.277e-01   2.729 0.006351 ** 
## OrderPrenormal+:TravelDistP_Local   6.915e-01  4.236e-01   1.633 0.102547    
## OrderPreearly+:TravelDistP_Short   -2.433e+01  8.065e+02  -0.030 0.975937    
## OrderPrenear:TravelDistP_Short     -2.354e+01  4.851e+02  -0.049 0.961291    
## OrderPrenormal:TravelDistP_Short           NA         NA      NA       NA    
## OrderPrenormal+:TravelDistP_Short          NA         NA      NA       NA    
## DoWweekend:Seasonno_special        -2.253e-01  2.167e-01  -1.040 0.298428    
## DoWxspecial:Seasonno_special        1.143e-01  1.703e-01   0.671 0.502268    
## DoWweekend:Seasonspring             4.532e-02  2.305e-01   0.197 0.844113    
## DoWxspecial:Seasonspring           -4.290e-01  1.939e-01  -2.213 0.026923 *  
## DoWweekend:Seasonsummer            -2.179e-03  2.076e-01  -0.010 0.991625    
## DoWxspecial:Seasonsummer           -3.159e-01  1.706e-01  -1.852 0.064043 .  
## DoWweekend:Seasonwinter            -2.577e-01  2.339e-01  -1.102 0.270678    
## DoWxspecial:Seasonwinter           -5.584e-01  2.006e-01  -2.784 0.005373 ** 
## Monetary:Seasonno_special          -4.548e-05  2.856e-05  -1.593 0.111261    
## Monetary:Seasonspring               1.228e-05  3.022e-05   0.406 0.684542    
## Monetary:Seasonsummer               1.800e-07  2.769e-05   0.007 0.994812    
## Monetary:Seasonwinter              -1.203e-05  3.112e-05  -0.387 0.698968    
## DailyQpon:Seasonno_special          6.224e-02  1.943e-01   0.320 0.748758    
## DailyQpon:Seasonspring              4.649e-01  2.214e-01   2.100 0.035717 *  
## DailyQpon:Seasonsummer              2.859e-01  1.858e-01   1.539 0.123915    
## DailyQpon:Seasonwinter              4.589e-01  2.286e-01   2.007 0.044702 *  
## OrderPreearly+:Seasonno_special    -1.270e+00  9.080e-01  -1.399 0.161752    
## OrderPrenear:Seasonno_special      -2.357e+00  8.035e-01  -2.934 0.003349 ** 
## OrderPrenormal:Seasonno_special    -5.669e-01  8.386e-01  -0.676 0.499008    
## OrderPrenormal+:Seasonno_special   -1.047e+00  8.579e-01  -1.220 0.222337    
## OrderPreearly+:Seasonspring         9.525e-02  8.139e-01   0.117 0.906836    
## OrderPrenear:Seasonspring          -1.369e+00  6.987e-01  -1.960 0.050027 .  
## OrderPrenormal:Seasonspring        -5.554e-01  7.295e-01  -0.761 0.446489    
## OrderPrenormal+:Seasonspring       -9.142e-01  7.524e-01  -1.215 0.224385    
## OrderPreearly+:Seasonsummer        -7.781e-01  7.579e-01  -1.027 0.304606    
## OrderPrenear:Seasonsummer          -1.276e+00  6.661e-01  -1.916 0.055353 .  
## OrderPrenormal:Seasonsummer        -6.883e-01  6.903e-01  -0.997 0.318672    
## OrderPrenormal+:Seasonsummer       -1.252e+00  7.105e-01  -1.762 0.078007 .  
## OrderPreearly+:Seasonwinter        -1.030e+00  7.801e-01  -1.321 0.186597    
## OrderPrenear:Seasonwinter          -2.112e+00  6.820e-01  -3.097 0.001958 ** 
## OrderPrenormal:Seasonwinter        -1.389e+00  7.134e-01  -1.947 0.051550 .  
## OrderPrenormal+:Seasonwinter       -1.421e+00  7.304e-01  -1.946 0.051677 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16411  on 12867  degrees of freedom
## Residual deviance: 12626  on 12782  degrees of freedom
## AIC: 12798
## 
## Number of Fisher Scoring iterations: 12
(1-pchisq(deviance(logit0), df.residual(logit0)))
## [1] 0.8347535
  1. Deviance decrease from 16411 (null distance) to 12626,the range is 3785, which means the variables are meaningful.
  2. The difference between Deviance and DF is small.
  3. According to the Likelihood Ratio Test, the p-value is 0.834, so we will accept the null hypothesis, which means the model is good.

4 Prediction Result

rd.f1 <- raw[, c(1, 3, 5:7, 10, 12, 21:28, 31, 32)]
res.f1 <- subset(rd.f1, rd.f1$Frequency == 1)

colnames(res.f1)
##  [1] "ID"          "Recency"     "RTRatio"     "Frequency"   "Monetary"   
##  [6] "DailyQpon"   "DailyNote"   "ChildType"   "GuestNum"    "Device"     
## [11] "BnbType"     "DoW"         "OrderPre"    "Season"      "Area"       
## [16] "TravelDist3" "tmedlog"
head(res.f1)

4.1 Predict TMed

summary(res.all)
##        ID            Recency           TMed           RTRatio      
##  Min.   :    10   Min.   :  1.0   Min.   :  1.0    Min.   :  0.00  
##  1st Qu.:204031   1st Qu.:116.0   1st Qu.: 15.0    1st Qu.:  0.76  
##  Median :272210   Median :303.5   Median : 71.0    Median :  3.02  
##  Mean   :276197   Mean   :311.7   Mean   :120.2    Mean   : 39.37  
##  3rd Qu.:357782   3rd Qu.:497.0   3rd Qu.:182.0    3rd Qu.: 15.30  
##  Max.   :438784   Max.   :678.0   Max.   :667.0    Max.   :677.00  
##                                   NA's   :101204   NA's   :101204  
##    Frequency         Monetary          AvgPmt           DailyQpon     
##  Min.   : 1.000   Min.   :     1   Min.   :    0.25   Min.   :0.0000  
##  1st Qu.: 1.000   1st Qu.:  1580   1st Qu.:  600.00   1st Qu.:0.0000  
##  Median : 1.000   Median :  2560   Median :  800.00   Median :0.0000  
##  Mean   : 1.166   Mean   :  4736   Mean   :  892.52   Mean   :0.2291  
##  3rd Qu.: 1.000   3rd Qu.:  5140   3rd Qu.: 1064.03   3rd Qu.:0.0000  
##  Max.   :28.000   Max.   :184082   Max.   :28000.00   Max.   :6.0000  
##                                                                       
##    DailyNote         NightType        ChildType           GuestNum     
##  Min.   :0.0000   most_1~3:99046   a_child :  8001   nonsingle:103239  
##  1st Qu.:0.0000   most_4up:13703   xa_child:106071   single   : 10833  
##  Median :0.0000   other   : 1323                                       
##  Mean   :0.2611                                                        
##  3rd Qu.:0.5000                                                        
##  Max.   :6.0000                                                        
##                                                                        
##      Device          BnbType            DoW           OrderPre    
##  app    :21918   apt     :88437   weekday :66416   early  : 6233  
##  mobile :44915   else.apt:22641   weekend :42992   early+ : 7431  
##  neutral: 3380   neutral : 2994   xspecial: 4664   near   :60483  
##  pc     :43859                                     normal :24558  
##                                                    normal+:15367  
##                                                                   
##                                                                   
##         Season                 Area              TravelDist       tmedlog      
##  autumn    :21368   EastAsia     :110511   No Tendency:   27   Min.   :0.00    
##  no_special: 5958   EastSouthAsia:  2803   P_Far      : 3781   1st Qu.:2.71    
##  spring    :25204   Others       :   758   P_Local    :86262   Median :4.26    
##  summer    :38414                          P_Short    :24002   Mean   :3.79    
##  winter    :23128                                              3rd Qu.:5.20    
##                                                                Max.   :6.50    
##                                                                NA's   :101204
res.allf1 = filter(res.all, Frequency == 1)
colnames(res.allf1)
##  [1] "ID"         "Recency"    "TMed"       "RTRatio"    "Frequency" 
##  [6] "Monetary"   "AvgPmt"     "DailyQpon"  "DailyNote"  "NightType" 
## [11] "ChildType"  "GuestNum"   "Device"     "BnbType"    "DoW"       
## [16] "OrderPre"   "Season"     "Area"       "TravelDist" "tmedlog"
res.allf1$tmedlog <- predict.lm(mlr.3, res.allf1[, c(-1, -3, -4, -5)])
## Warning in predict.lm(mlr.3, res.allf1[, c(-1, -3, -4, -5)]): prediction from a
## rank-deficient fit may be misleading
res.allf1$RTRatio <- round(res.allf1$Recency / exp(res.allf1$tmedlog), 2)
res.allf1$TMed = exp(res.allf1$tmedlog)
head(res.allf1)

the distribution of purchasing cycle of new customer

p.mlr = res.allf1 %>%
  filter(TMed <720) %>%
  ggplot(aes(TMed)) +
  geom_histogram(bins = 50,color="white")

p.mlr

4.2 Predict P(Alive)

colnames(res.allf1)
##  [1] "ID"         "Recency"    "TMed"       "RTRatio"    "Frequency" 
##  [6] "Monetary"   "AvgPmt"     "DailyQpon"  "DailyNote"  "NightType" 
## [11] "ChildType"  "GuestNum"   "Device"     "BnbType"    "DoW"       
## [16] "OrderPre"   "Season"     "Area"       "TravelDist" "tmedlog"
res.allf1$p.Status = predict(logit0, res.allf1[, c(-1, -3, -4, -5, -20, -21)], type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = ifelse(type
## == : prediction from a rank-deficient fit may be misleading

the probability distribution of being alive

p = res.allf1 %>%
  ggplot(aes(p.Status)) +
  geom_histogram(bins = 50)

p

5 Marketing Plan

Select the target customer:

  • In order to optimize the marketing budget, we think it would be better to define our target customer first. According to the data of old customers, the maximum value of purchasing cycle TMed is 660; therefore, 660 days of purchasing cycle would be one rule to decide whether a customer is dead.
hist(res.allf1$TMed[res.allf1$TMed < 800], 
     breaks = seq(0, 800, 20), 
     labels = TRUE, 
     xlab = 'Purchasing cycle (TMed)',
     main = 'Histogram of Purchasing cycle (TMed)')
abline(v=660,col="red")

  • Also, considering customer’s different purchasing cycle, we use RT-Ratio as another rule. Recall that if RT-Ratio is bigger than 8, then the customer is dead.
hist(res.allf1$RTRatio[res.allf1$TMed <= 660 & res.allf1$RTRatio<1000],
     freq = TRUE, 
     ylab = "#Members", xlab = "Recency-Purchasing Period Ratio",
     main = 'Histogram of R-T Ratio')

  • Finally, we will consider the probability of a customer being alive to get more granular segment.