Library
library(tidyverse)
library(dplyr)
library(ggplot2)
library(feather)
library(gridExtra)
library(GGally)
library(DescTools) # complete decimal
Data
raw <- read.csv('data_1216.csv', header = TRUE, encoding = 'UTF-8')
# colnames(raw)
raw.kp <- raw[, c(1, 3:8, 10, 12, 20:28, 31, 32)]
names(raw.kp)[names(raw.kp) == "TravelDist3"] <- "TravelDist"
colnames(raw.kp)
## [1] "ID" "Recency" "TMed" "RTRatio" "Frequency"
## [6] "Monetary" "AvgPmt" "DailyQpon" "DailyNote" "NightType"
## [11] "ChildType" "GuestNum" "Device" "BnbType" "DoW"
## [16] "OrderPre" "Season" "Area" "TravelDist" "tmedlog"
p2 = raw %>%
filter(., Frequency > 1) %>%
ggpairs(c(4, 9:12))
p2
Since TMed is calcualted by Frequency, and RTRatio is calculated by TMed so we would drop these two variables.
The distribution of TMed is apparently right skewed, which means we can do log transformation.
p4.1 = raw %>%
filter(., Frequency > 1) %>%
ggplot(aes(TMed)) +
geom_histogram(bins = 50)
p4.2 = raw %>%
filter(., Frequency > 1) %>%
ggplot(aes(x = factor(1), TMed)) +
geom_boxplot()
grid.arrange(p4.1, p4.2)
Goal:predict purchasing cycle of customers TMed
Data:the customer who make more than one purchase (old customer)
res.all = raw.kp
res.allf2 = filter(res.all, Frequency > 1)
colnames(res.allf2)
## [1] "ID" "Recency" "TMed" "RTRatio" "Frequency"
## [6] "Monetary" "AvgPmt" "DailyQpon" "DailyNote" "NightType"
## [11] "ChildType" "GuestNum" "Device" "BnbType" "DoW"
## [16] "OrderPre" "Season" "Area" "TravelDist" "tmedlog"
mlr.1 = lm(TMed ~ ., data = res.allf2[, c(2, 3, 6:19)])
summary(mlr.1)
##
## Call:
## lm(formula = TMed ~ ., data = res.allf2[, c(2, 3, 6:19)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -296.82 -75.88 -19.13 45.67 500.22
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.817e+01 2.455e+01 -1.147 0.251279
## Recency -2.588e-01 5.882e-03 -44.000 < 2e-16 ***
## Monetary 2.168e-03 3.544e-04 6.119 9.72e-10 ***
## AvgPmt 2.101e-03 3.335e-03 0.630 0.528632
## DailyQpon -2.559e+01 2.806e+00 -9.117 < 2e-16 ***
## DailyNote -1.331e+00 2.775e+00 -0.480 0.631431
## NightTypemost_4up 3.003e+01 5.785e+00 5.190 2.13e-07 ***
## NightTypeother 1.925e+01 4.428e+00 4.347 1.39e-05 ***
## ChildTypexa_child 4.915e+01 6.416e+00 7.661 1.98e-14 ***
## GuestNumsingle -1.486e+01 4.144e+00 -3.585 0.000338 ***
## Devicemobile 2.336e+01 3.019e+00 7.738 1.09e-14 ***
## Deviceneutral 5.211e+01 3.011e+00 17.307 < 2e-16 ***
## Devicepc 3.702e+01 2.893e+00 12.799 < 2e-16 ***
## BnbTypeelse.apt -1.394e+01 3.567e+00 -3.908 9.34e-05 ***
## BnbTypeneutral 9.383e+00 2.612e+00 3.593 0.000328 ***
## DoWweekend 9.362e+00 2.692e+00 3.478 0.000507 ***
## DoWxspecial 1.030e+01 2.267e+00 4.545 5.54e-06 ***
## OrderPreearly+ 1.690e+01 8.991e+00 1.880 0.060166 .
## OrderPrenear 2.855e+01 7.315e+00 3.902 9.58e-05 ***
## OrderPrenormal 1.683e+01 7.845e+00 2.145 0.031936 *
## OrderPrenormal+ 9.087e+00 8.193e+00 1.109 0.267369
## Seasonno_special 1.311e+02 3.799e+00 34.496 < 2e-16 ***
## Seasonspring 7.866e+01 4.603e+00 17.089 < 2e-16 ***
## Seasonsummer 5.593e+01 4.079e+00 13.712 < 2e-16 ***
## Seasonwinter 6.916e+01 4.726e+00 14.633 < 2e-16 ***
## AreaEastSouthAsia -8.234e+00 2.376e+01 -0.347 0.728906
## AreaOthers -2.697e+00 2.666e+01 -0.101 0.919447
## TravelDistP_Far -6.499e+00 3.063e+01 -0.212 0.831983
## TravelDistP_Local 1.000e+01 2.199e+01 0.455 0.649278
## TravelDistP_Short 1.689e+01 2.198e+01 0.769 0.442179
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 112.8 on 12838 degrees of freedom
## Multiple R-squared: 0.2863, Adjusted R-squared: 0.2847
## F-statistic: 177.6 on 29 and 12838 DF, p-value: < 2.2e-16
Checking Model Assumptions According to the QQ-Plot, the model violates the assumption of normality.
Outliers The cood distance of data point is almost smaller than 0.5, so we’re going keep all data.
plot(mlr.1)
Recall that TMed’s distriution is skewed to right so that it would be better to take log. But before that, we would like to run a suitable Box-Cox transformation of the dependent variable:
library(lindia)
fit1 = lm(TMed ~., data = res.allf2[, c(2, 3, 6:19)])
gg_boxcox(fit1, showlambda = TRUE, lambdaSF = 3, scale.factor = 0.5)
According to the result of box-cox transformation, the maximum likelihood is when lambda equals 0.22. Since, 0.22 is similar to 0, so we’re going to take log of TMed.
mlr.2 = lm(tmedlog ~ ., data = res.allf2[, c(2, 6:20)])
summary(mlr.2)
##
## Call:
## lm(formula = tmedlog ~ ., data = res.allf2[, c(2, 6:20)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4958 -0.8291 0.1744 0.9268 4.5847
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.584e-01 3.116e-01 1.792 0.073156 .
## Recency -2.312e-03 7.467e-05 -30.965 < 2e-16 ***
## Monetary 3.505e-05 4.499e-06 7.791 7.13e-15 ***
## AvgPmt -5.017e-05 4.233e-05 -1.185 0.235977
## DailyQpon -5.430e-02 3.563e-02 -1.524 0.127467
## DailyNote -2.310e-01 3.522e-02 -6.560 5.60e-11 ***
## NightTypemost_4up 4.791e-01 7.344e-02 6.524 7.12e-11 ***
## NightTypeother 2.461e-01 5.621e-02 4.378 1.21e-05 ***
## ChildTypexa_child 9.658e-01 8.144e-02 11.859 < 2e-16 ***
## GuestNumsingle -1.347e-01 5.261e-02 -2.561 0.010450 *
## Devicemobile 1.682e-01 3.833e-02 4.388 1.15e-05 ***
## Deviceneutral 4.488e-01 3.823e-02 11.742 < 2e-16 ***
## Devicepc 3.148e-01 3.672e-02 8.573 < 2e-16 ***
## BnbTypeelse.apt -1.726e-01 4.528e-02 -3.813 0.000138 ***
## BnbTypeneutral -7.064e-02 3.315e-02 -2.131 0.033120 *
## DoWweekend 1.686e-01 3.417e-02 4.934 8.17e-07 ***
## DoWxspecial -6.402e-02 2.878e-02 -2.225 0.026118 *
## OrderPreearly+ 4.962e-01 1.141e-01 4.347 1.39e-05 ***
## OrderPrenear 9.956e-01 9.286e-02 10.721 < 2e-16 ***
## OrderPrenormal 5.599e-01 9.958e-02 5.623 1.92e-08 ***
## OrderPrenormal+ 3.505e-01 1.040e-01 3.370 0.000753 ***
## Seasonno_special 2.401e+00 4.823e-02 49.784 < 2e-16 ***
## Seasonspring 1.092e+00 5.843e-02 18.685 < 2e-16 ***
## Seasonsummer 9.672e-01 5.178e-02 18.678 < 2e-16 ***
## Seasonwinter 9.685e-01 6.000e-02 16.142 < 2e-16 ***
## AreaEastSouthAsia -5.242e-01 3.016e-01 -1.738 0.082216 .
## AreaOthers -4.610e-01 3.385e-01 -1.362 0.173190
## TravelDistP_Far -1.643e-02 3.888e-01 -0.042 0.966298
## TravelDistP_Local 2.077e-01 2.791e-01 0.744 0.456888
## TravelDistP_Short 1.001e-01 2.790e-01 0.359 0.719672
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.432 on 12838 degrees of freedom
## Multiple R-squared: 0.3721, Adjusted R-squared: 0.3707
## F-statistic: 262.4 on 29 and 12838 DF, p-value: < 2.2e-16
plot(mlr.2)
QQ plot looks better!
We’re going to add interaction terms in model3.
mlr.3 <- lm(tmedlog ~ . + Monetary:DailyNote
+ Monetary:TravelDist
+ DailyQpon:Device
+ DailyQpon:BnbType
+ DailyQpon:Season
+ NightType:TravelDist
+ ChildType:Season
+ GuestNum:OrderPre
+ Device:DoW
+ OrderPre:Season
+ OrderPre:TravelDist
+ Season:TravelDist
+ Monetary:AvgPmt
+ Monetary:GuestNum
+ Monetary:Area
+ Monetary:Season
+ AvgPmt:DailyQpon
+ AvgPmt:Season
+ AvgPmt:Area
+ DailyQpon:DailyNote
+ DailyQpon:TravelDist
+ DailyNote:GuestNum
+ DailyNote:Device
+ DailyNote:Season
+ DailyNote:Area
+ GuestNum:TravelDist
+ Device:OrderPre
+ DoW:Season
+ Area:TravelDist, data = res.allf2[, c(2, 6:20)])
summary(mlr.3)
##
## Call:
## lm(formula = tmedlog ~ . + Monetary:DailyNote + Monetary:TravelDist +
## DailyQpon:Device + DailyQpon:BnbType + DailyQpon:Season +
## NightType:TravelDist + ChildType:Season + GuestNum:OrderPre +
## Device:DoW + OrderPre:Season + OrderPre:TravelDist + Season:TravelDist +
## Monetary:AvgPmt + Monetary:GuestNum + Monetary:Area + Monetary:Season +
## AvgPmt:DailyQpon + AvgPmt:Season + AvgPmt:Area + DailyQpon:DailyNote +
## DailyQpon:TravelDist + DailyNote:GuestNum + DailyNote:Device +
## DailyNote:Season + DailyNote:Area + GuestNum:TravelDist +
## Device:OrderPre + DoW:Season + Area:TravelDist, data = res.allf2[,
## c(2, 6:20)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3246 -0.7536 0.1662 0.8714 4.2739
##
## Coefficients: (6 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.163e+00 2.651e+00 0.816 0.414653
## Recency -2.341e-03 7.347e-05 -31.871 < 2e-16 ***
## Monetary 1.014e-04 9.318e-05 1.088 0.276406
## AvgPmt -2.518e-04 1.459e-04 -1.725 0.084529 .
## DailyQpon 1.041e+00 1.014e+00 1.027 0.304582
## DailyNote -3.691e-01 1.260e-01 -2.930 0.003395 **
## NightTypemost_4up -3.728e-01 1.212e+00 -0.308 0.758449
## NightTypeother -6.598e-01 8.445e-01 -0.781 0.434618
## ChildTypexa_child 5.892e-01 2.154e-01 2.735 0.006241 **
## GuestNumsingle -1.986e+00 3.310e+00 -0.600 0.548501
## Devicemobile -3.288e-01 2.708e-01 -1.214 0.224652
## Deviceneutral -1.496e-01 3.018e-01 -0.496 0.620109
## Devicepc -4.964e-01 2.437e-01 -2.037 0.041669 *
## BnbTypeelse.apt -7.711e-02 5.339e-02 -1.444 0.148679
## BnbTypeneutral -3.116e-02 3.934e-02 -0.792 0.428416
## DoWweekend 2.727e-01 1.279e-01 2.133 0.032979 *
## DoWxspecial -3.359e-01 1.053e-01 -3.189 0.001429 **
## OrderPreearly+ 3.399e+00 3.345e+00 1.016 0.309643
## OrderPrenear 2.505e+00 2.155e+00 1.162 0.245263
## OrderPrenormal -8.143e-01 3.643e-01 -2.235 0.025414 *
## OrderPrenormal+ 2.358e-01 3.755e-01 0.628 0.530048
## Seasonno_special 1.047e+00 1.603e+00 0.653 0.513762
## Seasonspring -7.174e-01 5.345e-01 -1.342 0.179523
## Seasonsummer -2.820e+00 2.128e+00 -1.325 0.185092
## Seasonwinter -2.050e-01 1.996e+00 -0.103 0.918219
## AreaEastSouthAsia -5.930e-01 1.596e+00 -0.372 0.710211
## AreaOthers -1.758e+00 6.368e-01 -2.761 0.005775 **
## TravelDistP_Far 3.289e-01 2.689e+00 0.122 0.902657
## TravelDistP_Local -1.342e+00 2.629e+00 -0.510 0.609790
## TravelDistP_Short -3.736e-01 2.623e+00 -0.142 0.886767
## Monetary:DailyNote 3.312e-05 8.277e-06 4.002 6.32e-05 ***
## Monetary:TravelDistP_Far -1.108e-04 1.066e-04 -1.039 0.298907
## Monetary:TravelDistP_Local 3.155e-05 9.246e-05 0.341 0.732977
## Monetary:TravelDistP_Short -1.338e-05 9.213e-05 -0.145 0.884497
## DailyQpon:Devicemobile 2.126e-01 1.038e-01 2.049 0.040524 *
## DailyQpon:Deviceneutral 3.668e-01 9.589e-02 3.825 0.000132 ***
## DailyQpon:Devicepc 5.374e-01 9.138e-02 5.881 4.17e-09 ***
## DailyQpon:BnbTypeelse.apt -2.639e-01 1.126e-01 -2.343 0.019141 *
## DailyQpon:BnbTypeneutral -7.393e-02 8.389e-02 -0.881 0.378191
## DailyQpon:Seasonno_special -3.732e-01 1.109e-01 -3.366 0.000765 ***
## DailyQpon:Seasonspring 5.290e-01 1.443e-01 3.665 0.000248 ***
## DailyQpon:Seasonsummer 3.663e-01 1.218e-01 3.007 0.002647 **
## DailyQpon:Seasonwinter 3.007e-01 1.460e-01 2.059 0.039474 *
## NightTypemost_4up:TravelDistP_Far 8.655e-01 1.243e+00 0.696 0.486229
## NightTypeother:TravelDistP_Far 1.125e+00 8.673e-01 1.297 0.194819
## NightTypemost_4up:TravelDistP_Local -3.382e-01 1.239e+00 -0.273 0.784959
## NightTypeother:TravelDistP_Local 5.531e-01 8.559e-01 0.646 0.518121
## NightTypemost_4up:TravelDistP_Short 9.565e-01 1.215e+00 0.787 0.431050
## NightTypeother:TravelDistP_Short 1.011e+00 8.468e-01 1.194 0.232428
## ChildTypexa_child:Seasonno_special -1.295e-01 2.654e-01 -0.488 0.625542
## ChildTypexa_child:Seasonspring 1.299e+00 3.480e-01 3.732 0.000191 ***
## ChildTypexa_child:Seasonsummer 6.103e-01 2.507e-01 2.434 0.014948 *
## ChildTypexa_child:Seasonwinter 2.051e-01 3.001e-01 0.684 0.494282
## GuestNumsingle:OrderPreearly+ 6.272e-01 6.243e-01 1.005 0.315124
## GuestNumsingle:OrderPrenear -1.125e+00 4.037e-01 -2.787 0.005327 **
## GuestNumsingle:OrderPrenormal -7.089e-01 4.358e-01 -1.627 0.103852
## GuestNumsingle:OrderPrenormal+ -2.169e-01 4.533e-01 -0.479 0.632290
## Devicemobile:DoWweekend 1.649e-01 9.558e-02 1.725 0.084505 .
## Deviceneutral:DoWweekend 4.153e-02 1.020e-01 0.407 0.683814
## Devicepc:DoWweekend 9.442e-02 9.243e-02 1.022 0.307006
## Devicemobile:DoWxspecial 2.070e-01 8.426e-02 2.457 0.014028 *
## Deviceneutral:DoWxspecial 3.304e-01 8.264e-02 3.998 6.43e-05 ***
## Devicepc:DoWxspecial 1.619e-01 7.960e-02 2.033 0.042047 *
## OrderPreearly+:Seasonno_special -8.486e-01 4.429e-01 -1.916 0.055364 .
## OrderPrenear:Seasonno_special -1.409e+00 3.762e-01 -3.745 0.000181 ***
## OrderPrenormal:Seasonno_special 1.564e-02 3.934e-01 0.040 0.968287
## OrderPrenormal+:Seasonno_special -7.395e-01 4.086e-01 -1.810 0.070334 .
## OrderPreearly+:Seasonspring -6.993e-01 4.240e-01 -1.650 0.099063 .
## OrderPrenear:Seasonspring -1.639e-01 3.435e-01 -0.477 0.633161
## OrderPrenormal:Seasonspring 5.792e-01 3.670e-01 1.578 0.114588
## OrderPrenormal+:Seasonspring -2.157e-01 3.801e-01 -0.567 0.570505
## OrderPreearly+:Seasonsummer -6.866e-01 4.085e-01 -1.681 0.092820 .
## OrderPrenear:Seasonsummer -3.463e-01 3.416e-01 -1.014 0.310612
## OrderPrenormal:Seasonsummer 4.465e-01 3.614e-01 1.236 0.216589
## OrderPrenormal+:Seasonsummer -7.680e-01 3.755e-01 -2.045 0.040866 *
## OrderPreearly+:Seasonwinter -1.845e-01 4.208e-01 -0.438 0.661119
## OrderPrenear:Seasonwinter 5.211e-02 3.510e-01 0.148 0.882001
## OrderPrenormal:Seasonwinter 8.062e-01 3.758e-01 2.145 0.031951 *
## OrderPrenormal+:Seasonwinter -6.282e-01 3.888e-01 -1.616 0.106185
## OrderPreearly+:TravelDistP_Far -3.009e+00 3.338e+00 -0.902 0.367297
## OrderPrenear:TravelDistP_Far -1.718e+00 2.153e+00 -0.798 0.424684
## OrderPrenormal:TravelDistP_Far 6.105e-02 3.658e-01 0.167 0.867453
## OrderPrenormal+:TravelDistP_Far 4.081e-02 3.509e-01 0.116 0.907424
## OrderPreearly+:TravelDistP_Local -2.465e+00 3.334e+00 -0.739 0.459652
## OrderPrenear:TravelDistP_Local -1.105e+00 2.140e+00 -0.516 0.605832
## OrderPrenormal:TravelDistP_Local 1.050e+00 2.417e-01 4.345 1.40e-05 ***
## OrderPrenormal+:TravelDistP_Local 7.393e-01 2.415e-01 3.061 0.002214 **
## OrderPreearly+:TravelDistP_Short -2.720e+00 3.320e+00 -0.819 0.412727
## OrderPrenear:TravelDistP_Short -1.416e+00 2.130e+00 -0.665 0.506179
## OrderPrenormal:TravelDistP_Short NA NA NA NA
## OrderPrenormal+:TravelDistP_Short NA NA NA NA
## Seasonno_special:TravelDistP_Far 2.418e+00 1.558e+00 1.552 0.120630
## Seasonspring:TravelDistP_Far -5.846e-01 2.467e-01 -2.370 0.017827 *
## Seasonsummer:TravelDistP_Far 2.539e+00 2.096e+00 1.211 0.225838
## Seasonwinter:TravelDistP_Far -5.816e-02 1.954e+00 -0.030 0.976252
## Seasonno_special:TravelDistP_Local 2.702e+00 1.534e+00 1.762 0.078077 .
## Seasonspring:TravelDistP_Local 5.167e-01 1.849e-01 2.794 0.005214 **
## Seasonsummer:TravelDistP_Local 3.294e+00 2.083e+00 1.582 0.113780
## Seasonwinter:TravelDistP_Local 8.963e-01 1.940e+00 0.462 0.644072
## Seasonno_special:TravelDistP_Short 2.157e+00 1.541e+00 1.399 0.161742
## Seasonspring:TravelDistP_Short NA NA NA NA
## Seasonsummer:TravelDistP_Short 2.573e+00 2.088e+00 1.232 0.217994
## Seasonwinter:TravelDistP_Short 1.853e-01 1.946e+00 0.095 0.924139
## Monetary:AvgPmt -1.557e-08 4.317e-09 -3.605 0.000313 ***
## Monetary:GuestNumsingle 4.858e-05 2.732e-05 1.778 0.075407 .
## Monetary:AreaEastSouthAsia 6.811e-05 5.526e-05 1.232 0.217822
## Monetary:AreaOthers -4.070e-05 6.639e-05 -0.613 0.539801
## Monetary:Seasonno_special -9.181e-05 1.507e-05 -6.092 1.14e-09 ***
## Monetary:Seasonspring -2.461e-05 1.722e-05 -1.430 0.152872
## Monetary:Seasonsummer -2.135e-05 1.594e-05 -1.339 0.180623
## Monetary:Seasonwinter -2.990e-05 1.797e-05 -1.664 0.096188 .
## AvgPmt:DailyQpon -3.856e-04 1.118e-04 -3.448 0.000566 ***
## AvgPmt:Seasonno_special 4.328e-04 1.519e-04 2.850 0.004383 **
## AvgPmt:Seasonspring 1.711e-04 1.783e-04 0.959 0.337418
## AvgPmt:Seasonsummer 3.505e-04 1.652e-04 2.121 0.033960 *
## AvgPmt:Seasonwinter 3.135e-04 1.847e-04 1.697 0.089756 .
## AvgPmt:AreaEastSouthAsia 3.975e-04 2.732e-04 1.455 0.145732
## AvgPmt:AreaOthers 1.476e-03 5.334e-04 2.767 0.005667 **
## DailyQpon:DailyNote -1.549e-01 7.155e-02 -2.166 0.030360 *
## DailyQpon:TravelDistP_Far -1.308e+00 1.023e+00 -1.279 0.201023
## DailyQpon:TravelDistP_Local -1.040e+00 1.001e+00 -1.039 0.298829
## DailyQpon:TravelDistP_Short -8.374e-01 1.002e+00 -0.836 0.403408
## DailyNote:GuestNumsingle 3.693e-01 1.398e-01 2.641 0.008285 **
## DailyNote:Devicemobile -1.121e-01 1.073e-01 -1.044 0.296469
## DailyNote:Deviceneutral -9.824e-02 1.054e-01 -0.932 0.351321
## DailyNote:Devicepc -2.882e-01 9.581e-02 -3.008 0.002637 **
## DailyNote:Seasonno_special 3.903e-01 1.081e-01 3.613 0.000304 ***
## DailyNote:Seasonspring 1.228e-01 1.366e-01 0.899 0.368710
## DailyNote:Seasonsummer 1.654e-01 1.206e-01 1.371 0.170448
## DailyNote:Seasonwinter -1.492e-01 1.452e-01 -1.028 0.304104
## DailyNote:AreaEastSouthAsia 5.120e-01 1.865e-01 2.746 0.006049 **
## DailyNote:AreaOthers 7.299e-01 4.214e-01 1.732 0.083293 .
## GuestNumsingle:TravelDistP_Far 2.703e+00 3.296e+00 0.820 0.412134
## GuestNumsingle:TravelDistP_Local 2.788e+00 3.286e+00 0.849 0.396111
## GuestNumsingle:TravelDistP_Short 2.304e+00 3.288e+00 0.701 0.483497
## Devicemobile:OrderPreearly+ 3.759e-01 3.424e-01 1.098 0.272296
## Deviceneutral:OrderPreearly+ -2.505e-01 3.644e-01 -0.687 0.491803
## Devicepc:OrderPreearly+ 4.891e-01 3.022e-01 1.618 0.105648
## Devicemobile:OrderPrenear 4.016e-01 2.693e-01 1.491 0.135877
## Deviceneutral:OrderPrenear 4.266e-01 2.995e-01 1.424 0.154429
## Devicepc:OrderPrenear 6.814e-01 2.420e-01 2.815 0.004883 **
## Devicemobile:OrderPrenormal 2.432e-01 2.904e-01 0.837 0.402383
## Deviceneutral:OrderPrenormal 3.391e-01 3.208e-01 1.057 0.290482
## Devicepc:OrderPrenormal 5.533e-01 2.613e-01 2.118 0.034221 *
## Devicemobile:OrderPrenormal+ -1.463e-01 3.083e-01 -0.475 0.635141
## Deviceneutral:OrderPrenormal+ 3.632e-01 3.380e-01 1.075 0.282583
## Devicepc:OrderPrenormal+ 6.561e-01 2.753e-01 2.383 0.017199 *
## DoWweekend:Seasonno_special -3.099e-01 1.252e-01 -2.475 0.013320 *
## DoWxspecial:Seasonno_special 2.663e-01 1.000e-01 2.662 0.007780 **
## DoWweekend:Seasonspring -9.637e-03 1.457e-01 -0.066 0.947254
## DoWxspecial:Seasonspring -1.595e-03 1.207e-01 -0.013 0.989460
## DoWweekend:Seasonsummer -1.867e-01 1.317e-01 -1.417 0.156366
## DoWxspecial:Seasonsummer -1.303e-01 1.096e-01 -1.189 0.234483
## DoWweekend:Seasonwinter -1.519e-01 1.503e-01 -1.011 0.311912
## DoWxspecial:Seasonwinter -1.190e-01 1.273e-01 -0.935 0.349841
## AreaEastSouthAsia:TravelDistP_Far -4.805e-01 1.603e+00 -0.300 0.764431
## AreaOthers:TravelDistP_Far NA NA NA NA
## AreaEastSouthAsia:TravelDistP_Local -4.068e+00 2.102e+00 -1.935 0.053030 .
## AreaOthers:TravelDistP_Local NA NA NA NA
## AreaEastSouthAsia:TravelDistP_Short -1.856e-01 2.105e+00 -0.088 0.929741
## AreaOthers:TravelDistP_Short NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.383 on 12713 degrees of freedom
## Multiple R-squared: 0.4203, Adjusted R-squared: 0.4133
## F-statistic: 59.86 on 154 and 12713 DF, p-value: < 2.2e-16
plot(mlr.3)
## Warning: not plotting observations with leverage one:
## 121, 784, 4239, 10776, 12222
## Warning: not plotting observations with leverage one:
## 121, 784, 4239, 10776, 12222
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
According to the results, we decide model3 as our final model for predicting customer purchasing cycle (TMed)
Goal:predict the probability of new customer being alive
Data:the customer who make more than one purchase (old customer)
Definition:
Online room booking platform is a case of non-contractual setting for customer analysis
Status: We are going to build binomial logistic regression model to predict Status, which means the customer is alive or not. If Status equals 1 then is alive, otherwise dead (drop out the platform). And we defined the Status variable according to the RT-Ratio.
RT-Ratio = Recency / TMed
hist(res.allf2$RTRatio, freq = TRUE, breaks = seq(0, 1200, 1), labels = TRUE,
xlim = c(0, 20), ylim = c(0, 12000),
ylab = "#Members", xlab = "Recency-Purchasing Period Ratio",
main = 'Histogram of R-T Ratio')
abline(v=8,col="red")
We are going to calculated the proportion of neighboring counts of RT-Ratio. When the proportion > 90%, we assumed the customer will jump to the higher RT-Ratio (jump to the right of graph).
We defined when RT-Ratio equals 8, the probability of the customer jump to the right is 90% (221/247), which means there is little chance they will come back. Therefore, when RT-Ratio is larger than 8, the customer is dead, otherwise alive.
Data
res.allf2$Status <- 1
# [RTRatio = 8] / [RTRatio = 7] > 90%
res.allf2[res.allf2$RTRatio > 8, ]$Status <- 0
colnames(res.allf2)
## [1] "ID" "Recency" "TMed" "RTRatio" "Frequency"
## [6] "Monetary" "AvgPmt" "DailyQpon" "DailyNote" "NightType"
## [11] "ChildType" "GuestNum" "Device" "BnbType" "DoW"
## [16] "OrderPre" "Season" "Area" "TravelDist" "tmedlog"
## [21] "Status"
Y:Status
X:Monetary、DailyQpon、DailyNote、ChildType、GuestNum、Device、BnbType、DoW、OrderPre、 Season、Area、TravelDist
logit0 = glm(Status ~ . + Season:TravelDist
+ ChildType:Season
+ OrderPre:TravelDist
+ DoW:Season
+ Monetary:Season
+ DailyQpon:Season
+ OrderPre:Season, data = res.allf2[, c(6:19, 21)],
family = binomial(link="logit"))
summary(logit0)
##
## Call:
## glm(formula = Status ~ . + Season:TravelDist + ChildType:Season +
## OrderPre:TravelDist + DoW:Season + Monetary:Season + DailyQpon:Season +
## OrderPre:Season, family = binomial(link = "logit"), data = res.allf2[,
## c(6:19, 21)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7768 -0.8700 0.4577 0.6797 2.5160
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.385e+01 3.430e+02 -0.040 0.967783
## Monetary 5.622e-05 2.505e-05 2.244 0.024828 *
## AvgPmt 1.079e-04 7.662e-05 1.408 0.159160
## DailyQpon 5.986e-01 1.478e-01 4.049 5.14e-05 ***
## DailyNote -9.151e-02 6.124e-02 -1.494 0.135105
## NightTypemost_4up 6.929e-01 1.357e-01 5.108 3.26e-07 ***
## NightTypeother 2.862e-01 1.050e-01 2.725 0.006432 **
## ChildTypexa_child 6.552e-02 3.385e-01 0.194 0.846540
## GuestNumsingle -4.829e-01 8.597e-02 -5.616 1.95e-08 ***
## Devicemobile -2.462e-01 6.479e-02 -3.801 0.000144 ***
## Deviceneutral 7.455e-02 6.831e-02 1.091 0.275126
## Devicepc -2.293e-01 6.268e-02 -3.659 0.000254 ***
## BnbTypeelse.apt -7.917e-02 7.541e-02 -1.050 0.293781
## BnbTypeneutral -1.034e-01 5.872e-02 -1.761 0.078181 .
## DoWweekend 2.702e-01 1.810e-01 1.493 0.135399
## DoWxspecial -1.321e-01 1.436e-01 -0.920 0.357725
## OrderPreearly+ 2.517e+01 8.065e+02 0.031 0.975105
## OrderPrenear 2.606e+01 4.851e+02 0.054 0.957160
## OrderPrenormal 6.006e-01 6.414e-01 0.936 0.349118
## OrderPrenormal+ 9.195e-01 6.518e-01 1.411 0.158318
## Seasonno_special -8.987e+00 3.430e+02 -0.026 0.979098
## Seasonspring -1.075e+00 9.762e-01 -1.101 0.270745
## Seasonsummer 4.596e-01 4.961e+02 0.001 0.999261
## Seasonwinter 2.125e+00 4.600e+02 0.005 0.996313
## AreaEastSouthAsia -1.094e+00 5.148e-01 -2.125 0.033573 *
## AreaOthers -7.599e-01 5.767e-01 -1.318 0.187609
## TravelDistP_Far 1.067e+01 3.430e+02 0.031 0.975185
## TravelDistP_Local 1.162e+01 3.430e+02 0.034 0.972985
## TravelDistP_Short 1.182e+01 3.430e+02 0.034 0.972514
## Seasonno_special:TravelDistP_Far 1.312e+01 3.430e+02 0.038 0.969495
## Seasonspring:TravelDistP_Far -8.177e-01 4.813e-01 -1.699 0.089351 .
## Seasonsummer:TravelDistP_Far -8.155e-01 4.961e+02 -0.002 0.998688
## Seasonwinter:TravelDistP_Far -1.470e+00 4.600e+02 -0.003 0.997451
## Seasonno_special:TravelDistP_Local 1.271e+01 3.430e+02 0.037 0.970431
## Seasonspring:TravelDistP_Local 6.685e-02 3.028e-01 0.221 0.825247
## Seasonsummer:TravelDistP_Local -5.511e-01 4.961e+02 -0.001 0.999114
## Seasonwinter:TravelDistP_Local -1.225e+00 4.600e+02 -0.003 0.997875
## Seasonno_special:TravelDistP_Short 1.250e+01 3.430e+02 0.036 0.970925
## Seasonspring:TravelDistP_Short NA NA NA NA
## Seasonsummer:TravelDistP_Short -8.848e-01 4.961e+02 -0.002 0.998577
## Seasonwinter:TravelDistP_Short -1.139e+00 4.600e+02 -0.002 0.998024
## ChildTypexa_child:Seasonno_special 6.439e-02 4.828e-01 0.133 0.893904
## ChildTypexa_child:Seasonspring 1.343e+00 6.299e-01 2.132 0.032996 *
## ChildTypexa_child:Seasonsummer 9.663e-01 4.001e-01 2.415 0.015724 *
## ChildTypexa_child:Seasonwinter 5.452e-01 4.792e-01 1.138 0.255285
## OrderPreearly+:TravelDistP_Far -2.343e+01 8.065e+02 -0.029 0.976824
## OrderPrenear:TravelDistP_Far -2.235e+01 4.851e+02 -0.046 0.963255
## OrderPrenormal:TravelDistP_Far 1.373e+00 1.184e+00 1.160 0.246083
## OrderPrenormal+:TravelDistP_Far 1.040e+00 1.181e+00 0.881 0.378510
## OrderPreearly+:TravelDistP_Local -2.404e+01 8.065e+02 -0.030 0.976216
## OrderPrenear:TravelDistP_Local -2.343e+01 4.851e+02 -0.048 0.961480
## OrderPrenormal:TravelDistP_Local 1.167e+00 4.277e-01 2.729 0.006351 **
## OrderPrenormal+:TravelDistP_Local 6.915e-01 4.236e-01 1.633 0.102547
## OrderPreearly+:TravelDistP_Short -2.433e+01 8.065e+02 -0.030 0.975937
## OrderPrenear:TravelDistP_Short -2.354e+01 4.851e+02 -0.049 0.961291
## OrderPrenormal:TravelDistP_Short NA NA NA NA
## OrderPrenormal+:TravelDistP_Short NA NA NA NA
## DoWweekend:Seasonno_special -2.253e-01 2.167e-01 -1.040 0.298428
## DoWxspecial:Seasonno_special 1.143e-01 1.703e-01 0.671 0.502268
## DoWweekend:Seasonspring 4.532e-02 2.305e-01 0.197 0.844113
## DoWxspecial:Seasonspring -4.290e-01 1.939e-01 -2.213 0.026923 *
## DoWweekend:Seasonsummer -2.179e-03 2.076e-01 -0.010 0.991625
## DoWxspecial:Seasonsummer -3.159e-01 1.706e-01 -1.852 0.064043 .
## DoWweekend:Seasonwinter -2.577e-01 2.339e-01 -1.102 0.270678
## DoWxspecial:Seasonwinter -5.584e-01 2.006e-01 -2.784 0.005373 **
## Monetary:Seasonno_special -4.548e-05 2.856e-05 -1.593 0.111261
## Monetary:Seasonspring 1.228e-05 3.022e-05 0.406 0.684542
## Monetary:Seasonsummer 1.800e-07 2.769e-05 0.007 0.994812
## Monetary:Seasonwinter -1.203e-05 3.112e-05 -0.387 0.698968
## DailyQpon:Seasonno_special 6.224e-02 1.943e-01 0.320 0.748758
## DailyQpon:Seasonspring 4.649e-01 2.214e-01 2.100 0.035717 *
## DailyQpon:Seasonsummer 2.859e-01 1.858e-01 1.539 0.123915
## DailyQpon:Seasonwinter 4.589e-01 2.286e-01 2.007 0.044702 *
## OrderPreearly+:Seasonno_special -1.270e+00 9.080e-01 -1.399 0.161752
## OrderPrenear:Seasonno_special -2.357e+00 8.035e-01 -2.934 0.003349 **
## OrderPrenormal:Seasonno_special -5.669e-01 8.386e-01 -0.676 0.499008
## OrderPrenormal+:Seasonno_special -1.047e+00 8.579e-01 -1.220 0.222337
## OrderPreearly+:Seasonspring 9.525e-02 8.139e-01 0.117 0.906836
## OrderPrenear:Seasonspring -1.369e+00 6.987e-01 -1.960 0.050027 .
## OrderPrenormal:Seasonspring -5.554e-01 7.295e-01 -0.761 0.446489
## OrderPrenormal+:Seasonspring -9.142e-01 7.524e-01 -1.215 0.224385
## OrderPreearly+:Seasonsummer -7.781e-01 7.579e-01 -1.027 0.304606
## OrderPrenear:Seasonsummer -1.276e+00 6.661e-01 -1.916 0.055353 .
## OrderPrenormal:Seasonsummer -6.883e-01 6.903e-01 -0.997 0.318672
## OrderPrenormal+:Seasonsummer -1.252e+00 7.105e-01 -1.762 0.078007 .
## OrderPreearly+:Seasonwinter -1.030e+00 7.801e-01 -1.321 0.186597
## OrderPrenear:Seasonwinter -2.112e+00 6.820e-01 -3.097 0.001958 **
## OrderPrenormal:Seasonwinter -1.389e+00 7.134e-01 -1.947 0.051550 .
## OrderPrenormal+:Seasonwinter -1.421e+00 7.304e-01 -1.946 0.051677 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16411 on 12867 degrees of freedom
## Residual deviance: 12626 on 12782 degrees of freedom
## AIC: 12798
##
## Number of Fisher Scoring iterations: 12
(1-pchisq(deviance(logit0), df.residual(logit0)))
## [1] 0.8347535
rd.f1 <- raw[, c(1, 3, 5:7, 10, 12, 21:28, 31, 32)]
res.f1 <- subset(rd.f1, rd.f1$Frequency == 1)
colnames(res.f1)
## [1] "ID" "Recency" "RTRatio" "Frequency" "Monetary"
## [6] "DailyQpon" "DailyNote" "ChildType" "GuestNum" "Device"
## [11] "BnbType" "DoW" "OrderPre" "Season" "Area"
## [16] "TravelDist3" "tmedlog"
head(res.f1)
summary(res.all)
## ID Recency TMed RTRatio
## Min. : 10 Min. : 1.0 Min. : 1.0 Min. : 0.00
## 1st Qu.:204031 1st Qu.:116.0 1st Qu.: 15.0 1st Qu.: 0.76
## Median :272210 Median :303.5 Median : 71.0 Median : 3.02
## Mean :276197 Mean :311.7 Mean :120.2 Mean : 39.37
## 3rd Qu.:357782 3rd Qu.:497.0 3rd Qu.:182.0 3rd Qu.: 15.30
## Max. :438784 Max. :678.0 Max. :667.0 Max. :677.00
## NA's :101204 NA's :101204
## Frequency Monetary AvgPmt DailyQpon
## Min. : 1.000 Min. : 1 Min. : 0.25 Min. :0.0000
## 1st Qu.: 1.000 1st Qu.: 1580 1st Qu.: 600.00 1st Qu.:0.0000
## Median : 1.000 Median : 2560 Median : 800.00 Median :0.0000
## Mean : 1.166 Mean : 4736 Mean : 892.52 Mean :0.2291
## 3rd Qu.: 1.000 3rd Qu.: 5140 3rd Qu.: 1064.03 3rd Qu.:0.0000
## Max. :28.000 Max. :184082 Max. :28000.00 Max. :6.0000
##
## DailyNote NightType ChildType GuestNum
## Min. :0.0000 most_1~3:99046 a_child : 8001 nonsingle:103239
## 1st Qu.:0.0000 most_4up:13703 xa_child:106071 single : 10833
## Median :0.0000 other : 1323
## Mean :0.2611
## 3rd Qu.:0.5000
## Max. :6.0000
##
## Device BnbType DoW OrderPre
## app :21918 apt :88437 weekday :66416 early : 6233
## mobile :44915 else.apt:22641 weekend :42992 early+ : 7431
## neutral: 3380 neutral : 2994 xspecial: 4664 near :60483
## pc :43859 normal :24558
## normal+:15367
##
##
## Season Area TravelDist tmedlog
## autumn :21368 EastAsia :110511 No Tendency: 27 Min. :0.00
## no_special: 5958 EastSouthAsia: 2803 P_Far : 3781 1st Qu.:2.71
## spring :25204 Others : 758 P_Local :86262 Median :4.26
## summer :38414 P_Short :24002 Mean :3.79
## winter :23128 3rd Qu.:5.20
## Max. :6.50
## NA's :101204
res.allf1 = filter(res.all, Frequency == 1)
colnames(res.allf1)
## [1] "ID" "Recency" "TMed" "RTRatio" "Frequency"
## [6] "Monetary" "AvgPmt" "DailyQpon" "DailyNote" "NightType"
## [11] "ChildType" "GuestNum" "Device" "BnbType" "DoW"
## [16] "OrderPre" "Season" "Area" "TravelDist" "tmedlog"
res.allf1$tmedlog <- predict.lm(mlr.3, res.allf1[, c(-1, -3, -4, -5)])
## Warning in predict.lm(mlr.3, res.allf1[, c(-1, -3, -4, -5)]): prediction from a
## rank-deficient fit may be misleading
res.allf1$RTRatio <- round(res.allf1$Recency / exp(res.allf1$tmedlog), 2)
res.allf1$TMed = exp(res.allf1$tmedlog)
head(res.allf1)
the distribution of purchasing cycle of new customer
p.mlr = res.allf1 %>%
filter(TMed <720) %>%
ggplot(aes(TMed)) +
geom_histogram(bins = 50,color="white")
p.mlr
colnames(res.allf1)
## [1] "ID" "Recency" "TMed" "RTRatio" "Frequency"
## [6] "Monetary" "AvgPmt" "DailyQpon" "DailyNote" "NightType"
## [11] "ChildType" "GuestNum" "Device" "BnbType" "DoW"
## [16] "OrderPre" "Season" "Area" "TravelDist" "tmedlog"
res.allf1$p.Status = predict(logit0, res.allf1[, c(-1, -3, -4, -5, -20, -21)], type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = ifelse(type
## == : prediction from a rank-deficient fit may be misleading
the probability distribution of being alive
p = res.allf1 %>%
ggplot(aes(p.Status)) +
geom_histogram(bins = 50)
p
Select the target customer:
TMed is 660; therefore, 660 days of purchasing cycle would be one rule to decide whether a customer is dead.hist(res.allf1$TMed[res.allf1$TMed < 800],
breaks = seq(0, 800, 20),
labels = TRUE,
xlab = 'Purchasing cycle (TMed)',
main = 'Histogram of Purchasing cycle (TMed)')
abline(v=660,col="red")
hist(res.allf1$RTRatio[res.allf1$TMed <= 660 & res.allf1$RTRatio<1000],
freq = TRUE,
ylab = "#Members", xlab = "Recency-Purchasing Period Ratio",
main = 'Histogram of R-T Ratio')