1 Clear Workspace

rm(list = ls())
library(ggplot2)
library(dplyr)
library(pastecs)
library(fpc)
library(FactoMineR)
library(readxl)
library(xlsx)
data <- read_excel("data_cohort analysis.xlsx")
library(data.table)
mydata <- subset(data, select = -c(1,4))
df <- melt(setDT(mydata), id.vars = c("profile_uuid","Start Date"), variable.name = "active_date")

head(df)
##                            profile_uuid Start Date active_date    value
## 1: 869368ce-b1d9-47d8-a9b2-9515ebf01368   3/7/2022    3/7/2022 14.98333
## 2: 70be4661-054a-4f82-89e8-f1134ab67233  3/17/2022    3/7/2022       NA
## 3: db3e1095-eafc-489f-a0f0-b9dea9677302  3/17/2022    3/7/2022       NA
## 4: af57ccfa-7233-4ebf-96f6-1dde1c865220  3/18/2022    3/7/2022       NA
## 5: 7d1d9e63-2520-4d04-bd05-3e11d996e487  3/18/2022    3/7/2022       NA
## 6: 177f6fb5-e16d-4bfb-90d0-f18613648c90  3/18/2022    3/7/2022       NA

2 Rename multiple columns

dfx <- df
# solution 1: 
names(dfx)[1] <- "CustomerID"
names(dfx)[2] <- "Join_Date"
names(dfx)[3] <- "InvoiceDate"

# # solution:
# colnames(df) <- c("", ....)
# solution 2:
library(data.table)
setnames(df, old = c("profile_uuid", "Start Date", "active_date"), new = c("CustomerID", "Join_Date", "InvoiceDate"))
df
##                                 CustomerID Join_Date InvoiceDate     value
##    1: 869368ce-b1d9-47d8-a9b2-9515ebf01368  3/7/2022    3/7/2022 14.983333
##    2: 70be4661-054a-4f82-89e8-f1134ab67233 3/17/2022    3/7/2022        NA
##    3: db3e1095-eafc-489f-a0f0-b9dea9677302 3/17/2022    3/7/2022        NA
##    4: af57ccfa-7233-4ebf-96f6-1dde1c865220 3/18/2022    3/7/2022        NA
##    5: 7d1d9e63-2520-4d04-bd05-3e11d996e487 3/18/2022    3/7/2022        NA
##   ---                                                                     
## 5525: 2c0489ef-fd4a-467c-b57a-1237f6a3ae0d  5/7/2022    5/9/2022        NA
## 5526: 21c1fffc-3f19-4cd8-a471-38402df6c186  5/7/2022    5/9/2022        NA
## 5527: 0bbda961-4181-4169-9a7a-bb60b9be1156  5/7/2022    5/9/2022        NA
## 5528: 97395bdb-00e5-46e5-b037-b5437373953a  5/7/2022    5/9/2022  2.733333
## 5529: d7a4336d-d74c-4a38-9c55-a3e4eb757489  5/7/2022    5/9/2022        NA

3 Remove all NA value

cohort <- df[complete.cases(df), ]

head(cohort)
##                              CustomerID Join_Date InvoiceDate     value
## 1: 869368ce-b1d9-47d8-a9b2-9515ebf01368  3/7/2022    3/7/2022  14.98333
## 2: 869368ce-b1d9-47d8-a9b2-9515ebf01368  3/7/2022   3/10/2022  10.66667
## 3: 869368ce-b1d9-47d8-a9b2-9515ebf01368  3/7/2022   3/11/2022  20.16667
## 4: 70be4661-054a-4f82-89e8-f1134ab67233 3/17/2022   3/17/2022  17.65000
## 5: db3e1095-eafc-489f-a0f0-b9dea9677302 3/17/2022   3/17/2022  24.41667
## 6: af57ccfa-7233-4ebf-96f6-1dde1c865220 3/18/2022   3/18/2022 124.28333
str(cohort)
## Classes 'data.table' and 'data.frame':   550 obs. of  4 variables:
##  $ CustomerID : chr  "869368ce-b1d9-47d8-a9b2-9515ebf01368" "869368ce-b1d9-47d8-a9b2-9515ebf01368" "869368ce-b1d9-47d8-a9b2-9515ebf01368" "70be4661-054a-4f82-89e8-f1134ab67233" ...
##  $ Join_Date  : chr  "3/7/2022" "3/7/2022" "3/7/2022" "3/17/2022" ...
##  $ InvoiceDate: Factor w/ 57 levels "3/7/2022","3/10/2022",..: 1 2 3 4 4 5 5 5 5 6 ...
##  $ value      : num  15 10.7 20.2 17.6 24.4 ...
##  - attr(*, ".internal.selfref")=<externalptr>

4 Convert Date format

cohort$Join_Date <- as.Date(cohort$Join_Date, "%m/%d/%Y")
cohort$InvoiceDate <- as.Date(cohort$InvoiceDate, "%m/%d/%Y")
str(cohort)
## Classes 'data.table' and 'data.frame':   550 obs. of  4 variables:
##  $ CustomerID : chr  "869368ce-b1d9-47d8-a9b2-9515ebf01368" "869368ce-b1d9-47d8-a9b2-9515ebf01368" "869368ce-b1d9-47d8-a9b2-9515ebf01368" "70be4661-054a-4f82-89e8-f1134ab67233" ...
##  $ Join_Date  : Date, format: "2022-03-07" "2022-03-07" ...
##  $ InvoiceDate: Date, format: "2022-03-07" "2022-03-10" ...
##  $ value      : num  15 10.7 20.2 17.6 24.4 ...
##  - attr(*, ".internal.selfref")=<externalptr>

5 Rounding selected columns

cohort <- cohort %>% mutate(across(where(is.numeric), round, 1))
DT::datatable(head(cohort,500),
              filter = 'top',
              rownames = FALSE,
              options = list(
                pageLength = 10, 
                pageLength = c(10,20,30,40,50)))

6 Su dung app vao tuan thu…?

library(dplyr)
library(lubridate)

cohort <- cohort %>% 
  mutate(visit_week = cut.Date(InvoiceDate, breaks = "1 week", labels = FALSE)) %>% 
  arrange(InvoiceDate)

head(cohort)
##                              CustomerID  Join_Date InvoiceDate value visit_week
## 1: 869368ce-b1d9-47d8-a9b2-9515ebf01368 2022-03-07  2022-03-07  15.0          1
## 2: 869368ce-b1d9-47d8-a9b2-9515ebf01368 2022-03-07  2022-03-10  10.7          1
## 3: 869368ce-b1d9-47d8-a9b2-9515ebf01368 2022-03-07  2022-03-11  20.2          1
## 4: 70be4661-054a-4f82-89e8-f1134ab67233 2022-03-17  2022-03-17  17.6          2
## 5: db3e1095-eafc-489f-a0f0-b9dea9677302 2022-03-17  2022-03-17  24.4          2
## 6: af57ccfa-7233-4ebf-96f6-1dde1c865220 2022-03-18  2022-03-18 124.3          2

7 Trong tuan thu ….x, co nhieu uniqueID su dung app?

library(tidyr)
cohort_wide <- cohort %>% group_by(Join_Date, visit_week) %>% 
            summarise(visits = n_distinct(CustomerID))

head(cohort_wide)
## # A tibble: 6 × 3
## # Groups:   Join_Date [2]
##   Join_Date  visit_week visits
##   <date>          <int>  <int>
## 1 2022-03-07          1      1
## 2 2022-03-07          5      1
## 3 2022-03-07          6      1
## 4 2022-03-07          7      1
## 5 2022-03-17          2      2
## 6 2022-03-17          5      1

8 Visualise datatable

library(kableExtra)
# Visualize data with Viridis Color
vs_dt <- cohort_wide[1:20, ] # 10 row 

vs_dt[2:3] <- lapply(vs_dt[2:3], function(x) {   #column 2 & 3 
    cell_spec(x, bold = T, 
              color = spec_color(x, end = 0.9),
              font_size = spec_font_size(x))
})

vs_dt[1] <- cell_spec(vs_dt[[1]], color = "white", bold = T, # column 1 
    background = spec_color(1:20, end = 0.9, option = "A", direction = -1))
kbl(vs_dt, escape = F, align = "c") %>%
  kable_classic("striped", full_width = F)
Join_Date visit_week visits
2022-03-07 1 1
2022-03-07 5 1
2022-03-07 6 1
2022-03-07 7 1
2022-03-17 2 2
2022-03-17 5 1
2022-03-17 6 1
2022-03-17 7 1
2022-03-17 9 1
2022-03-18 2 4
2022-03-18 3 3
2022-03-18 4 1
2022-03-18 5 4
2022-03-18 6 1
2022-03-18 7 2
2022-03-18 8 2
2022-03-18 9 2
2022-03-18 10 1
2022-03-20 2 4
2022-03-20 3 4
# Cannot convert column to rowname 
cohort_wide2 <- cohort_wide[,-1]
# rownames(cohort_wide2) <- cohort_wide[,1]

# using styleColorBar
## convert to numeric value 
cohort_wide2$visit_week <- as.numeric(cohort_wide2$visit_week)
cohort_wide2$visits <- as.numeric(cohort_wide2$visits)

library(DT)
datatable(cohort_wide2) %>% formatStyle(names(cohort_wide2),
  background = styleColorBar(range(cohort_wide2), 'lightblue'),
  backgroundSize = '98% 88%',
  backgroundRepeat = 'no-repeat',
  backgroundPosition = 'center')
cohort_wide1 <- cohort_wide %>% 
            spread(visit_week, visits, fill=0)
cohort_wide1
## # A tibble: 31 × 11
## # Groups:   Join_Date [31]
##    Join_Date    `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`  `10`
##    <date>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 2022-03-07     1     0     0     0     1     1     1     0     0     0
##  2 2022-03-17     0     2     0     0     1     1     1     0     1     0
##  3 2022-03-18     0     4     3     1     4     1     2     2     2     1
##  4 2022-03-20     0     4     4     2     1     2     2     1     0     0
##  5 2022-03-23     0     0     1     1     1     1     1     0     0     0
##  6 2022-03-24     0     0     1     1     1     1     0     0     0     0
##  7 2022-03-25     0     0     6     4     3     4     6     2     4     0
##  8 2022-03-26     0     0     2     3     3     2     1     0     1     0
##  9 2022-03-27     0     0     1     1     1     1     1     1     0     0
## 10 2022-03-28     0     0     0     4     2     3     1     2     1     0
## # … with 21 more rows
# You can also style the full table as a type of heat map by using either styleInterval or styleColorBar, e.g.,
cohort_wide3 <- cohort_wide1[, -1]
### create 19 breaks and 20 rgb color values ranging from white to red
brks <- quantile(cohort_wide3, probs = seq(.05, .95, .05), na.rm = TRUE)
clrs <- round(seq(255, 40, length.out = length(brks) + 1), 0) %>%
  {paste0("rgb(255,", ., ",", ., ")")}

datatable(cohort_wide3) %>% formatStyle(names(cohort_wide3), backgroundColor = styleInterval(brks, clrs))

9 Count so nguoi join theo ngay

distinct_count <- cohort %>%
  group_by(Join_Date) %>%
  summarize(count = n_distinct(CustomerID))

distinct_count
## # A tibble: 31 × 2
##    Join_Date  count
##    <date>     <int>
##  1 2022-03-07     1
##  2 2022-03-17     2
##  3 2022-03-18     4
##  4 2022-03-20     5
##  5 2022-03-23     1
##  6 2022-03-24     2
##  7 2022-03-25     6
##  8 2022-03-26     3
##  9 2022-03-27     1
## 10 2022-03-28     4
## # … with 21 more rows
result <- merge(distinct_count, cohort_wide1, by = "Join_Date")
result
##     Join_Date count 1 2 3 4 5 6 7 8 9 10
## 1  2022-03-07     1 1 0 0 0 1 1 1 0 0  0
## 2  2022-03-17     2 0 2 0 0 1 1 1 0 1  0
## 3  2022-03-18     4 0 4 3 1 4 1 2 2 2  1
## 4  2022-03-20     5 0 4 4 2 1 2 2 1 0  0
## 5  2022-03-23     1 0 0 1 1 1 1 1 0 0  0
## 6  2022-03-24     2 0 0 1 1 1 1 0 0 0  0
## 7  2022-03-25     6 0 0 6 4 3 4 6 2 4  0
## 8  2022-03-26     3 0 0 2 3 3 2 1 0 1  0
## 9  2022-03-27     1 0 0 1 1 1 1 1 1 0  0
## 10 2022-03-28     4 0 0 0 4 2 3 1 2 1  0
## 11 2022-03-29     2 0 0 0 1 2 1 0 1 1  0
## 12 2022-03-30     3 0 0 0 3 3 0 0 0 0  0
## 13 2022-04-03     2 0 0 0 2 2 0 1 1 1  0
## 14 2022-04-15    16 0 0 0 0 0 8 5 6 9  2
## 15 2022-04-16     6 0 0 0 0 0 6 5 4 1  0
## 16 2022-04-19     1 0 0 0 0 0 0 1 1 0  0
## 17 2022-04-20     2 0 0 0 0 0 0 2 1 0  0
## 18 2022-04-21     5 0 0 0 0 0 0 3 4 2  1
## 19 2022-04-22     1 0 0 0 0 0 0 1 0 0  0
## 20 2022-04-23     1 0 0 0 0 0 0 0 1 0  0
## 21 2022-04-25     4 0 0 0 0 0 0 1 4 3  2
## 22 2022-04-26     2 0 0 0 0 0 0 0 2 0  0
## 23 2022-04-27     1 0 0 0 0 0 0 0 1 1  0
## 24 2022-04-28     1 0 0 0 0 0 0 0 1 1  0
## 25 2022-04-29     1 0 0 0 0 0 0 0 1 1  0
## 26 2022-04-30     2 0 0 0 0 0 0 0 2 2  1
## 27 2022-05-01     5 0 0 0 0 0 0 0 2 5  3
## 28 2022-05-02     2 0 0 0 0 0 0 1 0 2  1
## 29 2022-05-03     4 0 0 0 0 0 0 0 0 4  0
## 30 2022-05-05     2 0 0 0 0 0 0 0 0 2  0
## 31 2022-05-07     5 0 0 0 0 0 0 0 0 5  1
DT::datatable(head(result,500),
              filter = 'top',
              rownames = FALSE,
              options = list(
                pageLength = 10, 
                pageLength = c(10,20,30,40,50)))

10 Calculate retention

x <- result[,c(2:12)]
y <- result[,2] # cot count 

reten.r <- apply(x, 2, function(x) x/y ) # 2: column 
reten.r <- data.frame(cohort=(result$Join_Date), reten.r)

head(reten.r,5)
##       cohort count X1  X2   X3   X4  X5   X6  X7  X8  X9  X10
## 1 2022-03-07     1  1 0.0 0.00 0.00 1.0 1.00 1.0 0.0 0.0 0.00
## 2 2022-03-17     1  0 1.0 0.00 0.00 0.5 0.50 0.5 0.0 0.5 0.00
## 3 2022-03-18     1  0 1.0 0.75 0.25 1.0 0.25 0.5 0.5 0.5 0.25
## 4 2022-03-20     1  0 0.8 0.80 0.40 0.2 0.40 0.4 0.2 0.0 0.00
## 5 2022-03-23     1  0 0.0 1.00 1.00 1.0 1.00 1.0 0.0 0.0 0.00
# Rounding & % 
library(scales)
reten.r[2:12] <- sapply(reten.r[2:12], function(x) percent(x, accuracy=1))
reten.r <- as.data.frame(reten.r)
# Table 
library(kableExtra)
kbl(reten.r) %>%
  kable_styling(bootstrap_options = c("striped", "hover")) 
cohort count X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
2022-03-07 100% 100% 0% 0% 0% 100% 100% 100% 0% 0% 0%
2022-03-17 100% 0% 100% 0% 0% 50% 50% 50% 0% 50% 0%
2022-03-18 100% 0% 100% 75% 25% 100% 25% 50% 50% 50% 25%
2022-03-20 100% 0% 80% 80% 40% 20% 40% 40% 20% 0% 0%
2022-03-23 100% 0% 0% 100% 100% 100% 100% 100% 0% 0% 0%
2022-03-24 100% 0% 0% 50% 50% 50% 50% 0% 0% 0% 0%
2022-03-25 100% 0% 0% 100% 67% 50% 67% 100% 33% 67% 0%
2022-03-26 100% 0% 0% 67% 100% 100% 67% 33% 0% 33% 0%
2022-03-27 100% 0% 0% 100% 100% 100% 100% 100% 100% 0% 0%
2022-03-28 100% 0% 0% 0% 100% 50% 75% 25% 50% 25% 0%
2022-03-29 100% 0% 0% 0% 50% 100% 50% 0% 50% 50% 0%
2022-03-30 100% 0% 0% 0% 100% 100% 0% 0% 0% 0% 0%
2022-04-03 100% 0% 0% 0% 100% 100% 0% 50% 50% 50% 0%
2022-04-15 100% 0% 0% 0% 0% 0% 50% 31% 38% 56% 12%
2022-04-16 100% 0% 0% 0% 0% 0% 100% 83% 67% 17% 0%
2022-04-19 100% 0% 0% 0% 0% 0% 0% 100% 100% 0% 0%
2022-04-20 100% 0% 0% 0% 0% 0% 0% 100% 50% 0% 0%
2022-04-21 100% 0% 0% 0% 0% 0% 0% 60% 80% 40% 20%
2022-04-22 100% 0% 0% 0% 0% 0% 0% 100% 0% 0% 0%
2022-04-23 100% 0% 0% 0% 0% 0% 0% 0% 100% 0% 0%
2022-04-25 100% 0% 0% 0% 0% 0% 0% 25% 100% 75% 50%
2022-04-26 100% 0% 0% 0% 0% 0% 0% 0% 100% 0% 0%
2022-04-27 100% 0% 0% 0% 0% 0% 0% 0% 100% 100% 0%
2022-04-28 100% 0% 0% 0% 0% 0% 0% 0% 100% 100% 0%
2022-04-29 100% 0% 0% 0% 0% 0% 0% 0% 100% 100% 0%
2022-04-30 100% 0% 0% 0% 0% 0% 0% 0% 100% 100% 50%
2022-05-01 100% 0% 0% 0% 0% 0% 0% 0% 40% 100% 60%
2022-05-02 100% 0% 0% 0% 0% 0% 0% 50% 0% 100% 50%
2022-05-03 100% 0% 0% 0% 0% 0% 0% 0% 0% 100% 0%
2022-05-05 100% 0% 0% 0% 0% 0% 0% 0% 0% 100% 0%
2022-05-07 100% 0% 0% 0% 0% 0% 0% 0% 0% 100% 20%
reten.r %>%
  kbl() %>%
  kable_minimal()
cohort count X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
2022-03-07 100% 100% 0% 0% 0% 100% 100% 100% 0% 0% 0%
2022-03-17 100% 0% 100% 0% 0% 50% 50% 50% 0% 50% 0%
2022-03-18 100% 0% 100% 75% 25% 100% 25% 50% 50% 50% 25%
2022-03-20 100% 0% 80% 80% 40% 20% 40% 40% 20% 0% 0%
2022-03-23 100% 0% 0% 100% 100% 100% 100% 100% 0% 0% 0%
2022-03-24 100% 0% 0% 50% 50% 50% 50% 0% 0% 0% 0%
2022-03-25 100% 0% 0% 100% 67% 50% 67% 100% 33% 67% 0%
2022-03-26 100% 0% 0% 67% 100% 100% 67% 33% 0% 33% 0%
2022-03-27 100% 0% 0% 100% 100% 100% 100% 100% 100% 0% 0%
2022-03-28 100% 0% 0% 0% 100% 50% 75% 25% 50% 25% 0%
2022-03-29 100% 0% 0% 0% 50% 100% 50% 0% 50% 50% 0%
2022-03-30 100% 0% 0% 0% 100% 100% 0% 0% 0% 0% 0%
2022-04-03 100% 0% 0% 0% 100% 100% 0% 50% 50% 50% 0%
2022-04-15 100% 0% 0% 0% 0% 0% 50% 31% 38% 56% 12%
2022-04-16 100% 0% 0% 0% 0% 0% 100% 83% 67% 17% 0%
2022-04-19 100% 0% 0% 0% 0% 0% 0% 100% 100% 0% 0%
2022-04-20 100% 0% 0% 0% 0% 0% 0% 100% 50% 0% 0%
2022-04-21 100% 0% 0% 0% 0% 0% 0% 60% 80% 40% 20%
2022-04-22 100% 0% 0% 0% 0% 0% 0% 100% 0% 0% 0%
2022-04-23 100% 0% 0% 0% 0% 0% 0% 0% 100% 0% 0%
2022-04-25 100% 0% 0% 0% 0% 0% 0% 25% 100% 75% 50%
2022-04-26 100% 0% 0% 0% 0% 0% 0% 0% 100% 0% 0%
2022-04-27 100% 0% 0% 0% 0% 0% 0% 0% 100% 100% 0%
2022-04-28 100% 0% 0% 0% 0% 0% 0% 0% 100% 100% 0%
2022-04-29 100% 0% 0% 0% 0% 0% 0% 0% 100% 100% 0%
2022-04-30 100% 0% 0% 0% 0% 0% 0% 0% 100% 100% 50%
2022-05-01 100% 0% 0% 0% 0% 0% 0% 0% 40% 100% 60%
2022-05-02 100% 0% 0% 0% 0% 0% 0% 50% 0% 100% 50%
2022-05-03 100% 0% 0% 0% 0% 0% 0% 0% 0% 100% 0%
2022-05-05 100% 0% 0% 0% 0% 0% 0% 0% 0% 100% 0%
2022-05-07 100% 0% 0% 0% 0% 0% 0% 0% 0% 100% 20%