1 Library Loading

library(tidyverse)
library("readxl")
library("writexl")
library(tidyr)
library(dplyr)
library(lubridate)

2 Data Wrangling

2.1 Remove duplicated data

Active <- Aug30

Active <- Active %>% group_by(UserID,ActiveDate) %>% summarise(totalusage = sum(`Usage Time (minute)`))

Aug30_a <- Aug30 %>% select(UserID,RedeemedAt_text) 
Aug30_a <- Aug30_a[!duplicated(Aug30_a$UserID), ]

Active_m <- merge(Active,Aug30_a)

2.2 Calculate DAU

# data cleaning 
Active_m$RedeemedAt_text <- as.Date(Active_m$RedeemedAt_text)
Active_m <- Active_m %>% mutate(A0 = Active_m$RedeemedAt_text + 0)
Active_m <- Active_m %>% mutate(A1 = Active_m$RedeemedAt_text + 1)
Active_m <- Active_m %>% mutate(A7 = Active_m$RedeemedAt_text + 7)
Active_m <- Active_m %>% mutate(A14 = Active_m$RedeemedAt_text + 14)
Active_m <- Active_m %>% mutate(A30 = Active_m$RedeemedAt_text + 30)

Active_m <- Active_m %>% mutate(A0_True = if_else(A0 == ActiveDate,TRUE,FALSE))
Active_m <- Active_m %>% mutate(A1_True = if_else(A1 == ActiveDate,TRUE,FALSE))
Active_m <- Active_m %>% mutate(A7_True = if_else(A7 == ActiveDate,TRUE,FALSE))
Active_m <- Active_m %>% mutate(A14_True = if_else(A14 == ActiveDate,TRUE,FALSE))
Active_m <- Active_m %>% mutate(A30_True = if_else(A30 == ActiveDate,TRUE,FALSE))

# number of active user after 1, 7,14,30 days since mobile app registered 
Active_a0 <- Active_m %>% filter(A0_True == "TRUE")%>% group_by(RedeemedAt_text)%>% summarise(count_a0= n_distinct(UserID))
Active_a1 <- Active_m %>% filter(A1_True == "TRUE")%>% group_by(RedeemedAt_text)%>% summarise(count_a1 = n_distinct(UserID))
Active_a7 <- Active_m %>% filter(A7_True == "TRUE")%>% group_by(RedeemedAt_text)%>% summarise(count_a7 = n_distinct(UserID))
Active_a14 <- Active_m %>% filter(A14_True == "TRUE")%>% group_by(RedeemedAt_text)%>% summarise(count_14 = n_distinct(UserID))
Active_a30 <- Active_m %>% filter(A30_True == "TRUE")%>% group_by(RedeemedAt_text)%>% summarise(count_30 = n_distinct(UserID))

# fullJoint 
fullJoinDf <- full_join(Active_a0,Active_a1,by="RedeemedAt_text")
fullJoinDf <- full_join(fullJoinDf,Active_a7,by="RedeemedAt_text")
fullJoinDf <- full_join(fullJoinDf,Active_a14,by="RedeemedAt_text")
fullJoinDf <- full_join(fullJoinDf,Active_a30,by="RedeemedAt_text")

# count distinct 
Active_a <- Active_m %>% group_by(RedeemedAt_text)%>% summarise(count = n_distinct(UserID))

fullJoinDf <- full_join(fullJoinDf,Active_a,by="RedeemedAt_text")

head(fullJoinDf,5) %>% knitr::kable()
RedeemedAt_text count_a0 count_a1 count_a7 count_14 count_30 count
2021-07-01 4 3 NA 1 1 4
2021-07-02 6 3 2 2 3 9
2021-07-03 2 2 2 2 NA 3
2021-07-04 2 2 NA NA 1 2
2021-07-12 1 NA NA NA NA 1

2.2.1 Daily Retention Calculation

dat <- c(0:60)
dat <- data.frame(dat)

library(lubridate)
test <- Aug30
test$ActiveDate <- as.Date(test$ActiveDate)
test$RedeemedAt_text <- as.Date(test$RedeemedAt_text)
test1 <- test %>% mutate(diff = as.numeric(test$ActiveDate - test$RedeemedAt_text))
test2 <- test1 %>% select(RedeemedAt_text,UserID,diff)

test3 <- test2 %>% group_by(RedeemedAt_text,diff) %>% summarise(n=n_distinct(UserID))
test4 <- test3 %>% select(RedeemedAt_text)
test5 <- test4[!duplicated(test4$RedeemedAt_text),]
test6 <- merge(test5,dat)
test7 <- left_join(test6,test3, by = c("RedeemedAt_text" = "RedeemedAt_text", "dat" = "diff") )
test7$n[is.na(test7$n)] <- 0
result <- spread(test7, dat, n)

test8 <- test %>% group_by(RedeemedAt_text) %>% summarise(nru = n_distinct(UserID))

result <- merge(result, test8)
head(result,5)%>% knitr::kable()
RedeemedAt_text 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 nru
2021-07-01 4 3 3 0 1 0 1 0 0 1 3 2 2 2 1 1 1 1 2 1 0 1 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 4
2021-07-02 6 3 3 3 3 1 2 2 3 1 0 0 1 2 2 3 3 2 2 3 2 2 2 2 4 2 2 2 2 1 3 3 0 2 1 1 2 3 2 2 3 3 4 0 3 2 1 1 0 2 2 0 0 0 0 0 0 0 0 0 0 9
2021-07-03 2 2 3 1 1 1 0 2 0 1 0 1 0 1 2 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 1 1 1 1 2 1 2 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3
2021-07-04 2 2 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 2
2021-07-06 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1

3 Calculate percentage for all columns

result_pct <- data.frame(
  result$RedeemedAt_text,
  result[,2:ncol(result)]/result[["nru"]]
)

# rounded value 
result_pct4 <- result_pct %>% 
  mutate_at(vars(-result.RedeemedAt_text), list(~ round(., 2)))

# rounded value and add % 
myfun <- function(x) {
  if(is.numeric(x)){ 
    ifelse(is.na(x), x, paste0(round(x*100L, 1), "%")) 
  } else x 
}

result_pct5 <- result_pct %>% mutate_each(funs(myfun))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
result_pct6 <- result_pct5 %>% mutate(sample = result$nru)

# reorder the position of column 
result_pct7 <- result_pct6[,c(1,64,63,2,3:ncol(result_pct6))]

# rename all column
result_pct8 <- result_pct7 %>% rename(RedeemdAt = result.RedeemedAt_text,
                                      NRU = sample,
                                      pct = nru)

head(result_pct8,5) %>%knitr::kable()
RedeemdAt NRU pct X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 X32 X33 X34 X35 X36 X37 X38 X39 X40 X41 X42 X43 X44 X45 X46 X47 X48 X49 X50 X51 X52 X53 X54 X55 X56 X57 X58 X59 X60 nru.1 sample.1
2021-07-01 4 100% 100% 75% 75% 0% 25% 0% 25% 0% 0% 25% 75% 50% 50% 50% 25% 25% 25% 25% 50% 25% 0% 25% 50% 50% 25% 25% 25% 25% 25% 25% 25% 25% 25% 25% 25% 50% 25% 25% 25% 25% 25% 25% 25% 25% 25% 25% 25% 25% 25% 25% 25% 25% 0% 0% 0% 0% 0% 0% 0% 0% 0% 100% 4
2021-07-02 9 100% 66.7% 33.3% 33.3% 33.3% 33.3% 11.1% 22.2% 22.2% 33.3% 11.1% 0% 0% 11.1% 22.2% 22.2% 33.3% 33.3% 22.2% 22.2% 33.3% 22.2% 22.2% 22.2% 22.2% 44.4% 22.2% 22.2% 22.2% 22.2% 11.1% 33.3% 33.3% 0% 22.2% 11.1% 11.1% 22.2% 33.3% 22.2% 22.2% 33.3% 33.3% 44.4% 0% 33.3% 22.2% 11.1% 11.1% 0% 22.2% 22.2% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 100% 9
2021-07-03 3 100% 66.7% 66.7% 100% 33.3% 33.3% 33.3% 0% 66.7% 0% 33.3% 0% 33.3% 0% 33.3% 66.7% 0% 0% 0% 33.3% 0% 0% 33.3% 0% 0% 0% 0% 33.3% 33.3% 0% 0% 0% 0% 33.3% 0% 0% 33.3% 33.3% 33.3% 33.3% 66.7% 33.3% 66.7% 33.3% 33.3% 33.3% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 100% 3
2021-07-04 2 100% 100% 100% 0% 0% 0% 0% 50% 0% 50% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 50% 0% 50% 50% 50% 50% 50% 50% 50% 50% 50% 50% 50% 50% 50% 50% 50% 50% 50% 50% 50% 50% 50% 0% 50% 50% 50% 50% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 100% 2
2021-07-06 1 100% 0% 0% 100% 0% 0% 100% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 100% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 100% 100% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 100% 1