Library Loading
library(tidyverse)
library("readxl")
library("writexl")
library(tidyr)
library(dplyr)
library(lubridate)
Data Wrangling
Remove duplicated data
Active <- Aug30
Active <- Active %>% group_by(UserID,ActiveDate) %>% summarise(totalusage = sum(`Usage Time (minute)`))
Aug30_a <- Aug30 %>% select(UserID,RedeemedAt_text)
Aug30_a <- Aug30_a[!duplicated(Aug30_a$UserID), ]
Active_m <- merge(Active,Aug30_a)
Calculate DAU
# data cleaning
Active_m$RedeemedAt_text <- as.Date(Active_m$RedeemedAt_text)
Active_m <- Active_m %>% mutate(A0 = Active_m$RedeemedAt_text + 0)
Active_m <- Active_m %>% mutate(A1 = Active_m$RedeemedAt_text + 1)
Active_m <- Active_m %>% mutate(A7 = Active_m$RedeemedAt_text + 7)
Active_m <- Active_m %>% mutate(A14 = Active_m$RedeemedAt_text + 14)
Active_m <- Active_m %>% mutate(A30 = Active_m$RedeemedAt_text + 30)
Active_m <- Active_m %>% mutate(A0_True = if_else(A0 == ActiveDate,TRUE,FALSE))
Active_m <- Active_m %>% mutate(A1_True = if_else(A1 == ActiveDate,TRUE,FALSE))
Active_m <- Active_m %>% mutate(A7_True = if_else(A7 == ActiveDate,TRUE,FALSE))
Active_m <- Active_m %>% mutate(A14_True = if_else(A14 == ActiveDate,TRUE,FALSE))
Active_m <- Active_m %>% mutate(A30_True = if_else(A30 == ActiveDate,TRUE,FALSE))
# number of active user after 1, 7,14,30 days since mobile app registered
Active_a0 <- Active_m %>% filter(A0_True == "TRUE")%>% group_by(RedeemedAt_text)%>% summarise(count_a0= n_distinct(UserID))
Active_a1 <- Active_m %>% filter(A1_True == "TRUE")%>% group_by(RedeemedAt_text)%>% summarise(count_a1 = n_distinct(UserID))
Active_a7 <- Active_m %>% filter(A7_True == "TRUE")%>% group_by(RedeemedAt_text)%>% summarise(count_a7 = n_distinct(UserID))
Active_a14 <- Active_m %>% filter(A14_True == "TRUE")%>% group_by(RedeemedAt_text)%>% summarise(count_14 = n_distinct(UserID))
Active_a30 <- Active_m %>% filter(A30_True == "TRUE")%>% group_by(RedeemedAt_text)%>% summarise(count_30 = n_distinct(UserID))
# fullJoint
fullJoinDf <- full_join(Active_a0,Active_a1,by="RedeemedAt_text")
fullJoinDf <- full_join(fullJoinDf,Active_a7,by="RedeemedAt_text")
fullJoinDf <- full_join(fullJoinDf,Active_a14,by="RedeemedAt_text")
fullJoinDf <- full_join(fullJoinDf,Active_a30,by="RedeemedAt_text")
# count distinct
Active_a <- Active_m %>% group_by(RedeemedAt_text)%>% summarise(count = n_distinct(UserID))
fullJoinDf <- full_join(fullJoinDf,Active_a,by="RedeemedAt_text")
head(fullJoinDf,5) %>% knitr::kable()
| 2021-07-01 |
4 |
3 |
NA |
1 |
1 |
4 |
| 2021-07-02 |
6 |
3 |
2 |
2 |
3 |
9 |
| 2021-07-03 |
2 |
2 |
2 |
2 |
NA |
3 |
| 2021-07-04 |
2 |
2 |
NA |
NA |
1 |
2 |
| 2021-07-12 |
1 |
NA |
NA |
NA |
NA |
1 |
Daily Retention Calculation
dat <- c(0:60)
dat <- data.frame(dat)
library(lubridate)
test <- Aug30
test$ActiveDate <- as.Date(test$ActiveDate)
test$RedeemedAt_text <- as.Date(test$RedeemedAt_text)
test1 <- test %>% mutate(diff = as.numeric(test$ActiveDate - test$RedeemedAt_text))
test2 <- test1 %>% select(RedeemedAt_text,UserID,diff)
test3 <- test2 %>% group_by(RedeemedAt_text,diff) %>% summarise(n=n_distinct(UserID))
test4 <- test3 %>% select(RedeemedAt_text)
test5 <- test4[!duplicated(test4$RedeemedAt_text),]
test6 <- merge(test5,dat)
test7 <- left_join(test6,test3, by = c("RedeemedAt_text" = "RedeemedAt_text", "dat" = "diff") )
test7$n[is.na(test7$n)] <- 0
result <- spread(test7, dat, n)
test8 <- test %>% group_by(RedeemedAt_text) %>% summarise(nru = n_distinct(UserID))
result <- merge(result, test8)
head(result,5)%>% knitr::kable()
| 2021-07-01 |
4 |
3 |
3 |
0 |
1 |
0 |
1 |
0 |
0 |
1 |
3 |
2 |
2 |
2 |
1 |
1 |
1 |
1 |
2 |
1 |
0 |
1 |
2 |
2 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
2 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
4 |
| 2021-07-02 |
6 |
3 |
3 |
3 |
3 |
1 |
2 |
2 |
3 |
1 |
0 |
0 |
1 |
2 |
2 |
3 |
3 |
2 |
2 |
3 |
2 |
2 |
2 |
2 |
4 |
2 |
2 |
2 |
2 |
1 |
3 |
3 |
0 |
2 |
1 |
1 |
2 |
3 |
2 |
2 |
3 |
3 |
4 |
0 |
3 |
2 |
1 |
1 |
0 |
2 |
2 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
9 |
| 2021-07-03 |
2 |
2 |
3 |
1 |
1 |
1 |
0 |
2 |
0 |
1 |
0 |
1 |
0 |
1 |
2 |
0 |
0 |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
2 |
1 |
2 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
3 |
| 2021-07-04 |
2 |
2 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
2 |
| 2021-07-06 |
0 |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
Calculate percentage for all columns
result_pct <- data.frame(
result$RedeemedAt_text,
result[,2:ncol(result)]/result[["nru"]]
)
# rounded value
result_pct4 <- result_pct %>%
mutate_at(vars(-result.RedeemedAt_text), list(~ round(., 2)))
# rounded value and add %
myfun <- function(x) {
if(is.numeric(x)){
ifelse(is.na(x), x, paste0(round(x*100L, 1), "%"))
} else x
}
result_pct5 <- result_pct %>% mutate_each(funs(myfun))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
result_pct6 <- result_pct5 %>% mutate(sample = result$nru)
# reorder the position of column
result_pct7 <- result_pct6[,c(1,64,63,2,3:ncol(result_pct6))]
# rename all column
result_pct8 <- result_pct7 %>% rename(RedeemdAt = result.RedeemedAt_text,
NRU = sample,
pct = nru)
head(result_pct8,5) %>%knitr::kable()
| 2021-07-01 |
4 |
100% |
100% |
75% |
75% |
0% |
25% |
0% |
25% |
0% |
0% |
25% |
75% |
50% |
50% |
50% |
25% |
25% |
25% |
25% |
50% |
25% |
0% |
25% |
50% |
50% |
25% |
25% |
25% |
25% |
25% |
25% |
25% |
25% |
25% |
25% |
25% |
50% |
25% |
25% |
25% |
25% |
25% |
25% |
25% |
25% |
25% |
25% |
25% |
25% |
25% |
25% |
25% |
25% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
100% |
4 |
| 2021-07-02 |
9 |
100% |
66.7% |
33.3% |
33.3% |
33.3% |
33.3% |
11.1% |
22.2% |
22.2% |
33.3% |
11.1% |
0% |
0% |
11.1% |
22.2% |
22.2% |
33.3% |
33.3% |
22.2% |
22.2% |
33.3% |
22.2% |
22.2% |
22.2% |
22.2% |
44.4% |
22.2% |
22.2% |
22.2% |
22.2% |
11.1% |
33.3% |
33.3% |
0% |
22.2% |
11.1% |
11.1% |
22.2% |
33.3% |
22.2% |
22.2% |
33.3% |
33.3% |
44.4% |
0% |
33.3% |
22.2% |
11.1% |
11.1% |
0% |
22.2% |
22.2% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
100% |
9 |
| 2021-07-03 |
3 |
100% |
66.7% |
66.7% |
100% |
33.3% |
33.3% |
33.3% |
0% |
66.7% |
0% |
33.3% |
0% |
33.3% |
0% |
33.3% |
66.7% |
0% |
0% |
0% |
33.3% |
0% |
0% |
33.3% |
0% |
0% |
0% |
0% |
33.3% |
33.3% |
0% |
0% |
0% |
0% |
33.3% |
0% |
0% |
33.3% |
33.3% |
33.3% |
33.3% |
66.7% |
33.3% |
66.7% |
33.3% |
33.3% |
33.3% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
100% |
3 |
| 2021-07-04 |
2 |
100% |
100% |
100% |
0% |
0% |
0% |
0% |
50% |
0% |
50% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
50% |
0% |
50% |
50% |
50% |
50% |
50% |
50% |
50% |
50% |
50% |
50% |
50% |
50% |
50% |
50% |
50% |
50% |
50% |
50% |
50% |
50% |
50% |
0% |
50% |
50% |
50% |
50% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
100% |
2 |
| 2021-07-06 |
1 |
100% |
0% |
0% |
100% |
0% |
0% |
100% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
100% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
100% |
100% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
0% |
100% |
1 |