Clear Workspace

rm(list = ls())
library(dplyr)
library(readxl)
library(xlsx)
library(tidyr)
library(data.table)

Load data

data <- read_excel("Data Track.xlsx")
data2 <- read_excel("Data Track 2.xlsx")

df <- data %>% full_join(data2, by="UserPhone")

Data cleaning

df <- df %>% mutate_if(is.numeric , replace_na, replace = 0)
df_unique <- df %>% distinct(UserPhone, .keep_all = TRUE)

ncol(df_unique)
## [1] 241

Count & sum by row

Note: function rowSum

test <- select(df_unique, -c(1:85))

test$count <- rowSums(is.na(test)) # count na value 
test$count2 <- rowSums(!is.na(test)) # count not na value 
test$sum <- rowSums(test, na.rm=TRUE) # sum without na 

rm(test)

Replace NA value with 0 for numeric column

Note: function muate_if

test <- select(data, -c(1:85))
test <- test %>% mutate_if(is.numeric , replace_na, replace = 0) # replace na by 0

test$count <- rowSums(test != 0) # count value (not count 0)
test$sum <- rowSums(test, na.rm=TRUE) # sum without na 
test$sum <- rowSums(test[, c(1:8), 1], na.rm=TRUE) # sum without na 

Calculate real dataset

ncol(df_unique) # number of col 
## [1] 241
test <- df_unique
ncol(test)
## [1] 241
test_unique <- test %>% mutate_if(is.numeric , replace_na, replace = 0) # replace na by 0

test_unique$count <- rowSums(test_unique != 0) # count value (not count 0)
test_unique$count1 <- rowSums(test_unique[, c(9:241)] != 0) # count value (not count 0) with subset coclums 

test_unique$sum <- rowSums(test_unique[, c(9:241), 1], na.rm=TRUE) # sum without na 
test_unique <- test_unique[complete.cases(test_unique),]

Export to excel

write.xlsx(test_unique, file = "data_fulljoin.xlsx",
           sheetName = "data", append = FALSE)

Processing Dates

Note: function setnames() to change multiple column names

mydata <- subset(test_unique, select = -c(2,4:7))
df <- melt(setDT(mydata), id.vars = c("UserPhone","RedeemDate.x"), variable.name = "active_date")
setnames(df, old = c("UserPhone","RedeemDate.x", "active_date"), new = c("CustomerID", "Join_Date", "InvoiceDate"))

str(df)
## Classes 'data.table' and 'data.frame':   68967 obs. of  4 variables:
##  $ CustomerID : chr  "0325288043" "0327175989" "0327245578" "0328768642" ...
##  $ Join_Date  : chr  "2021-10-01 20:59:30.874" "2021-09-30 18:57:17.764" "2021-09-26 18:59:09.619" "2021-09-28 21:01:02.539" ...
##  $ InvoiceDate: Factor w/ 237 levels "2021-09-28","2021-09-29",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ value      : num  0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# Converting the InvoiceDate column into a date object
df$Join_Date <- as.Date(df$Join_Date)
df$InvoiceDate <- as.Date(df$InvoiceDate)
str(df)
## Classes 'data.table' and 'data.frame':   68967 obs. of  4 variables:
##  $ CustomerID : chr  "0325288043" "0327175989" "0327245578" "0328768642" ...
##  $ Join_Date  : Date, format: "2021-10-01" "2021-09-30" ...
##  $ InvoiceDate: Date, format: "2021-09-28" "2021-09-28" ...
##  $ value      : num  0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
## solution 2 to change character column to Date 
df1 <- df 
df1 <- select(df, -c(1))
res <- df1 %>% mutate_if(is.character, as.Date)
str(res)
## Classes 'data.table' and 'data.frame':   68967 obs. of  3 variables:
##  $ Join_Date  : Date, format: "2021-10-01" "2021-09-30" ...
##  $ InvoiceDate: Date, format: "2021-09-28" "2021-09-28" ...
##  $ value      : num  0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# Setting up a column for the year data to make it easier to focus on year
df$Year <- as.numeric(format(df$Join_Date, "%Y"))

# # Seperating the 2010 observations from the 2011, observations to focus strictly on the 2011 cohorts
cohorts2021 <- df[df$Year == 2021,]

# Drop unneeded variables 
cohorts2021 <- cohorts2021[,c("CustomerID", "InvoiceDate", "Year")]
str(cohorts2021)
## Classes 'data.table' and 'data.frame':   68967 obs. of  3 variables:
##  $ CustomerID : chr  "0325288043" "0327175989" "0327245578" "0328768642" ...
##  $ InvoiceDate: Date, format: "2021-09-28" "2021-09-28" ...
##  $ Year       : num  2021 2021 2021 2021 2021 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Creating Cohorts

# Geting the first transaction
join.date <- aggregate(Join_Date~CustomerID, df, min, na.rm = TRUE)

# Changing the name of the column InvoiceDate to Join_Date since this is the first transaction date for each customer
colnames(join.date)[2] <- "Join_Date"
cohorts2021 <- merge(cohorts2021, join.date, by.x = "CustomerID",by.y = "CustomerID", all.x = TRUE)

# cohorts2021 <- cohorts2021 %>% mutate(days = cohorts2021$Join_Date - cohorts2021$InvoiceDate) # lable days
cohorts2021 <- cohorts2021 %>% mutate(days = as.numeric(cohorts2021$Join_Date - cohorts2021$InvoiceDate))
cohorts2021 <- cohorts2021[!(cohorts2021$days > 0)]

# Creating the groups/Cohorts based on the join date month
cohorts2021$Cohort <- as.numeric(format(cohorts2021$Join_Date, "%m"))

# Hide original customerID and replace by new ID 
data_id1 <- transform(cohorts2021,                                 
                      ID = as.numeric(factor(cohorts2021$CustomerID))) # Create new column named ID 

cohorts2021 <- transform(cohorts2021,                                 
                      CustomerID = as.numeric(factor(cohorts2021$CustomerID))) # Replace directly, not create new column


# Data table
library(data.table)

cohorts2021 <- select(cohorts2021, -c(5)) 

DT::datatable(head(cohorts2021, 100),
              filter = 'top',
              rownames = FALSE,
              options = list(
                pageLength = 10,
                pageLength = c(10, 20, 30, 40, 50)),
)

Cohort Age

cohorts2021$Age_by_Day <- as.numeric(difftime(cohorts2021$InvoiceDate, cohorts2021$Join_Date, 
                                              units = c("days")))

# Dividing the days by 30 to get the number of months
cohorts2021$Age_by_Month <- floor(cohorts2021$Age_by_Day/30)

cohorts2021$Join_Date <- format(cohorts2021$Join_Date, "%Y-%m")
cohorts2021$InvoiceDate <- format(cohorts2021$InvoiceDate, "%Y-%m")

# Renaming levels of a factor
str(cohorts2021)
## Classes 'data.table' and 'data.frame':   66789 obs. of  7 variables:
##  $ CustomerID  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ InvoiceDate : chr  "2021-10" "2021-10" "2021-10" "2021-10" ...
##  $ Year        : num  2021 2021 2021 2021 2021 ...
##  $ Join_Date   : chr  "2021-10" "2021-10" "2021-10" "2021-10" ...
##  $ Cohort      : num  10 10 10 10 10 10 10 10 10 10 ...
##  $ Age_by_Day  : num  0 1 2 3 4 5 6 7 8 9 ...
##  $ Age_by_Month: num  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
cohorts2021$Cohort <- as.factor(cohorts2021$Cohort)
library(plyr)
cohorts2021$Cohort <- revalue(cohorts2021$Cohort, c("9"="Sep Cohorts", "10"="Oct Cohorts", "11" = "Nov Cohorts"))

# Have a look 
DT::datatable(head(cohorts2021,100),
              filter = 'top',
              rownames = FALSE,
              options = list(
                pageLength = 10,
                pageLength = c(10,20,30,40,50)))

MAU Mixpanel

# Count unique ID per month

## By excluding both columns Age_by_Day and Age_by_Month, we're able to remove the extra monthly observations to avoid counting unique customer IDs multiple times in any single month.

## The day and month Age variables keep us from removing duplicates which is why we need to exclude them both
dupes <- which(duplicated(cohorts2021[,c(-7,-6)]))
cohorts2021_nodup <- cohorts2021[-dupes,]

cohorts.wide <- reshape2::dcast(cohorts2021_nodup,Cohort~Age_by_Month,
                                value.var="CustomerID",
                                fun.aggregate = length)
# Cloning the output for retention and churn mixpanels
# to be used later
cw.retention <- cohorts.wide
cw.churn <- cohorts.wide

# Creating 19 breaks and 20 rgb color values ranging from blue to white
breaks <- quantile(cohorts.wide[,3:9], probs = seq(.05, .95, .05), na.rm = TRUE)
colors <- sapply(round(seq(155, 80, length.out = length(breaks) + 1), 0),
                 function(x){ rgb(x,x,155, maxColorValue = 155) } )


# The Retention Mixpanel with counts
library(DT)
DT::datatable(cohorts.wide,
              class = 'cell-border stripe',
             rownames = FALSE,
             options = list(
               ordering=F,
               dom = 't',
               pageLength = 12) ) %>%
             formatStyle("0",
                         backgroundColor = 'lightgrey',
                         fontWeight = 'bold') %>%
  formatStyle(names(cohorts.wide[c(-1,-2)]),fontWeight = 'bold',color = 'white', backgroundColor = styleInterval(breaks,colors))

Reference

  1. as.Date()
  2. rowSum()
  3. rowSum()
  4. Create new ID
  5. Rename Levels of Factor
  6. Cohort Analysis