rm(list = ls())
library(dplyr)
library(readxl)
library(xlsx)
library(tidyr)
library(data.table)
data <- read_excel("Data Track.xlsx")
data2 <- read_excel("Data Track 2.xlsx")
df <- data %>% full_join(data2, by="UserPhone")
df <- df %>% mutate_if(is.numeric , replace_na, replace = 0)
df_unique <- df %>% distinct(UserPhone, .keep_all = TRUE)
ncol(df_unique)
## [1] 241
Note: function rowSum
test <- select(df_unique, -c(1:85))
test$count <- rowSums(is.na(test)) # count na value
test$count2 <- rowSums(!is.na(test)) # count not na value
test$sum <- rowSums(test, na.rm=TRUE) # sum without na
rm(test)
Note: function muate_if
test <- select(data, -c(1:85))
test <- test %>% mutate_if(is.numeric , replace_na, replace = 0) # replace na by 0
test$count <- rowSums(test != 0) # count value (not count 0)
test$sum <- rowSums(test, na.rm=TRUE) # sum without na
test$sum <- rowSums(test[, c(1:8), 1], na.rm=TRUE) # sum without na
ncol(df_unique) # number of col
## [1] 241
test <- df_unique
ncol(test)
## [1] 241
test_unique <- test %>% mutate_if(is.numeric , replace_na, replace = 0) # replace na by 0
test_unique$count <- rowSums(test_unique != 0) # count value (not count 0)
test_unique$count1 <- rowSums(test_unique[, c(9:241)] != 0) # count value (not count 0) with subset coclums
test_unique$sum <- rowSums(test_unique[, c(9:241), 1], na.rm=TRUE) # sum without na
test_unique <- test_unique[complete.cases(test_unique),]
write.xlsx(test_unique, file = "data_fulljoin.xlsx",
sheetName = "data", append = FALSE)
Note: function setnames() to change multiple column names
mydata <- subset(test_unique, select = -c(2,4:7))
df <- melt(setDT(mydata), id.vars = c("UserPhone","RedeemDate.x"), variable.name = "active_date")
setnames(df, old = c("UserPhone","RedeemDate.x", "active_date"), new = c("CustomerID", "Join_Date", "InvoiceDate"))
str(df)
## Classes 'data.table' and 'data.frame': 68967 obs. of 4 variables:
## $ CustomerID : chr "0325288043" "0327175989" "0327245578" "0328768642" ...
## $ Join_Date : chr "2021-10-01 20:59:30.874" "2021-09-30 18:57:17.764" "2021-09-26 18:59:09.619" "2021-09-28 21:01:02.539" ...
## $ InvoiceDate: Factor w/ 237 levels "2021-09-28","2021-09-29",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ value : num 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Converting the InvoiceDate column into a date object
df$Join_Date <- as.Date(df$Join_Date)
df$InvoiceDate <- as.Date(df$InvoiceDate)
str(df)
## Classes 'data.table' and 'data.frame': 68967 obs. of 4 variables:
## $ CustomerID : chr "0325288043" "0327175989" "0327245578" "0328768642" ...
## $ Join_Date : Date, format: "2021-10-01" "2021-09-30" ...
## $ InvoiceDate: Date, format: "2021-09-28" "2021-09-28" ...
## $ value : num 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
## solution 2 to change character column to Date
df1 <- df
df1 <- select(df, -c(1))
res <- df1 %>% mutate_if(is.character, as.Date)
str(res)
## Classes 'data.table' and 'data.frame': 68967 obs. of 3 variables:
## $ Join_Date : Date, format: "2021-10-01" "2021-09-30" ...
## $ InvoiceDate: Date, format: "2021-09-28" "2021-09-28" ...
## $ value : num 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Setting up a column for the year data to make it easier to focus on year
df$Year <- as.numeric(format(df$Join_Date, "%Y"))
# # Seperating the 2010 observations from the 2011, observations to focus strictly on the 2011 cohorts
cohorts2021 <- df[df$Year == 2021,]
# Drop unneeded variables
cohorts2021 <- cohorts2021[,c("CustomerID", "InvoiceDate", "Year")]
str(cohorts2021)
## Classes 'data.table' and 'data.frame': 68967 obs. of 3 variables:
## $ CustomerID : chr "0325288043" "0327175989" "0327245578" "0328768642" ...
## $ InvoiceDate: Date, format: "2021-09-28" "2021-09-28" ...
## $ Year : num 2021 2021 2021 2021 2021 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Geting the first transaction
join.date <- aggregate(Join_Date~CustomerID, df, min, na.rm = TRUE)
# Changing the name of the column InvoiceDate to Join_Date since this is the first transaction date for each customer
colnames(join.date)[2] <- "Join_Date"
cohorts2021 <- merge(cohorts2021, join.date, by.x = "CustomerID",by.y = "CustomerID", all.x = TRUE)
# cohorts2021 <- cohorts2021 %>% mutate(days = cohorts2021$Join_Date - cohorts2021$InvoiceDate) # lable days
cohorts2021 <- cohorts2021 %>% mutate(days = as.numeric(cohorts2021$Join_Date - cohorts2021$InvoiceDate))
cohorts2021 <- cohorts2021[!(cohorts2021$days > 0)]
# Creating the groups/Cohorts based on the join date month
cohorts2021$Cohort <- as.numeric(format(cohorts2021$Join_Date, "%m"))
# Hide original customerID and replace by new ID
data_id1 <- transform(cohorts2021,
ID = as.numeric(factor(cohorts2021$CustomerID))) # Create new column named ID
cohorts2021 <- transform(cohorts2021,
CustomerID = as.numeric(factor(cohorts2021$CustomerID))) # Replace directly, not create new column
# Data table
library(data.table)
cohorts2021 <- select(cohorts2021, -c(5))
DT::datatable(head(cohorts2021, 100),
filter = 'top',
rownames = FALSE,
options = list(
pageLength = 10,
pageLength = c(10, 20, 30, 40, 50)),
)
cohorts2021$Age_by_Day <- as.numeric(difftime(cohorts2021$InvoiceDate, cohorts2021$Join_Date,
units = c("days")))
# Dividing the days by 30 to get the number of months
cohorts2021$Age_by_Month <- floor(cohorts2021$Age_by_Day/30)
cohorts2021$Join_Date <- format(cohorts2021$Join_Date, "%Y-%m")
cohorts2021$InvoiceDate <- format(cohorts2021$InvoiceDate, "%Y-%m")
# Renaming levels of a factor
str(cohorts2021)
## Classes 'data.table' and 'data.frame': 66789 obs. of 7 variables:
## $ CustomerID : num 1 1 1 1 1 1 1 1 1 1 ...
## $ InvoiceDate : chr "2021-10" "2021-10" "2021-10" "2021-10" ...
## $ Year : num 2021 2021 2021 2021 2021 ...
## $ Join_Date : chr "2021-10" "2021-10" "2021-10" "2021-10" ...
## $ Cohort : num 10 10 10 10 10 10 10 10 10 10 ...
## $ Age_by_Day : num 0 1 2 3 4 5 6 7 8 9 ...
## $ Age_by_Month: num 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
cohorts2021$Cohort <- as.factor(cohorts2021$Cohort)
library(plyr)
cohorts2021$Cohort <- revalue(cohorts2021$Cohort, c("9"="Sep Cohorts", "10"="Oct Cohorts", "11" = "Nov Cohorts"))
# Have a look
DT::datatable(head(cohorts2021,100),
filter = 'top',
rownames = FALSE,
options = list(
pageLength = 10,
pageLength = c(10,20,30,40,50)))
# Count unique ID per month
## By excluding both columns Age_by_Day and Age_by_Month, we're able to remove the extra monthly observations to avoid counting unique customer IDs multiple times in any single month.
## The day and month Age variables keep us from removing duplicates which is why we need to exclude them both
dupes <- which(duplicated(cohorts2021[,c(-7,-6)]))
cohorts2021_nodup <- cohorts2021[-dupes,]
cohorts.wide <- reshape2::dcast(cohorts2021_nodup,Cohort~Age_by_Month,
value.var="CustomerID",
fun.aggregate = length)
# Cloning the output for retention and churn mixpanels
# to be used later
cw.retention <- cohorts.wide
cw.churn <- cohorts.wide
# Creating 19 breaks and 20 rgb color values ranging from blue to white
breaks <- quantile(cohorts.wide[,3:9], probs = seq(.05, .95, .05), na.rm = TRUE)
colors <- sapply(round(seq(155, 80, length.out = length(breaks) + 1), 0),
function(x){ rgb(x,x,155, maxColorValue = 155) } )
# The Retention Mixpanel with counts
library(DT)
DT::datatable(cohorts.wide,
class = 'cell-border stripe',
rownames = FALSE,
options = list(
ordering=F,
dom = 't',
pageLength = 12) ) %>%
formatStyle("0",
backgroundColor = 'lightgrey',
fontWeight = 'bold') %>%
formatStyle(names(cohorts.wide[c(-1,-2)]),fontWeight = 'bold',color = 'white', backgroundColor = styleInterval(breaks,colors))