This document contains the solution fot the Data Analysis problem given by uninstall.io. and it organized in to below sections:
# Load required libraries
library(readr)
library(tidyverse)
library(lubridate)
library(stringr)
library(ggthemes)
library(gridExtra)
library(cowplot)
# Import datasets
eventdump <- read_csv("C:/Users/welcome/Desktop/eventdump.csv") # events dataset
uicycles <- read_csv("C:/Users/welcome/Desktop/uicycles.csv") # customer app lifetime dataset
A customer life cycle is either from Install to Uninstall (or) from Re-install to uninstall.
Each lifecycle is treated seprately i.e each cycle is either install-uninstall or re-install-uninstall. Where as Install-uninstall-re-install-uninstall makes two lifecycles.
In this analysis lifecycles are numbered as 1 ,2 ,3… so on and forth, each lifecycle represents either a install-uninstall event or Re-install-Uninstall event.
cycles <- uicycles %>% arrange(uuid, creation_date) %>%
group_by(uuid) %>%
mutate(rank = order(as.POSIXct(creation_date, format = "%Y-%m-%d %H:%M:%S", tz = 'CET'))) # rank customer app events base on the timestamp
count_rank <- cycles %>%
summarise(c = sum(rank)) #
one <- count_rank %>% filter(c == 1) %>% as.list() # subset customers who have installed, uninstalled and re-installed only once
one <- one[[1]] # select the uuids in the list
cycles_users <- cycles[-which(cycles$uuid %in% one),] # lifecyles i.e install-uninstall events
users <- cycles[which(cycles$uuid %in% one),] # users who have installed, reinstalled and uninstalled [users with zero lifecyles]
cycles1 <- cycles_users %>% group_by(uuid) %>%
arrange(uuid, rank) %>%
filter(rank != last(rank)) %>%
mutate(cycle = paste0("cycle", rank)) %>%
group_by(uuid, cycle) %>%
count() %>%
spread(key = cycle, value = n, fill = 0) %>% # convert long foramt to wide format
as.data.frame()
cycles2 <- cycles1 %>%
select(uuid, cycle1, cycle3,
cycle5, cycle7,
cycle9, cycle11,
cycle13) # re-arrange columns in the desired manner
cycles2 <- cycles2 %>% select(-uuid) %>% mutate(rentention_trend = rowSums(.))
cycles3 <- data.frame(cycles1[,1], cycles2)
colnames(cycles3)[1] <- "uuid"
retention <- table(cycles3$rentention_trend) %>% as.data.frame()
colnames(retention)[1] <- "Cycle_No"
retention
## Cycle_No Freq
## 1 1 39988
## 2 2 640
## 3 3 39
## 4 4 10
## 5 5 2
## 6 6 1
## 7 7 1
The number of customers having a single life cycle i.e either install-uninstall or re-install-uninstall event are 39988. The number of customers having two lifecycle i.e install-uninstall-reinstall-uninstall are 640
ggplot(retention, aes(x = Cycle_No, y = Freq, fill = Cycle_No)) +
geom_bar(stat="identity") +
geom_text(aes(label= Freq), vjust=-0.3, size=3.5, color="black") +
theme_minimal() +
ggtitle("Customers retention trends from their lifetime cycle ") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(caption = "Note: Cycle No. indicates one cycle i.e install-uninstall or reinstall-install")
events <- eventdump %>% arrange(uuid, event_timestamp) %>%
group_by(uuid) %>%
mutate(rank = order(as.POSIXct(event_timestamp, format = "%Y-%m-%d %H:%M:%S", tz = 'CET'))) # rank customer app events base on the timestamp
events$event_timestamp <- ymd_hms(events$event_timestamp) # get the hour, min and sec from timesatmp
breaks <- hour(hm("00:00", "6:00", "12:00", "18:00", "23:59")) # bucket time in to four groups base on time of the day
labels <- c("Night", "Morning", "Afternoon", "Evening")
events$event_timestamp <- cut(x=hour(events$event_timestamp), breaks = breaks, labels = labels, include.lowest=TRUE) # convert timestamps into above time buckets in a day
users_activity <- events %>% group_by(event_timestamp) %>% summarise(count = n()) # get users acitivity based on the time of the day
ggplot(users_activity, aes(x = event_timestamp, y = count, fill = event_timestamp)) +
geom_bar(stat="identity") +
geom_text(aes(label= count), vjust=-0.3, size=3.5, color="black") +
theme_economist() +
scale_color_economist()+
ggtitle("Users App Activity through the day") +
theme(plot.title = element_text(hjust = 0.5))
User activity on the app is highest in the afternoon and least in the evening
Purchase value associated with each transaction: A user can have mutiple transactions and each transaction is treated seperately
Purchase value associated with each customer: Cumilative purchase value(for multiple transactions) for each customer is computed.
purchase <- c("Checkout is completed by null",
"Checkout is completed by Credit Cards / Debit Cards / Net Banking",
"Checkout is completed by Cash On Delivery", "Checkout is completed by PG",
"Checkout is completed by Paid using zCoins") # list of purchase activites by customers
# filter the customers who have shown above purchasing activities
purchase_customers <- events %>% filter(event %in% purchase) %>% select(-rank)
# Function to extract total value from each string from properties column for "Checkout is completed by Credit Cards / Debit Cards / Net Banking", & "Checkout is completed by Cash On Delivery"
strext <- function(variable) {
str_text <- str_extract_all(variable,"[0-9]+")[[1]][19]
return(str_text)
}
strext2 <- function(variable) { # function to extract total value from each string from properties column for ""Checkout is completed by PG""
str_text <- str_extract_all(variable,"[0-9]+")[[1]][12]
return(str_text)
}
strext3 <- function(variable) { # function to extract number of items purchased
str_text <- str_extract_all(variable,"[0-9]+")[[1]][18]
return(str_text)
}
# parse the properties column to get the total value atached to each transaction
purchase_value <- purchase_customers %>% mutate(Purchase_value_per_transaction = lapply(properties, strext) %>% unlist() %>% as.numeric(), items = lapply(properties, strext3) %>% unlist() %>% as.numeric()) %>% filter(!event %in% c('Checkout is completed by null', 'Checkout is completed by PG') & !is.na(Purchase_value_per_transaction))
# remove check out by NULL and PG as they have null values
# "Checkout is completed by PG - cash on dilvery""
e <- purchase_customers %>% filter( event == "Checkout is completed by PG" & grepl("Cash On Delivery", properties))
# "Checkout is completed by PG - "Credit Cards / Debit Cards / Net Banking"
f <-purchase_customers %>% filter( event == "Checkout is completed by PG" & grepl("Credit Cards / Debit Cards / Net Banking", properties))
# total value for "Checkout is completed by PG - cash on dilvery"
e1 <- e %>% mutate(Purchase_value_per_transaction = lapply(properties, strext2) %>% unlist() %>% as.numeric(), items = lapply(properties, strext3) %>% unlist() %>% as.numeric())
# Total value for Checkout is completed by PG - "Credit Cards / Debit Cards / Net Banking
f1 <- f %>% mutate(Purchase_value_per_transaction = lapply(properties, strext2) %>% unlist() %>% as.numeric(),items = lapply(properties, strext3) %>% unlist() %>% as.numeric())
purchase_value1 <- bind_rows(e1, f1) #concatate eq and f1
purchase_value1[is.na(purchase_value1$items), 6] <- 0
purchase_value <- bind_rows(purchase_value, purchase_value1) # concatate purchase_value and purchase_value1
purchase_value_customerwise <- purchase_value %>% group_by(uuid) %>%
summarise(Purchase_value_per_customer= sum(Purchase_value_per_transaction)) # get the total value attached to each customer
p1 <- ggplot(purchase_value, aes(x = Purchase_value_per_transaction)) +
geom_histogram(bins = 10, color="black", fill="orange", position="identity") + # purchase values transactionwise
theme_economist() +
scale_color_economist()+
ggtitle("Purchase values transaction-wise") +
theme(plot.title = element_text(hjust = 0.5))
p2 <- ggplot(purchase_value_customerwise, aes(x = Purchase_value_per_customer)) +
geom_histogram(bins = 10, color="black", fill="green", position="identity") + # purchase values customerwise
theme_economist() +
scale_color_economist()+
ggtitle("Purchase values customer-wise") +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(p1, p2, ncol=2)
The revenue trend for the users who made purchases after they have installed the app.
purchasepostinstall <- purchase_value %>% group_by(uuid) %>%
left_join(purchase_value_customerwise, by = "uuid") %>% left_join(cycles3, by = "uuid") %>% filter( cycle1 == 1)
purchasepostinstall <- purchasepostinstall %>% group_by(items) %>%
summarise(purchase.value = sum(Purchase_value_per_customer))
ggplot(purchasepostinstall, aes(x = items, y = purchase.value, fill = items)) +
geom_bar(stat="identity", position = "dodge") +
geom_text(aes(label= purchase.value), vjust= -0.3, size=3.5, color="black") +
theme_economist() +
scale_color_economist()+
ggtitle("Purchase value for items purchased post installing the app in one lifecycle") +
theme(plot.title = element_text(hjust = 0.5))
The revenue generated from purchases made my users after install the App is highest for single item purchase.
d <- cycles %>% mutate(week_no = strftime(creation_date, format = "%V") %>% unlist() %>% as.numeric()) # get week numbers from timestamp
# Get week difference, i.e if the week diff is zero, that means the customer has unistalled the app inside one week, similarly if the week diff is one, that means the customer has used the app for one week and so on and forth.
dif <- d %>%
group_by(uuid) %>%
mutate(week = week_no - lag(week_no)) %>%
filter(creation_date == max(creation_date)) %>% select(uuid, week) #
dif <- dif %>% filter(!is.na(week)) # remove missing values
purchase_after_oneweek <- left_join(dif, purchase_value, by = "uuid") %>% filter(!is.na(Purchase_value_per_transaction) & week > 0) # get purchases for retention for one week and further
purchase_after_oneweek <- purchase_after_oneweek %>% group_by(week) %>%
summarise(purchase.value = sum(Purchase_value_per_transaction))
purchase_after_oneweek$week <- as.factor(purchase_after_oneweek$week) # convert week to factor
# recode values for week
purchase_after_oneweek <- purchase_after_oneweek %>% mutate(Week = recode(week, "1" = "Week1",
"2" = "Week2",
"3" = "Week3",
"4" = "Week4"))
knitr::kable(purchase_after_oneweek)
| week | purchase.value | Week |
|---|---|---|
| 1 | 93295 | Week1 |
| 2 | 21851 | Week2 |
| 3 | 21552 | Week3 |
| 4 | 5317 | Week4 |
ggplot(purchase_after_oneweek, aes(x = Week, y = purchase.value, fill = Week)) +
geom_bar(stat="identity") +
geom_text(aes(label= purchase.value), vjust=-0.3, size=3.5, color="black") +
theme_economist() +
scale_color_economist()+
ggtitle("Purchase value for customers retained for one week and more") +
theme(plot.title = element_text(hjust = 0.5))
The purchase values drop from week to week.
prop_revenue <- prop.table(purchase_after_oneweek$purchase.value) %>% as.data.frame() # get the proportion of revenue for each each week
prop_revenue$week <- c(1,2,3,4) # week numbers
colnames(prop_revenue)[1] <- "Prop.revenue"
prop_revenue
## Prop.revenue week
## 1 0.65693765 1
## 2 0.15386403 2
## 3 0.15175862 3
## 4 0.03743971 4
ggplot(prop_revenue, aes(x = week, y = Prop.revenue, label = week)) +
geom_point() +
geom_line() +
geom_text(aes(label=week),hjust=0, vjust=0) +
ggtitle("Proportion of revenue from retained customers week wise") +
theme(plot.title = element_text(hjust = 0.5))
The revenue generated drop from week to week. There is certainly a drop in the revenue from week 3 to week 4 and but not as steep as from week1 to week2.
Purchasing customers are those who have completed check out while the Non-purchasing customers are those who haven’t completed check out. For the purpose of the analysis, the customers in the first cycle are selected as there are more number of customer falling in the first cycle than the other cycles.
purchase1 <- c("Checkout is completed by null",
"Checkout is completed by Credit Cards / Debit Cards / Net Banking",
"Checkout is completed by Cash On Delivery", "Checkout is completed by PG", "Checkout is completed by Paid using zCoins")# list of purchasing activites by users
# rank user events by timestamp
cust_events <- eventdump %>% arrange(uuid, event_timestamp) %>%
group_by(uuid) %>%
mutate(rank = order(as.POSIXct(event_timestamp, format = "%Y-%m-%d %H:%M:%S", tz = 'CET')))
# filter customers who completed check out
purchase_cust_events <- cust_events %>% filter(event %in% purchase1) %>% arrange(uuid, event_timestamp) %>% group_by(uuid) %>% mutate(Rank = order(as.POSIXct(event_timestamp, format = "%Y-%m-%d %H:%M:%S", tz = 'CET'))) %>% select(-rank)
# function for adding quotation mark to customer id vector
q <- function(...) {
sapply(match.call()[-1], deparse)
}
cust_purchase <- purchase_cust_events %>% filter(Rank == 1) %>% select(uuid) %>% mutate(cust_id = lapply(., q)) %>% as.list()
# get the list of customer ids
cust_purchase <- cust_purchase[[1]]
# customers who made purchases
purchase_cust_events <- events[which(events$uuid %in% cust_purchase),]
# getting the max activity by each customer who made purchases
purchase_cust_events <- purchase_cust_events %>% group_by(uuid) %>%
summarise(activity = max(rank))
purchase_cust_events <- left_join(purchase_cust_events, cycles3, by = "uuid")
purchase_cust_events <- purchase_cust_events %>% select(uuid, activity, rentention_trend) %>%
filter(rentention_trend == 1) # ACtivites of purchasing customers in a install-unistall lifecycle
ggplot(purchase_cust_events, aes(x = activity)) +
geom_histogram(bins = 10, color="black", fill="green", position="identity") + # purchase values customerwise
theme_economist() +
scale_color_economist()+
ggtitle("Activity of purchasing customers in a install-unistall lifecyle") +
theme(plot.title = element_text(hjust = 0.5))
# filter customers who did not complete check out
nonpurchase_cust_events <- cust_events %>% filter(!event %in% purchase1) %>% arrange(uuid, event_timestamp) %>% group_by(uuid) %>% mutate(Rank = order(as.POSIXct(event_timestamp, format = "%Y-%m-%d %H:%M:%S", tz = 'CET'))) %>% select(-rank)
# customers who did not make purchases
nonpurchase_cust_events <- events[-which(events$uuid %in% cust_purchase),]
nonpurchase_cust_events <- nonpurchase_cust_events %>% group_by(uuid) %>%
summarise(activity = max(rank))
# join cust events with app lifecycle table
nonpurchase_cust_events <- left_join(nonpurchase_cust_events, cycles3, by = "uuid")
# customers activites who have not made a purchase in their first lifecyle
nonpurchase_cust_events <- nonpurchase_cust_events %>% select(uuid, activity, rentention_trend) %>% filter(rentention_trend == 1)
ggplot(nonpurchase_cust_events, aes(x = activity)) +
geom_histogram(bins = 10, color="black", fill="green", position="identity") + # purchase values customerwise
theme_economist() +
scale_color_economist()+
ggtitle("Activity of Non-purchasing customers in a install-unistall lifecyle") +
theme(plot.title = element_text(hjust = 0.0))
Active users are those who either installed the app and have not uninstalled it and also those who reinstalled the app and have not uninstalled it again. Active users are also whose last event in their lifetime cycle is re-install and have not uninstalled the app again
# Active users with zero lifecyles
users <- cycles[cycles$uuid %in% one,] # users who have installed or uninstalled or reinstalled only once
activeusers <- cycles %>% group_by(uuid) %>% filter(rank == max(rank) & event_type %in% c("install", "re-install")) # filter customers who have either installed or re-installed the app and reinstalled in the last event of their lifecyle
activeusers <- table(activeusers$event_type) %>% as.data.frame() # frequency table of active users
activeusers
## Var1 Freq
## 1 install 53493
## 2 re-install 2755
sum(activeusers$Freq) # total number of active users
## [1] 56248
Total number of active users are 56248