This document contains the solution fot the Data Analysis problem given by uninstall.io. and it organized in to below sections:

  1. Customer retention based on their app install-unistall events
  2. Time of the day customers are most active
  3. Purchase value bucket per transaction and customer
  4. week over week revenue trends for purchasing customers
    1. Purchases post installing app
    2. Purchases in 2nd, 3rd…weeks if retention is greater than one week
    3. Is there a steady inflow of revenue for customers with high retention?
  5. Purchasing customers vS Non-purchasing customers behaviour/Activity
  6. Other Actionable insights
    1. Active Users of the App
# Load required libraries 
library(readr)
library(tidyverse)
library(lubridate)
library(stringr)
library(ggthemes)
library(gridExtra)
library(cowplot)
# Import datasets
eventdump <- read_csv("C:/Users/welcome/Desktop/eventdump.csv") # events dataset

uicycles <- read_csv("C:/Users/welcome/Desktop/uicycles.csv") # customer app lifetime dataset

1.Customer retention trend based on the app install-uninstall events

A customer life cycle is either from Install to Uninstall (or) from Re-install to uninstall.

Each lifecycle is treated seprately i.e each cycle is either install-uninstall or re-install-uninstall. Where as Install-uninstall-re-install-uninstall makes two lifecycles.

In this analysis lifecycles are numbered as 1 ,2 ,3… so on and forth, each lifecycle represents either a install-uninstall event or Re-install-Uninstall event.

cycles <- uicycles %>% arrange(uuid, creation_date) %>% 
                  group_by(uuid) %>%  
                  mutate(rank = order(as.POSIXct(creation_date, format = "%Y-%m-%d %H:%M:%S", tz = 'CET'))) # rank customer app events base on the timestamp
                 
count_rank <- cycles %>% 
  summarise(c = sum(rank)) # 

one <- count_rank %>% filter(c == 1) %>% as.list() # subset customers who have installed, uninstalled and re-installed only once

one <- one[[1]] # select the uuids in the list 

cycles_users <- cycles[-which(cycles$uuid %in% one),] # lifecyles i.e install-uninstall events

users <- cycles[which(cycles$uuid %in% one),]  # users who have installed, reinstalled and uninstalled [users with zero lifecyles]



cycles1 <- cycles_users %>% group_by(uuid) %>%
           arrange(uuid, rank) %>%   
           filter(rank != last(rank)) %>%   
           mutate(cycle = paste0("cycle", rank)) %>% 
           group_by(uuid, cycle) %>%
           count() %>%
           spread(key = cycle, value = n, fill = 0) %>% # convert long foramt to wide format
           as.data.frame() 


cycles2 <- cycles1 %>% 
               select(uuid, cycle1, cycle3,
                       cycle5,  cycle7,
                       cycle9, cycle11,
                       cycle13)   # re-arrange columns in the desired manner


cycles2 <- cycles2 %>% select(-uuid) %>%  mutate(rentention_trend = rowSums(.)) 

cycles3 <- data.frame(cycles1[,1], cycles2)

colnames(cycles3)[1] <- "uuid"

retention <- table(cycles3$rentention_trend) %>% as.data.frame()

colnames(retention)[1] <- "Cycle_No"


retention
##   Cycle_No  Freq
## 1        1 39988
## 2        2   640
## 3        3    39
## 4        4    10
## 5        5     2
## 6        6     1
## 7        7     1

The number of customers having a single life cycle i.e either install-uninstall or re-install-uninstall event are 39988. The number of customers having two lifecycle i.e install-uninstall-reinstall-uninstall are 640

ggplot(retention, aes(x = Cycle_No, y = Freq, fill = Cycle_No)) + 
  geom_bar(stat="identity") +
  geom_text(aes(label= Freq), vjust=-0.3, size=3.5, color="black") +
  theme_minimal() +
  ggtitle("Customers retention trends from their lifetime cycle ") +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(caption = "Note: Cycle No. indicates one cycle i.e install-uninstall or reinstall-install")

2.Time of the day customers are most active

events <- eventdump %>% arrange(uuid, event_timestamp) %>% 
          group_by(uuid) %>% 
          mutate(rank = order(as.POSIXct(event_timestamp, format = "%Y-%m-%d %H:%M:%S", tz = 'CET'))) # rank customer app events base on the timestamp


events$event_timestamp <- ymd_hms(events$event_timestamp) # get the hour, min and sec from timesatmp

breaks <- hour(hm("00:00", "6:00", "12:00", "18:00", "23:59")) # bucket time in to four groups base on time of the day

labels <- c("Night", "Morning", "Afternoon", "Evening")

events$event_timestamp <- cut(x=hour(events$event_timestamp), breaks = breaks, labels = labels, include.lowest=TRUE) # convert timestamps into above time buckets in a day

users_activity <- events %>% group_by(event_timestamp) %>% summarise(count = n()) # get users acitivity based on the time of the day
ggplot(users_activity, aes(x = event_timestamp, y = count, fill = event_timestamp)) +
  geom_bar(stat="identity") +
  geom_text(aes(label= count), vjust=-0.3, size=3.5, color="black") +
  theme_economist() + 
  scale_color_economist()+
  ggtitle("Users App Activity through the day") +
  theme(plot.title = element_text(hjust = 0.5)) 

User activity on the app is highest in the afternoon and least in the evening

3.Purchase value bucket per transaction and customer

Purchase value associated with each transaction: A user can have mutiple transactions and each transaction is treated seperately

Purchase value associated with each customer: Cumilative purchase value(for multiple transactions) for each customer is computed.

purchase <- c("Checkout is completed by null", 
  "Checkout is completed by Credit Cards / Debit Cards / Net Banking",
  "Checkout is completed by Cash On Delivery", "Checkout is completed by PG", 
  "Checkout is completed by Paid using zCoins") # list of purchase activites by customers


# filter the customers who have shown above purchasing activities
purchase_customers <- events %>% filter(event %in% purchase) %>% select(-rank)  


# Function to extract total value from each string from properties column for "Checkout is completed by Credit Cards / Debit Cards / Net Banking", &  "Checkout is completed by Cash On Delivery"
strext <- function(variable) { 
   str_text <- str_extract_all(variable,"[0-9]+")[[1]][19]
  
  return(str_text)
  
}


strext2 <- function(variable) { # function to extract total value from each string from properties column for ""Checkout is completed by PG""
  str_text <- str_extract_all(variable,"[0-9]+")[[1]][12]
  
  return(str_text)
  
}



strext3 <- function(variable) { # function to extract number of items purchased
  str_text <- str_extract_all(variable,"[0-9]+")[[1]][18]
  
  return(str_text)
  
}



# parse the properties column to get the total value atached to each transaction

purchase_value <- purchase_customers %>% mutate(Purchase_value_per_transaction = lapply(properties, strext) %>% unlist() %>% as.numeric(), items =  lapply(properties, strext3) %>% unlist() %>% as.numeric()) %>% filter(!event %in% c('Checkout is completed by null', 'Checkout is completed by PG') & !is.na(Purchase_value_per_transaction))
 # remove check out by NULL and PG as they have null values


# "Checkout is completed by PG - cash on dilvery""
e <- purchase_customers %>% filter( event == "Checkout is completed by PG" & grepl("Cash On Delivery", properties)) 


# "Checkout is completed by PG - "Credit Cards / Debit Cards / Net Banking"
f <-purchase_customers %>% filter( event == "Checkout is completed by PG" & grepl("Credit Cards / Debit Cards / Net Banking", properties)) 

# total value for "Checkout is completed by PG - cash on dilvery"
e1 <- e  %>% mutate(Purchase_value_per_transaction = lapply(properties, strext2) %>% unlist() %>% as.numeric(), items = lapply(properties, strext3) %>% unlist() %>% as.numeric()) 
# Total value for Checkout is completed by PG - "Credit Cards / Debit Cards / Net Banking
f1 <- f %>% mutate(Purchase_value_per_transaction  = lapply(properties, strext2) %>% unlist() %>% as.numeric(),items = lapply(properties, strext3) %>% unlist() %>% as.numeric())

purchase_value1 <- bind_rows(e1, f1) #concatate eq and f1

purchase_value1[is.na(purchase_value1$items), 6] <- 0

purchase_value <- bind_rows(purchase_value, purchase_value1) # concatate purchase_value and purchase_value1 

purchase_value_customerwise <- purchase_value %>% group_by(uuid) %>% 
summarise(Purchase_value_per_customer= sum(Purchase_value_per_transaction)) # get the total value attached to each customer
p1 <- ggplot(purchase_value, aes(x = Purchase_value_per_transaction)) +
  geom_histogram(bins = 10, color="black", fill="orange", position="identity") + # purchase values transactionwise
  theme_economist() + 
  scale_color_economist()+
  ggtitle("Purchase values transaction-wise") +
  theme(plot.title = element_text(hjust = 0.5))


p2 <- ggplot(purchase_value_customerwise, aes(x = Purchase_value_per_customer)) +
  geom_histogram(bins = 10, color="black", fill="green", position="identity") + # purchase values customerwise
  theme_economist() + 
  scale_color_economist()+
  ggtitle("Purchase values customer-wise") +
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(p1, p2, ncol=2) 

5.Purchasing customers vs Non-purchasing customers behaviour/Activity in a install-uninstall lifecyle

Purchasing customers are those who have completed check out while the Non-purchasing customers are those who haven’t completed check out. For the purpose of the analysis, the customers in the first cycle are selected as there are more number of customer falling in the first cycle than the other cycles.

purchase1 <- c("Checkout is completed by null", 
  "Checkout is completed by Credit Cards / Debit Cards / Net Banking",
  "Checkout is completed by Cash On Delivery", "Checkout is completed by PG", "Checkout is completed by Paid using zCoins")# list of purchasing activites by users
 

# rank user events by timestamp
cust_events <- eventdump %>% arrange(uuid, event_timestamp) %>% 
  group_by(uuid) %>% 
  mutate(rank = order(as.POSIXct(event_timestamp, format = "%Y-%m-%d %H:%M:%S", tz = 'CET'))) 

a.Purchasing customers Activity

# filter customers who completed check out
purchase_cust_events <- cust_events %>% filter(event %in% purchase1) %>% arrange(uuid, event_timestamp) %>% group_by(uuid) %>%  mutate(Rank = order(as.POSIXct(event_timestamp, format = "%Y-%m-%d %H:%M:%S", tz = 'CET'))) %>% select(-rank) 

# function for adding quotation mark to customer id vector
q <- function(...) {
  sapply(match.call()[-1], deparse)
} 

cust_purchase <- purchase_cust_events %>% filter(Rank == 1) %>% select(uuid) %>% mutate(cust_id = lapply(., q)) %>% as.list()

# get the list of customer ids
cust_purchase <- cust_purchase[[1]] 

# customers who made purchases
purchase_cust_events <- events[which(events$uuid %in% cust_purchase),] 

# getting the max activity by each customer who made purchases  
purchase_cust_events <- purchase_cust_events %>% group_by(uuid) %>% 
                        summarise(activity = max(rank)) 


purchase_cust_events <- left_join(purchase_cust_events, cycles3, by = "uuid")



purchase_cust_events <- purchase_cust_events %>% select(uuid, activity, rentention_trend) %>% 
                   filter(rentention_trend == 1) # ACtivites of purchasing customers in a install-unistall lifecycle
ggplot(purchase_cust_events, aes(x = activity)) +
  geom_histogram(bins = 10, color="black", fill="green", position="identity") + # purchase values customerwise
  theme_economist() + 
  scale_color_economist()+
  ggtitle("Activity of purchasing customers in a install-unistall lifecyle") +
  theme(plot.title = element_text(hjust = 0.5))

b.Non-purchasing customers Activity

# filter customers who  did not complete check out
nonpurchase_cust_events <- cust_events %>% filter(!event %in% purchase1) %>% arrange(uuid, event_timestamp) %>% group_by(uuid) %>%  mutate(Rank = order(as.POSIXct(event_timestamp, format = "%Y-%m-%d %H:%M:%S", tz = 'CET'))) %>% select(-rank) 

# customers who did not make purchases
nonpurchase_cust_events <- events[-which(events$uuid %in% cust_purchase),] 


nonpurchase_cust_events <- nonpurchase_cust_events %>% group_by(uuid) %>% 
  summarise(activity = max(rank))

# join cust events with app lifecycle table
nonpurchase_cust_events <- left_join(nonpurchase_cust_events, cycles3, by = "uuid") 

# customers activites who have not made a purchase in their first lifecyle
nonpurchase_cust_events <- nonpurchase_cust_events %>% select(uuid, activity, rentention_trend) %>% filter(rentention_trend == 1) 
ggplot(nonpurchase_cust_events, aes(x = activity)) +
  geom_histogram(bins = 10, color="black", fill="green", position="identity") + # purchase values customerwise
  theme_economist() + 
  scale_color_economist()+
  ggtitle("Activity of Non-purchasing customers in a install-unistall lifecyle") +
  theme(plot.title = element_text(hjust = 0.0))

6.Other actionable insights

Active Users

Active users are those who either installed the app and have not uninstalled it and also those who reinstalled the app and have not uninstalled it again. Active users are also whose last event in their lifetime cycle is re-install and have not uninstalled the app again

# Active users with zero lifecyles

users <- cycles[cycles$uuid %in% one,] #  users who have installed or uninstalled or reinstalled only once

activeusers <- cycles %>% group_by(uuid) %>% filter(rank == max(rank) & event_type %in% c("install", "re-install")) # filter customers who have either installed or re-installed the app and reinstalled in the last event of their lifecyle

activeusers <- table(activeusers$event_type) %>% as.data.frame() # frequency table of active users

activeusers
##         Var1  Freq
## 1    install 53493
## 2 re-install  2755
sum(activeusers$Freq)  # total number of active users
## [1] 56248

Total number of active users are 56248