#Running all the libraries needed in the analysis
library("readxl")
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/
## Resources/library/tcltk/libs//tcltk.so'' had status 1
## Loading required package: RSQLite
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.4
## ✓ tibble 2.1.3 ✓ dplyr 0.8.3
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## Warning: package 'purrr' was built under R version 3.6.2
## ── Conflicts ───────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyr)
library(forcats)
library("gridExtra")
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
#install.packages("anytime")
library(anytime)
library(ggthemes)
#Reading excel
data <- read_excel("Data Science Take-home Dataset (2).xlsx")
head(data)
#Changing purchase_Date to Date format
data$purchase_date <- as.Date(data$purchase_date)
#Created data1 table with calculated field for total_sales= restaurant_total+(-promo_amount)
data1<- data %>% mutate(total_sales= case_when (is.na (promo_amount)~ restaurant_total,
TRUE~ restaurant_total+promo_amount))
head(data1)
str(data1)
## Classes 'tbl_df', 'tbl' and 'data.frame': 16132 obs. of 9 variables:
## $ order_id : num 12765530 12423191 11265914 15625461 12658684 ...
## $ source : chr "web" "google" "web" "web" ...
## $ purchase_date : Date, format: "2018-10-27" "2018-10-10" ...
## $ shipping_type : chr "Delivery" "Delivery" "Delivery" "Delivery" ...
## $ payment_method : chr "credit" "credit" "cash" "cash" ...
## $ promo_amount : num NA NA NA NA NA NA NA NA NA NA ...
## $ restaurant_total: num 29 16.1 14.8 13.8 42.3 ...
## $ user_id : num 10000 10001 10001 10001 10002 ...
## $ total_sales : num 29 16.1 14.8 13.8 42.3 ...
summary(data1)
## order_id source purchase_date
## Min. :-7006713 Length:16132 Min. :2018-06-01
## 1st Qu.:10867818 Class :character 1st Qu.:2018-08-27
## Median :12491759 Mode :character Median :2018-11-04
## Mean : 9879452 Mean :2018-10-28
## 3rd Qu.:13982318 3rd Qu.:2019-01-01
## Max. :15681340 Max. :2019-03-07
##
## shipping_type payment_method promo_amount restaurant_total
## Length:16132 Length:16132 Min. :-8.000 Min. : 0.00
## Class :character Class :character 1st Qu.:-3.000 1st Qu.: 20.89
## Mode :character Mode :character Median :-3.000 Median : 29.00
## Mean :-3.323 Mean : 31.04
## 3rd Qu.:-3.000 3rd Qu.: 34.62
## Max. : 0.000 Max. :1116.57
## NA's :15272
## user_id total_sales
## Min. :10000 Min. : 0.00
## 1st Qu.:12417 1st Qu.: 20.65
## Median :15380 Median : 29.00
## Mean :16274 Mean : 30.86
## 3rd Qu.:19695 3rd Qu.: 34.45
## Max. :26130 Max. :1116.57
##
#Cohort1 by source
c1<- sqldf('select source, sum(total_sales) as total_sales
from data1
group by source')
head(c1)
#Plot of Source Cohort
f1<- c1 %>% ggplot(aes(reorder(source,-total_sales), total_sales, fill = source))+
geom_bar(stat = "identity", width = 0.3)+
geom_text(label = paste("$",format(round(c1$total_sales,2),big.mark = ',')),vjust =-0.2, color = "black")+ theme_solarized()+
labs(title = "Cohort by Source",
caption = "source: Slice",
x = "Source",
y="Total Sales",
fill = "Source"
)
f1

#Created data2 table to get user_id's joindate, last date and number of weeks the user stayed
data2<- data1 %>%
select(order_id:total_sales)%>%
group_by (user_id,source) %>%
summarise(Join_date = min(purchase_date),
Last_date = max(purchase_date))%>%
mutate(diff_in_weeks = difftime(Last_date,Join_date, units = 'weeks' ))
head(data2)
summary(data2)
## user_id source Join_date
## Min. :10000 Length:10882 Min. :2018-06-01
## 1st Qu.:13278 Class :character 1st Qu.:2018-08-11
## Median :16768 Mode :character Median :2018-10-17
## Mean :17280 Mean :2018-10-13
## 3rd Qu.:21110 3rd Qu.:2018-12-16
## Max. :26130 Max. :2019-03-07
## Last_date diff_in_weeks
## Min. :2018-06-01 Length:10882
## 1st Qu.:2018-08-23 Class :difftime
## Median :2018-11-03 Mode :numeric
## Mean :2018-10-27
## 3rd Qu.:2019-01-02
## Max. :2019-03-07
#Joined data1 table with data2 table using user_id and source
#Calculated LTV= AOV*purchase frequency* user lifetime in years
sql1 <- sqldf('select t1.user_id,
t1.source,
t2.Join_date,
t2.last_date,
round((t2.last_date- t2.Join_date)/365,3) as Years,
sum(t1.total_sales) Total_Sales,
count(t1.order_id) Orders,
round(sum(t1.total_sales)/count(t1.order_id),2) AOV,
(sum(t1.total_sales)/count(t1.order_id))*(count(t1.order_id))*((t2.last_date- t2.Join_date)/365) as LTV
from data1 t1
left join data2 t2 on t2.user_id = t1.user_id and t2.source = t1.source
group by t1.user_id,t1.source')
head(sql1,10)
#Calculate average LTV
summary(sql1$LTV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 5.245 0.000 2532.548
#Average LTV=5.245 (average revenue the customer will generate before they churn)
means = aggregate(LTV~source,sql1,mean)
summary(means)
## source LTV
## Length:6 Min. : 0.000
## Class :character 1st Qu.: 1.502
## Mode :character Median : 3.657
## Mean : 6.341
## 3rd Qu.:11.106
## Max. :16.357
ms <- ggplot(means,aes(reorder(source,-LTV),LTV))+
geom_bar(stat = "identity",width = 0.3,fill = "#69b3a2",color = "black")+
geom_smooth(method = "lm")+
geom_text(label = round(means$LTV,2),vjust = -0.2,color = "black")+ theme_economist()+
labs(title = "Average Lifetime Value by Source",
x= "Source",
y= "LTV",
fill = "Source")
ms
## `geom_smooth()` using formula 'y ~ x'

#KeyInsights
#People purchasing through iOs app have highest LTV and thus should be allocated more resources towards both acquisitiona and maintenance.
#Recommendations
#Customers with higher LTV should be allocated more resources depending on what stage the customer lifecycle they are in.
#The other data that can be used is profit margin. Marketing Source of the user acquisition and marketing channel that user came through.