#Running all the libraries needed in the analysis
library("readxl")
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/
## Resources/library/tcltk/libs//tcltk.so'' had status 1
## Loading required package: RSQLite
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.4
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## Warning: package 'purrr' was built under R version 3.6.2
## ── Conflicts ───────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidyr)
library(forcats)
library("gridExtra")
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
#install.packages("anytime")
library(anytime)
library(ggthemes)
#Reading excel
data <- read_excel("Data Science Take-home Dataset (2).xlsx")
head(data)
#Changing purchase_Date to Date format
data$purchase_date <- as.Date(data$purchase_date)

#Created data1 table with calculated field for total_sales= restaurant_total+(-promo_amount)
data1<- data %>% mutate(total_sales= case_when (is.na (promo_amount)~ restaurant_total, 
                                                   TRUE~ restaurant_total+promo_amount))
head(data1)
str(data1)
## Classes 'tbl_df', 'tbl' and 'data.frame':    16132 obs. of  9 variables:
##  $ order_id        : num  12765530 12423191 11265914 15625461 12658684 ...
##  $ source          : chr  "web" "google" "web" "web" ...
##  $ purchase_date   : Date, format: "2018-10-27" "2018-10-10" ...
##  $ shipping_type   : chr  "Delivery" "Delivery" "Delivery" "Delivery" ...
##  $ payment_method  : chr  "credit" "credit" "cash" "cash" ...
##  $ promo_amount    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ restaurant_total: num  29 16.1 14.8 13.8 42.3 ...
##  $ user_id         : num  10000 10001 10001 10001 10002 ...
##  $ total_sales     : num  29 16.1 14.8 13.8 42.3 ...
summary(data1)
##     order_id           source          purchase_date       
##  Min.   :-7006713   Length:16132       Min.   :2018-06-01  
##  1st Qu.:10867818   Class :character   1st Qu.:2018-08-27  
##  Median :12491759   Mode  :character   Median :2018-11-04  
##  Mean   : 9879452                      Mean   :2018-10-28  
##  3rd Qu.:13982318                      3rd Qu.:2019-01-01  
##  Max.   :15681340                      Max.   :2019-03-07  
##                                                            
##  shipping_type      payment_method      promo_amount    restaurant_total 
##  Length:16132       Length:16132       Min.   :-8.000   Min.   :   0.00  
##  Class :character   Class :character   1st Qu.:-3.000   1st Qu.:  20.89  
##  Mode  :character   Mode  :character   Median :-3.000   Median :  29.00  
##                                        Mean   :-3.323   Mean   :  31.04  
##                                        3rd Qu.:-3.000   3rd Qu.:  34.62  
##                                        Max.   : 0.000   Max.   :1116.57  
##                                        NA's   :15272                     
##     user_id       total_sales     
##  Min.   :10000   Min.   :   0.00  
##  1st Qu.:12417   1st Qu.:  20.65  
##  Median :15380   Median :  29.00  
##  Mean   :16274   Mean   :  30.86  
##  3rd Qu.:19695   3rd Qu.:  34.45  
##  Max.   :26130   Max.   :1116.57  
## 
#Cohort1 by source

c1<- sqldf('select source, sum(total_sales) as total_sales
            from data1
            group by source')
head(c1)
#Plot of Source Cohort
f1<- c1 %>% ggplot(aes(reorder(source,-total_sales), total_sales, fill = source))+
  geom_bar(stat = "identity", width = 0.3)+
  geom_text(label = paste("$",format(round(c1$total_sales,2),big.mark = ',')),vjust =-0.2, color = "black")+ theme_solarized()+
    labs(title = "Cohort by Source",
         caption = "source: Slice",
         x = "Source",
         y="Total Sales",
         fill = "Source"
    )
f1

#Created data2 table to get user_id's joindate, last date and number of weeks the user stayed
data2<- data1 %>%
  select(order_id:total_sales)%>%
  group_by (user_id,source) %>%
  summarise(Join_date = min(purchase_date),
            Last_date = max(purchase_date))%>%
  mutate(diff_in_weeks = difftime(Last_date,Join_date, units = 'weeks' ))

head(data2)
summary(data2)
##     user_id         source            Join_date         
##  Min.   :10000   Length:10882       Min.   :2018-06-01  
##  1st Qu.:13278   Class :character   1st Qu.:2018-08-11  
##  Median :16768   Mode  :character   Median :2018-10-17  
##  Mean   :17280                      Mean   :2018-10-13  
##  3rd Qu.:21110                      3rd Qu.:2018-12-16  
##  Max.   :26130                      Max.   :2019-03-07  
##    Last_date          diff_in_weeks    
##  Min.   :2018-06-01   Length:10882     
##  1st Qu.:2018-08-23   Class :difftime  
##  Median :2018-11-03   Mode  :numeric   
##  Mean   :2018-10-27                    
##  3rd Qu.:2019-01-02                    
##  Max.   :2019-03-07
#Joined data1 table with data2 table using user_id and source
#Calculated LTV= AOV*purchase frequency* user lifetime in years

sql1 <- sqldf('select t1.user_id,
               t1.source,
               t2.Join_date,
               t2.last_date,
               round((t2.last_date- t2.Join_date)/365,3) as Years,
               sum(t1.total_sales) Total_Sales,
               count(t1.order_id) Orders,
               round(sum(t1.total_sales)/count(t1.order_id),2) AOV,
               (sum(t1.total_sales)/count(t1.order_id))*(count(t1.order_id))*((t2.last_date- t2.Join_date)/365) as LTV
               from data1 t1
               left join data2 t2 on t2.user_id = t1.user_id and t2.source = t1.source
               group by t1.user_id,t1.source')

head(sql1,10)
#Calculate average LTV
summary(sql1$LTV)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    0.000    0.000    0.000    5.245    0.000 2532.548
#Average LTV=5.245 (average revenue the customer will generate before they churn)
means = aggregate(LTV~source,sql1,mean)
summary(means)
##     source               LTV        
##  Length:6           Min.   : 0.000  
##  Class :character   1st Qu.: 1.502  
##  Mode  :character   Median : 3.657  
##                     Mean   : 6.341  
##                     3rd Qu.:11.106  
##                     Max.   :16.357
ms <- ggplot(means,aes(reorder(source,-LTV),LTV))+
  geom_bar(stat = "identity",width = 0.3,fill = "#69b3a2",color = "black")+
  geom_smooth(method = "lm")+
  geom_text(label = round(means$LTV,2),vjust =  -0.2,color = "black")+ theme_economist()+
  labs(title = "Average Lifetime Value by Source",
       x= "Source",
       y= "LTV",
       fill = "Source")
ms
## `geom_smooth()` using formula 'y ~ x'

#KeyInsights
#People purchasing through iOs app have highest LTV and thus should be allocated more resources towards both acquisitiona and maintenance.

#Recommendations
#Customers with higher LTV should be allocated more resources depending on what stage the customer lifecycle they are in.

#The other data that can be used is profit margin. Marketing Source of the user acquisition and marketing channel that user came through.