Loading libraries

library(dplyr)
library(lubridate)
library(ggplot2)

Importing the data

setwd('C:/Users/anany/Desktop/BANA/Applications/Mock Project XYZ.com')
people <- read.csv('people_person.csv')
people_pets <- read.csv('pets_pet.csv')
services <- read.csv('services_service.csv')
conversation <- read.csv('conversations_conversation.csv')
conversation_review <- read.csv('conversations_review.csv')
people_test_seg <-  read.csv('people_testsegmentation.csv')
# str(people_test_seg)
# str(people)
# str(people_pets)
# str(services)
# str(conversation)
# str(conversation_review)
people$date_joined <- as.Date(people$date_joined)
conversation$end_date <-  as.Date(conversation$end_date)
conversation$start_date <- as.Date(conversation$start_date)
conversation$booked_at<-  as.Date(conversation$booked_at)
conversation$cancelled_at<-  as.Date(substr(conversation$cancelled_at,1,10), format = "%Y-%m-%d")
conversation$added = as.Date(conversation$added)
people_test_seg$added_date <- as.Date(people_test_seg$added)

Exploring the data

1. How many users have signed up? 64416

2. How many users signed up prior to 2017-01-12 ? 35500

3. What percentage of users have added pets? 80.44%

4. Of those users, how many pets have they added on average? 1.495

5. What percentage of pets play well with cats? 24.78%

#1
total_users <- people %>%
  summarise(count_users = n_distinct(email))
total_users

##   count_users
## 1       64416

#2
people %>%
  filter(date_joined < '2017-01-12') %>%
  summarise(count = n_distinct(email))

##   count
## 1 35500

#3
people %>% 
  inner_join(people_pets, by = c("id"="owner_id")) %>%
  summarise(perc_w_pets = n_distinct(email))*100/total_users

##   perc_w_pets
## 1    80.43964

#4
people %>%
  inner_join(people_pets, by = c("id"="owner_id")) %>%
  group_by(id) %>%
  count() %>%
  ungroup() %>%
  summarise( mean = mean(n))

## # A tibble: 1 x 1
##       mean
##      <dbl>
## 1 1.495909

#5
people_pets %>% summarise(perc_play_cats = sum(plays_cats)/n())

##   perc_play_cats
## 1      0.2477552

Conversation and Booking

1. What are the possible services and what is the average price per unit for each service type?

#1
services %>% group_by(service_type)%>% summarise(avg_price = mean(price, na.rm = TRUE))

## # A tibble: 3 x 2
##   service_type avg_price
##         <fctr>     <dbl>
## 1     boarding  34.50608
## 2     day-care  27.01369
## 3  dog-walking  21.98219

2. How many requests have there been for each service type and what percentage of those have booked? The percentage of those that have booked is called booking rate.

#2
conversation %>%
  inner_join(services,by = c("service_id"="id")) %>%
  group_by(service_type) %>%
  summarise(count_req = n(), count_booked = sum(!is.na(booked_at)) , perc_booked = count_booked*100/count_req)

## # A tibble: 3 x 4
##   service_type count_req count_booked perc_booked
##         <fctr>     <int>        <int>       <dbl>
## 1     boarding     26665        10862    40.73505
## 2     day-care     26362         9417    35.72187
## 3  dog-walking     26716        12811    47.95254

3. What are the cancellation rates for each service?

#3
conversation %>%
  inner_join(services,by = c("service_id"="id")) %>%
  group_by(service_type) %>%
  summarise(count_req = n(), count_cancelled = sum(!is.na(cancelled_at)) , perc_cancelled = count_cancelled*100/count_req)

## # A tibble: 3 x 4
##   service_type count_req count_cancelled perc_cancelled
##         <fctr>     <int>           <int>          <dbl>
## 1     boarding     26665            1051       3.941496
## 2     day-care     26362             919       3.486078
## 3  dog-walking     26716            1218       4.559066

4.For uncancelled bookings, is the owner or provider more likely to leave a review and which tends to leave better reviews? How would you narrate this finding to a business partner?

#4
head(conversation,5)

##       id start_date   end_date units      added booking_total
## 1 208294 2017-06-06 2017-06-09     3 2017-05-25           102
## 2 208295 2017-07-16 2017-07-25     9 2017-07-07           324
## 3 208296 2017-06-06 2017-06-12     6 2017-05-26           102
## 4 208297 2017-06-20 2017-06-25     5 2017-06-10           420
## 5 208298 2017-05-25 2017-05-28     3 2017-05-15            66
##   cancellation_fault requester_id service_id  booked_at cancelled_at
## 1                           64416      81642 2017-05-26         <NA>
## 2                           64415      79559       <NA>         <NA>
## 3                           64414      91348 2017-05-29         <NA>
## 4           provider        64413      87158 2017-06-15   2017-06-18
## 5                           64411      82832 2017-05-16         <NA>

conversation %>%
  filter(is.na(cancelled_at)) %>%
  left_join(conversation_review, by = c("id" = "conversation_id"))%>%
  summarise( count_total = n(), count_review = sum(!is.na(reviewer_id)), perc_review = count_review*100/count_total, stars = mean(stars, na.rm = TRUE))

##   count_total count_review perc_review    stars
## 1       81087        28536    35.19183 4.295697

Average price is highest for boarding followed by day-care followed by dog-walking.

The number of requests per service type is mor or less the same with dog-walking slightly higher than the rest. Most number of bookings have been made for dog walking with 47.9% of the requests followed by boarding with 40.73%.

On an average dog-walking has the highest cancellation as well with 4.55% followed by boarding with 3.94%.

For uncancelled bookings, the owner leaves a review 35% of the times so he is not very likely to leave a review. But, people who have left review they have givean averag eof 4.29 starts out of 5 which is pretty good.

Recent Daily Book rate

1.First, let’s reproduce their results. They tell you that daily booking rate is defined to be the percentage of conversations created each day that eventually book. What is the daily booking rate for each of the 90 days prior to the snapshot? Is there a downward trend?

conversation %>% filter(added > max(added)-90) %>% 
  group_by(added) %>% summarise( count = n(), count_booked = sum(!is.na(booked_at)),perc_booking = count_booked/count) %>%
  arrange(added) %>%
  ggplot(aes( x= added, y = perc_booking))+geom_line()

2. Can you narrate a reason why this trend exists? Is there a reason to be concerned?

conversation %>% filter(added > max(added)-90) %>% 
  group_by(added) %>% summarise( count = n(), count_booked = sum(!is.na(booked_at)), 
                                 count_cancelled = sum(!is.na(cancelled_at)),perc_booking = count_booked/count, time_to_book = mean(booked_at - added, na.rm = TRUE)) %>%
  arrange(added)

## # A tibble: 90 x 6
##         added count count_booked count_cancelled perc_booking
##        <date> <int>        <int>           <int>        <dbl>
##  1 2017-04-12   213          108              12    0.5070423
##  2 2017-04-13   226          103              10    0.4557522
##  3 2017-04-14   213           93               7    0.4366197
##  4 2017-04-15   213           87              10    0.4084507
##  5 2017-04-16   206           94               9    0.4563107
##  6 2017-04-17   207           84              11    0.4057971
##  7 2017-04-18   239          100              10    0.4184100
##  8 2017-04-19   220          104              12    0.4727273
##  9 2017-04-20   212           95              14    0.4481132
## 10 2017-04-21   229          118              12    0.5152838
## # ... with 80 more rows, and 1 more variables: time_to_book <time>

Why this trend exists The average number of days a customer takes to book his request is ~3 days as we can see from the 90 days data. The last three days show a donwward trend becuase they have not passed this waiting period thus, the downward trend the anayst is seeing is not actually a drop in the booking rate. At this point there is no reason to be concerned. We might want to wait for 2-3 days and see if the trend become normal for the recent 2-3 days.

Analyzing take rate

1. In each month, what were the gross billings and net revenue?

2. Define take rate to be the percentage of gross billings that is net revenue. In the previous example, the take rate is slightly more than 19% since $20/$105 is approximately 0.1905. In each month, what was the aggregate take rate?

take_rate1 <- people %>%
  inner_join(services, c("id"= "provider_id")) %>%
  mutate(service_fee = fee.y, booking_fee = fee.x) %>%
  inner_join(conversation, c("id.y"= "service_id")) %>%
  filter(!is.na(booked_at), is.na(cancelled_at)) %>%
  mutate(booking_month = month(booked_at),booking_year = year(booked_at), 
         owner_fee = ifelse(booking_total*booking_fee>50,50,booking_total*booking_fee), 
         service_fee1 = booking_total*service_fee,
         gross_billing =booking_total+owner_fee,
         net_revenue = service_fee1 +owner_fee,
         take_rate = net_revenue*100/gross_billing)%>%
  group_by(booking_year,booking_month) %>%
  summarise(gross_billin_m = sum(gross_billing,na.rm = TRUE),net_revenue_m = sum(net_revenue, na.rm = TRUE), take_rate_m = mean(take_rate, na.rm = TRUE), service_fee1 = mean(service_fee), owner_fee1 = mean(booking_fee))

take_rate1

## # A tibble: 25 x 7
## # Groups:   booking_year [?]
##    booking_year booking_month gross_billin_m net_revenue_m take_rate_m
##           <dbl>         <dbl>          <dbl>         <dbl>       <dbl>
##  1         2015             7        1035.00        155.25    15.00000
##  2         2015             8       11677.46       1799.61    15.32680
##  3         2015             9       19105.54       3006.54    15.79511
##  4         2015            10       36054.74       5825.34    16.13126
##  5         2015            11       37320.56       6083.91    16.28261
##  6         2015            12       57197.38       9470.73    16.58921
##  7         2016             1       71517.89      12603.54    17.47574
##  8         2016             2       80035.20      14154.25    17.70134
##  9         2016             3      128357.28      23159.58    18.04801
## 10         2016             4      139910.58      25560.08    18.18658
## # ... with 15 more rows, and 2 more variables: service_fee1 <dbl>,
## #   owner_fee1 <dbl>

3. Did take rate trend up or trend down or remain unchanged over time?

4. If it did change, investigate why and provide an explaination. Be sure to provide additional data/charts/evidence that justify your explaination. Any claims should be backed by data.

take_rate1 %>% mutate(booking_month1 = ifelse(booking_month <10, paste0('0',booking_month), booking_month),booking_month_year = paste0(booking_year,"-", booking_month1)) %>%
  ggplot(aes( x = booking_month_year, y = take_rate_m)) +geom_bar(stat= "identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ggtitle("Change of Take Rate over time")

take_rate1 %>% mutate(booking_month1 = ifelse(booking_month <10, paste0('0',booking_month), booking_month),booking_month_year = paste0(booking_year,"-", booking_month1)) %>%
  ggplot(aes( x = booking_month_year, y = owner_fee1)) +geom_bar(stat= "identity",fill = "dark cyan") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  ggtitle("Change of %Owner fee over time")

take_rate1 %>% mutate(booking_month1 = ifelse(booking_month <10, paste0('0',booking_month), booking_month),booking_month_year = paste0(booking_year,"-", booking_month1)) %>%
  ggplot(aes( x = booking_month_year, y = service_fee1)) +geom_bar(stat= "identity",fill = "brown") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  ggtitle("Change of %Service fee over time")

We can clearly see an increaasing trend in the take rate over time from the graph above.

But Why?

This is happening probably due to an increase in the % Service fee and % Booking fee over time.

5. New Conversation Flow

1. Did conversations with the new conversation page book at a higher rate?

head(people_test_seg,10)

##       id  test_name test_group                      added person_id
## 1  61425 Email Test    holdout 2015-07-12 15:13:16.673652         1
## 2  61426 Email Test    holdout 2015-07-12 13:44:21.949482         2
## 3  61427 Email Test    variant 2015-07-12 11:36:06.992714         3
## 4  61428 Email Test    holdout 2015-07-12 12:23:43.627071         4
## 5  61429 Email Test    holdout 2015-07-12 07:40:36.724086         5
## 6  61430 Email Test    holdout 2015-07-12 11:18:40.410251         6
## 7  61431 Email Test    holdout 2015-07-12 19:48:21.073046         7
## 8  61432 Email Test    holdout 2015-07-12 12:01:51.756294         8
## 9  61433 Email Test    holdout 2015-07-12 16:30:27.866647         9
## 10 61434 Email Test    variant 2015-07-12 15:32:31.386474        10
##    added_date
## 1  2015-07-12
## 2  2015-07-12
## 3  2015-07-12
## 4  2015-07-12
## 5  2015-07-12
## 6  2015-07-12
## 7  2015-07-12
## 8  2015-07-12
## 9  2015-07-12
## 10 2015-07-12

people_test_seg %>%
  filter(test_name == "New Conversation Flow") %>%
  left_join(conversation, by = c("person_id"="requester_id")) %>%
  filter(is.na(booked_at)|booked_at >= "2017-03-13") %>%
  group_by(person_id,test_group) %>%
  summarise(booked = max(as.numeric(!is.na(booked_at))))  %>%
  group_by(test_group) %>%
  summarise(count_total = n(),count_booked = sum(booked), perc_booked = count_booked*100/count_total)

## # A tibble: 2 x 4
##   test_group count_total count_booked perc_booked
##       <fctr>       <int>        <dbl>       <dbl>
## 1    holdout       11751         5273    44.87278
## 2    variant       11749         6099    51.91080

The percentage coversion for variant is 62% i.e. 6% higher than th eholdout group who weren’t exposed to the new conversation flow.

2. Is it statistically significant?

holdout_seg <-people_test_seg %>%
              filter(test_name == "New Conversation Flow") %>%
              left_join(conversation, by = c("person_id"="requester_id")) %>%
              filter(test_group == "holdout", is.na(booked_at)|booked_at >= "2017-03-13") %>%
              group_by(person_id,test_group) %>%
              summarise(booked = max(as.numeric(!is.na(booked_at)))) %>%
              select(booked)

variant_seg <-people_test_seg %>%
              filter(test_name == "New Conversation Flow") %>%
              left_join(conversation, by = c("person_id"="requester_id")) %>%
              filter(test_group == "variant", is.na(booked_at)|booked_at >= "2017-03-13" ) %>%
              group_by(person_id,test_group) %>%
              summarise(booked = max(as.numeric(!is.na(booked_at)))) %>%
              select(booked)

t.test(holdout_seg$booked,variant_seg$booked)

## 
##  Welch Two Sample t-test
## 
## data:  holdout_seg$booked and variant_seg$booked
## t = -10.821, df = 23497, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.08312850 -0.05763198
## sample estimates:
## mean of x mean of y 
## 0.4487278 0.5191080

I conducted a TTest to check if the difference in the booking in the variant and holdout to see if the 6% difference we see in the booking rate is not by chance but is actually statistically significant. We see that the p-value is less than 0.05. Thus, the variant performed better than the holdout.

3. Do you have any reservations about the experiment design? What would you recommend as next steps?

holdout_seg_pre <-people_test_seg %>%
              filter(test_name == "New Conversation Flow") %>%
              left_join(conversation, by = c("person_id"="requester_id")) %>%
              filter(test_group == "holdout", is.na(booked_at)|booked_at < "2017-03-13") %>%
              group_by(person_id,test_group) %>%
              summarise(booked = max(as.numeric(!is.na(booked_at)))) %>%
              select(booked)

variant_seg_pre <-people_test_seg %>%
              filter(test_name == "New Conversation Flow") %>%
              left_join(conversation, by = c("person_id"="requester_id")) %>%
              filter(test_group == "variant", is.na(booked_at)|booked_at < "2017-03-13") %>%
              group_by(person_id,test_group) %>%
              summarise(booked = max(as.numeric(!is.na(booked_at)))) %>%
              select(booked)

t.test(holdout_seg_pre$booked,variant_seg_pre$booked)

## 
##  Welch Two Sample t-test
## 
## data:  holdout_seg_pre$booked and variant_seg_pre$booked
## t = -1.7342, df = 17675, p-value = 0.08289
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.02496372  0.00152613
## sample estimates:
## mean of x mean of y 
## 0.2765841 0.2883029

If we check the booking rate for the holdout and variant segments pre to the AB Testing we can see a slighty higher booking rate for y which is not statistically signficant with p = 0.08 but, still has some difference. What we should do ideally is match the holdout and variant segment pre AB testing so that the segments are similar in performance so that the change that happends post AB testing can be attributed solely to the test and not an exsting bias.

6.Search Engine Marketing

1. For each day, determine the count of users that joined and were acquired through Google. Plot this and confirm there is an inflection point on or near 2017-04-12

cust_google_data <- people %>% 
                    filter(channel == "Google") %>%
                    group_by(date_joined) %>%
                    summarise(count = n_distinct(id)) 

cust_google_data %>%
ggplot(aes( x= date_joined, y =count ))+geom_line()+
geom_vline(xintercept = as.numeric(as.Date("2017-04-12")), linetype="dotted", color = "blue", size=1.5)

We can see that there is an inflection point on 2017-04-12 where a blue line has been drawn post which the count of customers joining has increased

2. How many users were acquired via Google advertising since 2017-04-12 and what was the average cost per account?

people %>% 
  filter(channel == "Google", date_joined >="2017-04-12") %>%
  summarise(count = n_distinct(id), avg_cost = 210285/count)

##   count avg_cost
## 1  4673       45

#45$

4673 customers have been acquired via google advertising since 2017-04-12. Average cost per account is 45$.

3. Estimate how many users would have been acquired had the company not changed its bidding strategy. What would have been the marketing spend in that case?

 a<- cust_google_data %>%
      mutate( year1 = year(date_joined),month1 = month(date_joined))%>%
      group_by(year1,month1) %>% summarise(sum_c = sum(count))
#No seasonality
ggplot(a,aes(x= month1, y = sum_c))+ geom_line() +facet_wrap(~year1, nrow = 3)

First I checked if there existing seasonlity in the count of customers joining. Seeing the graph above we can see that there exists no seasonlity. Thus, decided to extrapolate the customer acquired count using linear regression.

x<- people %>% 
  filter(channel == "Google",date_joined < "2017-04-12") %>%
  group_by(date_joined) %>%
  summarise(count = n_distinct(id))

x1 <- people %>% 
  filter(channel == "Google",date_joined >= "2017-04-12") %>%
  group_by(date_joined) %>%
  summarise(count = n_distinct(id))


#predicting
d.fit <- lm(count~date_joined, x)
summary(d.fit)

## 
## Call:
## lm(formula = count ~ date_joined, data = x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.2637  -2.2172  -0.0932   2.3408  12.6450 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.584e+02  1.469e+01  -51.61   <2e-16 ***
## date_joined  4.565e-02  8.664e-04   52.68   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.835 on 613 degrees of freedom
## Multiple R-squared:  0.8191, Adjusted R-squared:  0.8188 
## F-statistic:  2776 on 1 and 613 DF,  p-value: < 2.2e-16

pred <- predict(d.fit, newdata = x1, interval = "prediction")
x1$pred <- pred[,1]

cust_google_data %>% left_join(x1, by = "date_joined")%>%
  ggplot(aes( x= date_joined, y =count.x )) +geom_point()+
   geom_smooth(data=subset(cust_google_data %>% left_join(x1, by = "date_joined"),date_joined  <"2017-04-12"), method='lm',se=F)+
  geom_vline(xintercept = as.numeric(as.Date("2017-04-12")), linetype="dotted", color = "blue", size=1.5)+
  geom_point(aes( x = date_joined, y = pred),color='red', size =0.5)

The red line you see is the extrapolated customer count which is what we would expect the customer acquisition count to be if there was no change in the bidding strategy. Total number of customers who would be acquired if there was no chance in bidding strategy is :

sum(x1$pred)*30 #85852

## [1] 85852.05

4.How many additional accounts where created? What was the marginal cost per account for these additional accounts?

#additional accounts created 
sum(x1$count-x1$pred) #1811

## [1] 1811.265

#marginal cost
45*1811

## [1] 81495

1811 additional accounts were created becuase of the change in strategy. Marginal cost per account is : $81,495.

Thanks. Author: Ananya

Analysis for XYZ

Ananya Dutta

March, 2018