library(dplyr)
library(lubridate)
library(ggplot2)
setwd('C:/Users/anany/Desktop/BANA/Applications/Mock Project XYZ.com')
people <- read.csv('people_person.csv')
people_pets <- read.csv('pets_pet.csv')
services <- read.csv('services_service.csv')
conversation <- read.csv('conversations_conversation.csv')
conversation_review <- read.csv('conversations_review.csv')
people_test_seg <- read.csv('people_testsegmentation.csv')
# str(people_test_seg)
# str(people)
# str(people_pets)
# str(services)
# str(conversation)
# str(conversation_review)
people$date_joined <- as.Date(people$date_joined)
conversation$end_date <- as.Date(conversation$end_date)
conversation$start_date <- as.Date(conversation$start_date)
conversation$booked_at<- as.Date(conversation$booked_at)
conversation$cancelled_at<- as.Date(substr(conversation$cancelled_at,1,10), format = "%Y-%m-%d")
conversation$added = as.Date(conversation$added)
people_test_seg$added_date <- as.Date(people_test_seg$added)
1. How many users have signed up? 64416
2. How many users signed up prior to 2017-01-12 ? 35500
3. What percentage of users have added pets? 80.44%
4. Of those users, how many pets have they added on average? 1.495
5. What percentage of pets play well with cats? 24.78%
#1
total_users <- people %>%
summarise(count_users = n_distinct(email))
total_users
## count_users
## 1 64416
#2
people %>%
filter(date_joined < '2017-01-12') %>%
summarise(count = n_distinct(email))
## count
## 1 35500
#3
people %>%
inner_join(people_pets, by = c("id"="owner_id")) %>%
summarise(perc_w_pets = n_distinct(email))*100/total_users
## perc_w_pets
## 1 80.43964
#4
people %>%
inner_join(people_pets, by = c("id"="owner_id")) %>%
group_by(id) %>%
count() %>%
ungroup() %>%
summarise( mean = mean(n))
## # A tibble: 1 x 1
## mean
## <dbl>
## 1 1.495909
#5
people_pets %>% summarise(perc_play_cats = sum(plays_cats)/n())
## perc_play_cats
## 1 0.2477552
1. What are the possible services and what is the average price per unit for each service type?
#1
services %>% group_by(service_type)%>% summarise(avg_price = mean(price, na.rm = TRUE))
## # A tibble: 3 x 2
## service_type avg_price
## <fctr> <dbl>
## 1 boarding 34.50608
## 2 day-care 27.01369
## 3 dog-walking 21.98219
2. How many requests have there been for each service type and what percentage of those have booked? The percentage of those that have booked is called booking rate.
#2
conversation %>%
inner_join(services,by = c("service_id"="id")) %>%
group_by(service_type) %>%
summarise(count_req = n(), count_booked = sum(!is.na(booked_at)) , perc_booked = count_booked*100/count_req)
## # A tibble: 3 x 4
## service_type count_req count_booked perc_booked
## <fctr> <int> <int> <dbl>
## 1 boarding 26665 10862 40.73505
## 2 day-care 26362 9417 35.72187
## 3 dog-walking 26716 12811 47.95254
3. What are the cancellation rates for each service?
#3
conversation %>%
inner_join(services,by = c("service_id"="id")) %>%
group_by(service_type) %>%
summarise(count_req = n(), count_cancelled = sum(!is.na(cancelled_at)) , perc_cancelled = count_cancelled*100/count_req)
## # A tibble: 3 x 4
## service_type count_req count_cancelled perc_cancelled
## <fctr> <int> <int> <dbl>
## 1 boarding 26665 1051 3.941496
## 2 day-care 26362 919 3.486078
## 3 dog-walking 26716 1218 4.559066
4.For uncancelled bookings, is the owner or provider more likely to leave a review and which tends to leave better reviews? How would you narrate this finding to a business partner?
#4
head(conversation,5)
## id start_date end_date units added booking_total
## 1 208294 2017-06-06 2017-06-09 3 2017-05-25 102
## 2 208295 2017-07-16 2017-07-25 9 2017-07-07 324
## 3 208296 2017-06-06 2017-06-12 6 2017-05-26 102
## 4 208297 2017-06-20 2017-06-25 5 2017-06-10 420
## 5 208298 2017-05-25 2017-05-28 3 2017-05-15 66
## cancellation_fault requester_id service_id booked_at cancelled_at
## 1 64416 81642 2017-05-26 <NA>
## 2 64415 79559 <NA> <NA>
## 3 64414 91348 2017-05-29 <NA>
## 4 provider 64413 87158 2017-06-15 2017-06-18
## 5 64411 82832 2017-05-16 <NA>
conversation %>%
filter(is.na(cancelled_at)) %>%
left_join(conversation_review, by = c("id" = "conversation_id"))%>%
summarise( count_total = n(), count_review = sum(!is.na(reviewer_id)), perc_review = count_review*100/count_total, stars = mean(stars, na.rm = TRUE))
## count_total count_review perc_review stars
## 1 81087 28536 35.19183 4.295697
Average price is highest for boarding followed by day-care followed by dog-walking.
The number of requests per service type is mor or less the same with dog-walking slightly higher than the rest. Most number of bookings have been made for dog walking with 47.9% of the requests followed by boarding with 40.73%.
On an average dog-walking has the highest cancellation as well with 4.55% followed by boarding with 3.94%.
For uncancelled bookings, the owner leaves a review 35% of the times so he is not very likely to leave a review. But, people who have left review they have givean averag eof 4.29 starts out of 5 which is pretty good.
1.First, let’s reproduce their results. They tell you that daily booking rate is defined to be the percentage of conversations created each day that eventually book. What is the daily booking rate for each of the 90 days prior to the snapshot? Is there a downward trend?
conversation %>% filter(added > max(added)-90) %>%
group_by(added) %>% summarise( count = n(), count_booked = sum(!is.na(booked_at)),perc_booking = count_booked/count) %>%
arrange(added) %>%
ggplot(aes( x= added, y = perc_booking))+geom_line()
2. Can you narrate a reason why this trend exists? Is there a reason to be concerned?
conversation %>% filter(added > max(added)-90) %>%
group_by(added) %>% summarise( count = n(), count_booked = sum(!is.na(booked_at)),
count_cancelled = sum(!is.na(cancelled_at)),perc_booking = count_booked/count, time_to_book = mean(booked_at - added, na.rm = TRUE)) %>%
arrange(added)
## # A tibble: 90 x 6
## added count count_booked count_cancelled perc_booking
## <date> <int> <int> <int> <dbl>
## 1 2017-04-12 213 108 12 0.5070423
## 2 2017-04-13 226 103 10 0.4557522
## 3 2017-04-14 213 93 7 0.4366197
## 4 2017-04-15 213 87 10 0.4084507
## 5 2017-04-16 206 94 9 0.4563107
## 6 2017-04-17 207 84 11 0.4057971
## 7 2017-04-18 239 100 10 0.4184100
## 8 2017-04-19 220 104 12 0.4727273
## 9 2017-04-20 212 95 14 0.4481132
## 10 2017-04-21 229 118 12 0.5152838
## # ... with 80 more rows, and 1 more variables: time_to_book <time>
Why this trend exists The average number of days a customer takes to book his request is ~3 days as we can see from the 90 days data. The last three days show a donwward trend becuase they have not passed this waiting period thus, the downward trend the anayst is seeing is not actually a drop in the booking rate. At this point there is no reason to be concerned. We might want to wait for 2-3 days and see if the trend become normal for the recent 2-3 days.
1. In each month, what were the gross billings and net revenue?
2. Define take rate to be the percentage of gross billings that is net revenue. In the previous example, the take rate is slightly more than 19% since $20/$105 is approximately 0.1905. In each month, what was the aggregate take rate?
take_rate1 <- people %>%
inner_join(services, c("id"= "provider_id")) %>%
mutate(service_fee = fee.y, booking_fee = fee.x) %>%
inner_join(conversation, c("id.y"= "service_id")) %>%
filter(!is.na(booked_at), is.na(cancelled_at)) %>%
mutate(booking_month = month(booked_at),booking_year = year(booked_at),
owner_fee = ifelse(booking_total*booking_fee>50,50,booking_total*booking_fee),
service_fee1 = booking_total*service_fee,
gross_billing =booking_total+owner_fee,
net_revenue = service_fee1 +owner_fee,
take_rate = net_revenue*100/gross_billing)%>%
group_by(booking_year,booking_month) %>%
summarise(gross_billin_m = sum(gross_billing,na.rm = TRUE),net_revenue_m = sum(net_revenue, na.rm = TRUE), take_rate_m = mean(take_rate, na.rm = TRUE), service_fee1 = mean(service_fee), owner_fee1 = mean(booking_fee))
take_rate1
## # A tibble: 25 x 7
## # Groups: booking_year [?]
## booking_year booking_month gross_billin_m net_revenue_m take_rate_m
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2015 7 1035.00 155.25 15.00000
## 2 2015 8 11677.46 1799.61 15.32680
## 3 2015 9 19105.54 3006.54 15.79511
## 4 2015 10 36054.74 5825.34 16.13126
## 5 2015 11 37320.56 6083.91 16.28261
## 6 2015 12 57197.38 9470.73 16.58921
## 7 2016 1 71517.89 12603.54 17.47574
## 8 2016 2 80035.20 14154.25 17.70134
## 9 2016 3 128357.28 23159.58 18.04801
## 10 2016 4 139910.58 25560.08 18.18658
## # ... with 15 more rows, and 2 more variables: service_fee1 <dbl>,
## # owner_fee1 <dbl>
3. Did take rate trend up or trend down or remain unchanged over time?
4. If it did change, investigate why and provide an explaination. Be sure to provide additional data/charts/evidence that justify your explaination. Any claims should be backed by data.
take_rate1 %>% mutate(booking_month1 = ifelse(booking_month <10, paste0('0',booking_month), booking_month),booking_month_year = paste0(booking_year,"-", booking_month1)) %>%
ggplot(aes( x = booking_month_year, y = take_rate_m)) +geom_bar(stat= "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Change of Take Rate over time")
take_rate1 %>% mutate(booking_month1 = ifelse(booking_month <10, paste0('0',booking_month), booking_month),booking_month_year = paste0(booking_year,"-", booking_month1)) %>%
ggplot(aes( x = booking_month_year, y = owner_fee1)) +geom_bar(stat= "identity",fill = "dark cyan") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
ggtitle("Change of %Owner fee over time")
take_rate1 %>% mutate(booking_month1 = ifelse(booking_month <10, paste0('0',booking_month), booking_month),booking_month_year = paste0(booking_year,"-", booking_month1)) %>%
ggplot(aes( x = booking_month_year, y = service_fee1)) +geom_bar(stat= "identity",fill = "brown") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
ggtitle("Change of %Service fee over time")
We can clearly see an increaasing trend in the take rate over time from the graph above.
But Why?
This is happening probably due to an increase in the % Service fee and % Booking fee over time.
1. Did conversations with the new conversation page book at a higher rate?
head(people_test_seg,10)
## id test_name test_group added person_id
## 1 61425 Email Test holdout 2015-07-12 15:13:16.673652 1
## 2 61426 Email Test holdout 2015-07-12 13:44:21.949482 2
## 3 61427 Email Test variant 2015-07-12 11:36:06.992714 3
## 4 61428 Email Test holdout 2015-07-12 12:23:43.627071 4
## 5 61429 Email Test holdout 2015-07-12 07:40:36.724086 5
## 6 61430 Email Test holdout 2015-07-12 11:18:40.410251 6
## 7 61431 Email Test holdout 2015-07-12 19:48:21.073046 7
## 8 61432 Email Test holdout 2015-07-12 12:01:51.756294 8
## 9 61433 Email Test holdout 2015-07-12 16:30:27.866647 9
## 10 61434 Email Test variant 2015-07-12 15:32:31.386474 10
## added_date
## 1 2015-07-12
## 2 2015-07-12
## 3 2015-07-12
## 4 2015-07-12
## 5 2015-07-12
## 6 2015-07-12
## 7 2015-07-12
## 8 2015-07-12
## 9 2015-07-12
## 10 2015-07-12
people_test_seg %>%
filter(test_name == "New Conversation Flow") %>%
left_join(conversation, by = c("person_id"="requester_id")) %>%
filter(is.na(booked_at)|booked_at >= "2017-03-13") %>%
group_by(person_id,test_group) %>%
summarise(booked = max(as.numeric(!is.na(booked_at)))) %>%
group_by(test_group) %>%
summarise(count_total = n(),count_booked = sum(booked), perc_booked = count_booked*100/count_total)
## # A tibble: 2 x 4
## test_group count_total count_booked perc_booked
## <fctr> <int> <dbl> <dbl>
## 1 holdout 11751 5273 44.87278
## 2 variant 11749 6099 51.91080
The percentage coversion for variant is 62% i.e. 6% higher than th eholdout group who weren’t exposed to the new conversation flow.
2. Is it statistically significant?
holdout_seg <-people_test_seg %>%
filter(test_name == "New Conversation Flow") %>%
left_join(conversation, by = c("person_id"="requester_id")) %>%
filter(test_group == "holdout", is.na(booked_at)|booked_at >= "2017-03-13") %>%
group_by(person_id,test_group) %>%
summarise(booked = max(as.numeric(!is.na(booked_at)))) %>%
select(booked)
variant_seg <-people_test_seg %>%
filter(test_name == "New Conversation Flow") %>%
left_join(conversation, by = c("person_id"="requester_id")) %>%
filter(test_group == "variant", is.na(booked_at)|booked_at >= "2017-03-13" ) %>%
group_by(person_id,test_group) %>%
summarise(booked = max(as.numeric(!is.na(booked_at)))) %>%
select(booked)
t.test(holdout_seg$booked,variant_seg$booked)
##
## Welch Two Sample t-test
##
## data: holdout_seg$booked and variant_seg$booked
## t = -10.821, df = 23497, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.08312850 -0.05763198
## sample estimates:
## mean of x mean of y
## 0.4487278 0.5191080
I conducted a TTest to check if the difference in the booking in the variant and holdout to see if the 6% difference we see in the booking rate is not by chance but is actually statistically significant. We see that the p-value is less than 0.05. Thus, the variant performed better than the holdout.
3. Do you have any reservations about the experiment design? What would you recommend as next steps?
holdout_seg_pre <-people_test_seg %>%
filter(test_name == "New Conversation Flow") %>%
left_join(conversation, by = c("person_id"="requester_id")) %>%
filter(test_group == "holdout", is.na(booked_at)|booked_at < "2017-03-13") %>%
group_by(person_id,test_group) %>%
summarise(booked = max(as.numeric(!is.na(booked_at)))) %>%
select(booked)
variant_seg_pre <-people_test_seg %>%
filter(test_name == "New Conversation Flow") %>%
left_join(conversation, by = c("person_id"="requester_id")) %>%
filter(test_group == "variant", is.na(booked_at)|booked_at < "2017-03-13") %>%
group_by(person_id,test_group) %>%
summarise(booked = max(as.numeric(!is.na(booked_at)))) %>%
select(booked)
t.test(holdout_seg_pre$booked,variant_seg_pre$booked)
##
## Welch Two Sample t-test
##
## data: holdout_seg_pre$booked and variant_seg_pre$booked
## t = -1.7342, df = 17675, p-value = 0.08289
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02496372 0.00152613
## sample estimates:
## mean of x mean of y
## 0.2765841 0.2883029
If we check the booking rate for the holdout and variant segments pre to the AB Testing we can see a slighty higher booking rate for y which is not statistically signficant with p = 0.08 but, still has some difference. What we should do ideally is match the holdout and variant segment pre AB testing so that the segments are similar in performance so that the change that happends post AB testing can be attributed solely to the test and not an exsting bias.
1. For each day, determine the count of users that joined and were acquired through Google. Plot this and confirm there is an inflection point on or near 2017-04-12
cust_google_data <- people %>%
filter(channel == "Google") %>%
group_by(date_joined) %>%
summarise(count = n_distinct(id))
cust_google_data %>%
ggplot(aes( x= date_joined, y =count ))+geom_line()+
geom_vline(xintercept = as.numeric(as.Date("2017-04-12")), linetype="dotted", color = "blue", size=1.5)
We can see that there is an inflection point on 2017-04-12 where a blue line has been drawn post which the count of customers joining has increased
2. How many users were acquired via Google advertising since 2017-04-12 and what was the average cost per account?
people %>%
filter(channel == "Google", date_joined >="2017-04-12") %>%
summarise(count = n_distinct(id), avg_cost = 210285/count)
## count avg_cost
## 1 4673 45
#45$
4673 customers have been acquired via google advertising since 2017-04-12. Average cost per account is 45$.
3. Estimate how many users would have been acquired had the company not changed its bidding strategy. What would have been the marketing spend in that case?
a<- cust_google_data %>%
mutate( year1 = year(date_joined),month1 = month(date_joined))%>%
group_by(year1,month1) %>% summarise(sum_c = sum(count))
#No seasonality
ggplot(a,aes(x= month1, y = sum_c))+ geom_line() +facet_wrap(~year1, nrow = 3)
First I checked if there existing seasonlity in the count of customers joining. Seeing the graph above we can see that there exists no seasonlity. Thus, decided to extrapolate the customer acquired count using linear regression.
x<- people %>%
filter(channel == "Google",date_joined < "2017-04-12") %>%
group_by(date_joined) %>%
summarise(count = n_distinct(id))
x1 <- people %>%
filter(channel == "Google",date_joined >= "2017-04-12") %>%
group_by(date_joined) %>%
summarise(count = n_distinct(id))
#predicting
d.fit <- lm(count~date_joined, x)
summary(d.fit)
##
## Call:
## lm(formula = count ~ date_joined, data = x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.2637 -2.2172 -0.0932 2.3408 12.6450
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.584e+02 1.469e+01 -51.61 <2e-16 ***
## date_joined 4.565e-02 8.664e-04 52.68 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.835 on 613 degrees of freedom
## Multiple R-squared: 0.8191, Adjusted R-squared: 0.8188
## F-statistic: 2776 on 1 and 613 DF, p-value: < 2.2e-16
pred <- predict(d.fit, newdata = x1, interval = "prediction")
x1$pred <- pred[,1]
cust_google_data %>% left_join(x1, by = "date_joined")%>%
ggplot(aes( x= date_joined, y =count.x )) +geom_point()+
geom_smooth(data=subset(cust_google_data %>% left_join(x1, by = "date_joined"),date_joined <"2017-04-12"), method='lm',se=F)+
geom_vline(xintercept = as.numeric(as.Date("2017-04-12")), linetype="dotted", color = "blue", size=1.5)+
geom_point(aes( x = date_joined, y = pred),color='red', size =0.5)
The red line you see is the extrapolated customer count which is what we would expect the customer acquisition count to be if there was no change in the bidding strategy. Total number of customers who would be acquired if there was no chance in bidding strategy is :
sum(x1$pred)*30 #85852
## [1] 85852.05
4.How many additional accounts where created? What was the marginal cost per account for these additional accounts?
#additional accounts created
sum(x1$count-x1$pred) #1811
## [1] 1811.265
#marginal cost
45*1811
## [1] 81495
1811 additional accounts were created becuase of the change in strategy. Marginal cost per account is : $81,495.
Thanks. Author: Ananya