library(tidyverse)
library(dplyr)
library(sqldf)
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
## dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
## Referenced from: /Library/Frameworks/R.framework/Resources/modules//R_X11.so
## Reason: image not found
library(lubridate)
library(DT)
library(chron)
library(ggplot2)
library(plyr)
library(ggpubr)
quotes <- read.csv('quotes.csv', header = TRUE)
visitors <- read.csv('visitors.csv',header = TRUE)
head(quotes);summary(quotes);str(quotes)
## request_id quote_id quote_price hired
## Min. : 1 Min. :37719563 Min. : 0.0 Min. :0.0000
## 1st Qu.: 8112 1st Qu.:38226478 1st Qu.: 120.0 1st Qu.:0.0000
## Median :16674 Median :38740194 Median : 195.0 Median :0.0000
## Mean :16780 Mean :38735303 Mean : 242.5 Mean :0.1717
## 3rd Qu.:25331 3rd Qu.:39246298 3rd Qu.: 350.0 3rd Qu.:0.0000
## Max. :34146 Max. :39659761 Max. :25504.0 Max. :1.0000
## NA's :32776
## pro_id
## Min. : 2557
## 1st Qu.: 5328238
## Median : 9287742
## Mean : 9368664
## 3rd Qu.:14287697
## Max. :16771673
##
## 'data.frame': 64330 obs. of 5 variables:
## $ request_id : int 1 1 1 1 1 2 2 3 3 4 ...
## $ quote_id : int 38912310 38912311 38913628 38914071 38912344 39189751 39189531 38146708 38178389 38192457 ...
## $ quote_price: num NA 210 NA NA NA 175 100 NA NA 100 ...
## $ hired : int 0 0 0 0 0 0 1 0 0 0 ...
## $ pro_id : int 851539 10113954 13498826 15921289 14506387 76329 10995392 5836949 5918823 15728846 ...
head(visitors);summary(visitors);str(visitors)
## session_date device category_name
## 8/22/16: 2825 desktop:32295 House Cleaning (One Time) :17347
## 8/15/16: 2791 mobile :27701 Local Moving (under 50 miles):42649
## 8/1/16 : 2718
## 8/8/16 : 2661
## 8/23/16: 2518
## 8/24/16: 2495
## (Other):43988
## sent_request request_id how_far
## Min. :0.0000 Min. : 1 :38545
## 1st Qu.:0.0000 1st Qu.: 8537 11 - 20 miles : 4312
## Median :1.0000 Median :17074 21 - 30 miles : 2069
## Mean :0.5691 Mean :17074 31 - 50 miles : 1683
## 3rd Qu.:1.0000 3rd Qu.:25610 5 - 10 miles : 5703
## Max. :1.0000 Max. :34147 Less than 5 miles : 7007
## NA's :25850 Within the same building: 677
## num_bedrooms num_bathrooms
## :51358 :51358
## 1 bedroom : 1001 1 bathroom : 2081
## 2 bedrooms : 2044 1.5 bathrooms: 822
## 3 bedrooms : 3261 2 bathrooms : 3369
## 4 bedrooms : 1695 3 bathrooms : 1928
## 5+ bedrooms: 460 4+ bathrooms : 438
## Studio : 177
## 'data.frame': 59996 obs. of 8 variables:
## $ session_date : Factor w/ 28 levels "8/1/16","8/10/16",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ device : Factor w/ 2 levels "desktop","mobile": 1 1 1 2 2 1 2 2 2 2 ...
## $ category_name: Factor w/ 2 levels "House Cleaning (One Time)",..: 1 2 2 2 2 1 2 1 2 2 ...
## $ sent_request : int 1 1 1 1 1 1 1 1 1 1 ...
## $ request_id : int 9067 12707 19561 31010 6887 30673 32253 22560 21478 2110 ...
## $ how_far : Factor w/ 7 levels "","11 - 20 miles",..: 1 6 6 6 2 1 3 1 2 3 ...
## $ num_bedrooms : Factor w/ 7 levels "","1 bedroom",..: 2 1 1 1 1 2 1 4 1 1 ...
## $ num_bathrooms: Factor w/ 6 levels "","1 bathroom",..: 2 1 1 1 1 2 1 3 1 1 ...
summary(is.na(quotes))
## request_id quote_id quote_price hired
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:64330 FALSE:64330 FALSE:31554 FALSE:64330
## TRUE :32776
## pro_id
## Mode :logical
## FALSE:64330
##
summary(is.na(visitors))
## session_date device category_name sent_request
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:59996 FALSE:59996 FALSE:59996 FALSE:59996
##
## request_id how_far num_bedrooms num_bathrooms
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:34146 FALSE:59996 FALSE:59996 FALSE:59996
## TRUE :25850
Transform Date into Date format:
class(visitors$session_date)
## [1] "factor"
visitors$session_date <- as.Date(visitors$session_date, format = "%m/%d/%y")
Convert dayofweek into factor with levels + labels Cut 8/1/16 ~ 8/28/16 which is Monday to Sunday into four full weeks:
visitors$session_dayofweek <- wday(visitors$session_date)
visitors$session_dayofweek <- factor(visitors$session_dayofweek, labels = c('Sun','Mon','Tue','Wed','Thur','Fri','Sat'))
visitors$session_week <- cut(visitors$session_date,4,labels = c('wk1','wk2','wk3','wk4'))
table(visitors$session_date);class(visitors$session_date)
##
## 2016-08-01 2016-08-02 2016-08-03 2016-08-04 2016-08-05 2016-08-06
## 2718 2428 2347 2249 1893 1530
## 2016-08-07 2016-08-08 2016-08-09 2016-08-10 2016-08-11 2016-08-12
## 1503 2661 2491 2329 2174 1910
## 2016-08-13 2016-08-14 2016-08-15 2016-08-16 2016-08-17 2016-08-18
## 1562 1649 2791 2489 2435 2198
## 2016-08-19 2016-08-20 2016-08-21 2016-08-22 2016-08-23 2016-08-24
## 1887 1535 1603 2825 2518 2495
## 2016-08-25 2016-08-26 2016-08-27 2016-08-28
## 2378 2070 1680 1648
## [1] "Date"
By date we can see the clear seasonality in a week with high # of visitors on Mondays, low in weekends; Also there’s a week over week increasing trend in the 4 full weeks
By week view proves that week over week, there’s a obvious increase in terms of # of total visitors
By day of week we can prove the seasonality within a week that Monday is the most visited day, and it decrease gradually by day and reach the weekly low on weekends in terms of # of total visitors
visitors_by_date <- sqldf('SELECT session_date, COUNT(*) as CNT FROM visitors GROUP BY session_date')
visitors_by_week <- sqldf('SELECT session_week, COUNT(*) as CNT FROM visitors GROUP BY session_week')
visitors_by_dayofweek <- sqldf('SELECT session_dayofweek, COUNT(*) as CNT FROM visitors GROUP BY session_dayofweek')
datatable(visitors_by_date)
a <- ggplot(data = visitors_by_date, aes(x = session_date, y = CNT)) +
geom_line(linetype = "dashed") +
geom_point() +
labs(y="Total # of Distinct Visitors",
x="Date",
title="Total # of Distinct Visitors by Date",
caption = "Source: Thumbtack")
plot(a)
datatable(visitors_by_week)
b <- ggplot(visitors_by_week, aes(x=session_week, y=CNT)) +
geom_point(col="tomato2", size=3) +
geom_segment(aes(x=session_week,
xend=session_week,
y=min(CNT),
yend=max(CNT)),
linetype="dashed",
size=0.1) +
labs(y="Total # of Distinct Visitors",
x="Week",
title="Total # of Distinct Visitors by Week",
caption = "Source: Thumbtack") +
coord_flip()
plot(b)
datatable(visitors_by_dayofweek)
c<-ggplot(data=visitors_by_dayofweek, aes(x=session_dayofweek, y=CNT)) +
geom_bar(stat="identity",fill="grey") +
theme(legend.position="none") +
labs(y="Total # of Distinct Visitors",
x="Day of Week",
title="Total # of Distinct Visitors by Day of Week",
caption = "Source: Thumbtack")
plot(c)
request_ratio <- sqldf('SELECT sum(sent_request) reqeust_cnt, count(*) as total_cnt FROM visitors')
request_ratio$reqeust_ratio <- request_ratio$reqeust_cnt/request_ratio$total_cnt
request_ratio$no_reqeust_ratio <- 1-request_ratio$reqeust_ratio
datatable(request_ratio)
If we view by Date*Category we see that although it shows a similar seasonality within a week across two categories, the Local Moving has far more visitors averagely than House Cleaning
If we view by Day of week*Category, we still see the similar weekday trend across two categories, but Local Moving is having a better standard deviation across days of week, which means it has more fluctuations across different day of a week comparing to House Cleaning
If we view by Date*Device, it’s interesting that Desktop averagely has more visitors, but also has much bigger standard deviation (fluctuations) across comparing to Mobile in terms of # of visitors
If we view by day of week *Device, we see the similar story - Desktop has bigger average # of visitors but also more flutuations
# 1. Date * category
visitors_by_date_cate <- sqldf('SELECT session_date, category_name, COUNT(*) as CNT FROM visitors GROUP BY session_date, category_name ORDER BY 2,1')
datatable(visitors_by_date_cate)
visitors_by_date_cate %>%
dplyr::group_by(category_name) %>%
dplyr::summarise(
total_count = sum(CNT),
mean_count = round(mean(CNT),2),
standard_dev_count = round(sd(CNT),2)
) %>% datatable()
d <- ggplot(data = visitors_by_date_cate, aes(x = session_date, y = CNT, color = category_name)) +
geom_line(linetype = "dashed") +
geom_point() +
labs(y="Total # of Distinct Visitors",
x="Date",
title="Total # of Distinct Visitors by Date * Category",
caption = "Source: Thumbtack")
plot(d)
# 2. Day of week * category
visitors_by_dow_cate <- sqldf('SELECT session_dayofweek, category_name, COUNT(*) as CNT FROM visitors GROUP BY session_dayofweek, category_name ORDER BY 2,1')
datatable(visitors_by_dow_cate)
visitors_by_dow_cate %>%
dplyr::group_by(category_name) %>%
dplyr::summarise(
total_count = sum(CNT),
mean_count = round(mean(CNT),2),
standard_dev_count = round(sd(CNT),2)
) %>% datatable()
e <- ggplot(data = visitors_by_dow_cate, aes(x = session_dayofweek, y = CNT, color = category_name)) +
geom_line(linetype = "dashed") +
geom_point() +
labs(y="Total # of Distinct Visitors",
x="Date",
title="Total # of Distinct Visitors by day of week * Category",
caption = "Source: Thumbtack")
plot(e)
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
# 3.Date * Device
visitors_by_date_dev <- sqldf('SELECT session_date, device, COUNT(*) as CNT FROM visitors GROUP BY session_date, device ORDER BY 2,1')
datatable(visitors_by_date_dev)
visitors_by_date_dev %>%
dplyr::group_by(device) %>%
dplyr::summarise(
total_count = sum(CNT),
mean_count = round(mean(CNT),2),
standard_dev_count = round(sd(CNT),2)
) %>% datatable()
f <- ggplot(data = visitors_by_date_dev, aes(x = session_date, y = CNT, color = device)) +
geom_line(linetype = "dashed") +
geom_point() +
labs(y="Total # of Distinct Visitors",
x="Date",
title="Total # of Distinct Visitors by Date * Device",
caption = "Source: Thumbtack")
plot(f)
# 4. Day of week * device
visitors_by_dow_dev <- sqldf('SELECT session_dayofweek, device, COUNT(*) as CNT FROM visitors GROUP BY session_dayofweek, device ORDER BY 2,1')
datatable(visitors_by_dow_dev)
visitors_by_dow_dev %>%
dplyr::group_by(device) %>%
dplyr::summarise(
total_count = sum(CNT),
mean_count = round(mean(CNT),2),
standard_dev_count = round(sd(CNT),2)
) %>% datatable()
g <- ggplot(data = visitors_by_dow_dev, aes(x = session_dayofweek, y = CNT, color = device)) +
geom_line(linetype = "dashed") +
geom_point() +
labs(y="Total # of Distinct Visitors",
x="Date",
title="Total # of Distinct Visitors by day of week * device",
caption = "Source: Thumbtack")
plot(g)
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
# Request by category * device
request_dev_cate <- sqldf('SELECT device, category_name, sum(sent_request) as Request_total FROM visitors GROUP BY device, category_name ORDER BY 1,2')
datatable(request_dev_cate)
quotes_per_request <- sqldf('SELECT request_id, count(*) as CNT FROM quotes GROUP BY 1')
quotes_per_request_V2 <-
quotes_per_request %>%
dplyr::group_by(CNT) %>%
dplyr::summarise (n = n()) %>%
mutate(freq = round(n / sum(n),6))
datatable(quotes_per_request_V2)
h<-ggplot(data=quotes_per_request_V2, aes(x=CNT, y=n)) +
geom_bar(stat="identity",fill="grey") +
theme(legend.position="none") +
labs(y="Total # request",
x="Quotes per Request",
title="Quotes per Request Distribution",
caption = "Source: Thumbtack")
plot(h)
*It may shows that professional doesn’t like quoting on too few / too many bedrooms
*It may also shows that professional doesn’t like quoting on too few / too many bathrooms, but 1.5 bathroom is an interesting exception (maybe half bathroom created some problem in cleanning in the past)
*It may shows when the moving requests a vehicle, professionals prefer shorter distance moving because it’s easier to manage and saves time; But when it doesn’t request a vehicle (within a building moving), professionals don’t like it may due to the small profit made from there
# Join two tables to consider all factors
visitor_quotes <- sqldf('SELECT * FROM visitors a JOIN quotes b on a.request_id = b.request_id')
visitor_quotes$request_id <- as.character(visitor_quotes$request_id)
visitor_quotes_cnt_request <- sqldf('SELECT request_id,category_name,device, how_far, num_bedrooms,num_bathrooms, count(*) as CNT FROM visitor_quotes group by 1,2,3,4,5,6')
visitor_quotes_cnt_HC <- filter(visitor_quotes_cnt_request,category_name == 'House Cleaning (One Time)')
visitor_quotes_cnt_LM<- filter(visitor_quotes_cnt_request,category_name == 'Local Moving (under 50 miles)')
cate_freq <- as.data.frame(table(visitor_quotes_cnt_request$category_name,visitor_quotes_cnt_request$CNT))
dev_freq <- as.data.frame(table(visitor_quotes_cnt_request$device,visitor_quotes_cnt_request$CNT))
HC_bd_freq <- as.data.frame(table(visitor_quotes_cnt_HC$num_bedrooms,visitor_quotes_cnt_HC$CNT))
HC_bt_freq <- as.data.frame(table(visitor_quotes_cnt_HC$num_bathrooms,visitor_quotes_cnt_HC$CNT))
LM_hf_freq <- as.data.frame(table(visitor_quotes_cnt_LM$how_far,visitor_quotes_cnt_LM$CNT))
HC_bd_freq_plot <- ggballoonplot(HC_bd_freq, fill = "value")+
scale_fill_viridis_c(option = "C") +
labs(y="# of quotes / request",
x="# of bedroom",
title="# of Bedroom's Impact for House Cleaning Request on the # of Quote Acquired ",
caption = "Source: Thumbtack")
plot(HC_bd_freq_plot)
HC_bt_freq_plot <- ggballoonplot(HC_bt_freq, fill = "value")+
scale_fill_viridis_c(option = "C") +
labs(y="# of quotes / request",
x="# of bathroom",
title="# of Bathroom's Impact for House Cleaning Request on the # of Quote Acquired ",
caption = "Source: Thumbtack")
plot(HC_bt_freq_plot)
LM_hf_freq_plot <- ggballoonplot(LM_hf_freq, fill = "value")+
scale_fill_viridis_c(option = "C") +
labs(y="# of quotes / request",
x="How Far",
title="How Far Impact for Local Moving Request on the # of Quote Acquired ",
caption = "Source: Thumbtack")
plot(LM_hf_freq_plot)
*Outliers and NA values excluded to get a better understanding of the general distribution
visitor_quotes_price <- filter(visitor_quotes,quote_price >= 0 & quote_price <= 850)
# quantile(visitor_quotes_price$quote_price, c(.99)) --- filter out 99% outlier to better represent
mu <- ddply(visitor_quotes_price, "category_name", summarise, grp.mean=mean(quote_price))
median <- ddply(visitor_quotes_price, "category_name", summarise, grp.median=median(quote_price))
std <- ddply(visitor_quotes_price, "category_name", summarise, grp.std=sd(quote_price))
mu
median
std
j <- ggplot(visitor_quotes_price, aes(x=quote_price, color=category_name)) +
geom_histogram(fill="white", alpha=0.5, position="identity",bins = 25) +
geom_vline(data=mu, aes(xintercept=grp.mean, color=category_name),
linetype="dashed")+
scale_color_brewer(palette="Dark2")+
scale_fill_brewer(palette="Dark2") +
labs(y="Total # of request",
x="Quote Price",
title="Quote Price Distribution by Category",
caption = "Source: Thumbtack")
plot(j)
Answer: Around mean value of each category to score higher quote sucess rate (mean is close to median due to the cleaning of outliers)
Question to explore 1: - For house Cleaning request, how do the num_bedroom, num_bathroom factors impact how many quotes available for the sight request? - Similarly for Local Moving request, how does the how_far factor impact the how many quotes available for the sight request?
Question to explore 2: - What are the driving factors for a higher hired possibilities (modeling on hire / not hire based on available information) - based on the hire rate, who are the star professionals and how do we profile them based on their quote history?