Library load

library(tidyverse)
library(dplyr)
library(sqldf)
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
##   Referenced from: /Library/Frameworks/R.framework/Resources/modules//R_X11.so
##   Reason: image not found
library(lubridate)
library(DT)
library(chron)
library(ggplot2)
library(plyr)
library(ggpubr)

Data cleaning

Import data

quotes <- read.csv('quotes.csv', header = TRUE)
visitors <- read.csv('visitors.csv',header = TRUE)

Data summary

  • There are NA values in visitors data nad quotes data, due to the discrepency of visit but not request and request but no quotes / hire.
head(quotes);summary(quotes);str(quotes)
##    request_id       quote_id         quote_price          hired       
##  Min.   :    1   Min.   :37719563   Min.   :    0.0   Min.   :0.0000  
##  1st Qu.: 8112   1st Qu.:38226478   1st Qu.:  120.0   1st Qu.:0.0000  
##  Median :16674   Median :38740194   Median :  195.0   Median :0.0000  
##  Mean   :16780   Mean   :38735303   Mean   :  242.5   Mean   :0.1717  
##  3rd Qu.:25331   3rd Qu.:39246298   3rd Qu.:  350.0   3rd Qu.:0.0000  
##  Max.   :34146   Max.   :39659761   Max.   :25504.0   Max.   :1.0000  
##                                     NA's   :32776                     
##      pro_id        
##  Min.   :    2557  
##  1st Qu.: 5328238  
##  Median : 9287742  
##  Mean   : 9368664  
##  3rd Qu.:14287697  
##  Max.   :16771673  
## 
## 'data.frame':    64330 obs. of  5 variables:
##  $ request_id : int  1 1 1 1 1 2 2 3 3 4 ...
##  $ quote_id   : int  38912310 38912311 38913628 38914071 38912344 39189751 39189531 38146708 38178389 38192457 ...
##  $ quote_price: num  NA 210 NA NA NA 175 100 NA NA 100 ...
##  $ hired      : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ pro_id     : int  851539 10113954 13498826 15921289 14506387 76329 10995392 5836949 5918823 15728846 ...
head(visitors);summary(visitors);str(visitors)
##   session_date       device                            category_name  
##  8/22/16: 2825   desktop:32295   House Cleaning (One Time)    :17347  
##  8/15/16: 2791   mobile :27701   Local Moving (under 50 miles):42649  
##  8/1/16 : 2718                                                        
##  8/8/16 : 2661                                                        
##  8/23/16: 2518                                                        
##  8/24/16: 2495                                                        
##  (Other):43988                                                        
##   sent_request      request_id                        how_far     
##  Min.   :0.0000   Min.   :    1                           :38545  
##  1st Qu.:0.0000   1st Qu.: 8537   11 - 20 miles           : 4312  
##  Median :1.0000   Median :17074   21 - 30 miles           : 2069  
##  Mean   :0.5691   Mean   :17074   31 - 50 miles           : 1683  
##  3rd Qu.:1.0000   3rd Qu.:25610   5 - 10 miles            : 5703  
##  Max.   :1.0000   Max.   :34147   Less than 5 miles       : 7007  
##                   NA's   :25850   Within the same building:  677  
##       num_bedrooms         num_bathrooms  
##             :51358                :51358  
##  1 bedroom  : 1001   1 bathroom   : 2081  
##  2 bedrooms : 2044   1.5 bathrooms:  822  
##  3 bedrooms : 3261   2 bathrooms  : 3369  
##  4 bedrooms : 1695   3 bathrooms  : 1928  
##  5+ bedrooms:  460   4+ bathrooms :  438  
##  Studio     :  177
## 'data.frame':    59996 obs. of  8 variables:
##  $ session_date : Factor w/ 28 levels "8/1/16","8/10/16",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ device       : Factor w/ 2 levels "desktop","mobile": 1 1 1 2 2 1 2 2 2 2 ...
##  $ category_name: Factor w/ 2 levels "House Cleaning (One Time)",..: 1 2 2 2 2 1 2 1 2 2 ...
##  $ sent_request : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ request_id   : int  9067 12707 19561 31010 6887 30673 32253 22560 21478 2110 ...
##  $ how_far      : Factor w/ 7 levels "","11 - 20 miles",..: 1 6 6 6 2 1 3 1 2 3 ...
##  $ num_bedrooms : Factor w/ 7 levels "","1 bedroom",..: 2 1 1 1 1 2 1 4 1 1 ...
##  $ num_bathrooms: Factor w/ 6 levels "","1 bathroom",..: 2 1 1 1 1 2 1 3 1 1 ...
summary(is.na(quotes))
##  request_id       quote_id       quote_price       hired        
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:64330     FALSE:64330     FALSE:31554     FALSE:64330    
##                                  TRUE :32776                    
##    pro_id       
##  Mode :logical  
##  FALSE:64330    
## 
summary(is.na(visitors))
##  session_date      device        category_name   sent_request   
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:59996     FALSE:59996     FALSE:59996     FALSE:59996    
##                                                                 
##  request_id       how_far        num_bedrooms    num_bathrooms  
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:34146     FALSE:59996     FALSE:59996     FALSE:59996    
##  TRUE :25850

Data transformation

Transform Date into Date format:

class(visitors$session_date)
## [1] "factor"
visitors$session_date <- as.Date(visitors$session_date, format = "%m/%d/%y")

Convert dayofweek into factor with levels + labels Cut 8/1/16 ~ 8/28/16 which is Monday to Sunday into four full weeks:

visitors$session_dayofweek <- wday(visitors$session_date)
visitors$session_dayofweek <- factor(visitors$session_dayofweek, labels = c('Sun','Mon','Tue','Wed','Thur','Fri','Sat'))

visitors$session_week <- cut(visitors$session_date,4,labels = c('wk1','wk2','wk3','wk4'))
table(visitors$session_date);class(visitors$session_date)
## 
## 2016-08-01 2016-08-02 2016-08-03 2016-08-04 2016-08-05 2016-08-06 
##       2718       2428       2347       2249       1893       1530 
## 2016-08-07 2016-08-08 2016-08-09 2016-08-10 2016-08-11 2016-08-12 
##       1503       2661       2491       2329       2174       1910 
## 2016-08-13 2016-08-14 2016-08-15 2016-08-16 2016-08-17 2016-08-18 
##       1562       1649       2791       2489       2435       2198 
## 2016-08-19 2016-08-20 2016-08-21 2016-08-22 2016-08-23 2016-08-24 
##       1887       1535       1603       2825       2518       2495 
## 2016-08-25 2016-08-26 2016-08-27 2016-08-28 
##       2378       2070       1680       1648
## [1] "Date"

PART I visitors and coversition rate

Q1: how does the number of visitors change over time?

  • By date we can see the clear seasonality in a week with high # of visitors on Mondays, low in weekends; Also there’s a week over week increasing trend in the 4 full weeks

  • By week view proves that week over week, there’s a obvious increase in terms of # of total visitors

  • By day of week we can prove the seasonality within a week that Monday is the most visited day, and it decrease gradually by day and reach the weekly low on weekends in terms of # of total visitors

visitors_by_date <- sqldf('SELECT session_date, COUNT(*) as CNT FROM visitors GROUP BY session_date')
visitors_by_week <- sqldf('SELECT session_week, COUNT(*) as CNT FROM visitors GROUP BY session_week')
visitors_by_dayofweek <- sqldf('SELECT session_dayofweek, COUNT(*) as CNT FROM visitors GROUP BY session_dayofweek')

datatable(visitors_by_date)
a <- ggplot(data = visitors_by_date, aes(x = session_date, y = CNT))  + 
  geom_line(linetype = "dashed") + 
  geom_point() + 
  labs(y="Total # of Distinct Visitors", 
       x="Date", 
       title="Total # of Distinct Visitors by Date", 
       caption = "Source: Thumbtack")
plot(a)

datatable(visitors_by_week)
b <- ggplot(visitors_by_week, aes(x=session_week, y=CNT)) + 
  geom_point(col="tomato2", size=3) + 
  geom_segment(aes(x=session_week, 
                   xend=session_week, 
                   y=min(CNT), 
                   yend=max(CNT)), 
               linetype="dashed", 
               size=0.1) +
  labs(y="Total # of Distinct Visitors", 
       x="Week", 
       title="Total # of Distinct Visitors by Week", 
       caption = "Source: Thumbtack") + 
  coord_flip()
plot(b)

datatable(visitors_by_dayofweek)
c<-ggplot(data=visitors_by_dayofweek, aes(x=session_dayofweek, y=CNT)) +
  geom_bar(stat="identity",fill="grey") + 
  theme(legend.position="none") +
  labs(y="Total # of Distinct Visitors", 
     x="Day of Week", 
     title="Total # of Distinct Visitors by Day of Week", 
     caption = "Source: Thumbtack")
plot(c)

Q2: What percent of visitors submit a project request?

  • About 57% of visitors submitted a project requst in the 4 full weeks available
request_ratio <- sqldf('SELECT sum(sent_request) reqeust_cnt, count(*) as total_cnt FROM visitors')
request_ratio$reqeust_ratio <- request_ratio$reqeust_cnt/request_ratio$total_cnt
request_ratio$no_reqeust_ratio <- 1-request_ratio$reqeust_ratio
datatable(request_ratio)

Q3: How do category and device influence these metrics?

  • If we view by Date*Category we see that although it shows a similar seasonality within a week across two categories, the Local Moving has far more visitors averagely than House Cleaning

  • If we view by Day of week*Category, we still see the similar weekday trend across two categories, but Local Moving is having a better standard deviation across days of week, which means it has more fluctuations across different day of a week comparing to House Cleaning

  • If we view by Date*Device, it’s interesting that Desktop averagely has more visitors, but also has much bigger standard deviation (fluctuations) across comparing to Mobile in terms of # of visitors

  • If we view by day of week *Device, we see the similar story - Desktop has bigger average # of visitors but also more flutuations

# 1. Date * category

visitors_by_date_cate <- sqldf('SELECT session_date, category_name, COUNT(*) as CNT FROM visitors GROUP BY session_date, category_name ORDER BY 2,1')


datatable(visitors_by_date_cate)
visitors_by_date_cate %>% 
  dplyr::group_by(category_name) %>% 
  dplyr::summarise(
    total_count = sum(CNT),
    mean_count = round(mean(CNT),2),
    standard_dev_count = round(sd(CNT),2)
  ) %>% datatable()
d <- ggplot(data = visitors_by_date_cate, aes(x = session_date, y = CNT, color = category_name))  + 
  geom_line(linetype = "dashed") + 
  geom_point() + 
  labs(y="Total # of Distinct Visitors", 
       x="Date", 
       title="Total # of Distinct Visitors by Date * Category", 
       caption = "Source: Thumbtack")
plot(d)

# 2. Day of week * category
visitors_by_dow_cate <- sqldf('SELECT session_dayofweek, category_name, COUNT(*) as CNT FROM visitors GROUP BY session_dayofweek, category_name ORDER BY 2,1')

datatable(visitors_by_dow_cate)
visitors_by_dow_cate %>% 
  dplyr::group_by(category_name) %>% 
  dplyr::summarise(
    total_count = sum(CNT),
    mean_count = round(mean(CNT),2),
    standard_dev_count = round(sd(CNT),2)
  ) %>% datatable()
e <- ggplot(data = visitors_by_dow_cate, aes(x = session_dayofweek, y = CNT, color = category_name))  + 
  geom_line(linetype = "dashed") + 
  geom_point() + 
  labs(y="Total # of Distinct Visitors", 
       x="Date", 
       title="Total # of Distinct Visitors by day of week * Category", 
       caption = "Source: Thumbtack")
plot(e)
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

# 3.Date * Device
visitors_by_date_dev <- sqldf('SELECT session_date, device, COUNT(*) as CNT FROM visitors GROUP BY session_date, device ORDER BY 2,1')

datatable(visitors_by_date_dev)
visitors_by_date_dev %>% 
  dplyr::group_by(device) %>% 
  dplyr::summarise(
    total_count = sum(CNT),
    mean_count = round(mean(CNT),2),
    standard_dev_count = round(sd(CNT),2)
) %>% datatable()
f <- ggplot(data = visitors_by_date_dev, aes(x = session_date, y = CNT, color = device))  + 
  geom_line(linetype = "dashed") + 
  geom_point() + 
  labs(y="Total # of Distinct Visitors", 
       x="Date", 
       title="Total # of Distinct Visitors by Date * Device", 
       caption = "Source: Thumbtack")
plot(f)

# 4. Day of week * device
visitors_by_dow_dev <- sqldf('SELECT session_dayofweek, device, COUNT(*) as CNT FROM visitors GROUP BY session_dayofweek, device ORDER BY 2,1')

datatable(visitors_by_dow_dev)
visitors_by_dow_dev %>% 
  dplyr::group_by(device) %>% 
  dplyr::summarise(
    total_count = sum(CNT),
    mean_count = round(mean(CNT),2),
    standard_dev_count = round(sd(CNT),2)
  ) %>% datatable()
g <- ggplot(data = visitors_by_dow_dev, aes(x = session_dayofweek, y = CNT, color = device))  + 
  geom_line(linetype = "dashed") + 
  geom_point() + 
  labs(y="Total # of Distinct Visitors", 
       x="Date", 
       title="Total # of Distinct Visitors by day of week * device", 
       caption = "Source: Thumbtack")
plot(g)
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

# Request by category * device

request_dev_cate <- sqldf('SELECT device, category_name, sum(sent_request) as Request_total FROM visitors GROUP BY device, category_name ORDER BY 1,2')
datatable(request_dev_cate)

PART II Quotes per reqeust

Q1: What’s the distribution of number of quotes per request?

  • We see most request has 2 quotes (29.6%), second is 1 quote (28.4%), then is 3 quotes (23.0%), 4th is 4 quotes (11.6%), 5th is 5 quotes (7.1%), and the last one is 6 quotes (0.004%)
quotes_per_request <- sqldf('SELECT request_id, count(*) as CNT FROM quotes GROUP BY 1')
quotes_per_request_V2 <- 
  quotes_per_request %>%
  dplyr::group_by(CNT) %>%
  dplyr::summarise (n = n()) %>%
  mutate(freq = round(n / sum(n),6))

datatable(quotes_per_request_V2)
h<-ggplot(data=quotes_per_request_V2, aes(x=CNT, y=n)) +
  geom_bar(stat="identity",fill="grey") + 
  theme(legend.position="none") +
  labs(y="Total # request", 
       x="Quotes per Request", 
       title="Quotes per Request Distribution", 
       caption = "Source: Thumbtack")
plot(h)

Q2: What factors contribute to some requests getting more quotes than others?

  • Looking at the 1st graph which shows in the House Cleaning task, the frequency relationship the # of bed rooms and # of quotes per request: It’s cleary that 3 bedrooms always acquire most quotes across a # of quotes groups ,then 2/4 bedrooms, then 1/5+ bedrooms and the least one is studio

*It may shows that professional doesn’t like quoting on too few / too many bedrooms

  • Looking at the 2nd graph which shows in the House Cleaning task, the frequency relationship between # of bedrooms and the # of quotes per request: it’s clear that 2 bathroom is the most popular option, then 1/3, then 1.5, the least one is 4+

*It may also shows that professional doesn’t like quoting on too few / too many bathrooms, but 1.5 bathroom is an interesting exception (maybe half bathroom created some problem in cleanning in the past)

  • Looking at 3rd graph which shows in Local Moving task, the frequency relationship between the distance and the # of quotes per request. It shows professionals love shorter distance moving (Less than 5 miles) most, then gradually lose interests when the distance grows. But what interesting here is that the least popular one is within a building moving

*It may shows when the moving requests a vehicle, professionals prefer shorter distance moving because it’s easier to manage and saves time; But when it doesn’t request a vehicle (within a building moving), professionals don’t like it may due to the small profit made from there

# Join two tables to consider all factors
visitor_quotes <- sqldf('SELECT * FROM visitors a JOIN quotes b on a.request_id = b.request_id')
visitor_quotes$request_id <- as.character(visitor_quotes$request_id)
visitor_quotes_cnt_request <- sqldf('SELECT request_id,category_name,device, how_far, num_bedrooms,num_bathrooms, count(*) as CNT  FROM visitor_quotes group by 1,2,3,4,5,6')
visitor_quotes_cnt_HC <- filter(visitor_quotes_cnt_request,category_name == 'House Cleaning (One Time)')
visitor_quotes_cnt_LM<- filter(visitor_quotes_cnt_request,category_name == 'Local Moving (under 50 miles)')

cate_freq <- as.data.frame(table(visitor_quotes_cnt_request$category_name,visitor_quotes_cnt_request$CNT))
dev_freq <- as.data.frame(table(visitor_quotes_cnt_request$device,visitor_quotes_cnt_request$CNT))
HC_bd_freq <- as.data.frame(table(visitor_quotes_cnt_HC$num_bedrooms,visitor_quotes_cnt_HC$CNT))
HC_bt_freq <- as.data.frame(table(visitor_quotes_cnt_HC$num_bathrooms,visitor_quotes_cnt_HC$CNT))
LM_hf_freq <- as.data.frame(table(visitor_quotes_cnt_LM$how_far,visitor_quotes_cnt_LM$CNT))


HC_bd_freq_plot <- ggballoonplot(HC_bd_freq, fill = "value")+
  scale_fill_viridis_c(option = "C") +
  labs(y="# of quotes / request", 
       x="# of bedroom", 
       title="# of Bedroom's Impact for House Cleaning Request on the # of Quote Acquired ", 
       caption = "Source: Thumbtack")
  
plot(HC_bd_freq_plot)

HC_bt_freq_plot <- ggballoonplot(HC_bt_freq, fill = "value")+
  scale_fill_viridis_c(option = "C") +
  labs(y="# of quotes / request", 
       x="# of bathroom", 
       title="# of Bathroom's Impact for House Cleaning Request on the # of Quote Acquired ", 
       caption = "Source: Thumbtack")

plot(HC_bt_freq_plot)

LM_hf_freq_plot <- ggballoonplot(LM_hf_freq, fill = "value")+
  scale_fill_viridis_c(option = "C") +
  labs(y="# of quotes / request", 
       x="How Far", 
       title="How Far Impact for Local Moving Request on the # of Quote Acquired ", 
       caption = "Source: Thumbtack")

plot(LM_hf_freq_plot)

PART III Job Value

Q1: By category, what is the distribution of quote prices for a project?

*Outliers and NA values excluded to get a better understanding of the general distribution

  • The mean quote price is 149.95 dollars for House Cleaning and 328.24 dollars for Local Moving
  • The median quote price is 140 dollars for House Cleaning and 350 dollars for Local Moving
  • The standard deviation is 87.64 dollars for House Cleaning and 145.62 dollars for Local Moving
visitor_quotes_price <- filter(visitor_quotes,quote_price >= 0 & quote_price <= 850)
# quantile(visitor_quotes_price$quote_price, c(.99)) --- filter out 99% outlier to better represent
mu <- ddply(visitor_quotes_price, "category_name", summarise, grp.mean=mean(quote_price))
median <- ddply(visitor_quotes_price, "category_name", summarise, grp.median=median(quote_price))
std <- ddply(visitor_quotes_price, "category_name", summarise, grp.std=sd(quote_price))

mu
median
std
j <- ggplot(visitor_quotes_price, aes(x=quote_price, color=category_name)) +
  geom_histogram(fill="white", alpha=0.5, position="identity",bins = 25) + 
  geom_vline(data=mu, aes(xintercept=grp.mean, color=category_name),
             linetype="dashed")+
  scale_color_brewer(palette="Dark2")+
  scale_fill_brewer(palette="Dark2") +
  labs(y="Total # of request", 
       x="Quote Price", 
       title="Quote Price Distribution by Category", 
       caption = "Source: Thumbtack")
plot(j)

Q2: By category, what price do you think Thumbtack should charge pros to quote? Why? Explain your reasoning for any assumptions you make.

Answer: Around mean value of each category to score higher quote sucess rate (mean is close to median due to the cleaning of outliers)

PART IV Further Exploration

Q1: Briefly describe two other questions or analyses you would be interested in pursuing if you had more time to explore the data

Question to explore 1: - For house Cleaning request, how do the num_bedroom, num_bathroom factors impact how many quotes available for the sight request? - Similarly for Local Moving request, how does the how_far factor impact the how many quotes available for the sight request?

Question to explore 2: - What are the driving factors for a higher hired possibilities (modeling on hire / not hire based on available information) - based on the hire rate, who are the star professionals and how do we profile them based on their quote history?