Library load

library(tidyverse)
library(dplyr)
library(sqldf)

## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
##   Referenced from: /Library/Frameworks/R.framework/Resources/modules//R_X11.so
##   Reason: image not found

library(lubridate)
library(DT)
library(chron)
library(ggplot2)
library(plyr)
library(ggpubr)

Data cleaning

Import data

quotes <- read.csv('quotes.csv', header = TRUE)
visitors <- read.csv('visitors.csv',header = TRUE)

Data summary

There are NA values in visitors data nad quotes data, due to the discrepency of visit but not request and request but no quotes / hire.

head(quotes);summary(quotes);str(quotes)

##    request_id       quote_id         quote_price          hired       
##  Min.   :    1   Min.   :37719563   Min.   :    0.0   Min.   :0.0000  
##  1st Qu.: 8112   1st Qu.:38226478   1st Qu.:  120.0   1st Qu.:0.0000  
##  Median :16674   Median :38740194   Median :  195.0   Median :0.0000  
##  Mean   :16780   Mean   :38735303   Mean   :  242.5   Mean   :0.1717  
##  3rd Qu.:25331   3rd Qu.:39246298   3rd Qu.:  350.0   3rd Qu.:0.0000  
##  Max.   :34146   Max.   :39659761   Max.   :25504.0   Max.   :1.0000  
##                                     NA's   :32776                     
##      pro_id        
##  Min.   :    2557  
##  1st Qu.: 5328238  
##  Median : 9287742  
##  Mean   : 9368664  
##  3rd Qu.:14287697  
##  Max.   :16771673  
##

## 'data.frame':    64330 obs. of  5 variables:
##  $ request_id : int  1 1 1 1 1 2 2 3 3 4 ...
##  $ quote_id   : int  38912310 38912311 38913628 38914071 38912344 39189751 39189531 38146708 38178389 38192457 ...
##  $ quote_price: num  NA 210 NA NA NA 175 100 NA NA 100 ...
##  $ hired      : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ pro_id     : int  851539 10113954 13498826 15921289 14506387 76329 10995392 5836949 5918823 15728846 ...

head(visitors);summary(visitors);str(visitors)

##   session_date       device                            category_name  
##  8/22/16: 2825   desktop:32295   House Cleaning (One Time)    :17347  
##  8/15/16: 2791   mobile :27701   Local Moving (under 50 miles):42649  
##  8/1/16 : 2718                                                        
##  8/8/16 : 2661                                                        
##  8/23/16: 2518                                                        
##  8/24/16: 2495                                                        
##  (Other):43988                                                        
##   sent_request      request_id                        how_far     
##  Min.   :0.0000   Min.   :    1                           :38545  
##  1st Qu.:0.0000   1st Qu.: 8537   11 - 20 miles           : 4312  
##  Median :1.0000   Median :17074   21 - 30 miles           : 2069  
##  Mean   :0.5691   Mean   :17074   31 - 50 miles           : 1683  
##  3rd Qu.:1.0000   3rd Qu.:25610   5 - 10 miles            : 5703  
##  Max.   :1.0000   Max.   :34147   Less than 5 miles       : 7007  
##                   NA's   :25850   Within the same building:  677  
##       num_bedrooms         num_bathrooms  
##             :51358                :51358  
##  1 bedroom  : 1001   1 bathroom   : 2081  
##  2 bedrooms : 2044   1.5 bathrooms:  822  
##  3 bedrooms : 3261   2 bathrooms  : 3369  
##  4 bedrooms : 1695   3 bathrooms  : 1928  
##  5+ bedrooms:  460   4+ bathrooms :  438  
##  Studio     :  177

## 'data.frame':    59996 obs. of  8 variables:
##  $ session_date : Factor w/ 28 levels "8/1/16","8/10/16",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ device       : Factor w/ 2 levels "desktop","mobile": 1 1 1 2 2 1 2 2 2 2 ...
##  $ category_name: Factor w/ 2 levels "House Cleaning (One Time)",..: 1 2 2 2 2 1 2 1 2 2 ...
##  $ sent_request : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ request_id   : int  9067 12707 19561 31010 6887 30673 32253 22560 21478 2110 ...
##  $ how_far      : Factor w/ 7 levels "","11 - 20 miles",..: 1 6 6 6 2 1 3 1 2 3 ...
##  $ num_bedrooms : Factor w/ 7 levels "","1 bedroom",..: 2 1 1 1 1 2 1 4 1 1 ...
##  $ num_bathrooms: Factor w/ 6 levels "","1 bathroom",..: 2 1 1 1 1 2 1 3 1 1 ...

summary(is.na(quotes))

##  request_id       quote_id       quote_price       hired        
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:64330     FALSE:64330     FALSE:31554     FALSE:64330    
##                                  TRUE :32776                    
##    pro_id       
##  Mode :logical  
##  FALSE:64330    
##

summary(is.na(visitors))

##  session_date      device        category_name   sent_request   
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:59996     FALSE:59996     FALSE:59996     FALSE:59996    
##                                                                 
##  request_id       how_far        num_bedrooms    num_bathrooms  
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:34146     FALSE:59996     FALSE:59996     FALSE:59996    
##  TRUE :25850

Data transformation

Transform Date into Date format:

class(visitors$session_date)

## [1] "factor"

visitors$session_date <- as.Date(visitors$session_date, format = "%m/%d/%y")

Convert dayofweek into factor with levels + labels Cut 8/1/16 ~ 8/28/16 which is Monday to Sunday into four full weeks:

visitors$session_dayofweek <- wday(visitors$session_date)
visitors$session_dayofweek <- factor(visitors$session_dayofweek, labels = c('Sun','Mon','Tue','Wed','Thur','Fri','Sat'))

visitors$session_week <- cut(visitors$session_date,4,labels = c('wk1','wk2','wk3','wk4'))
table(visitors$session_date);class(visitors$session_date)

## 
## 2016-08-01 2016-08-02 2016-08-03 2016-08-04 2016-08-05 2016-08-06 
##       2718       2428       2347       2249       1893       1530 
## 2016-08-07 2016-08-08 2016-08-09 2016-08-10 2016-08-11 2016-08-12 
##       1503       2661       2491       2329       2174       1910 
## 2016-08-13 2016-08-14 2016-08-15 2016-08-16 2016-08-17 2016-08-18 
##       1562       1649       2791       2489       2435       2198 
## 2016-08-19 2016-08-20 2016-08-21 2016-08-22 2016-08-23 2016-08-24 
##       1887       1535       1603       2825       2518       2495 
## 2016-08-25 2016-08-26 2016-08-27 2016-08-28 
##       2378       2070       1680       1648

## [1] "Date"

PART I visitors and coversition rate

Q1: how does the number of visitors change over time?

By date we can see the clear seasonality in a week with high # of visitors on Mondays, low in weekends; Also there’s a week over week increasing trend in the 4 full weeks
By week view proves that week over week, there’s a obvious increase in terms of # of total visitors
By day of week we can prove the seasonality within a week that Monday is the most visited day, and it decrease gradually by day and reach the weekly low on weekends in terms of # of total visitors

visitors_by_date <- sqldf('SELECT session_date, COUNT(*) as CNT FROM visitors GROUP BY session_date')
visitors_by_week <- sqldf('SELECT session_week, COUNT(*) as CNT FROM visitors GROUP BY session_week')
visitors_by_dayofweek <- sqldf('SELECT session_dayofweek, COUNT(*) as CNT FROM visitors GROUP BY session_dayofweek')

datatable(visitors_by_date)

a <- ggplot(data = visitors_by_date, aes(x = session_date, y = CNT))  + 
  geom_line(linetype = "dashed") + 
  geom_point() + 
  labs(y="Total # of Distinct Visitors", 
       x="Date", 
       title="Total # of Distinct Visitors by Date", 
       caption = "Source: Thumbtack")
plot(a)

datatable(visitors_by_week)

b <- ggplot(visitors_by_week, aes(x=session_week, y=CNT)) + 
  geom_point(col="tomato2", size=3) + 
  geom_segment(aes(x=session_week, 
                   xend=session_week, 
                   y=min(CNT), 
                   yend=max(CNT)), 
               linetype="dashed", 
               size=0.1) +
  labs(y="Total # of Distinct Visitors", 
       x="Week", 
       title="Total # of Distinct Visitors by Week", 
       caption = "Source: Thumbtack") + 
  coord_flip()
plot(b)

datatable(visitors_by_dayofweek)

c<-ggplot(data=visitors_by_dayofweek, aes(x=session_dayofweek, y=CNT)) +
  geom_bar(stat="identity",fill="grey") + 
  theme(legend.position="none") +
  labs(y="Total # of Distinct Visitors", 
     x="Day of Week", 
     title="Total # of Distinct Visitors by Day of Week", 
     caption = "Source: Thumbtack")
plot(c)

Q2: What percent of visitors submit a project request?

About 57% of visitors submitted a project requst in the 4 full weeks available

request_ratio <- sqldf('SELECT sum(sent_request) reqeust_cnt, count(*) as total_cnt FROM visitors')
request_ratio$reqeust_ratio <- request_ratio$reqeust_cnt/request_ratio$total_cnt
request_ratio$no_reqeust_ratio <- 1-request_ratio$reqeust_ratio
datatable(request_ratio)

Q3: How do category and device influence these metrics?

If we view by Date*Category we see that although it shows a similar seasonality within a week across two categories, the Local Moving has far more visitors averagely than House Cleaning
If we view by Day of week*Category, we still see the similar weekday trend across two categories, but Local Moving is having a better standard deviation across days of week, which means it has more fluctuations across different day of a week comparing to House Cleaning
If we view by Date*Device, it’s interesting that Desktop averagely has more visitors, but also has much bigger standard deviation (fluctuations) across comparing to Mobile in terms of # of visitors
If we view by day of week *Device, we see the similar story - Desktop has bigger average # of visitors but also more flutuations

# 1. Date * category

visitors_by_date_cate <- sqldf('SELECT session_date, category_name, COUNT(*) as CNT FROM visitors GROUP BY session_date, category_name ORDER BY 2,1')


datatable(visitors_by_date_cate)

visitors_by_date_cate %>% 
  dplyr::group_by(category_name) %>% 
  dplyr::summarise(
    total_count = sum(CNT),
    mean_count = round(mean(CNT),2),
    standard_dev_count = round(sd(CNT),2)
  ) %>% datatable()

d <- ggplot(data = visitors_by_date_cate, aes(x = session_date, y = CNT, color = category_name))  + 
  geom_line(linetype = "dashed") + 
  geom_point() + 
  labs(y="Total # of Distinct Visitors", 
       x="Date", 
       title="Total # of Distinct Visitors by Date * Category", 
       caption = "Source: Thumbtack")
plot(d)

# 2. Day of week * category
visitors_by_dow_cate <- sqldf('SELECT session_dayofweek, category_name, COUNT(*) as CNT FROM visitors GROUP BY session_dayofweek, category_name ORDER BY 2,1')

datatable(visitors_by_dow_cate)

visitors_by_dow_cate %>% 
  dplyr::group_by(category_name) %>% 
  dplyr::summarise(
    total_count = sum(CNT),
    mean_count = round(mean(CNT),2),
    standard_dev_count = round(sd(CNT),2)
  ) %>% datatable()

e <- ggplot(data = visitors_by_dow_cate, aes(x = session_dayofweek, y = CNT, color = category_name))  + 
  geom_line(linetype = "dashed") + 
  geom_point() + 
  labs(y="Total # of Distinct Visitors", 
       x="Date", 
       title="Total # of Distinct Visitors by day of week * Category", 
       caption = "Source: Thumbtack")
plot(e)

## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

# 3.Date * Device
visitors_by_date_dev <- sqldf('SELECT session_date, device, COUNT(*) as CNT FROM visitors GROUP BY session_date, device ORDER BY 2,1')

datatable(visitors_by_date_dev)

visitors_by_date_dev %>% 
  dplyr::group_by(device) %>% 
  dplyr::summarise(
    total_count = sum(CNT),
    mean_count = round(mean(CNT),2),
    standard_dev_count = round(sd(CNT),2)
) %>% datatable()

f <- ggplot(data = visitors_by_date_dev, aes(x = session_date, y = CNT, color = device))  + 
  geom_line(linetype = "dashed") + 
  geom_point() + 
  labs(y="Total # of Distinct Visitors", 
       x="Date", 
       title="Total # of Distinct Visitors by Date * Device", 
       caption = "Source: Thumbtack")
plot(f)

# 4. Day of week * device
visitors_by_dow_dev <- sqldf('SELECT session_dayofweek, device, COUNT(*) as CNT FROM visitors GROUP BY session_dayofweek, device ORDER BY 2,1')

datatable(visitors_by_dow_dev)

visitors_by_dow_dev %>% 
  dplyr::group_by(device) %>% 
  dplyr::summarise(
    total_count = sum(CNT),
    mean_count = round(mean(CNT),2),
    standard_dev_count = round(sd(CNT),2)
  ) %>% datatable()

g <- ggplot(data = visitors_by_dow_dev, aes(x = session_dayofweek, y = CNT, color = device))  + 
  geom_line(linetype = "dashed") + 
  geom_point() + 
  labs(y="Total # of Distinct Visitors", 
       x="Date", 
       title="Total # of Distinct Visitors by day of week * device", 
       caption = "Source: Thumbtack")
plot(g)

## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

# Request by category * device

request_dev_cate <- sqldf('SELECT device, category_name, sum(sent_request) as Request_total FROM visitors GROUP BY device, category_name ORDER BY 1,2')
datatable(request_dev_cate)

PART II Quotes per reqeust

Q1: What’s the distribution of number of quotes per request?

We see most request has 2 quotes (29.6%), second is 1 quote (28.4%), then is 3 quotes (23.0%), 4th is 4 quotes (11.6%), 5th is 5 quotes (7.1%), and the last one is 6 quotes (0.004%)

quotes_per_request <- sqldf('SELECT request_id, count(*) as CNT FROM quotes GROUP BY 1')
quotes_per_request_V2 <- 
  quotes_per_request %>%
  dplyr::group_by(CNT) %>%
  dplyr::summarise (n = n()) %>%
  mutate(freq = round(n / sum(n),6))

datatable(quotes_per_request_V2)

h<-ggplot(data=quotes_per_request_V2, aes(x=CNT, y=n)) +
  geom_bar(stat="identity",fill="grey") + 
  theme(legend.position="none") +
  labs(y="Total # request", 
       x="Quotes per Request", 
       title="Quotes per Request Distribution", 
       caption = "Source: Thumbtack")
plot(h)

Q2: What factors contribute to some requests getting more quotes than others?

Looking at the 1st graph which shows in the House Cleaning task, the frequency relationship the # of bed rooms and # of quotes per request: It’s cleary that 3 bedrooms always acquire most quotes across a # of quotes groups ,then 2/4 bedrooms, then 1/5+ bedrooms and the least one is studio

*It may shows that professional doesn’t like quoting on too few / too many bedrooms

Looking at the 2nd graph which shows in the House Cleaning task, the frequency relationship between # of bedrooms and the # of quotes per request: it’s clear that 2 bathroom is the most popular option, then 1/3, then 1.5, the least one is 4+

*It may also shows that professional doesn’t like quoting on too few / too many bathrooms, but 1.5 bathroom is an interesting exception (maybe half bathroom created some problem in cleanning in the past)

Looking at 3rd graph which shows in Local Moving task, the frequency relationship between the distance and the # of quotes per request. It shows professionals love shorter distance moving (Less than 5 miles) most, then gradually lose interests when the distance grows. But what interesting here is that the least popular one is within a building moving

*It may shows when the moving requests a vehicle, professionals prefer shorter distance moving because it’s easier to manage and saves time; But when it doesn’t request a vehicle (within a building moving), professionals don’t like it may due to the small profit made from there

# Join two tables to consider all factors
visitor_quotes <- sqldf('SELECT * FROM visitors a JOIN quotes b on a.request_id = b.request_id')
visitor_quotes$request_id <- as.character(visitor_quotes$request_id)
visitor_quotes_cnt_request <- sqldf('SELECT request_id,category_name,device, how_far, num_bedrooms,num_bathrooms, count(*) as CNT  FROM visitor_quotes group by 1,2,3,4,5,6')
visitor_quotes_cnt_HC <- filter(visitor_quotes_cnt_request,category_name == 'House Cleaning (One Time)')
visitor_quotes_cnt_LM<- filter(visitor_quotes_cnt_request,category_name == 'Local Moving (under 50 miles)')

cate_freq <- as.data.frame(table(visitor_quotes_cnt_request$category_name,visitor_quotes_cnt_request$CNT))
dev_freq <- as.data.frame(table(visitor_quotes_cnt_request$device,visitor_quotes_cnt_request$CNT))
HC_bd_freq <- as.data.frame(table(visitor_quotes_cnt_HC$num_bedrooms,visitor_quotes_cnt_HC$CNT))
HC_bt_freq <- as.data.frame(table(visitor_quotes_cnt_HC$num_bathrooms,visitor_quotes_cnt_HC$CNT))
LM_hf_freq <- as.data.frame(table(visitor_quotes_cnt_LM$how_far,visitor_quotes_cnt_LM$CNT))


HC_bd_freq_plot <- ggballoonplot(HC_bd_freq, fill = "value")+
  scale_fill_viridis_c(option = "C") +
  labs(y="# of quotes / request", 
       x="# of bedroom", 
       title="# of Bedroom's Impact for House Cleaning Request on the # of Quote Acquired ", 
       caption = "Source: Thumbtack")
  
plot(HC_bd_freq_plot)

HC_bt_freq_plot <- ggballoonplot(HC_bt_freq, fill = "value")+
  scale_fill_viridis_c(option = "C") +
  labs(y="# of quotes / request", 
       x="# of bathroom", 
       title="# of Bathroom's Impact for House Cleaning Request on the # of Quote Acquired ", 
       caption = "Source: Thumbtack")

plot(HC_bt_freq_plot)

LM_hf_freq_plot <- ggballoonplot(LM_hf_freq, fill = "value")+
  scale_fill_viridis_c(option = "C") +
  labs(y="# of quotes / request", 
       x="How Far", 
       title="How Far Impact for Local Moving Request on the # of Quote Acquired ", 
       caption = "Source: Thumbtack")

plot(LM_hf_freq_plot)

PART III Job Value

Q1: By category, what is the distribution of quote prices for a project?

*Outliers and NA values excluded to get a better understanding of the general distribution

The mean quote price is 149.95 dollars for House Cleaning and 328.24 dollars for Local Moving
The median quote price is 140 dollars for House Cleaning and 350 dollars for Local Moving
The standard deviation is 87.64 dollars for House Cleaning and 145.62 dollars for Local Moving

visitor_quotes_price <- filter(visitor_quotes,quote_price >= 0 & quote_price <= 850)
# quantile(visitor_quotes_price$quote_price, c(.99)) --- filter out 99% outlier to better represent
mu <- ddply(visitor_quotes_price, "category_name", summarise, grp.mean=mean(quote_price))
median <- ddply(visitor_quotes_price, "category_name", summarise, grp.median=median(quote_price))
std <- ddply(visitor_quotes_price, "category_name", summarise, grp.std=sd(quote_price))

mu

median

std

j <- ggplot(visitor_quotes_price, aes(x=quote_price, color=category_name)) +
  geom_histogram(fill="white", alpha=0.5, position="identity",bins = 25) + 
  geom_vline(data=mu, aes(xintercept=grp.mean, color=category_name),
             linetype="dashed")+
  scale_color_brewer(palette="Dark2")+
  scale_fill_brewer(palette="Dark2") +
  labs(y="Total # of request", 
       x="Quote Price", 
       title="Quote Price Distribution by Category", 
       caption = "Source: Thumbtack")
plot(j)

Q2: By category, what price do you think Thumbtack should charge pros to quote? Why? Explain your reasoning for any assumptions you make.

Answer: Around mean value of each category to score higher quote sucess rate (mean is close to median due to the cleaning of outliers)

PART IV Further Exploration

Q1: Briefly describe two other questions or analyses you would be interested in pursuing if you had more time to explore the data

Question to explore 1: - For house Cleaning request, how do the num_bedroom, num_bathroom factors impact how many quotes available for the sight request? - Similarly for Local Moving request, how does the how_far factor impact the how many quotes available for the sight request?

Question to explore 2: - What are the driving factors for a higher hired possibilities (modeling on hire / not hire based on available information) - based on the hire rate, who are the star professionals and how do we profile them based on their quote history?

TT Data Challenge

Wen he

9/16/2018