Analytics Challenge by Sushanth Chintalapati

library("tidyverse")
library("lubridate")
library("plotly")
library("caret")
library("e1071")
library("DMwR")
library("formatR")

c <- read.csv("Contacts.csv")
c <- select(c, -("contact_id"))
colnames(c) <- c("visitor_id", "pro_id", "hired")

v <- read.csv("Visitors.csv")
v <- v[, c("visitor_id", "pro_user_id", "category", "result_position", "num_reviews", 
    "avg_rating", "cost_estimate_cents", "search_timestamp", "pro_last_active_time_before_search", 
    "service_page_viewed")]
v$cost_estimate_cents <- v$cost_estimate_cents/100
colnames(v) <- c("visitor_id", "pro_id", "category", "search_rank", "num_reviews", 
    "avg_rating", "cost_est($)", "search_timestamp", "pro_last_active_time_before_search", 
    "page_viewed")

Fixing Date and Time stamps

v <- v %>% mutate(search_timestamp = parse_date_time(search_timestamp, orders = "Y-m-d HMS"))
v$s_date <- format(v$search_timestamp, "%Y-%m-%d") %>% as.Date()
v$s_day <- as.factor(weekdays(v$s_date))
levels(v$s_day) <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", 
    "Saturday", "Sunday")
v$s_hr <- format(v$search_timestamp, "%H.%M") %>% as.numeric()
v$s_hr <- as.integer(floor(v$s_hr))
v <- select(v, -("search_timestamp"))

v <- v %>% mutate(pro_last_active_time_before_search = parse_date_time(pro_last_active_time_before_search, 
    orders = "Y-m-d HMS"))
v$p.l_date <- format(v$pro_last_active_time_before_search, "%Y-%m-%d") %>% as.Date()
v$p.l_hr <- format(v$pro_last_active_time_before_search, "%H.%M") %>% as.numeric()
v$p.l_hr <- floor(v$p.l_hr)
v <- select(v, -("pro_last_active_time_before_search"))

v <- v %>% mutate(diff = as.integer(s_date - p.l_date))

Handling and Joining data files

explo <- v
data <- v %>% left_join(c, by = c("visitor_id", "pro_id"))
data <- data[!is.na(data$hired), ]
rm(c)
rm(v)
dt_total <- select(data, -c("visitor_id", "pro_id", "s_date", "p.l_date", "p.l_hr"))

rm(data)
dt_total$hired <- as.factor(ifelse(dt_total$hired == T, "Hired", "Not Hired"))
levels(dt_total$hired) <- c("Hired", "Not Hired")

Checking the Variable of concern

table(dt_total$hired)

## 
##     Hired Not Hired 
##       363      1141

This is an unbalenced data set and required Over Sampling for the Hired part to get better results.

SMOTE for equalizing the data

set.seed(1)
dt_total <- SMOTE(hired ~ ., dt_total, perc.over = 500, k = 5, perc.under = 100)
table(dt_total$hired)

## 
##     Hired Not Hired 
##      2178      1815

Spliting into data for two different categories

# HC for House cleaning and LM for Local moving
table(dt_total$category)

## 
##                House Cleaning Local Moving (under 50 miles) 
##                          2286                          1707

dt_hc <- dt_total %>% filter(category == "House Cleaning") %>% select(-"category")
dt_lm <- dt_total %>% filter(category != "House Cleaning") %>% select(-"category")

Checking for parameters that actually effect the Hiring For the total data as a whole

fit <- glm(hired ~ ., data = dt_total, family = binomial)
summary(fit)

## 
## Call:
## glm(formula = hired ~ ., family = binomial, data = dt_total)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2878  -0.7983   0.5761   0.7558   2.4378  
## 
## Coefficients:
##                                         Estimate Std. Error z value
## (Intercept)                            6.5540557  1.2577495   5.211
## categoryLocal Moving (under 50 miles) -0.0146466  0.1048605  -0.140
## search_rank                           -0.0543648  0.0276542  -1.966
## num_reviews                           -0.0042378  0.0002450 -17.295
## avg_rating                            -0.7022593  0.2252558  -3.118
## `cost_est($)`                          0.0002081  0.0016430   0.127
## page_viewed                           -0.7641991  0.5560763  -1.374
## s_dayTuesday                          -0.5651910  0.1844218  -3.065
## s_dayWednesday                        -0.6117261  0.2140275  -2.858
## s_dayThursday                         -0.1894014  0.2477488  -0.764
## s_dayFriday                           -1.2999216  0.1686804  -7.706
## s_daySaturday                         -0.6260782  0.1813363  -3.453
## s_daySunday                           -0.0946996  0.1922067  -0.493
## s_hr                                  -0.0226209  0.0071656  -3.157
## diff                                  -0.0010046  0.0017612  -0.570
##                                       Pr(>|z|)    
## (Intercept)                           1.88e-07 ***
## categoryLocal Moving (under 50 miles) 0.888916    
## search_rank                           0.049312 *  
## num_reviews                            < 2e-16 ***
## avg_rating                            0.001823 ** 
## `cost_est($)`                         0.899226    
## page_viewed                           0.169358    
## s_dayTuesday                          0.002179 ** 
## s_dayWednesday                        0.004261 ** 
## s_dayThursday                         0.444575    
## s_dayFriday                           1.29e-14 ***
## s_daySaturday                         0.000555 ***
## s_daySunday                           0.622227    
## s_hr                                  0.001595 ** 
## diff                                  0.568407    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3577.5  on 2714  degrees of freedom
## Residual deviance: 2852.6  on 2700  degrees of freedom
##   (1278 observations deleted due to missingness)
## AIC: 2882.6
## 
## Number of Fisher Scoring iterations: 4

data.frame(summary(fit)$coef[summary(fit)$coef[, 4] <= 0.05, 4])

For Local Moving

fit <- glm(hired ~ ., data = dt_lm, family = binomial)
summary(fit)

## 
## Call:
## glm(formula = hired ~ ., family = binomial, data = dt_lm)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2653  -0.8755   0.5409   0.8424   2.3610  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    11.7586485  4.8371105   2.431  0.01506 *  
## search_rank    -0.2812539  0.0587455  -4.788 1.69e-06 ***
## num_reviews    -0.0034549  0.0002794 -12.367  < 2e-16 ***
## avg_rating     -0.9162927  0.3009073  -3.045  0.00233 ** 
## `cost_est($)`  -0.0012903  0.0029081  -0.444  0.65726    
## page_viewed    -4.5400878  4.5846723  -0.990  0.32204    
## s_dayTuesday   -0.5109651  0.2521428  -2.026  0.04271 *  
## s_dayWednesday -0.6891419  0.2917250  -2.362  0.01816 *  
## s_dayThursday  -0.4366263  0.3461627  -1.261  0.20719    
## s_dayFriday    -1.4678240  0.2161723  -6.790 1.12e-11 ***
## s_daySaturday  -0.6540578  0.2301218  -2.842  0.00448 ** 
## s_daySunday    -0.1990824  0.2485992  -0.801  0.42324    
## s_hr           -0.0278300  0.0100639  -2.765  0.00569 ** 
## diff            0.0050347  0.0051969   0.969  0.33265    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1913.2  on 1390  degrees of freedom
## Residual deviance: 1539.6  on 1377  degrees of freedom
##   (316 observations deleted due to missingness)
## AIC: 1567.6
## 
## Number of Fisher Scoring iterations: 6

data.frame(summary(fit)$coef[summary(fit)$coef[, 4] <= 0.05, 4])

For Home Cleaning

fit <- glm(hired ~ ., data = dt_hc, family = binomial)
summary(fit)

## 
## Call:
## glm(formula = hired ~ ., family = binomial, data = dt_hc)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4096  -0.2894   0.5629   0.6927   2.3813  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     6.9832995  1.9093771   3.657 0.000255 ***
## search_rank    -0.0105953  0.0355316  -0.298 0.765554    
## num_reviews    -0.0075657  0.0006489 -11.659  < 2e-16 ***
## avg_rating     -0.9185581  0.3618509  -2.538 0.011133 *  
## `cost_est($)`   0.0002341  0.0020678   0.113 0.909854    
## page_viewed    -0.4243029  0.6009237  -0.706 0.480136    
## s_dayTuesday   -0.2868072  0.2742609  -1.046 0.295678    
## s_dayWednesday -0.4205124  0.3178982  -1.323 0.185905    
## s_dayThursday   0.2731393  0.3775870   0.723 0.469446    
## s_dayFriday    -0.7929773  0.2662614  -2.978 0.002900 ** 
## s_daySaturday  -0.4073990  0.2869532  -1.420 0.155683    
## s_daySunday     0.2414265  0.3007759   0.803 0.422160    
## s_hr           -0.0098497  0.0104427  -0.943 0.345573    
## diff           -0.0024589  0.0018383  -1.338 0.181017    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1589.2  on 1323  degrees of freedom
## Residual deviance: 1235.5  on 1310  degrees of freedom
##   (962 observations deleted due to missingness)
## AIC: 1263.5
## 
## Number of Fisher Scoring iterations: 5

data.frame(summary(fit)$coef[summary(fit)$coef[, 4] <= 0.05, 4])

rm(fit)

EXPLORATORY ANALYSIS

num_reviews VS hiring

p <- dt_total %>% na.omit(c("hired", "num_reviews")) %>% ggplot(aes(x = hired, 
    y = as.integer(num_reviews), fill = hired)) + geom_boxplot(color = "black", 
    outlier.size = 0, outlier.shape = NA) + ggtitle("Number of Reviews Vs Hiring (For the total data)") + 
    labs(y = "Number of reviews", x = "Hired or Not", fill = "Hired or Not")
p

p <- dt_lm %>% na.omit(c("hired", "num_reviews")) %>% ggplot(aes(x = hired, 
    y = as.integer(num_reviews), fill = hired)) + geom_boxplot(color = "black", 
    outlier.size = 0, outlier.shape = NA) + ggtitle("Number of Reviews Vs Hiring (For Local Moving)") + 
    labs(y = "Number of reviews", x = "Hired or Not", fill = "Hired or Not")
p

p <- dt_hc %>% na.omit(c("hired", "num_reviews")) %>% ggplot(aes(x = hired, 
    y = as.integer(num_reviews), fill = hired)) + geom_boxplot(color = "black", 
    outlier.size = 0, outlier.shape = NA) + ggtitle("Number of Reviews Vs Hiring (For Home Cleaning)") + 
    labs(y = "Number of reviews", x = "Hired or Not", fill = "Hired or Not")
p

rm(p)

It is very clear that the number of reviews have a significant effect on the hiring. As the number of reviews increases, the probability of being hired also increases.

Search Rank Vs Hiring

p <- dt_total %>% na.omit(c("search_rank", "hired")) %>% filter(search_rank > 5) %>% ggplot(aes(x = hired, y = search_rank, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Effect of Search Ranks beyond 5 on Hiring")+labs(y = "Search Rank", x = "Hired or Not", fill = "Hired or Not")
p

p <- dt_lm %>% na.omit(c("search_rank", "hired")) %>% filter(search_rank > 5) %>% ggplot(aes(x = hired, y = search_rank, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Effect of Search Ranks beyond 5 on Hiring (Local Movers)")+labs(y = "Search Rank", x = "Hired or Not", fill = "Hired or Not")
p

p <- dt_hc %>% na.omit(c("search_rank", "hired")) %>% filter(search_rank > 5) %>% ggplot(aes(x = hired, y = search_rank, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Effect of Search Ranks beyond 5 on Hiring (Home Cleaners)")+labs(y = "Search Rank", x = "Hired or Not", fill = "Hired or Not")
p

rm(p)

There is a significant effect of search ranks beyond 5 on Home Cleaners and a mild effect on Local Movers. As the search rank increases there posibility of being hired decreases.

Number of Reviews, Avg Reviews Vs Hiring

p <- dt_total %>% na.omit(c("num_reviews", "avg_reviews", "hired")) %>% ggplot(aes(x = avg_rating, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for higher Rating and higher number of reviews")+labs(y = "Number of Reviews", x = "Avg Rating", fill = "Hired or Not")
p

p <- dt_lm %>% na.omit(c("num_reviews", "avg_reviews", "hired")) %>% ggplot(aes(x = avg_rating, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for higher Rating and higher number of reviews (Local Movers)")+labs(y = "Number of Reviews", x = "Avg Rating", fill = "Hired or Not")
p

p <- dt_hc %>% na.omit(c("num_reviews", "avg_reviews", "hired")) %>% ggplot(aes(x = avg_rating, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for higher Rating and higher number of reviews (Home Cleaners)")+labs(y = "Number of Reviews", x = "Avg Rating", fill = "Hired or Not")
p

rm(p)

In all of the plots, it is pretty evident that for higher Number of reviews and higher rating the change of being hired is high

Effect of Search time on hiring

p <- dt_total %>% na.omit(c("s_hr", "hired")) %>% ggplot(aes(x = hired, y = s_hr, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Search Time vs Hiring")+labs(y = "Search time (24 hr format)", x = "Hired or Not", fill = "Hired or Not")
p

p <- dt_lm %>% na.omit(c("s_hr", "hired")) %>% ggplot(aes(x = hired, y = s_hr, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Search Time vs Hiring (Local Moving)")+labs(y = "Search time (24 hr format)", x = "Hired or Not", fill = "Hired or Not")
p

p <- dt_hc %>% na.omit(c("s_hr", "hired")) %>% ggplot(aes(x = hired, y = s_hr, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Search Time vs Hiring (Home Cleaning)")+labs(y = "Search time (24 hr format)", x = "Hired or Not", fill = "Hired or Not")
p

rm(p)

All plots indicated most of the hiring happens in the second half of the day, sometime between 2-8pm.

Effect of Day on the hiring

p <- dt_total %>% na.omit(c("s_day", "hired")) %>% ggplot(aes(x = hired, fill = hired)) + geom_bar(color = "black", position = "identity")+ facet_wrap(s_day~.)+
  ggtitle("Day of the week Vs Hiring")+labs(x = "", y = "Frequency of Hiring")
p

p <- dt_lm %>% na.omit(c("s_day", "hired")) %>% ggplot(aes(x = hired, fill = hired)) + geom_bar(color = "black", position = "identity")+ facet_wrap(s_day~.)+
  ggtitle("Day of the week Vs Hiring (Local Moving)")+labs(x = "", y = "Frequency of Hiring")
p

p <- dt_hc %>% na.omit(c("s_day", "hired")) %>% ggplot(aes(x = hired, fill = hired)) + geom_bar(color = "black", position = "identity")+ facet_wrap(s_day~.)+
  ggtitle("Day of the week Vs Hiring (Home Cleaning)")+labs(x = "", y = "Frequency of Hiring")
p

rm(p)

Friday seems to the be “The Day” when the traffic is high and a lot of hirings actually happen for both the categories.

Number of Reviews, Cost est Vs Hiring

p <- dt_total %>% na.omit(c("num_reviews", "`cost_est($)`", "hired")) %>% ggplot(aes(x = `cost_est($)`, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for Number of reviews and Cost estimate")+labs(y = "Number of Reviews", x = "Cost estimate($)`", fill = "Hired or Not")
p

p <- dt_lm %>% na.omit(c("num_reviews", "`cost_est($)`", "hired")) %>% ggplot(aes(x = `cost_est($)`, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for Number of reviews and Cost estimate (Local Moving)")+labs(y = "Number of Reviews", x = "Cost estimate($)`", fill = "Hired or Not")
p

p <- dt_hc %>% na.omit(c("num_reviews", "`cost_est($)`", "hired")) %>% ggplot(aes(x = `cost_est($)`, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for Number of reviews and Cost estimate (Home Cleaning)")+labs(y = "Number of Reviews", x = "Cost estimate($)`", fill = "Hired or Not")
p

rm(p)

For almost all the hiring are concentrated between $75 - $150 of cost estimate and have high number of reviews (beyond 200).

With this, I can conclude that the hirings are impacted by Search Rank (Lower Ranks preferred), Number of reviews (More the reviews better it is), Average Ratings, Cost estimate ($75 - $150) Day of the Week (more hirings are on Friday), Time of the day (2-8 favorable).

Using these factors I would conclude that (i) posting ads on the website on all days except on Fridays between 2-10 pm, and (ii) charging pros to promote in the top 5 of search results could increase the revenues to the company.

When bookings are not made the ads would generate money during less hiring spots (Every day except on Fridays between 2-10 pm). When a booking is made, then the top 5 pros would pay a premium to ThumbTack, for driving more eye balls onto them.

So in both cases it’s a win-win to the company.