library("tidyverse")
library("lubridate")
library("plotly")
library("caret")
library("e1071")
library("DMwR")
library("formatR")
c <- read.csv("Contacts.csv")
c <- select(c, -("contact_id"))
colnames(c) <- c("visitor_id", "pro_id", "hired")
v <- read.csv("Visitors.csv")
v <- v[, c("visitor_id", "pro_user_id", "category", "result_position", "num_reviews",
"avg_rating", "cost_estimate_cents", "search_timestamp", "pro_last_active_time_before_search",
"service_page_viewed")]
v$cost_estimate_cents <- v$cost_estimate_cents/100
colnames(v) <- c("visitor_id", "pro_id", "category", "search_rank", "num_reviews",
"avg_rating", "cost_est($)", "search_timestamp", "pro_last_active_time_before_search",
"page_viewed")
Fixing Date and Time stamps
v <- v %>% mutate(search_timestamp = parse_date_time(search_timestamp, orders = "Y-m-d HMS"))
v$s_date <- format(v$search_timestamp, "%Y-%m-%d") %>% as.Date()
v$s_day <- as.factor(weekdays(v$s_date))
levels(v$s_day) <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
"Saturday", "Sunday")
v$s_hr <- format(v$search_timestamp, "%H.%M") %>% as.numeric()
v$s_hr <- as.integer(floor(v$s_hr))
v <- select(v, -("search_timestamp"))
v <- v %>% mutate(pro_last_active_time_before_search = parse_date_time(pro_last_active_time_before_search,
orders = "Y-m-d HMS"))
v$p.l_date <- format(v$pro_last_active_time_before_search, "%Y-%m-%d") %>% as.Date()
v$p.l_hr <- format(v$pro_last_active_time_before_search, "%H.%M") %>% as.numeric()
v$p.l_hr <- floor(v$p.l_hr)
v <- select(v, -("pro_last_active_time_before_search"))
v <- v %>% mutate(diff = as.integer(s_date - p.l_date))
Handling and Joining data files
explo <- v
data <- v %>% left_join(c, by = c("visitor_id", "pro_id"))
data <- data[!is.na(data$hired), ]
rm(c)
rm(v)
dt_total <- select(data, -c("visitor_id", "pro_id", "s_date", "p.l_date", "p.l_hr"))
rm(data)
dt_total$hired <- as.factor(ifelse(dt_total$hired == T, "Hired", "Not Hired"))
levels(dt_total$hired) <- c("Hired", "Not Hired")
Checking the Variable of concern
table(dt_total$hired)
##
## Hired Not Hired
## 363 1141
This is an unbalenced data set and required Over Sampling for the Hired part to get better results.
SMOTE for equalizing the data
set.seed(1)
dt_total <- SMOTE(hired ~ ., dt_total, perc.over = 500, k = 5, perc.under = 100)
table(dt_total$hired)
##
## Hired Not Hired
## 2178 1815
Spliting into data for two different categories
# HC for House cleaning and LM for Local moving
table(dt_total$category)
##
## House Cleaning Local Moving (under 50 miles)
## 2286 1707
dt_hc <- dt_total %>% filter(category == "House Cleaning") %>% select(-"category")
dt_lm <- dt_total %>% filter(category != "House Cleaning") %>% select(-"category")
Checking for parameters that actually effect the Hiring For the total data as a whole
fit <- glm(hired ~ ., data = dt_total, family = binomial)
summary(fit)
##
## Call:
## glm(formula = hired ~ ., family = binomial, data = dt_total)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2878 -0.7983 0.5761 0.7558 2.4378
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 6.5540557 1.2577495 5.211
## categoryLocal Moving (under 50 miles) -0.0146466 0.1048605 -0.140
## search_rank -0.0543648 0.0276542 -1.966
## num_reviews -0.0042378 0.0002450 -17.295
## avg_rating -0.7022593 0.2252558 -3.118
## `cost_est($)` 0.0002081 0.0016430 0.127
## page_viewed -0.7641991 0.5560763 -1.374
## s_dayTuesday -0.5651910 0.1844218 -3.065
## s_dayWednesday -0.6117261 0.2140275 -2.858
## s_dayThursday -0.1894014 0.2477488 -0.764
## s_dayFriday -1.2999216 0.1686804 -7.706
## s_daySaturday -0.6260782 0.1813363 -3.453
## s_daySunday -0.0946996 0.1922067 -0.493
## s_hr -0.0226209 0.0071656 -3.157
## diff -0.0010046 0.0017612 -0.570
## Pr(>|z|)
## (Intercept) 1.88e-07 ***
## categoryLocal Moving (under 50 miles) 0.888916
## search_rank 0.049312 *
## num_reviews < 2e-16 ***
## avg_rating 0.001823 **
## `cost_est($)` 0.899226
## page_viewed 0.169358
## s_dayTuesday 0.002179 **
## s_dayWednesday 0.004261 **
## s_dayThursday 0.444575
## s_dayFriday 1.29e-14 ***
## s_daySaturday 0.000555 ***
## s_daySunday 0.622227
## s_hr 0.001595 **
## diff 0.568407
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3577.5 on 2714 degrees of freedom
## Residual deviance: 2852.6 on 2700 degrees of freedom
## (1278 observations deleted due to missingness)
## AIC: 2882.6
##
## Number of Fisher Scoring iterations: 4
data.frame(summary(fit)$coef[summary(fit)$coef[, 4] <= 0.05, 4])
For Local Moving
fit <- glm(hired ~ ., data = dt_lm, family = binomial)
summary(fit)
##
## Call:
## glm(formula = hired ~ ., family = binomial, data = dt_lm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2653 -0.8755 0.5409 0.8424 2.3610
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.7586485 4.8371105 2.431 0.01506 *
## search_rank -0.2812539 0.0587455 -4.788 1.69e-06 ***
## num_reviews -0.0034549 0.0002794 -12.367 < 2e-16 ***
## avg_rating -0.9162927 0.3009073 -3.045 0.00233 **
## `cost_est($)` -0.0012903 0.0029081 -0.444 0.65726
## page_viewed -4.5400878 4.5846723 -0.990 0.32204
## s_dayTuesday -0.5109651 0.2521428 -2.026 0.04271 *
## s_dayWednesday -0.6891419 0.2917250 -2.362 0.01816 *
## s_dayThursday -0.4366263 0.3461627 -1.261 0.20719
## s_dayFriday -1.4678240 0.2161723 -6.790 1.12e-11 ***
## s_daySaturday -0.6540578 0.2301218 -2.842 0.00448 **
## s_daySunday -0.1990824 0.2485992 -0.801 0.42324
## s_hr -0.0278300 0.0100639 -2.765 0.00569 **
## diff 0.0050347 0.0051969 0.969 0.33265
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1913.2 on 1390 degrees of freedom
## Residual deviance: 1539.6 on 1377 degrees of freedom
## (316 observations deleted due to missingness)
## AIC: 1567.6
##
## Number of Fisher Scoring iterations: 6
data.frame(summary(fit)$coef[summary(fit)$coef[, 4] <= 0.05, 4])
For Home Cleaning
fit <- glm(hired ~ ., data = dt_hc, family = binomial)
summary(fit)
##
## Call:
## glm(formula = hired ~ ., family = binomial, data = dt_hc)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4096 -0.2894 0.5629 0.6927 2.3813
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.9832995 1.9093771 3.657 0.000255 ***
## search_rank -0.0105953 0.0355316 -0.298 0.765554
## num_reviews -0.0075657 0.0006489 -11.659 < 2e-16 ***
## avg_rating -0.9185581 0.3618509 -2.538 0.011133 *
## `cost_est($)` 0.0002341 0.0020678 0.113 0.909854
## page_viewed -0.4243029 0.6009237 -0.706 0.480136
## s_dayTuesday -0.2868072 0.2742609 -1.046 0.295678
## s_dayWednesday -0.4205124 0.3178982 -1.323 0.185905
## s_dayThursday 0.2731393 0.3775870 0.723 0.469446
## s_dayFriday -0.7929773 0.2662614 -2.978 0.002900 **
## s_daySaturday -0.4073990 0.2869532 -1.420 0.155683
## s_daySunday 0.2414265 0.3007759 0.803 0.422160
## s_hr -0.0098497 0.0104427 -0.943 0.345573
## diff -0.0024589 0.0018383 -1.338 0.181017
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1589.2 on 1323 degrees of freedom
## Residual deviance: 1235.5 on 1310 degrees of freedom
## (962 observations deleted due to missingness)
## AIC: 1263.5
##
## Number of Fisher Scoring iterations: 5
data.frame(summary(fit)$coef[summary(fit)$coef[, 4] <= 0.05, 4])
rm(fit)
EXPLORATORY ANALYSIS
num_reviews VS hiring
p <- dt_total %>% na.omit(c("hired", "num_reviews")) %>% ggplot(aes(x = hired,
y = as.integer(num_reviews), fill = hired)) + geom_boxplot(color = "black",
outlier.size = 0, outlier.shape = NA) + ggtitle("Number of Reviews Vs Hiring (For the total data)") +
labs(y = "Number of reviews", x = "Hired or Not", fill = "Hired or Not")
p
p <- dt_lm %>% na.omit(c("hired", "num_reviews")) %>% ggplot(aes(x = hired,
y = as.integer(num_reviews), fill = hired)) + geom_boxplot(color = "black",
outlier.size = 0, outlier.shape = NA) + ggtitle("Number of Reviews Vs Hiring (For Local Moving)") +
labs(y = "Number of reviews", x = "Hired or Not", fill = "Hired or Not")
p
p <- dt_hc %>% na.omit(c("hired", "num_reviews")) %>% ggplot(aes(x = hired,
y = as.integer(num_reviews), fill = hired)) + geom_boxplot(color = "black",
outlier.size = 0, outlier.shape = NA) + ggtitle("Number of Reviews Vs Hiring (For Home Cleaning)") +
labs(y = "Number of reviews", x = "Hired or Not", fill = "Hired or Not")
p
rm(p)
It is very clear that the number of reviews have a significant effect on the hiring. As the number of reviews increases, the probability of being hired also increases.
Search Rank Vs Hiring
p <- dt_total %>% na.omit(c("search_rank", "hired")) %>% filter(search_rank > 5) %>% ggplot(aes(x = hired, y = search_rank, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Effect of Search Ranks beyond 5 on Hiring")+labs(y = "Search Rank", x = "Hired or Not", fill = "Hired or Not")
p
p <- dt_lm %>% na.omit(c("search_rank", "hired")) %>% filter(search_rank > 5) %>% ggplot(aes(x = hired, y = search_rank, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Effect of Search Ranks beyond 5 on Hiring (Local Movers)")+labs(y = "Search Rank", x = "Hired or Not", fill = "Hired or Not")
p
p <- dt_hc %>% na.omit(c("search_rank", "hired")) %>% filter(search_rank > 5) %>% ggplot(aes(x = hired, y = search_rank, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Effect of Search Ranks beyond 5 on Hiring (Home Cleaners)")+labs(y = "Search Rank", x = "Hired or Not", fill = "Hired or Not")
p
rm(p)
There is a significant effect of search ranks beyond 5 on Home Cleaners and a mild effect on Local Movers. As the search rank increases there posibility of being hired decreases.
Number of Reviews, Avg Reviews Vs Hiring
p <- dt_total %>% na.omit(c("num_reviews", "avg_reviews", "hired")) %>% ggplot(aes(x = avg_rating, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for higher Rating and higher number of reviews")+labs(y = "Number of Reviews", x = "Avg Rating", fill = "Hired or Not")
p
p <- dt_lm %>% na.omit(c("num_reviews", "avg_reviews", "hired")) %>% ggplot(aes(x = avg_rating, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for higher Rating and higher number of reviews (Local Movers)")+labs(y = "Number of Reviews", x = "Avg Rating", fill = "Hired or Not")
p
p <- dt_hc %>% na.omit(c("num_reviews", "avg_reviews", "hired")) %>% ggplot(aes(x = avg_rating, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for higher Rating and higher number of reviews (Home Cleaners)")+labs(y = "Number of Reviews", x = "Avg Rating", fill = "Hired or Not")
p
rm(p)
In all of the plots, it is pretty evident that for higher Number of reviews and higher rating the change of being hired is high
Effect of Search time on hiring
p <- dt_total %>% na.omit(c("s_hr", "hired")) %>% ggplot(aes(x = hired, y = s_hr, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Search Time vs Hiring")+labs(y = "Search time (24 hr format)", x = "Hired or Not", fill = "Hired or Not")
p
p <- dt_lm %>% na.omit(c("s_hr", "hired")) %>% ggplot(aes(x = hired, y = s_hr, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Search Time vs Hiring (Local Moving)")+labs(y = "Search time (24 hr format)", x = "Hired or Not", fill = "Hired or Not")
p
p <- dt_hc %>% na.omit(c("s_hr", "hired")) %>% ggplot(aes(x = hired, y = s_hr, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Search Time vs Hiring (Home Cleaning)")+labs(y = "Search time (24 hr format)", x = "Hired or Not", fill = "Hired or Not")
p
rm(p)
All plots indicated most of the hiring happens in the second half of the day, sometime between 2-8pm.
Effect of Day on the hiring
p <- dt_total %>% na.omit(c("s_day", "hired")) %>% ggplot(aes(x = hired, fill = hired)) + geom_bar(color = "black", position = "identity")+ facet_wrap(s_day~.)+
ggtitle("Day of the week Vs Hiring")+labs(x = "", y = "Frequency of Hiring")
p
p <- dt_lm %>% na.omit(c("s_day", "hired")) %>% ggplot(aes(x = hired, fill = hired)) + geom_bar(color = "black", position = "identity")+ facet_wrap(s_day~.)+
ggtitle("Day of the week Vs Hiring (Local Moving)")+labs(x = "", y = "Frequency of Hiring")
p
p <- dt_hc %>% na.omit(c("s_day", "hired")) %>% ggplot(aes(x = hired, fill = hired)) + geom_bar(color = "black", position = "identity")+ facet_wrap(s_day~.)+
ggtitle("Day of the week Vs Hiring (Home Cleaning)")+labs(x = "", y = "Frequency of Hiring")
p
rm(p)
Friday seems to the be “The Day” when the traffic is high and a lot of hirings actually happen for both the categories.
Number of Reviews, Cost est Vs Hiring
p <- dt_total %>% na.omit(c("num_reviews", "`cost_est($)`", "hired")) %>% ggplot(aes(x = `cost_est($)`, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for Number of reviews and Cost estimate")+labs(y = "Number of Reviews", x = "Cost estimate($)`", fill = "Hired or Not")
p
p <- dt_lm %>% na.omit(c("num_reviews", "`cost_est($)`", "hired")) %>% ggplot(aes(x = `cost_est($)`, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for Number of reviews and Cost estimate (Local Moving)")+labs(y = "Number of Reviews", x = "Cost estimate($)`", fill = "Hired or Not")
p
p <- dt_hc %>% na.omit(c("num_reviews", "`cost_est($)`", "hired")) %>% ggplot(aes(x = `cost_est($)`, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for Number of reviews and Cost estimate (Home Cleaning)")+labs(y = "Number of Reviews", x = "Cost estimate($)`", fill = "Hired or Not")
p
rm(p)
For almost all the hiring are concentrated between $75 - $150 of cost estimate and have high number of reviews (beyond 200).
With this, I can conclude that the hirings are impacted by Search Rank (Lower Ranks preferred), Number of reviews (More the reviews better it is), Average Ratings, Cost estimate ($75 - $150) Day of the Week (more hirings are on Friday), Time of the day (2-8 favorable).
Using these factors I would conclude that (i) posting ads on the website on all days except on Fridays between 2-10 pm, and (ii) charging pros to promote in the top 5 of search results could increase the revenues to the company.
When bookings are not made the ads would generate money during less hiring spots (Every day except on Fridays between 2-10 pm). When a booking is made, then the top 5 pros would pay a premium to ThumbTack, for driving more eye balls onto them.
So in both cases it’s a win-win to the company.