c <- read.csv("Contacts.csv")
c <- select(c, -("contact_id"))
colnames(c) <- c("visitor_id", "pro_id", "hired")
v <- read.csv("Visitors.csv")
v <- v[, c("visitor_id", "pro_user_id", "category", "result_position", "num_reviews",
"avg_rating", "cost_estimate_cents", "search_timestamp", "pro_last_active_time_before_search",
"service_page_viewed")]
v$cost_estimate_cents <- v$cost_estimate_cents/100
colnames(v) <- c("visitor_id", "pro_id", "category", "search_rank", "num_reviews",
"avg_rating", "cost_est($)", "search_timestamp", "pro_last_active_time_before_search",
"page_viewed")
Fixing Date and Time stamps
v <- v %>% mutate(search_timestamp = parse_date_time(search_timestamp, orders = "Y-m-d HMS"))
v$s_date <- format(v$search_timestamp, "%Y-%m-%d") %>% as.Date()
v$s_day <- as.factor(weekdays(v$s_date))
levels(v$s_day) <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
"Saturday", "Sunday")
v$s_hr <- format(v$search_timestamp, "%H.%M") %>% as.numeric()
v$s_hr <- as.integer(floor(v$s_hr))
v <- select(v, -("search_timestamp"))
v <- v %>% mutate(pro_last_active_time_before_search = parse_date_time(pro_last_active_time_before_search,
orders = "Y-m-d HMS"))
v$p.l_date <- format(v$pro_last_active_time_before_search, "%Y-%m-%d") %>% as.Date()
v$p.l_hr <- format(v$pro_last_active_time_before_search, "%H.%M") %>% as.numeric()
v$p.l_hr <- floor(v$p.l_hr)
v <- select(v, -("pro_last_active_time_before_search"))
v <- v %>% mutate(diff = as.integer(s_date - p.l_date))
Handling and Joining data files
explo <- v
data <- v %>% left_join(c, by = c("visitor_id", "pro_id"))
data <- data[!is.na(data$hired), ]
rm(c)
rm(v)
dt_total <- select(data, -c("visitor_id", "pro_id", "s_date", "p.l_date", "p.l_hr"))
rm(data)
dt_total$hired <- as.factor(ifelse(dt_total$hired == T, "Hired", "Not Hired"))
levels(dt_total$hired) <- c("Hired", "Not Hired")
Checking the Variable of concern
table(dt_total$hired)
This is an unbalenced data set and required Over Sampling for the Hired part to get better results.
SMOTE for equalizing the data
set.seed(1)
dt_total <- SMOTE(hired ~ ., dt_total, perc.over = 500, k = 5, perc.under = 100)
table(dt_total$hired)
Spliting into data for two different categories
# HC for House cleaning and LM for Local moving
table(dt_total$category)
dt_hc <- dt_total %>% filter(category == "House Cleaning") %>% select(-"category")
dt_lm <- dt_total %>% filter(category != "House Cleaning") %>% select(-"category")
Checking for parameters that actually effect the Hiring For the total data as a whole
fit <- glm(hired ~ ., data = dt_total, family = binomial)
summary(fit)
data.frame(summary(fit)$coef[summary(fit)$coef[, 4] <= 0.05, 4])
For Local Moving
fit <- glm(hired ~ ., data = dt_lm, family = binomial)
summary(fit)
data.frame(summary(fit)$coef[summary(fit)$coef[, 4] <= 0.05, 4])
For Home Cleaning
fit <- glm(hired ~ ., data = dt_hc, family = binomial)
summary(fit)
data.frame(summary(fit)$coef[summary(fit)$coef[, 4] <= 0.05, 4])
rm(fit)
EXPLORATORY ANALYSIS
num_reviews VS hiring
p <- dt_total %>% na.omit(c("hired", "num_reviews")) %>% ggplot(aes(x = hired,
y = as.integer(num_reviews), fill = hired)) + geom_boxplot(color = "black",
outlier.size = 0, outlier.shape = NA) + ggtitle("Number of Reviews Vs Hiring (For the total data)") +
labs(y = "Number of reviews", x = "Hired or Not", fill = "Hired or Not")
p
p <- dt_lm %>% na.omit(c("hired", "num_reviews")) %>% ggplot(aes(x = hired,
y = as.integer(num_reviews), fill = hired)) + geom_boxplot(color = "black",
outlier.size = 0, outlier.shape = NA) + ggtitle("Number of Reviews Vs Hiring (For Local Moving)") +
labs(y = "Number of reviews", x = "Hired or Not", fill = "Hired or Not")
p
p <- dt_hc %>% na.omit(c("hired", "num_reviews")) %>% ggplot(aes(x = hired,
y = as.integer(num_reviews), fill = hired)) + geom_boxplot(color = "black",
outlier.size = 0, outlier.shape = NA) + ggtitle("Number of Reviews Vs Hiring (For Home Cleaning)") +
labs(y = "Number of reviews", x = "Hired or Not", fill = "Hired or Not")
p
rm(p)
It is very clear that the number of reviews have a significant effect on the hiring. As the number of reviews increases, the probability of being hired also increases.
Search Rank Vs Hiring
p <- dt_total %>% na.omit(c("search_rank", "hired")) %>% filter(search_rank > 5) %>% ggplot(aes(x = hired, y = search_rank, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Effect of Search Ranks beyond 5 on Hiring")+labs(y = "Search Rank", x = "Hired or Not", fill = "Hired or Not")
p
p <- dt_lm %>% na.omit(c("search_rank", "hired")) %>% filter(search_rank > 5) %>% ggplot(aes(x = hired, y = search_rank, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Effect of Search Ranks beyond 5 on Hiring (Local Movers)")+labs(y = "Search Rank", x = "Hired or Not", fill = "Hired or Not")
p
p <- dt_hc %>% na.omit(c("search_rank", "hired")) %>% filter(search_rank > 5) %>% ggplot(aes(x = hired, y = search_rank, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Effect of Search Ranks beyond 5 on Hiring (Home Cleaners)")+labs(y = "Search Rank", x = "Hired or Not", fill = "Hired or Not")
p
rm(p)
There is a significant effect of search ranks beyond 5 on Home Cleaners and a mild effect on Local Movers. As the search rank increases there posibility of being hired decreases.
Number of Reviews, Avg Reviews Vs Hiring
p <- dt_total %>% na.omit(c("num_reviews", "avg_reviews", "hired")) %>% ggplot(aes(x = avg_rating, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for higher Rating and higher number of reviews")+labs(y = "Number of Reviews", x = "Avg Rating", fill = "Hired or Not")
p
p <- dt_lm %>% na.omit(c("num_reviews", "avg_reviews", "hired")) %>% ggplot(aes(x = avg_rating, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for higher Rating and higher number of reviews (Local Movers)")+labs(y = "Number of Reviews", x = "Avg Rating", fill = "Hired or Not")
p
p <- dt_hc %>% na.omit(c("num_reviews", "avg_reviews", "hired")) %>% ggplot(aes(x = avg_rating, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for higher Rating and higher number of reviews (Home Cleaners)")+labs(y = "Number of Reviews", x = "Avg Rating", fill = "Hired or Not")
p
rm(p)
In all of the plots, it is pretty evident that for higher Number of reviews and higher rating the change of being hired is high
Effect of Search time on hiring
p <- dt_total %>% na.omit(c("s_hr", "hired")) %>% ggplot(aes(x = hired, y = s_hr, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Search Time vs Hiring")+labs(y = "Search time (24 hr format)", x = "Hired or Not", fill = "Hired or Not")
p
p <- dt_lm %>% na.omit(c("s_hr", "hired")) %>% ggplot(aes(x = hired, y = s_hr, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Search Time vs Hiring (Local Moving)")+labs(y = "Search time (24 hr format)", x = "Hired or Not", fill = "Hired or Not")
p
p <- dt_hc %>% na.omit(c("s_hr", "hired")) %>% ggplot(aes(x = hired, y = s_hr, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Search Time vs Hiring (Home Cleaning)")+labs(y = "Search time (24 hr format)", x = "Hired or Not", fill = "Hired or Not")
p
rm(p)
All plots indicated most of the hiring happens in the second half of the day, sometime between 2-8pm.
Effect of Day on the hiring
p <- dt_total %>% na.omit(c("s_day", "hired")) %>% ggplot(aes(x = hired, fill = hired)) + geom_bar(color = "black", position = "identity")+ facet_wrap(s_day~.)+
ggtitle("Day of the week Vs Hiring")+labs(x = "", y = "Frequency of Hiring")
p
p <- dt_lm %>% na.omit(c("s_day", "hired")) %>% ggplot(aes(x = hired, fill = hired)) + geom_bar(color = "black", position = "identity")+ facet_wrap(s_day~.)+
ggtitle("Day of the week Vs Hiring (Local Moving)")+labs(x = "", y = "Frequency of Hiring")
p
p <- dt_hc %>% na.omit(c("s_day", "hired")) %>% ggplot(aes(x = hired, fill = hired)) + geom_bar(color = "black", position = "identity")+ facet_wrap(s_day~.)+
ggtitle("Day of the week Vs Hiring (Home Cleaning)")+labs(x = "", y = "Frequency of Hiring")
p
rm(p)
Friday seems to the be “The Day” when the traffic is high and a lot of hirings actually happen for both the categories.
Number of Reviews, Cost est Vs Hiring
p <- dt_total %>% na.omit(c("num_reviews", "`cost_est($)`", "hired")) %>% ggplot(aes(x = `cost_est($)`, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for Number of reviews and Cost estimate")+labs(y = "Number of Reviews", x = "Cost estimate($)`", fill = "Hired or Not")
p
p <- dt_lm %>% na.omit(c("num_reviews", "`cost_est($)`", "hired")) %>% ggplot(aes(x = `cost_est($)`, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for Number of reviews and Cost estimate (Local Moving)")+labs(y = "Number of Reviews", x = "Cost estimate($)`", fill = "Hired or Not")
p
p <- dt_hc %>% na.omit(c("num_reviews", "`cost_est($)`", "hired")) %>% ggplot(aes(x = `cost_est($)`, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for Number of reviews and Cost estimate (Home Cleaning)")+labs(y = "Number of Reviews", x = "Cost estimate($)`", fill = "Hired or Not")
p
rm(p)
For almost all the hiring are concentrated between $75 - $150 of cost estimate and have high number of reviews (beyond 200).
With this, I can conclude that the hirings are impacted by Search Rank (Lower Ranks preferred), Number of reviews (More the reviews better it is),
Average Ratings, Cost estimate ($75 - $150) Day of the Week (more hirings are on Friday), Time of the day (2-8 favorable).
Using these factors I would conclude that
posting ads on the website on all days except on Fridays between 2-10 pm, and
charging pros to promote in the top 5 of search results could increase the revenues to the company.
When bookings are not made the ads would generate money during less hiring spots (Every day except on Fridays between 2-10 pm).
When a booking is made, then the top 5 pros would pay a premium to ThumbTack, for driving more eye balls onto them.
So in both cases it’s a win-win to the company.
---
title: "Analytics Challenge Notebook"
author: "Sushanth Chintalapati"
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


```{r, warning=FALSE, message=FALSE, results='hide'}
library("tidyverse")
library("lubridate")
library("plotly")
library("caret")
library("e1071")
library("DMwR")
library("formatR")
```

```{r loading data, tidy = TRUE}
c <- read.csv("Contacts.csv")
c <- select(c, -("contact_id"))
colnames(c) <-c("visitor_id", "pro_id", "hired") 

v <- read.csv("Visitors.csv")
v <- v[,c("visitor_id", "pro_user_id", "category", "result_position", "num_reviews", "avg_rating", "cost_estimate_cents", "search_timestamp", "pro_last_active_time_before_search",  "service_page_viewed")]
v$cost_estimate_cents <- v$cost_estimate_cents/100
colnames(v) <- c("visitor_id", "pro_id", "category", "search_rank", "num_reviews", "avg_rating", "cost_est($)", "search_timestamp", "pro_last_active_time_before_search",  "page_viewed")
```

Fixing Date and Time stamps

```{r spliting date and time, tidy = TRUE}
v <- v %>% mutate(search_timestamp = parse_date_time(search_timestamp, orders = "Y-m-d HMS"))
v$s_date <- format(v$search_timestamp, "%Y-%m-%d") %>% as.Date()
v$s_day <- as.factor(weekdays(v$s_date))
levels(v$s_day) <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
v$s_hr <- format(v$search_timestamp, "%H.%M") %>% as.numeric()
v$s_hr <- as.integer(floor(v$s_hr))
v <- select(v, -("search_timestamp"))

v <- v %>% mutate(pro_last_active_time_before_search = parse_date_time(pro_last_active_time_before_search, orders = "Y-m-d HMS"))
v$p.l_date <- format(v$pro_last_active_time_before_search, "%Y-%m-%d") %>% as.Date()
v$p.l_hr <- format(v$pro_last_active_time_before_search, "%H.%M") %>% as.numeric()
v$p.l_hr <- floor(v$p.l_hr)
v <- select(v, -("pro_last_active_time_before_search"))

v <- v %>% mutate(diff = as.integer(s_date- p.l_date))
```

Handling and Joining data files

```{r Joining data, tidy = TRUE}
explo <- v
data <- v %>% left_join(c, by = c("visitor_id", "pro_id"))
data <- data[!is.na(data$hired), ]
rm(c)
rm(v)
dt_total <- select(data, -c("visitor_id", "pro_id","s_date", "p.l_date", "p.l_hr"))

rm(data)
dt_total$hired <- as.factor(ifelse(dt_total$hired == T, "Hired", "Not Hired"))
levels(dt_total$hired) <- c("Hired", "Not Hired")
```

Checking the Variable of concern

```{r Inspecting the variable of concern, tidy = TRUE}
table(dt_total$hired)
```

This is an unbalenced data set and required Over Sampling for the Hired part to get better results.


SMOTE for equalizing the data

```{r Smote technique, tidy = TRUE}
set.seed(1)
dt_total <- SMOTE(hired~., dt_total, perc.over = 500, k = 5, perc.under = 100)
table(dt_total$hired)
```

Spliting into data for two different categories

```{r Spliting data, tidy = TRUE}
#HC for House cleaning and LM for Local moving
table(dt_total$category)
dt_hc <- dt_total %>% filter(category == "House Cleaning") %>% select(-"category")
dt_lm <- dt_total %>% filter(category != "House Cleaning") %>% select(-"category")
```

Checking for parameters that actually effect the Hiring
For the total data as a whole

```{r Modeling, tidy = TRUE}
fit <- glm(hired~., data = dt_total, family = binomial)
summary(fit)
data.frame(summary(fit)$coef[summary(fit)$coef[,4] <= .05, 4])
```

For Local Moving

```{r Modeling LM, tidy = TRUE}
fit <- glm(hired~., data = dt_lm, family = binomial)
summary(fit)
data.frame(summary(fit)$coef[summary(fit)$coef[,4] <= .05, 4])
```

For Home Cleaning

```{r Modeling HC, tidy = TRUE}
fit <- glm(hired~., data = dt_hc, family = binomial)
summary(fit)
data.frame(summary(fit)$coef[summary(fit)$coef[,4] <= .05, 4])
rm(fit)
```


EXPLORATORY ANALYSIS


num_reviews VS hiring
```{r nr vs hiring, tidy = TRUE}
p <- dt_total %>% na.omit(c("hired", "num_reviews")) %>% ggplot(aes(x = hired, y = as.integer(num_reviews), fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Number of Reviews Vs Hiring (For the total data)")+labs(y = "Number of reviews", x = "Hired or Not", fill = "Hired or Not")
p

p <- dt_lm %>% na.omit(c("hired", "num_reviews")) %>% ggplot(aes(x = hired, y = as.integer(num_reviews), fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Number of Reviews Vs Hiring (For Local Moving)")+labs(y = "Number of reviews", x = "Hired or Not", fill = "Hired or Not")
p

p <- dt_hc %>% na.omit(c("hired", "num_reviews")) %>% ggplot(aes(x = hired, y = as.integer(num_reviews), fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Number of Reviews Vs Hiring (For Home Cleaning)")+labs(y = "Number of reviews", x = "Hired or Not", fill = "Hired or Not")
p
rm(p)
```

It is very clear that the number of reviews have a significant effect on the hiring.
As the number of reviews increases, the probability of being hired also increases.


Search Rank Vs Hiring  
  
```{r sr vs hiring}
p <- dt_total %>% na.omit(c("search_rank", "hired")) %>% filter(search_rank > 5) %>% ggplot(aes(x = hired, y = search_rank, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Effect of Search Ranks beyond 5 on Hiring")+labs(y = "Search Rank", x = "Hired or Not", fill = "Hired or Not")
p

p <- dt_lm %>% na.omit(c("search_rank", "hired")) %>% filter(search_rank > 5) %>% ggplot(aes(x = hired, y = search_rank, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Effect of Search Ranks beyond 5 on Hiring (Local Movers)")+labs(y = "Search Rank", x = "Hired or Not", fill = "Hired or Not")
p


p <- dt_hc %>% na.omit(c("search_rank", "hired")) %>% filter(search_rank > 5) %>% ggplot(aes(x = hired, y = search_rank, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Effect of Search Ranks beyond 5 on Hiring (Home Cleaners)")+labs(y = "Search Rank", x = "Hired or Not", fill = "Hired or Not")
p
rm(p)
```


There is a significant effect of search ranks beyond 5 on Home Cleaners and a mild effect on Local Movers. As the search rank increases there posibility of being hired decreases.


Number of Reviews, Avg Reviews Vs Hiring
```{r nr & ar vs hiring}
p <- dt_total %>% na.omit(c("num_reviews", "avg_reviews", "hired")) %>% ggplot(aes(x = avg_rating, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for higher Rating and higher number of reviews")+labs(y = "Number of Reviews", x = "Avg Rating", fill = "Hired or Not")
p

p <- dt_lm %>% na.omit(c("num_reviews", "avg_reviews", "hired")) %>% ggplot(aes(x = avg_rating, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for higher Rating and higher number of reviews (Local Movers)")+labs(y = "Number of Reviews", x = "Avg Rating", fill = "Hired or Not")
p

p <- dt_hc %>% na.omit(c("num_reviews", "avg_reviews", "hired")) %>% ggplot(aes(x = avg_rating, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for higher Rating and higher number of reviews (Home Cleaners)")+labs(y = "Number of Reviews", x = "Avg Rating", fill = "Hired or Not")
p

rm(p)
```

In all of the plots, it is pretty evident that for higher Number of reviews and higher rating the change of being hired is high



Effect of Search time on hiring

```{r st vs hiring}
p <- dt_total %>% na.omit(c("s_hr", "hired")) %>% ggplot(aes(x = hired, y = s_hr, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Search Time vs Hiring")+labs(y = "Search time (24 hr format)", x = "Hired or Not", fill = "Hired or Not")
p

p <- dt_lm %>% na.omit(c("s_hr", "hired")) %>% ggplot(aes(x = hired, y = s_hr, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Search Time vs Hiring (Local Moving)")+labs(y = "Search time (24 hr format)", x = "Hired or Not", fill = "Hired or Not")
p

p <- dt_hc %>% na.omit(c("s_hr", "hired")) %>% ggplot(aes(x = hired, y = s_hr, fill = hired))+geom_boxplot(color = "black", outlier.size = 0, outlier.shape = NA)+ggtitle("Search Time vs Hiring (Home Cleaning)")+labs(y = "Search time (24 hr format)", x = "Hired or Not", fill = "Hired or Not")
p
rm(p)
```

All plots indicated most of the hiring happens in the second half of the day, sometime between 2-8pm.



Effect of Day on the hiring
```{r day vs hiring}
p <- dt_total %>% na.omit(c("s_day", "hired")) %>% ggplot(aes(x = hired, fill = hired)) + geom_bar(color = "black", position = "identity")+ facet_wrap(s_day~.)+
  ggtitle("Day of the week Vs Hiring")+labs(x = "", y = "Frequency of Hiring")
p

p <- dt_lm %>% na.omit(c("s_day", "hired")) %>% ggplot(aes(x = hired, fill = hired)) + geom_bar(color = "black", position = "identity")+ facet_wrap(s_day~.)+
  ggtitle("Day of the week Vs Hiring (Local Moving)")+labs(x = "", y = "Frequency of Hiring")
p

p <- dt_hc %>% na.omit(c("s_day", "hired")) %>% ggplot(aes(x = hired, fill = hired)) + geom_bar(color = "black", position = "identity")+ facet_wrap(s_day~.)+
  ggtitle("Day of the week Vs Hiring (Home Cleaning)")+labs(x = "", y = "Frequency of Hiring")
p
rm(p)
```


Friday seems to the be "The Day" when the traffic is high and a lot of hirings actually happen for both the categories.




Number of Reviews, Cost est Vs Hiring
```{r nr & ce vs hiring}
p <- dt_total %>% na.omit(c("num_reviews", "`cost_est($)`", "hired")) %>% ggplot(aes(x = `cost_est($)`, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for Number of reviews and Cost estimate")+labs(y = "Number of Reviews", x = "Cost estimate($)`", fill = "Hired or Not")
p

p <- dt_lm %>% na.omit(c("num_reviews", "`cost_est($)`", "hired")) %>% ggplot(aes(x = `cost_est($)`, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for Number of reviews and Cost estimate (Local Moving)")+labs(y = "Number of Reviews", x = "Cost estimate($)`", fill = "Hired or Not")
p


p <- dt_hc %>% na.omit(c("num_reviews", "`cost_est($)`", "hired")) %>% ggplot(aes(x = `cost_est($)`, y = num_reviews, shape = hired, col = hired))+scale_shape_manual(values=c(3, 1))+ scale_color_manual(values=c('#E69F00', '#56B4E9'))+geom_point()+ggtitle("Hiring for Number of reviews and Cost estimate (Home Cleaning)")+labs(y = "Number of Reviews", x = "Cost estimate($)`", fill = "Hired or Not")
p

rm(p)
```


For almost all the hiring are concentrated between $75 - $150 of cost estimate and have high number of reviews (beyond 200).



With this, I can conclude that the hirings are impacted by Search Rank (Lower Ranks preferred), Number of reviews (More the reviews better it is),

Average Ratings, Cost estimate ($75 - $150) Day of the Week (more hirings are on Friday), Time of the day (2-8 favorable).

Using these factors I would conclude that

(i) posting ads on the website on all days except on Fridays between 2-10 pm, and

(ii) charging pros to promote in the top 5 of search results could increase the revenues to the company.

When bookings are not made the ads would generate money during less hiring spots (Every day except on Fridays between 2-10 pm).

When a booking is made, then the top 5 pros would pay a premium to ThumbTack, for driving more eye balls onto them.


So in both cases it's a win-win to the company.