My goal here is to perform some exploratory data analysis and build two models to predict the real_cost using some features from the dataset.
## loading libraries
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## setting a global option variable
options(scipen = 999, digits = 2)
df <- read_csv("transit_cost.txt")
## Parsed with column specification:
## cols(
## .default = col_character(),
## e = col_double(),
## rr = col_double(),
## length = col_double(),
## tunnel = col_double(),
## stations = col_double(),
## cost = col_double(),
## year = col_double(),
## ppp_rate = col_double(),
## cost_km_millions = col_double()
## )
## See spec(...) for full column specifications.
df %>% head()
## # A tibble: 6 x 20
## e country city line start_year end_year rr length tunnel_per tunnel
## <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 7136 CA Vanc~ Broa~ 2020 2025 0 5.7 87.72% 5
## 2 7137 CA Toro~ Vaug~ 2009 2017 0 8.6 100.00% 8.6
## 3 7138 CA Toro~ Scar~ 2020 2030 0 7.8 100.00% 7.8
## 4 7139 CA Toro~ Onta~ 2020 2030 0 15.5 57.00% 8.8
## 5 7144 CA Toro~ Yong~ 2020 2030 0 7.4 100.00% 7.4
## 6 7145 NL Amst~ Nort~ 2003 2018 0 9.7 73.00% 7.1
## # ... with 10 more variables: stations <dbl>, source1 <chr>, cost <dbl>,
## # currency <chr>, year <dbl>, ppp_rate <dbl>, real_cost <chr>,
## # cost_km_millions <dbl>, source2 <chr>, reference <chr>
## Basic Cleaning
df <-
df %>%
mutate(start_year = as.integer(start_year),
end_year = as.integer(end_year),
tunnel_per = parse_number(tunnel_per)/100,
real_cost = as.numeric(real_cost))
I want to know which country has a quicker project completion time
## Which country has a better completion time against number of projects done.
df %>%
filter(end_year <= 2020) %>% # I wanted projects that has been done already
mutate(time_diff = end_year - start_year) %>%
count(country, sort = T, name = "No_Projects")%>%
left_join(df %>%
filter(end_year <= 2020) %>%
mutate(time_diff = end_year - start_year) %>%
count(country,wt = round(mean(time_diff, na.rm = T),2), sort = T, name = "Average_time")) %>%
left_join(df %>%
filter(end_year <= 2020) %>%
count(country,wt = round(mean(real_cost, na.rm = T),2), sort = T, name = "Average_cost")) %>%
top_n(10,No_Projects)
## Joining, by = "country"
## Joining, by = "country"
## # A tibble: 10 x 4
## country No_Projects Average_time Average_cost
## <chr> <int> <dbl> <dbl>
## 1 CN 108 3.79 4055.
## 2 ES 13 5.85 895.
## 3 TR 13 8.92 1523.
## 4 IN 10 6.9 6273.
## 5 DE 9 7.44 701.
## 6 JP 9 9.56 2440.
## 7 FR 7 5.29 399.
## 8 IT 7 9 982.
## 9 SA 6 3.67 6753.
## 10 TW 5 9.4 6963.
## pull countries from previous table
country_list <- df %>%
filter(end_year <= 2020) %>% # I wanted projects that has been done already
mutate(time_diff = end_year - start_year) %>%
count(country, sort = T, name = "No_Projects")%>%
top_n(10) %>%
pull(country)
## Selecting by No_Projects
## Create a plot that describes their project cost
df %>%
filter(country %in% country_list) %>%
mutate(country = reorder(country, -real_cost,FUN = median)) %>%
ggplot(aes(country, real_cost, fill = country)) +
geom_boxplot(show.legend = F)+
scale_y_log10(label = scales::dollar_format())+
coord_flip()+
theme_minimal()+
labs(title = "A Boxplot showing project costs for the top 10 countries with high number of projects.",
x = NULL,
y = "Real cost in Millions of USD")+
theme(plot.title.position = "plot",
plot.title = element_text(hjust = 0.5))
At first glance, one could say that the length of the tunnel would be a strong factor to consider when determining the cost, lets verify that.
df %>%
select_if(is.numeric) %>%
mutate(year_diff = end_year - start_year) %>%
select(-e,-end_year,-start_year) %>%
drop_na() %>%
cor() %>%
as_tibble() %>%
mutate(vars = colnames(.)) %>%
select(vars, everything()) %>%
gather(key = "key", value = "value", - vars) %>%
mutate(vars = reorder(vars,-value),
key = reorder(key, -value)) %>%
ggplot(aes(vars,key, fill = value))+
geom_tile()+
geom_text(aes(label =round(value,2)))+
theme_minimal()+
scale_fill_gradientn(colours =c("gray","midnightblue"))+
theme(
axis.title = element_blank(),
axis.text = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.key.height = unit(0.5,"cm"),
legend.key.width = unit(3,"cm"),
legend.position = "top",
legend.direction = "horizontal",
plot.title = element_text(hjust = 0.5))+
labs(fill = NULL,
title = "Correlation Plot")
We can see that, tunnel, stations and length are positively correlated with the real_cost variable
Lets show a simple scatter plot
df %>%
drop_na() %>%
select(real_cost,tunnel,length,stations) %>%
gather(key = "key",value = "value",-real_cost) %>%
mutate(key= str_c(key," against real cost") %>% str_to_sentence()) %>%
ggplot(aes(real_cost,value))+
geom_point(alpha = 0.6)+
geom_smooth(method = "lm", se = FALSE, col = "blue", alpha = 0.7)+
scale_y_log10(label = scales::dollar_format())+
scale_x_log10(label = scales::dollar_format())+
facet_wrap(~key, scales = "free")+
theme_light()+
labs(y = NULL,
x = NULL)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 48 rows containing non-finite values (stat_smooth).
## Modeling
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --
## v broom 0.7.2 v recipes 0.1.15
## v dials 0.0.9 v rsample 0.0.8
## v infer 0.5.3 v tune 0.1.2
## v modeldata 0.1.0 v workflows 0.2.1
## v parsnip 0.1.4 v yardstick 0.0.7
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## Preparing the data for modeling
model_df <-
df %>%
select(real_cost,country,start_year,end_year,rr,length,tunnel,stations,ppp_rate) %>%
mutate(rr = as.factor(rr),
country = as.factor(country),
ppp_rate = round(ppp_rate,3)) %>%
## to filter NULL observations
mutate(country_test = ifelse(is.na(country),TRUE,FALSE)) %>%
filter(country_test == FALSE) %>%
select(-country_test) %>%
## removing some variables
filter(real_cost > 0)
model_df %>% head()
## # A tibble: 6 x 9
## real_cost country start_year end_year rr length tunnel stations ppp_rate
## <dbl> <fct> <int> <int> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 2377. CA 2020 2025 0 5.7 5 6 0.84
## 2 2592 CA 2009 2017 0 8.6 8.6 6 0.81
## 3 4620 CA 2020 2030 0 7.8 7.8 3 0.84
## 4 7201. CA 2020 2030 0 15.5 8.8 15 0.84
## 5 4704 CA 2020 2030 0 7.4 7.4 6 0.84
## 6 4030 NL 2003 2018 0 9.7 7.1 8 1.3
set.seed(342)
df_split <- initial_split(model_df)
train <- training(df_split)
df_folds <- bootstraps(train)
train %>% head()
## # A tibble: 6 x 9
## real_cost country start_year end_year rr length tunnel stations ppp_rate
## <dbl> <fct> <int> <int> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 2377. CA 2020 2025 0 5.7 5 6 0.84
## 2 2592 CA 2009 2017 0 8.6 8.6 6 0.81
## 3 4620 CA 2020 2030 0 7.8 7.8 3 0.84
## 4 7201. CA 2020 2030 0 15.5 8.8 15 0.84
## 5 4704 CA 2020 2030 0 7.4 7.4 6 0.84
## 6 4030 NL 2003 2018 0 9.7 7.1 8 1.3
## recipe
df_rec <- recipe(real_cost ~ . , train) %>%
step_bagimpute(all_predictors()) %>%
# to store new countries
step_novel(country) %>%
step_mutate(year_diff = end_year - start_year,
# was the project length greater than or equal to the proposed length?
t_GE_l = ifelse(tunnel >= length,"yes","no"),
t_GE_l = as.factor(t_GE_l)) %>%
## removing variables
step_rm(length,end_year,start_year) %>%
## lump less occurring countries
step_other(country, threshold = 0.2) %>%
step_normalize(all_numeric(), -all_outcomes()) %>%
step_dummy(all_nominal())
df_rec %>% prep() %>% juice()
## # A tibble: 402 x 8
## tunnel stations ppp_rate real_cost year_diff country_other rr_X1 t_GE_l_yes
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.666 -0.561 0.227 2377. -0.254 1 0 0
## 2 -0.401 -0.561 0.190 2592 0.831 1 0 1
## 3 -0.460 -0.772 0.227 4620 1.56 1 0 1
## 4 -0.386 0.0726 0.227 7201. 1.56 1 0 0
## 5 -0.489 -0.561 0.227 4704 1.56 1 0 1
## 6 -0.511 -0.420 0.792 4030 3.36 1 0 0
## 7 -0.607 -0.631 0.227 3780 0.108 1 0 1
## 8 -0.659 -0.842 0.423 1756 0.470 1 0 1
## 9 -0.725 -0.842 0.423 3600 0.470 1 0 1
## 10 -0.725 -0.842 0.423 2400 0.831 1 0 1
## # ... with 392 more rows
## holdout, country - china, rr - 0, length_kmp - no
# Spec
linear_spec <- linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")
# Building Workflow
linear_wf <- workflow() %>%
add_recipe(df_rec) %>%
add_model(linear_spec)
## fitting model on resampled data
linear_tune <- fit_resamples(
linear_wf,
resamples = df_folds,
metrics = metric_set(rmse,rsq,mae),
control = control_grid(save_pred = T)
)
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
## flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
## splice
##
## Attaching package: 'vctrs'
## The following object is masked from 'package:dplyr':
##
## data_frame
## The following object is masked from 'package:tibble':
##
## data_frame
## Exploring results
linear_tune %>% collect_metrics()
## # A tibble: 3 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <fct>
## 1 mae standard 1939. 25 40.9 Preprocessor1_Model1
## 2 rmse standard 3225. 25 90.9 Preprocessor1_Model1
## 3 rsq standard 0.603 25 0.0160 Preprocessor1_Model1
linear_tune %>%
collect_predictions() %>%
select(pred = .pred, real_cost) %>%
cor()
## pred real_cost
## pred 1.00 0.76
## real_cost 0.76 1.00
## Applying model on test data and evaluating results
last_fit(linear_wf, df_split) %>%
collect_predictions() %>%
select(pred = .pred, real_cost) %>%
mutate(diff = real_cost - pred) %>%
ggplot(aes(diff))+
geom_boxplot()+
labs(title = "A Boxplot showing the difference between predicted costs and actual costs",
x = "difference",
caption = "there was a big outlier (> $60,000)")+
scale_x_continuous(labels = scales::dollar_format(), limits = c(-15000,20000))+
theme_minimal()+
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
# There is a good correlation between predicted and actual cost on the test data
# There was only one big outlier ($60,000)
linear_fit <- fit(linear_wf, model_df)
linear_fit %>% tidy() %>% arrange(p.value)
## # A tibble: 8 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 tunnel 3239. 238. 13.6 2.68e-36
## 2 (Intercept) 3847. 368. 10.5 2.33e-23
## 3 stations 2121. 233. 9.09 1.96e-18
## 4 country_other 2280. 473. 4.82 1.87e- 6
## 5 t_GE_l_yes -1100. 370. -2.97 3.09e- 3
## 6 year_diff 61.5 203. 0.303 7.62e- 1
## 7 rr_X1 219. 737. 0.297 7.66e- 1
## 8 ppp_rate 7.50 192. 0.0391 9.69e- 1
A Graphical representation of our results would help a lot.
linear_fit %>%
tidy() %>%
slice(-1) %>%
mutate(term = reorder(term,-estimate)) %>%
ggplot(aes(estimate,term,col = term))+
geom_vline(xintercept = 0, linetype = 2, col = "midnightblue")+
geom_point(size = 1.8)+
geom_errorbar(aes(xmin = estimate - std.error, xmax = estimate + std.error))+
scale_x_continuous(labels = scales::dollar_format())+
theme_minimal()+
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5, size = 19),
plot.subtitle = element_text(hjust = 0.5, size = 13, face = "italic"),
axis.text = element_text(size = 10, face = "italic"))+
labs(title = "Features of the Linear model and their estimates",
subtitle = "given that all other features remain the same",
y = NULL)
## holdout, country - China, rr - 0, t_GE_l - no
## Important Variables
linear_fit %>%
pull_workflow_fit() %>%
vip::vip(aes(alpha = 0.8)) +
theme_minimal()
Now, that we have a good understanding of the data, lets see if we can get a better model.
## ranger model
ranger_spec <-
rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
set_mode("regression") %>%
set_engine("ranger")
## Creating a Workflow
ranger_workflow <-
workflow() %>%
add_recipe(df_rec) %>%
add_model(ranger_spec)
## Tuning model
set.seed(51556)
ranger_tune <-
tune_grid(ranger_workflow,
resamples = df_folds,
grid = 10,
metrics = metric_set(rmse,rsq,mae),
control = control_grid(save_pred = T))
## i Creating pre-processing data to finalize unknown parameter: mtry
## Exploring results
ranger_tune %>%
autoplot()+
geom_line()+
theme_light()
It appears that, the linear model was better than the ranger model.