This week’s #TidyTuesday dataset comes from Transit Costs project.

My goal here is to perform some exploratory data analysis and build two models to predict the real_cost using some features from the dataset.

## loading libraries
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## setting a global option variable
options(scipen = 999, digits = 2)
df <- read_csv("transit_cost.txt")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   e = col_double(),
##   rr = col_double(),
##   length = col_double(),
##   tunnel = col_double(),
##   stations = col_double(),
##   cost = col_double(),
##   year = col_double(),
##   ppp_rate = col_double(),
##   cost_km_millions = col_double()
## )
## See spec(...) for full column specifications.
df %>% head() 
## # A tibble: 6 x 20
##       e country city  line  start_year end_year    rr length tunnel_per tunnel
##   <dbl> <chr>   <chr> <chr> <chr>      <chr>    <dbl>  <dbl> <chr>       <dbl>
## 1  7136 CA      Vanc~ Broa~ 2020       2025         0    5.7 87.72%        5  
## 2  7137 CA      Toro~ Vaug~ 2009       2017         0    8.6 100.00%       8.6
## 3  7138 CA      Toro~ Scar~ 2020       2030         0    7.8 100.00%       7.8
## 4  7139 CA      Toro~ Onta~ 2020       2030         0   15.5 57.00%        8.8
## 5  7144 CA      Toro~ Yong~ 2020       2030         0    7.4 100.00%       7.4
## 6  7145 NL      Amst~ Nort~ 2003       2018         0    9.7 73.00%        7.1
## # ... with 10 more variables: stations <dbl>, source1 <chr>, cost <dbl>,
## #   currency <chr>, year <dbl>, ppp_rate <dbl>, real_cost <chr>,
## #   cost_km_millions <dbl>, source2 <chr>, reference <chr>
## Basic Cleaning
df <-
  df %>%
  mutate(start_year = as.integer(start_year),
         end_year = as.integer(end_year),
         tunnel_per = parse_number(tunnel_per)/100,
         real_cost = as.numeric(real_cost))

I want to know which country has a quicker project completion time

## Which country has a better completion time against number of projects done.
 df %>%
  filter(end_year <= 2020) %>% # I wanted projects that has been done already
  mutate(time_diff = end_year - start_year) %>%
  count(country, sort = T, name = "No_Projects")%>%
  left_join(df %>%
  filter(end_year <= 2020) %>%
  mutate(time_diff = end_year - start_year) %>%
  count(country,wt = round(mean(time_diff, na.rm = T),2), sort = T, name = "Average_time")) %>%
  left_join(df %>%
  filter(end_year <= 2020) %>%
  count(country,wt = round(mean(real_cost, na.rm = T),2), sort = T, name = "Average_cost")) %>%
  top_n(10,No_Projects)
## Joining, by = "country"
## Joining, by = "country"
## # A tibble: 10 x 4
##    country No_Projects Average_time Average_cost
##    <chr>         <int>        <dbl>        <dbl>
##  1 CN              108         3.79        4055.
##  2 ES               13         5.85         895.
##  3 TR               13         8.92        1523.
##  4 IN               10         6.9         6273.
##  5 DE                9         7.44         701.
##  6 JP                9         9.56        2440.
##  7 FR                7         5.29         399.
##  8 IT                7         9            982.
##  9 SA                6         3.67        6753.
## 10 TW                5         9.4         6963.
## pull countries from previous table
country_list <-  df %>%
  filter(end_year <= 2020) %>% # I wanted projects that has been done already
  mutate(time_diff = end_year - start_year) %>%
  count(country, sort = T, name = "No_Projects")%>%
  top_n(10) %>%
  pull(country)
## Selecting by No_Projects
## Create a plot that describes their project cost
df %>%
  filter(country %in% country_list) %>%
  mutate(country = reorder(country, -real_cost,FUN = median)) %>%
  ggplot(aes(country, real_cost, fill = country)) +
  geom_boxplot(show.legend = F)+
  scale_y_log10(label = scales::dollar_format())+
  coord_flip()+
  theme_minimal()+
  labs(title = "A Boxplot showing project costs for the top 10 countries with high number of projects.",
       x = NULL,
       y = "Real cost in Millions of USD")+
  theme(plot.title.position = "plot",
        plot.title = element_text(hjust = 0.5))

At first glance, one could say that the length of the tunnel would be a strong factor to consider when determining the cost, lets verify that.

df %>%
  select_if(is.numeric) %>%
  mutate(year_diff = end_year - start_year) %>%
  select(-e,-end_year,-start_year) %>%
  drop_na() %>%
  cor() %>%
  as_tibble() %>%
  mutate(vars = colnames(.)) %>%
  select(vars, everything()) %>%
  gather(key = "key", value = "value", - vars) %>% 
  mutate(vars = reorder(vars,-value),
         key = reorder(key, -value)) %>%
  ggplot(aes(vars,key, fill = value))+
  geom_tile()+
  geom_text(aes(label =round(value,2)))+
  theme_minimal()+
  scale_fill_gradientn(colours =c("gray","midnightblue"))+
  theme(
    axis.title = element_blank(),
    axis.text = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.key.height = unit(0.5,"cm"),
    legend.key.width = unit(3,"cm"),
    legend.position = "top",
    legend.direction = "horizontal",
    plot.title = element_text(hjust = 0.5))+
  labs(fill = NULL,
       title = "Correlation Plot")

We can see that, tunnel, stations and length are positively correlated with the real_cost variable

Lets show a simple scatter plot

df %>%
  drop_na() %>%
  select(real_cost,tunnel,length,stations) %>%
  gather(key = "key",value = "value",-real_cost) %>%
  mutate(key= str_c(key," against real cost") %>% str_to_sentence()) %>%
  ggplot(aes(real_cost,value))+
  geom_point(alpha = 0.6)+
  geom_smooth(method = "lm", se = FALSE, col = "blue", alpha = 0.7)+
  scale_y_log10(label = scales::dollar_format())+
  scale_x_log10(label = scales::dollar_format())+
  facet_wrap(~key, scales = "free")+
  theme_light()+
  labs(y = NULL,
       x = NULL)
## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 48 rows containing non-finite values (stat_smooth).

## Modeling

library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --
## v broom     0.7.2      v recipes   0.1.15
## v dials     0.0.9      v rsample   0.0.8 
## v infer     0.5.3      v tune      0.1.2 
## v modeldata 0.1.0      v workflows 0.2.1 
## v parsnip   0.1.4      v yardstick 0.0.7
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## Preparing the data for modeling
model_df <-
df %>%
  select(real_cost,country,start_year,end_year,rr,length,tunnel,stations,ppp_rate) %>%
  mutate(rr = as.factor(rr),
         country = as.factor(country),
         ppp_rate = round(ppp_rate,3)) %>%
  ## to filter NULL observations
  mutate(country_test = ifelse(is.na(country),TRUE,FALSE)) %>% 
  filter(country_test == FALSE) %>%
  select(-country_test) %>%
  ## removing some variables
  filter(real_cost > 0)

model_df %>% head()
## # A tibble: 6 x 9
##   real_cost country start_year end_year rr    length tunnel stations ppp_rate
##       <dbl> <fct>        <int>    <int> <fct>  <dbl>  <dbl>    <dbl>    <dbl>
## 1     2377. CA            2020     2025 0        5.7    5          6     0.84
## 2     2592  CA            2009     2017 0        8.6    8.6        6     0.81
## 3     4620  CA            2020     2030 0        7.8    7.8        3     0.84
## 4     7201. CA            2020     2030 0       15.5    8.8       15     0.84
## 5     4704  CA            2020     2030 0        7.4    7.4        6     0.84
## 6     4030  NL            2003     2018 0        9.7    7.1        8     1.3

Data Partitioning and Bootstraps

set.seed(342)
df_split <- initial_split(model_df)

train <- training(df_split)

df_folds <- bootstraps(train)

train %>% head() 
## # A tibble: 6 x 9
##   real_cost country start_year end_year rr    length tunnel stations ppp_rate
##       <dbl> <fct>        <int>    <int> <fct>  <dbl>  <dbl>    <dbl>    <dbl>
## 1     2377. CA            2020     2025 0        5.7    5          6     0.84
## 2     2592  CA            2009     2017 0        8.6    8.6        6     0.81
## 3     4620  CA            2020     2030 0        7.8    7.8        3     0.84
## 4     7201. CA            2020     2030 0       15.5    8.8       15     0.84
## 5     4704  CA            2020     2030 0        7.4    7.4        6     0.84
## 6     4030  NL            2003     2018 0        9.7    7.1        8     1.3

Recipe

## recipe
df_rec <- recipe(real_cost ~ . , train) %>%
  step_bagimpute(all_predictors()) %>%
  
  # to store new countries 
  step_novel(country) %>%
  step_mutate(year_diff = end_year - start_year,
  # was the project length greater than or equal to the proposed length?
              t_GE_l = ifelse(tunnel >= length,"yes","no"), 
              t_GE_l = as.factor(t_GE_l)) %>%
  ## removing variables
  step_rm(length,end_year,start_year) %>%
  ## lump less occurring countries
  step_other(country, threshold = 0.2) %>%
  step_normalize(all_numeric(), -all_outcomes()) %>%
  step_dummy(all_nominal()) 

  

df_rec %>% prep() %>% juice() 
## # A tibble: 402 x 8
##    tunnel stations ppp_rate real_cost year_diff country_other rr_X1 t_GE_l_yes
##     <dbl>    <dbl>    <dbl>     <dbl>     <dbl>         <dbl> <dbl>      <dbl>
##  1 -0.666  -0.561     0.227     2377.    -0.254             1     0          0
##  2 -0.401  -0.561     0.190     2592      0.831             1     0          1
##  3 -0.460  -0.772     0.227     4620      1.56              1     0          1
##  4 -0.386   0.0726    0.227     7201.     1.56              1     0          0
##  5 -0.489  -0.561     0.227     4704      1.56              1     0          1
##  6 -0.511  -0.420     0.792     4030      3.36              1     0          0
##  7 -0.607  -0.631     0.227     3780      0.108             1     0          1
##  8 -0.659  -0.842     0.423     1756      0.470             1     0          1
##  9 -0.725  -0.842     0.423     3600      0.470             1     0          1
## 10 -0.725  -0.842     0.423     2400      0.831             1     0          1
## # ... with 392 more rows
## holdout, country - china, rr - 0, length_kmp - no

Building a Linear Model

# Spec
linear_spec <- linear_reg() %>%
  set_mode("regression") %>%
  set_engine("lm")

# Building Workflow
linear_wf <- workflow() %>%
  add_recipe(df_rec) %>%
  add_model(linear_spec)

## fitting model on resampled data
linear_tune <- fit_resamples(
  linear_wf,
  resamples = df_folds,
  metrics = metric_set(rmse,rsq,mae),
  control = control_grid(save_pred = T)
)
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
##     splice
## 
## Attaching package: 'vctrs'
## The following object is masked from 'package:dplyr':
## 
##     data_frame
## The following object is masked from 'package:tibble':
## 
##     data_frame
## Exploring results
linear_tune %>% collect_metrics()
## # A tibble: 3 x 6
##   .metric .estimator     mean     n std_err .config             
##   <chr>   <chr>         <dbl> <int>   <dbl> <fct>               
## 1 mae     standard   1939.       25 40.9    Preprocessor1_Model1
## 2 rmse    standard   3225.       25 90.9    Preprocessor1_Model1
## 3 rsq     standard      0.603    25  0.0160 Preprocessor1_Model1
linear_tune %>%
  collect_predictions() %>%
  select(pred = .pred, real_cost) %>%
  cor()
##           pred real_cost
## pred      1.00      0.76
## real_cost 0.76      1.00
## Applying model on test data and evaluating results
last_fit(linear_wf, df_split) %>%
  collect_predictions() %>% 
  select(pred = .pred, real_cost) %>%
  mutate(diff = real_cost - pred) %>%
  ggplot(aes(diff))+
  geom_boxplot()+
  labs(title = "A Boxplot showing the difference between predicted costs and actual costs",
       x = "difference",
       caption  = "there was a big outlier (> $60,000)")+
  scale_x_continuous(labels = scales::dollar_format(), limits = c(-15000,20000))+
  theme_minimal()+
  theme(plot.title =  element_text(hjust = 0.5))
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

# There is a good correlation between predicted and actual cost on the test data
# There was only one big outlier ($60,000)
linear_fit <- fit(linear_wf, model_df)
linear_fit %>% tidy() %>% arrange(p.value)
## # A tibble: 8 x 5
##   term          estimate std.error statistic  p.value
##   <chr>            <dbl>     <dbl>     <dbl>    <dbl>
## 1 tunnel         3239.        238.   13.6    2.68e-36
## 2 (Intercept)    3847.        368.   10.5    2.33e-23
## 3 stations       2121.        233.    9.09   1.96e-18
## 4 country_other  2280.        473.    4.82   1.87e- 6
## 5 t_GE_l_yes    -1100.        370.   -2.97   3.09e- 3
## 6 year_diff        61.5       203.    0.303  7.62e- 1
## 7 rr_X1           219.        737.    0.297  7.66e- 1
## 8 ppp_rate          7.50      192.    0.0391 9.69e- 1

Explaining the model

A Graphical representation of our results would help a lot.

linear_fit %>% 
  tidy() %>%
  slice(-1) %>%
  mutate(term = reorder(term,-estimate)) %>%
  ggplot(aes(estimate,term,col = term))+
  geom_vline(xintercept = 0, linetype = 2, col = "midnightblue")+
  geom_point(size = 1.8)+
  geom_errorbar(aes(xmin = estimate - std.error, xmax = estimate + std.error))+
  scale_x_continuous(labels = scales::dollar_format())+
  theme_minimal()+
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, size = 19),
        plot.subtitle = element_text(hjust = 0.5, size = 13, face = "italic"),
        axis.text = element_text(size = 10, face = "italic"))+
  labs(title = "Features of the Linear model and their estimates",
       subtitle = "given that all other features remain the same",
       y = NULL)

## holdout, country - China, rr - 0, t_GE_l - no
## Important Variables
linear_fit %>% 
  pull_workflow_fit() %>%
  vip::vip(aes(alpha = 0.8)) +
  theme_minimal()

Building a Ranger model

Now, that we have a good understanding of the data, lets see if we can get a better model.

## ranger model
ranger_spec <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_mode("regression") %>% 
  set_engine("ranger") 

## Creating a Workflow
ranger_workflow <- 
  workflow() %>% 
  add_recipe(df_rec) %>% 
  add_model(ranger_spec) 

## Tuning model
set.seed(51556)
ranger_tune <-
  tune_grid(ranger_workflow, 
            resamples = df_folds, 
            grid = 10, 
            metrics = metric_set(rmse,rsq,mae),
            control = control_grid(save_pred = T))
## i Creating pre-processing data to finalize unknown parameter: mtry
## Exploring results
ranger_tune %>% 
  autoplot()+
  geom_line()+
  theme_light()

It appears that, the linear model was better than the ranger model.