A TidyTuesday Analysis on Transit Cost

This week’s #TidyTuesday dataset comes from Transit Costs project.

My goal here is to perform some exploratory data analysis and build two models to predict the real_cost using some features from the dataset.

## loading libraries
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

## setting a global option variable
options(scipen = 999, digits = 2)

df <- read_csv("transit_cost.txt")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   e = col_double(),
##   rr = col_double(),
##   length = col_double(),
##   tunnel = col_double(),
##   stations = col_double(),
##   cost = col_double(),
##   year = col_double(),
##   ppp_rate = col_double(),
##   cost_km_millions = col_double()
## )

## See spec(...) for full column specifications.

df %>% head()

## # A tibble: 6 x 20
##       e country city  line  start_year end_year    rr length tunnel_per tunnel
##   <dbl> <chr>   <chr> <chr> <chr>      <chr>    <dbl>  <dbl> <chr>       <dbl>
## 1  7136 CA      Vanc~ Broa~ 2020       2025         0    5.7 87.72%        5  
## 2  7137 CA      Toro~ Vaug~ 2009       2017         0    8.6 100.00%       8.6
## 3  7138 CA      Toro~ Scar~ 2020       2030         0    7.8 100.00%       7.8
## 4  7139 CA      Toro~ Onta~ 2020       2030         0   15.5 57.00%        8.8
## 5  7144 CA      Toro~ Yong~ 2020       2030         0    7.4 100.00%       7.4
## 6  7145 NL      Amst~ Nort~ 2003       2018         0    9.7 73.00%        7.1
## # ... with 10 more variables: stations <dbl>, source1 <chr>, cost <dbl>,
## #   currency <chr>, year <dbl>, ppp_rate <dbl>, real_cost <chr>,
## #   cost_km_millions <dbl>, source2 <chr>, reference <chr>

## Basic Cleaning
df <-
  df %>%
  mutate(start_year = as.integer(start_year),
         end_year = as.integer(end_year),
         tunnel_per = parse_number(tunnel_per)/100,
         real_cost = as.numeric(real_cost))

I want to know which country has a quicker project completion time

## Which country has a better completion time against number of projects done.
 df %>%
  filter(end_year <= 2020) %>% # I wanted projects that has been done already
  mutate(time_diff = end_year - start_year) %>%
  count(country, sort = T, name = "No_Projects")%>%
  left_join(df %>%
  filter(end_year <= 2020) %>%
  mutate(time_diff = end_year - start_year) %>%
  count(country,wt = round(mean(time_diff, na.rm = T),2), sort = T, name = "Average_time")) %>%
  left_join(df %>%
  filter(end_year <= 2020) %>%
  count(country,wt = round(mean(real_cost, na.rm = T),2), sort = T, name = "Average_cost")) %>%
  top_n(10,No_Projects)

## Joining, by = "country"
## Joining, by = "country"

## # A tibble: 10 x 4
##    country No_Projects Average_time Average_cost
##    <chr>         <int>        <dbl>        <dbl>
##  1 CN              108         3.79        4055.
##  2 ES               13         5.85         895.
##  3 TR               13         8.92        1523.
##  4 IN               10         6.9         6273.
##  5 DE                9         7.44         701.
##  6 JP                9         9.56        2440.
##  7 FR                7         5.29         399.
##  8 IT                7         9            982.
##  9 SA                6         3.67        6753.
## 10 TW                5         9.4         6963.

## pull countries from previous table
country_list <-  df %>%
  filter(end_year <= 2020) %>% # I wanted projects that has been done already
  mutate(time_diff = end_year - start_year) %>%
  count(country, sort = T, name = "No_Projects")%>%
  top_n(10) %>%
  pull(country)

## Selecting by No_Projects

## Create a plot that describes their project cost
df %>%
  filter(country %in% country_list) %>%
  mutate(country = reorder(country, -real_cost,FUN = median)) %>%
  ggplot(aes(country, real_cost, fill = country)) +
  geom_boxplot(show.legend = F)+
  scale_y_log10(label = scales::dollar_format())+
  coord_flip()+
  theme_minimal()+
  labs(title = "A Boxplot showing project costs for the top 10 countries with high number of projects.",
       x = NULL,
       y = "Real cost in Millions of USD")+
  theme(plot.title.position = "plot",
        plot.title = element_text(hjust = 0.5))

At first glance, one could say that the length of the tunnel would be a strong factor to consider when determining the cost, lets verify that.

df %>%
  select_if(is.numeric) %>%
  mutate(year_diff = end_year - start_year) %>%
  select(-e,-end_year,-start_year) %>%
  drop_na() %>%
  cor() %>%
  as_tibble() %>%
  mutate(vars = colnames(.)) %>%
  select(vars, everything()) %>%
  gather(key = "key", value = "value", - vars) %>% 
  mutate(vars = reorder(vars,-value),
         key = reorder(key, -value)) %>%
  ggplot(aes(vars,key, fill = value))+
  geom_tile()+
  geom_text(aes(label =round(value,2)))+
  theme_minimal()+
  scale_fill_gradientn(colours =c("gray","midnightblue"))+
  theme(
    axis.title = element_blank(),
    axis.text = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.key.height = unit(0.5,"cm"),
    legend.key.width = unit(3,"cm"),
    legend.position = "top",
    legend.direction = "horizontal",
    plot.title = element_text(hjust = 0.5))+
  labs(fill = NULL,
       title = "Correlation Plot")

We can see that, tunnel, stations and length are positively correlated with the real_cost variable

Lets show a simple scatter plot

df %>%
  drop_na() %>%
  select(real_cost,tunnel,length,stations) %>%
  gather(key = "key",value = "value",-real_cost) %>%
  mutate(key= str_c(key," against real cost") %>% str_to_sentence()) %>%
  ggplot(aes(real_cost,value))+
  geom_point(alpha = 0.6)+
  geom_smooth(method = "lm", se = FALSE, col = "blue", alpha = 0.7)+
  scale_y_log10(label = scales::dollar_format())+
  scale_x_log10(label = scales::dollar_format())+
  facet_wrap(~key, scales = "free")+
  theme_light()+
  labs(y = NULL,
       x = NULL)

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 48 rows containing non-finite values (stat_smooth).

## Modeling

library(tidymodels)

## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --

## v broom     0.7.2      v recipes   0.1.15
## v dials     0.0.9      v rsample   0.0.8 
## v infer     0.5.3      v tune      0.1.2 
## v modeldata 0.1.0      v workflows 0.2.1 
## v parsnip   0.1.4      v yardstick 0.0.7

## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()

## Preparing the data for modeling
model_df <-
df %>%
  select(real_cost,country,start_year,end_year,rr,length,tunnel,stations,ppp_rate) %>%
  mutate(rr = as.factor(rr),
         country = as.factor(country),
         ppp_rate = round(ppp_rate,3)) %>%
  ## to filter NULL observations
  mutate(country_test = ifelse(is.na(country),TRUE,FALSE)) %>% 
  filter(country_test == FALSE) %>%
  select(-country_test) %>%
  ## removing some variables
  filter(real_cost > 0)

model_df %>% head()

## # A tibble: 6 x 9
##   real_cost country start_year end_year rr    length tunnel stations ppp_rate
##       <dbl> <fct>        <int>    <int> <fct>  <dbl>  <dbl>    <dbl>    <dbl>
## 1     2377. CA            2020     2025 0        5.7    5          6     0.84
## 2     2592  CA            2009     2017 0        8.6    8.6        6     0.81
## 3     4620  CA            2020     2030 0        7.8    7.8        3     0.84
## 4     7201. CA            2020     2030 0       15.5    8.8       15     0.84
## 5     4704  CA            2020     2030 0        7.4    7.4        6     0.84
## 6     4030  NL            2003     2018 0        9.7    7.1        8     1.3

Data Partitioning and Bootstraps

set.seed(342)
df_split <- initial_split(model_df)

train <- training(df_split)

df_folds <- bootstraps(train)

train %>% head()

## # A tibble: 6 x 9
##   real_cost country start_year end_year rr    length tunnel stations ppp_rate
##       <dbl> <fct>        <int>    <int> <fct>  <dbl>  <dbl>    <dbl>    <dbl>
## 1     2377. CA            2020     2025 0        5.7    5          6     0.84
## 2     2592  CA            2009     2017 0        8.6    8.6        6     0.81
## 3     4620  CA            2020     2030 0        7.8    7.8        3     0.84
## 4     7201. CA            2020     2030 0       15.5    8.8       15     0.84
## 5     4704  CA            2020     2030 0        7.4    7.4        6     0.84
## 6     4030  NL            2003     2018 0        9.7    7.1        8     1.3

Recipe

## recipe
df_rec <- recipe(real_cost ~ . , train) %>%
  step_bagimpute(all_predictors()) %>%
  
  # to store new countries 
  step_novel(country) %>%
  step_mutate(year_diff = end_year - start_year,
  # was the project length greater than or equal to the proposed length?
              t_GE_l = ifelse(tunnel >= length,"yes","no"), 
              t_GE_l = as.factor(t_GE_l)) %>%
  ## removing variables
  step_rm(length,end_year,start_year) %>%
  ## lump less occurring countries
  step_other(country, threshold = 0.2) %>%
  step_normalize(all_numeric(), -all_outcomes()) %>%
  step_dummy(all_nominal()) 

  

df_rec %>% prep() %>% juice()

## # A tibble: 402 x 8
##    tunnel stations ppp_rate real_cost year_diff country_other rr_X1 t_GE_l_yes
##     <dbl>    <dbl>    <dbl>     <dbl>     <dbl>         <dbl> <dbl>      <dbl>
##  1 -0.666  -0.561     0.227     2377.    -0.254             1     0          0
##  2 -0.401  -0.561     0.190     2592      0.831             1     0          1
##  3 -0.460  -0.772     0.227     4620      1.56              1     0          1
##  4 -0.386   0.0726    0.227     7201.     1.56              1     0          0
##  5 -0.489  -0.561     0.227     4704      1.56              1     0          1
##  6 -0.511  -0.420     0.792     4030      3.36              1     0          0
##  7 -0.607  -0.631     0.227     3780      0.108             1     0          1
##  8 -0.659  -0.842     0.423     1756      0.470             1     0          1
##  9 -0.725  -0.842     0.423     3600      0.470             1     0          1
## 10 -0.725  -0.842     0.423     2400      0.831             1     0          1
## # ... with 392 more rows

## holdout, country - china, rr - 0, length_kmp - no

Building a Linear Model

# Spec
linear_spec <- linear_reg() %>%
  set_mode("regression") %>%
  set_engine("lm")

# Building Workflow
linear_wf <- workflow() %>%
  add_recipe(df_rec) %>%
  add_model(linear_spec)

## fitting model on resampled data
linear_tune <- fit_resamples(
  linear_wf,
  resamples = df_folds,
  metrics = metric_set(rmse,rsq,mae),
  control = control_grid(save_pred = T)
)

## 
## Attaching package: 'rlang'

## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
##     splice

## 
## Attaching package: 'vctrs'

## The following object is masked from 'package:dplyr':
## 
##     data_frame

## The following object is masked from 'package:tibble':
## 
##     data_frame

## Exploring results
linear_tune %>% collect_metrics()

## # A tibble: 3 x 6
##   .metric .estimator     mean     n std_err .config             
##   <chr>   <chr>         <dbl> <int>   <dbl> <fct>               
## 1 mae     standard   1939.       25 40.9    Preprocessor1_Model1
## 2 rmse    standard   3225.       25 90.9    Preprocessor1_Model1
## 3 rsq     standard      0.603    25  0.0160 Preprocessor1_Model1

linear_tune %>%
  collect_predictions() %>%
  select(pred = .pred, real_cost) %>%
  cor()

##           pred real_cost
## pred      1.00      0.76
## real_cost 0.76      1.00

## Applying model on test data and evaluating results
last_fit(linear_wf, df_split) %>%
  collect_predictions() %>% 
  select(pred = .pred, real_cost) %>%
  mutate(diff = real_cost - pred) %>%
  ggplot(aes(diff))+
  geom_boxplot()+
  labs(title = "A Boxplot showing the difference between predicted costs and actual costs",
       x = "difference",
       caption  = "there was a big outlier (> $60,000)")+
  scale_x_continuous(labels = scales::dollar_format(), limits = c(-15000,20000))+
  theme_minimal()+
  theme(plot.title =  element_text(hjust = 0.5))

## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

# There is a good correlation between predicted and actual cost on the test data
# There was only one big outlier ($60,000)

linear_fit <- fit(linear_wf, model_df)
linear_fit %>% tidy() %>% arrange(p.value)

## # A tibble: 8 x 5
##   term          estimate std.error statistic  p.value
##   <chr>            <dbl>     <dbl>     <dbl>    <dbl>
## 1 tunnel         3239.        238.   13.6    2.68e-36
## 2 (Intercept)    3847.        368.   10.5    2.33e-23
## 3 stations       2121.        233.    9.09   1.96e-18
## 4 country_other  2280.        473.    4.82   1.87e- 6
## 5 t_GE_l_yes    -1100.        370.   -2.97   3.09e- 3
## 6 year_diff        61.5       203.    0.303  7.62e- 1
## 7 rr_X1           219.        737.    0.297  7.66e- 1
## 8 ppp_rate          7.50      192.    0.0391 9.69e- 1

Explaining the model

From the model, we can say that for every 1km increase in tunnel length, the project cost increases by $3239.0
An increase in number of stations by 1 causes the project cost to increase by $2120.8
The project cost is $2279.6 higher in other countries compared to China
The estimated Project cost is $1099.8 higher when the proposed length is lower than actual length compared to when it is the same or more than, which makes sense.
The longer it takes to complete the project, the higher the cost.
The estimated project cost is $218.9 higher when it is a rail road compared to when its not.
The estimated project cost increases by $7.5 when the purchasing power parity based on the midpoint of construction increases by 1 unit
All this can happen given that other features remain the same

A Graphical representation of our results would help a lot.

linear_fit %>% 
  tidy() %>%
  slice(-1) %>%
  mutate(term = reorder(term,-estimate)) %>%
  ggplot(aes(estimate,term,col = term))+
  geom_vline(xintercept = 0, linetype = 2, col = "midnightblue")+
  geom_point(size = 1.8)+
  geom_errorbar(aes(xmin = estimate - std.error, xmax = estimate + std.error))+
  scale_x_continuous(labels = scales::dollar_format())+
  theme_minimal()+
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, size = 19),
        plot.subtitle = element_text(hjust = 0.5, size = 13, face = "italic"),
        axis.text = element_text(size = 10, face = "italic"))+
  labs(title = "Features of the Linear model and their estimates",
       subtitle = "given that all other features remain the same",
       y = NULL)

## holdout, country - China, rr - 0, t_GE_l - no

## Important Variables
linear_fit %>% 
  pull_workflow_fit() %>%
  vip::vip(aes(alpha = 0.8)) +
  theme_minimal()

Building a Ranger model

Now, that we have a good understanding of the data, lets see if we can get a better model.

## ranger model
ranger_spec <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_mode("regression") %>% 
  set_engine("ranger") 

## Creating a Workflow
ranger_workflow <- 
  workflow() %>% 
  add_recipe(df_rec) %>% 
  add_model(ranger_spec) 

## Tuning model
set.seed(51556)
ranger_tune <-
  tune_grid(ranger_workflow, 
            resamples = df_folds, 
            grid = 10, 
            metrics = metric_set(rmse,rsq,mae),
            control = control_grid(save_pred = T))

## i Creating pre-processing data to finalize unknown parameter: mtry

## Exploring results
ranger_tune %>% 
  autoplot()+
  geom_line()+
  theme_light()

It appears that, the linear model was better than the ranger model.