Goal: Build a regression model to predict the cost (cost_km_millions) of the transit project. Click here for the data.

Import Data

transit_cost <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-01-05/transit_cost.csv')
## Rows: 544 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): country, city, line, start_year, end_year, tunnel_per, source1, cu...
## dbl  (9): e, rr, length, tunnel, stations, cost, year, ppp_rate, cost_km_mil...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(transit_cost)
Data summary
Name transit_cost
Number of rows 544
Number of columns 20
_______________________
Column type frequency:
character 11
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
country 7 0.99 2 2 0 56 0
city 7 0.99 4 16 0 140 0
line 7 0.99 2 46 0 366 0
start_year 53 0.90 4 9 0 40 0
end_year 71 0.87 1 4 0 36 0
tunnel_per 32 0.94 5 7 0 134 0
source1 12 0.98 4 54 0 17 0
currency 7 0.99 2 3 0 39 0
real_cost 0 1.00 1 10 0 534 0
source2 10 0.98 3 16 0 12 0
reference 19 0.97 3 302 0 350 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
e 7 0.99 7738.76 463.23 7136.00 7403.00 7705.00 7977.00 9510.00 ▇▇▂▁▁
rr 8 0.99 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
length 5 0.99 58.34 621.20 0.60 6.50 15.77 29.08 12256.98 ▇▁▁▁▁
tunnel 32 0.94 29.38 344.04 0.00 3.40 8.91 21.52 7790.78 ▇▁▁▁▁
stations 15 0.97 13.81 13.70 0.00 4.00 10.00 20.00 128.00 ▇▁▁▁▁
cost 7 0.99 805438.12 6708033.07 0.00 2289.00 11000.00 27000.00 90000000.00 ▇▁▁▁▁
year 7 0.99 2014.91 5.64 1987.00 2012.00 2016.00 2019.00 2027.00 ▁▁▂▇▂
ppp_rate 9 0.98 0.66 0.87 0.00 0.24 0.26 1.00 5.00 ▇▂▁▁▁
cost_km_millions 2 1.00 232.98 257.22 7.79 134.86 181.25 241.43 3928.57 ▇▁▁▁▁
data <- transit_cost %>%
    
    # Treat missing values
    select(-cost, -currency, -source2, -source1, -reference, -line) %>%
    mutate(across(c(start_year, end_year, real_cost), as.numeric)) %>%
    na.omit() %>%
    
     # Convert character columns to factors
    mutate(across(where(is.character), as.factor)) %>%
    
    # Convert rr to a factor
    mutate(rr = factor(rr)) %>%
    
    # log transform variables with pos-skewed distributions
    mutate(cost_km_millions = log(cost_km_millions))
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(c(start_year, end_year, real_cost), as.numeric)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
skimr::skim(data)
Data summary
Name data
Number of rows 436
Number of columns 14
_______________________
Column type frequency:
factor 4
numeric 10
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
country 0 1 FALSE 55 CN: 169, IN: 27, TR: 20, ES: 15
city 0 1 FALSE 135 Bei: 21, Sha: 18, Ist: 15, Mum: 13
rr 0 1 FALSE 2 0: 404, 1: 32
tunnel_per 0 1 FALSE 118 100: 245, 0.0: 51, 20.: 3, 35.: 3

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
e 0 1 7694.72 482.20 7136.00 7354.75 7589.50 7923.50 9510.00 ▇▅▂▁▁
start_year 0 1 2012.99 6.89 1982.00 2009.00 2015.00 2018.00 2025.00 ▁▁▂▇▇
end_year 0 1 2018.88 6.49 1987.00 2016.00 2020.00 2023.00 2030.00 ▁▁▂▇▆
length 0 1 20.59 21.93 0.60 6.10 14.75 28.20 200.00 ▇▁▁▁▁
tunnel 0 1 13.64 15.50 0.00 3.27 8.40 20.00 160.00 ▇▁▁▁▁
stations 0 1 13.77 14.03 0.00 4.00 10.00 20.00 128.00 ▇▁▁▁▁
year 0 1 2014.51 5.91 1987.00 2012.00 2016.00 2018.00 2027.00 ▁▁▂▇▂
ppp_rate 0 1 0.71 0.88 0.00 0.24 0.27 1.25 5.00 ▇▂▁▁▁
real_cost 0 1 4208.47 4866.52 66.25 1168.71 2977.92 5540.55 45604.00 ▇▁▁▁▁
cost_km_millions 0 1 5.23 0.63 2.05 4.89 5.22 5.49 8.28 ▁▁▇▁▁

Explore Data

Identify good predictors.

Length

data %>%
    ggplot(aes(cost_km_millions, length)) +
    scale_y_log10() +
    geom_point()

Tunnel

data %>%
    ggplot(aes(cost_km_millions, tunnel)) +
    scale_y_log10() +
    geom_point()
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

Stations

data %>%
    ggplot(aes(cost_km_millions, as.factor(year))) +
    geom_boxplot()

city

data %>%
    
    group_by(city) %>%
    filter(n() > 5) %>%
    ungroup() %>%
    
    # Plot
    ggplot(aes(cost_km_millions, fct_reorder(city, cost_km_millions))) +
    geom_point() +
    labs(y = "City of Transit Project")

EDA shortcut

# Step 1: Prepare data
data_binarized_tbl <- data %>%
    select(-e) %>%
    binarize()

data_binarized_tbl %>% glimpse()
## Rows: 436
## Columns: 88
## $ country__BG                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__CA                                         <dbl> 1, 1, 1, 1, 1, 0, …
## $ country__CN                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__DE                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__ES                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__FR                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__IN                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__IT                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__JP                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__KR                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__RU                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__SA                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__SE                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__TH                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__TR                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__TW                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__US                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__VN                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ `country__-OTHER`                                   <dbl> 0, 0, 0, 0, 0, 1, …
## $ city__Bangkok                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Barcelona                                     <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Beijing                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Changchun                                     <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Changsha                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Chengdu                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Chongqing                                     <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Dongguan                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Guangzhou                                     <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Guiyang                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Hangzhou                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Istanbul                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Madrid                                        <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Mumbai                                        <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Nanjing                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__New_York                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Paris                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Riyadh                                        <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Seoul                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Shanghai                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Shenzhen                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Sofia                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Taipei                                        <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Tianjin                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Tokyo                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Toronto                                       <dbl> 0, 1, 1, 1, 1, 0, …
## $ city__Wuhan                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ `city__-OTHER`                                      <dbl> 1, 0, 0, 0, 0, 1, …
## $ `start_year__-Inf_2009`                             <dbl> 0, 1, 0, 0, 0, 1, …
## $ start_year__2009_2015                               <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2015_2018                               <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2018_Inf                                <dbl> 1, 0, 1, 1, 1, 0, …
## $ `end_year__-Inf_2016`                               <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2016_2020                                 <dbl> 0, 1, 0, 0, 0, 1, …
## $ end_year__2020_2023                                 <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2023_Inf                                  <dbl> 1, 0, 1, 1, 1, 0, …
## $ rr__0                                               <dbl> 1, 1, 1, 1, 1, 1, …
## $ rr__1                                               <dbl> 0, 0, 0, 0, 0, 0, …
## $ `length__-Inf_6.1`                                  <dbl> 1, 0, 0, 0, 0, 0, …
## $ length__6.1_14.75                                   <dbl> 0, 1, 1, 0, 1, 1, …
## $ length__14.75_28.2                                  <dbl> 0, 0, 0, 1, 0, 0, …
## $ length__28.2_Inf                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__0.00%`                                 <dbl> 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__100.00%`                               <dbl> 0, 1, 1, 0, 1, 0, …
## $ `tunnel_per__-OTHER`                                <dbl> 1, 0, 0, 1, 0, 1, …
## $ `tunnel__-Inf_3.275`                                <dbl> 0, 0, 0, 0, 0, 0, …
## $ tunnel__3.275_8.4                                   <dbl> 1, 0, 1, 0, 1, 1, …
## $ tunnel__8.4_20                                      <dbl> 0, 1, 0, 1, 0, 0, …
## $ tunnel__20_Inf                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ `stations__-Inf_4`                                  <dbl> 0, 0, 1, 0, 0, 0, …
## $ stations__4_10                                      <dbl> 1, 1, 0, 0, 1, 1, …
## $ stations__10_20                                     <dbl> 0, 0, 0, 1, 0, 0, …
## $ stations__20_Inf                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ `year__-Inf_2012`                                   <dbl> 0, 0, 0, 0, 0, 1, …
## $ year__2012_2016                                     <dbl> 0, 1, 0, 0, 0, 0, …
## $ year__2016_2018                                     <dbl> 1, 0, 1, 0, 0, 0, …
## $ year__2018_Inf                                      <dbl> 0, 0, 0, 1, 1, 0, …
## $ `ppp_rate__-Inf_0.2379`                             <dbl> 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.2379_0.266                              <dbl> 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.266_1.25                                <dbl> 1, 1, 1, 1, 1, 0, …
## $ ppp_rate__1.25_Inf                                  <dbl> 0, 0, 0, 0, 0, 1, …
## $ `real_cost__-Inf_1168.71`                           <dbl> 0, 0, 0, 0, 0, 0, …
## $ real_cost__1168.71_2977.92                          <dbl> 1, 1, 0, 0, 0, 0, …
## $ real_cost__2977.92_5540.5525                        <dbl> 0, 0, 1, 0, 1, 1, …
## $ real_cost__5540.5525_Inf                            <dbl> 0, 0, 0, 1, 0, 0, …
## $ `cost_km_millions__-Inf_4.89022561868544`           <dbl> 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__4.89022561868544_5.21539741232945 <dbl> 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__5.21539741232945_5.49422035213027 <dbl> 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__5.49422035213027_Inf              <dbl> 1, 1, 1, 1, 1, 1, …
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(cost_km_millions__5.49422035213027_Inf)

data_corr_tbl
## # A tibble: 88 × 3
##    feature          bin                               correlation
##    <fct>            <chr>                                   <dbl>
##  1 cost_km_millions 5.49422035213027_Inf                    1    
##  2 cost_km_millions -Inf_4.89022561868544                  -0.333
##  3 cost_km_millions 4.89022561868544_5.21539741232945      -0.333
##  4 cost_km_millions 5.21539741232945_5.49422035213027      -0.333
##  5 country          US                                      0.304
##  6 country          CN                                     -0.264
##  7 real_cost        -Inf_1168.71                           -0.211
##  8 country          -OTHER                                  0.196
##  9 city             New_York                                0.187
## 10 real_cost        5540.5525_Inf                           0.180
## # ℹ 78 more rows
# Step 3: Plot 
data_corr_tbl %>%
    plot_correlation_funnel()
## Warning: ggrepel: 44 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps