Goal: Build a regression model to predict the cost (cost_km_millions) of the transit project. Click here for the data.

Import Data

transit_cost <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-01-05/transit_cost.csv')

## Rows: 544 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): country, city, line, start_year, end_year, tunnel_per, source1, cu...
## dbl  (9): e, rr, length, tunnel, stations, cost, year, ppp_rate, cost_km_mil...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(transit_cost)

Data summary
Name	transit_cost
Number of rows	544
Number of columns	20
_______________________
Column type frequency:
character	11
numeric	9
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
country	7	0.99	2	2	56
city	7	0.99	4	16	140
line	7	0.99	2	46	366
start_year	53	0.90	4	9	40
end_year	71	0.87	1	4	36
tunnel_per	32	0.94	5	7	134
source1	12	0.98	4	54	17
currency	7	0.99	2	3	39
real_cost	0	1.00	1	10	534
source2	10	0.98	3	16	12
reference	19	0.97	3	302	350

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
e	7	0.99	7738.76	463.23	7136.00	7403.00	7705.00	7977.00	9510.00	▇▇▂▁▁
rr	8	0.99	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
length	5	0.99	58.34	621.20	0.60	6.50	15.77	29.08	12256.98	▇▁▁▁▁
tunnel	32	0.94	29.38	344.04	0.00	3.40	8.91	21.52	7790.78	▇▁▁▁▁
stations	15	0.97	13.81	13.70	0.00	4.00	10.00	20.00	128.00	▇▁▁▁▁
cost	7	0.99	805438.12	6708033.07	0.00	2289.00	11000.00	27000.00	90000000.00	▇▁▁▁▁
year	7	0.99	2014.91	5.64	1987.00	2012.00	2016.00	2019.00	2027.00	▁▁▂▇▂
ppp_rate	9	0.98	0.66	0.87	0.00	0.24	0.26	1.00	5.00	▇▂▁▁▁
cost_km_millions	2	1.00	232.98	257.22	7.79	134.86	181.25	241.43	3928.57	▇▁▁▁▁

data <- transit_cost %>%
    
    # Treat missing values
    select(-cost, -currency, -source2, -source1, -reference, -line) %>%
    mutate(across(c(start_year, end_year, real_cost), as.numeric)) %>%
    na.omit() %>%
    
     # Convert character columns to factors
    mutate(across(where(is.character), as.factor)) %>%
    
    # Convert rr to a factor
    mutate(rr = factor(rr)) %>%
    
    # log transform variables with pos-skewed distributions
    mutate(cost_km_millions = log(cost_km_millions))

## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(c(start_year, end_year, real_cost), as.numeric)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.

skimr::skim(data)

Data summary
Name	data
Number of rows	436
Number of columns	14
_______________________
Column type frequency:
factor	4
numeric	10
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
country	1	FALSE	55	CN: 169, IN: 27, TR: 20, ES: 15
city	1	FALSE	135	Bei: 21, Sha: 18, Ist: 15, Mum: 13
rr	1	FALSE	2	0: 404, 1: 32
tunnel_per	1	FALSE	118	100: 245, 0.0: 51, 20.: 3, 35.: 3

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
e	1	7694.72	482.20	7136.00	7354.75	7589.50	7923.50	9510.00	▇▅▂▁▁
start_year	1	2012.99	6.89	1982.00	2009.00	2015.00	2018.00	2025.00	▁▁▂▇▇
end_year	1	2018.88	6.49	1987.00	2016.00	2020.00	2023.00	2030.00	▁▁▂▇▆
length	1	20.59	21.93	0.60	6.10	14.75	28.20	200.00	▇▁▁▁▁
tunnel	1	13.64	15.50	0.00	3.27	8.40	20.00	160.00	▇▁▁▁▁
stations	1	13.77	14.03	0.00	4.00	10.00	20.00	128.00	▇▁▁▁▁
year	1	2014.51	5.91	1987.00	2012.00	2016.00	2018.00	2027.00	▁▁▂▇▂
ppp_rate	1	0.71	0.88	0.00	0.24	0.27	1.25	5.00	▇▂▁▁▁
real_cost	1	4208.47	4866.52	66.25	1168.71	2977.92	5540.55	45604.00	▇▁▁▁▁
cost_km_millions	1	5.23	0.63	2.05	4.89	5.22	5.49	8.28	▁▁▇▁▁

Explore Data

Identify good predictors.

Length

data %>%
    ggplot(aes(cost_km_millions, length)) +
    scale_y_log10() +
    geom_point()

Tunnel

data %>%
    ggplot(aes(cost_km_millions, tunnel)) +
    scale_y_log10() +
    geom_point()

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

Stations

data %>%
    ggplot(aes(cost_km_millions, as.factor(year))) +
    geom_boxplot()

city

data %>%
    
    group_by(city) %>%
    filter(n() > 5) %>%
    ungroup() %>%
    
    # Plot
    ggplot(aes(cost_km_millions, fct_reorder(city, cost_km_millions))) +
    geom_point() +
    labs(y = "City of Transit Project")

EDA shortcut

# Step 1: Prepare data
data_binarized_tbl <- data %>%
    select(-e) %>%
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 436
## Columns: 88
## $ country__BG                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__CA                                         <dbl> 1, 1, 1, 1, 1, 0, …
## $ country__CN                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__DE                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__ES                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__FR                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__IN                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__IT                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__JP                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__KR                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__RU                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__SA                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__SE                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__TH                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__TR                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__TW                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__US                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__VN                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ `country__-OTHER`                                   <dbl> 0, 0, 0, 0, 0, 1, …
## $ city__Bangkok                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Barcelona                                     <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Beijing                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Changchun                                     <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Changsha                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Chengdu                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Chongqing                                     <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Dongguan                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Guangzhou                                     <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Guiyang                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Hangzhou                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Istanbul                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Madrid                                        <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Mumbai                                        <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Nanjing                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__New_York                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Paris                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Riyadh                                        <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Seoul                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Shanghai                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Shenzhen                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Sofia                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Taipei                                        <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Tianjin                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Tokyo                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Toronto                                       <dbl> 0, 1, 1, 1, 1, 0, …
## $ city__Wuhan                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ `city__-OTHER`                                      <dbl> 1, 0, 0, 0, 0, 1, …
## $ `start_year__-Inf_2009`                             <dbl> 0, 1, 0, 0, 0, 1, …
## $ start_year__2009_2015                               <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2015_2018                               <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2018_Inf                                <dbl> 1, 0, 1, 1, 1, 0, …
## $ `end_year__-Inf_2016`                               <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2016_2020                                 <dbl> 0, 1, 0, 0, 0, 1, …
## $ end_year__2020_2023                                 <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2023_Inf                                  <dbl> 1, 0, 1, 1, 1, 0, …
## $ rr__0                                               <dbl> 1, 1, 1, 1, 1, 1, …
## $ rr__1                                               <dbl> 0, 0, 0, 0, 0, 0, …
## $ `length__-Inf_6.1`                                  <dbl> 1, 0, 0, 0, 0, 0, …
## $ length__6.1_14.75                                   <dbl> 0, 1, 1, 0, 1, 1, …
## $ length__14.75_28.2                                  <dbl> 0, 0, 0, 1, 0, 0, …
## $ length__28.2_Inf                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__0.00%`                                 <dbl> 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__100.00%`                               <dbl> 0, 1, 1, 0, 1, 0, …
## $ `tunnel_per__-OTHER`                                <dbl> 1, 0, 0, 1, 0, 1, …
## $ `tunnel__-Inf_3.275`                                <dbl> 0, 0, 0, 0, 0, 0, …
## $ tunnel__3.275_8.4                                   <dbl> 1, 0, 1, 0, 1, 1, …
## $ tunnel__8.4_20                                      <dbl> 0, 1, 0, 1, 0, 0, …
## $ tunnel__20_Inf                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ `stations__-Inf_4`                                  <dbl> 0, 0, 1, 0, 0, 0, …
## $ stations__4_10                                      <dbl> 1, 1, 0, 0, 1, 1, …
## $ stations__10_20                                     <dbl> 0, 0, 0, 1, 0, 0, …
## $ stations__20_Inf                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ `year__-Inf_2012`                                   <dbl> 0, 0, 0, 0, 0, 1, …
## $ year__2012_2016                                     <dbl> 0, 1, 0, 0, 0, 0, …
## $ year__2016_2018                                     <dbl> 1, 0, 1, 0, 0, 0, …
## $ year__2018_Inf                                      <dbl> 0, 0, 0, 1, 1, 0, …
## $ `ppp_rate__-Inf_0.2379`                             <dbl> 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.2379_0.266                              <dbl> 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.266_1.25                                <dbl> 1, 1, 1, 1, 1, 0, …
## $ ppp_rate__1.25_Inf                                  <dbl> 0, 0, 0, 0, 0, 1, …
## $ `real_cost__-Inf_1168.71`                           <dbl> 0, 0, 0, 0, 0, 0, …
## $ real_cost__1168.71_2977.92                          <dbl> 1, 1, 0, 0, 0, 0, …
## $ real_cost__2977.92_5540.5525                        <dbl> 0, 0, 1, 0, 1, 1, …
## $ real_cost__5540.5525_Inf                            <dbl> 0, 0, 0, 1, 0, 0, …
## $ `cost_km_millions__-Inf_4.89022561868544`           <dbl> 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__4.89022561868544_5.21539741232945 <dbl> 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__5.21539741232945_5.49422035213027 <dbl> 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__5.49422035213027_Inf              <dbl> 1, 1, 1, 1, 1, 1, …

# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(cost_km_millions__5.49422035213027_Inf)

data_corr_tbl

## # A tibble: 88 × 3
##    feature          bin                               correlation
##    <fct>            <chr>                                   <dbl>
##  1 cost_km_millions 5.49422035213027_Inf                    1    
##  2 cost_km_millions -Inf_4.89022561868544                  -0.333
##  3 cost_km_millions 4.89022561868544_5.21539741232945      -0.333
##  4 cost_km_millions 5.21539741232945_5.49422035213027      -0.333
##  5 country          US                                      0.304
##  6 country          CN                                     -0.264
##  7 real_cost        -Inf_1168.71                           -0.211
##  8 country          -OTHER                                  0.196
##  9 city             New_York                                0.187
## 10 real_cost        5540.5525_Inf                           0.180
## # ℹ 78 more rows

# Step 3: Plot 
data_corr_tbl %>%
    plot_correlation_funnel()

## Warning: ggrepel: 44 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

CodeAlong1

Brady Martin

2025-02-13

Import Data

Explore Data