Goal: To figure out how to deliver more high-capacity transit projects for a fraction of the cost in countries like the United States. click here for the data.

Import Data

transit_cost <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-05/transit_cost.csv')

## Rows: 544 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): country, city, line, start_year, end_year, tunnel_per, source1, cu...
## dbl  (9): e, rr, length, tunnel, stations, cost, year, ppp_rate, cost_km_mil...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(transit_cost)

Data summary
Name	transit_cost
Number of rows	544
Number of columns	20
_______________________
Column type frequency:
character	11
numeric	9
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
country	7	0.99	2	2	56
city	7	0.99	4	16	140
line	7	0.99	2	46	366
start_year	53	0.90	4	9	40
end_year	71	0.87	1	4	36
tunnel_per	32	0.94	5	7	134
source1	12	0.98	4	54	17
currency	7	0.99	2	3	39
real_cost	0	1.00	1	10	534
source2	10	0.98	3	16	12
reference	19	0.97	3	302	350

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
e	7	0.99	7738.76	463.23	7136.00	7403.00	7705.00	7977.00	9510.00	▇▇▂▁▁
rr	8	0.99	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
length	5	0.99	58.34	621.20	0.60	6.50	15.77	29.08	12256.98	▇▁▁▁▁
tunnel	32	0.94	29.38	344.04	0.00	3.40	8.91	21.52	7790.78	▇▁▁▁▁
stations	15	0.97	13.81	13.70	0.00	4.00	10.00	20.00	128.00	▇▁▁▁▁
cost	7	0.99	805438.12	6708033.07	0.00	2289.00	11000.00	27000.00	90000000.00	▇▁▁▁▁
year	7	0.99	2014.91	5.64	1987.00	2012.00	2016.00	2019.00	2027.00	▁▁▂▇▂
ppp_rate	9	0.98	0.66	0.87	0.00	0.24	0.26	1.00	5.00	▇▂▁▁▁
cost_km_millions	2	1.00	232.98	257.22	7.79	134.86	181.25	241.43	3928.57	▇▁▁▁▁

subway <- transit_cost %>%
    mutate(realcost = as.numeric(real_cost)) %>%
        select(-reference, -source1, -source2, -real_cost) %>%
            na.omit() %>%
                mutate(cost = log(realcost))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `realcost = as.numeric(real_cost)`.
## Caused by warning:
## ! NAs introduced by coercion

glimpse(subway)

## Rows: 437
## Columns: 17
## $ e                <dbl> 7136, 7137, 7138, 7139, 7144, 7145, 7146, 7147, 7152,…
## $ country          <chr> "CA", "CA", "CA", "CA", "CA", "NL", "CA", "US", "US",…
## $ city             <chr> "Vancouver", "Toronto", "Toronto", "Toronto", "Toront…
## $ line             <chr> "Broadway", "Vaughan", "Scarborough", "Ontario", "Yon…
## $ start_year       <chr> "2020", "2009", "2020", "2020", "2020", "2003", "2020…
## $ end_year         <chr> "2025", "2017", "2030", "2030", "2030", "2018", "2026…
## $ rr               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ length           <dbl> 5.7, 8.6, 7.8, 15.5, 7.4, 9.7, 5.8, 5.1, 4.2, 4.2, 6.…
## $ tunnel_per       <chr> "87.72%", "100.00%", "100.00%", "57.00%", "100.00%", …
## $ tunnel           <dbl> 5.0, 8.6, 7.8, 8.8, 7.4, 7.1, 5.8, 5.1, 4.2, 4.2, 6.3…
## $ stations         <dbl> 6, 6, 3, 15, 6, 8, 5, 2, 2, 2, 3, 3, 4, 7, 13, 4, 4, …
## $ cost             <dbl> 7.773679, 7.860185, 8.438150, 8.882020, 8.456168, 8.3…
## $ currency         <chr> "CAD", "CAD", "CAD", "CAD", "CAD", "EUR", "CAD", "USD…
## $ year             <dbl> 2018, 2013, 2018, 2019, 2020, 2009, 2018, 2012, 2023,…
## $ ppp_rate         <dbl> 0.840, 0.810, 0.840, 0.840, 0.840, 1.300, 0.840, 1.00…
## $ cost_km_millions <dbl> 417.05263, 301.39535, 592.30769, 464.60129, 635.67568…
## $ realcost         <dbl> 2377.200, 2592.000, 4620.000, 7201.320, 4704.000, 403…

Explore Data

Identify good predictors.

EDA Shortcut

# Step 1 Prepare

data_binarized_tbl <- subway %>%
    select(-e) %>%
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 437
## Columns: 157
## $ country__BG                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__CA                             <dbl> 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, …
## $ country__CN                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__DE                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__ES                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__FR                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__IN                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__IT                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__JP                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__KR                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__RU                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__SA                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__SE                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TH                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TR                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TW                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__US                             <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ country__VN                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `country__-OTHER`                       <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ city__Bangkok                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Barcelona                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Beijing                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Changchun                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Changsha                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Chengdu                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Chongqing                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Dongguan                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Guangzhou                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Guiyang                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Hangzhou                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Istanbul                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Madrid                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Mumbai                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Nanjing                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__New_York                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Paris                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Riyadh                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Seoul                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Shanghai                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Shenzhen                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Sofia                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Taipei                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Tianjin                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Tokyo                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Toronto                           <dbl> 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ city__Wuhan                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `city__-OTHER`                          <dbl> 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, …
## $ line__Line_1                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_1_Phase_1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_12                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_2                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_3                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_3_Phase_1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_4                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_4_Phase_1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_5                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_5_Phase_1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_6                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_7                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Phase_1                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `line__-OTHER`                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ start_year__2000                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2001                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2003                        <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ start_year__2005                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2006                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2007                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2008                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2009                        <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ start_year__2010                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2011                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2012                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2013                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2014                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2015                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2016                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2017                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2018                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ start_year__2019                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2020                        <dbl> 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ start_year__2021                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `start_year__-OTHER`                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2000                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2008                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2009                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2010                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2011                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2012                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2013                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2014                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2015                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2016                          <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ end_year__2017                          <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2018                          <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ end_year__2019                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2020                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2021                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2022                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2023                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2024                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2025                          <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2026                          <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
## $ end_year__2027                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ end_year__2028                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2029                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2030                          <dbl> 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ `end_year__-OTHER`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rr__0                                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ rr__1                                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `length__-Inf_6.1`                      <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ length__6.1_15                          <dbl> 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, …
## $ length__15_28.2                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ length__28.2_Inf                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__0.00%`                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__100.00%`                   <dbl> 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, …
## $ `tunnel_per__-OTHER`                    <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ `tunnel__-Inf_3.3`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tunnel__3.3_8.4                         <dbl> 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, …
## $ tunnel__8.4_20                          <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ tunnel__20_Inf                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `stations__-Inf_4`                      <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
## $ stations__4_10                          <dbl> 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ stations__10_20                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ stations__20_Inf                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cost__-Inf_7.0647590277918`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost__7.0647590277918_7.99962488678285  <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ cost__7.99962488678285_8.63625718407456 <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ cost__8.63625718407456_Inf              <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ currency__CAD                           <dbl> 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, …
## $ currency__CNY                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__EUR                           <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ currency__INR                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__JPY                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__KRW                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__RUB                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__SEK                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__THB                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__TWD                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__USD                           <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ currency__VND                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `currency__-OTHER`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `year__-Inf_2012`                       <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, …
## $ year__2012_2016                         <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2016_2018                         <dbl> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2018_Inf                          <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ `ppp_rate__-Inf_0.2379`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.2379_0.266                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.266_1.25                    <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, …
## $ ppp_rate__1.25_Inf                      <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `cost_km_millions__-Inf_133.1804348`    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__133.1804348_184.2     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__184.2_244.31          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__244.31_Inf            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `realcost__-Inf_1170`                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ realcost__1170_2979.84                  <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ realcost__2979.84_5632.21               <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ realcost__5632.21_Inf                   <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …

# Step 2 Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(realcost__5632.21_Inf)

data_corr_tbl %>% glimpse()

## Rows: 157
## Columns: 3
## $ feature     <fct> cost, realcost, length, stations, tunnel, cost, realcost, …
## $ bin         <chr> "8.63625718407456_Inf", "5632.21_Inf", "28.2_Inf", "20_Inf…
## $ correlation <dbl> 1.0000000, 1.0000000, 0.6628447, 0.5829395, 0.5204007, -0.…

# Step 3 Plot 
data_corr_tbl %>%
    plot_correlation_funnel()

## Warning: ggrepel: 113 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Preprocess Data

Build Models

split data

# data <- sample_n(data, 100)

# Split into train and test data set
set.seed(1234)
data_split <- rsample::initial_split(subway)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training data set for cross validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [294/33]> Fold01
##  2 <split [294/33]> Fold02
##  3 <split [294/33]> Fold03
##  4 <split [294/33]> Fold04
##  5 <split [294/33]> Fold05
##  6 <split [294/33]> Fold06
##  7 <split [294/33]> Fold07
##  8 <split [295/32]> Fold08
##  9 <split [295/32]> Fold09
## 10 <split [295/32]> Fold10

library(usemodels)

## Warning: package 'usemodels' was built under R version 4.3.3

usemodels::use_xgboost(realcost ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = realcost ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(24468)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

# Specify recipe

xgboost_recipe <- 
  recipe(formula = realcost ~ ., data = data_train) %>% 
 recipes::update_role(e, new_role = "id variable") %>%
    step_other(country, city, line, currency, tunnel_per, start_year, end_year) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
    step_log(cost_km_millions) %>%
    step_YeoJohnson(length, tunnel, cost)

xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 327
## Columns: 42
## $ e                   <dbl> 7808, 7945, 8177, 7338, 7360, 8139, 7408, 9461, 81…
## $ rr                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ length              <dbl> 2.476769, 4.430932, 4.663634, 4.521415, 1.816155, …
## $ tunnel              <dbl> 2.6027078, 4.8189022, 2.2209108, 4.9247159, 0.0000…
## $ stations            <dbl> 7, 17, 20, 28, 3, 4, 1, 30, 10, 10, 3, 20, 38, 13,…
## $ cost                <dbl> 72.64753, 101.40845, 111.52777, 96.63319, 45.97803…
## $ year                <dbl> 2019, 2015, 2013, 2016, 2012, 2005, 2011, 2014, 20…
## $ ppp_rate            <dbl> 0.2382, 0.2583, 1.8200, 1.7000, 1.3000, 0.3517, 1.…
## $ cost_km_millions    <dbl> 5.423672, 5.178520, 5.410202, 4.926078, 4.595601, …
## $ realcost            <dbl> 1775.13, 6174.12, 9103.64, 5100.00, 416.00, 218.89…
## $ country_CN          <dbl> 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,…
## $ country_IN          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ country_other       <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,…
## $ city_Beijing        <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_other          <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ line_Line.3         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ line_other          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ start_year_X2013    <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ start_year_X2014    <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ start_year_X2015    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ start_year_X2016    <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ start_year_X2017    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,…
## $ start_year_X2018    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ start_year_X2019    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ start_year_X2020    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ start_year_other    <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,…
## $ end_year_X2019      <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ end_year_X2020      <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ end_year_X2021      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,…
## $ end_year_X2022      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ end_year_X2023      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ end_year_X2024      <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ end_year_X2025      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ end_year_other      <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,…
## $ tunnel_per_X0.00.   <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ tunnel_per_X100.00. <dbl> 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,…
## $ tunnel_per_other    <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,…
## $ currency_CNY        <dbl> 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,…
## $ currency_EUR        <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,…
## $ currency_INR        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ currency_USD        <dbl> 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ currency_other      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,…

xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), learn_rate = tune(),mtry = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(24468)
xgboost_tune <-
  tune_grid(xgboost_workflow, resamples = data_cv, grid = 7)

## i Creating pre-processing data to finalize unknown parameter: mtry

Evaluate Models

tune::show_best(xgboost_tune, metric = "rmse")

## # A tibble: 5 × 10
##    mtry trees min_n learn_rate .metric .estimator  mean     n std_err .config   
##   <int> <int> <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>     
## 1    34   510    10    0.00849 rmse    standard    769.    10    245. Preproces…
## 2    20  1590     4    0.00246 rmse    standard    823.    10    187. Preproces…
## 3    12  1100    15    0.0250  rmse    standard   1385.    10    277. Preproces…
## 4    25   285    24    0.0943  rmse    standard   1800.    10    306. Preproces…
## 5    14   733    33    0.163   rmse    standard   2129.    10    345. Preproces…

# Update the model by selecting the best hyper parameter
xgboost_fw <- tune::finalize_workflow(xgboost_workflow, tune::select_best(xgboost_tune, metric = "rmse"))

# Fit the model on the entire training data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)

tune::collect_metrics(data_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard    2156.    Preprocessor1_Model1
## 2 rsq     standard       0.856 Preprocessor1_Model1

tune::collect_predictions(data_fit) %>%
    ggplot(aes(realcost, .pred)) +
    geom_point(alpha = 0.4, fill = "lavenderblush") +
    geom_abline(lty = 2, color = "red3") +
    coord_fixed()

# Make predictions

Apply : Public Transit Cost of Construction

Kenton Quaglieri

2024-09-26

Import Data

Explore Data

Preprocess Data

Build Models

Evaluate Models