Goal: To figure out how to deliver more high-capacity transit projects for a fraction of the cost in countries like the United States. click here for the data.
transit_cost <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-05/transit_cost.csv')
## Rows: 544 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): country, city, line, start_year, end_year, tunnel_per, source1, cu...
## dbl (9): e, rr, length, tunnel, stations, cost, year, ppp_rate, cost_km_mil...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(transit_cost)
Name | transit_cost |
Number of rows | 544 |
Number of columns | 20 |
_______________________ | |
Column type frequency: | |
character | 11 |
numeric | 9 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
country | 7 | 0.99 | 2 | 2 | 0 | 56 | 0 |
city | 7 | 0.99 | 4 | 16 | 0 | 140 | 0 |
line | 7 | 0.99 | 2 | 46 | 0 | 366 | 0 |
start_year | 53 | 0.90 | 4 | 9 | 0 | 40 | 0 |
end_year | 71 | 0.87 | 1 | 4 | 0 | 36 | 0 |
tunnel_per | 32 | 0.94 | 5 | 7 | 0 | 134 | 0 |
source1 | 12 | 0.98 | 4 | 54 | 0 | 17 | 0 |
currency | 7 | 0.99 | 2 | 3 | 0 | 39 | 0 |
real_cost | 0 | 1.00 | 1 | 10 | 0 | 534 | 0 |
source2 | 10 | 0.98 | 3 | 16 | 0 | 12 | 0 |
reference | 19 | 0.97 | 3 | 302 | 0 | 350 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
e | 7 | 0.99 | 7738.76 | 463.23 | 7136.00 | 7403.00 | 7705.00 | 7977.00 | 9510.00 | ▇▇▂▁▁ |
rr | 8 | 0.99 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
length | 5 | 0.99 | 58.34 | 621.20 | 0.60 | 6.50 | 15.77 | 29.08 | 12256.98 | ▇▁▁▁▁ |
tunnel | 32 | 0.94 | 29.38 | 344.04 | 0.00 | 3.40 | 8.91 | 21.52 | 7790.78 | ▇▁▁▁▁ |
stations | 15 | 0.97 | 13.81 | 13.70 | 0.00 | 4.00 | 10.00 | 20.00 | 128.00 | ▇▁▁▁▁ |
cost | 7 | 0.99 | 805438.12 | 6708033.07 | 0.00 | 2289.00 | 11000.00 | 27000.00 | 90000000.00 | ▇▁▁▁▁ |
year | 7 | 0.99 | 2014.91 | 5.64 | 1987.00 | 2012.00 | 2016.00 | 2019.00 | 2027.00 | ▁▁▂▇▂ |
ppp_rate | 9 | 0.98 | 0.66 | 0.87 | 0.00 | 0.24 | 0.26 | 1.00 | 5.00 | ▇▂▁▁▁ |
cost_km_millions | 2 | 1.00 | 232.98 | 257.22 | 7.79 | 134.86 | 181.25 | 241.43 | 3928.57 | ▇▁▁▁▁ |
subway <- transit_cost %>%
mutate(realcost = as.numeric(real_cost)) %>%
select(-reference, -source1, -source2, -real_cost) %>%
na.omit() %>%
mutate(cost = log(realcost))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `realcost = as.numeric(real_cost)`.
## Caused by warning:
## ! NAs introduced by coercion
glimpse(subway)
## Rows: 437
## Columns: 17
## $ e <dbl> 7136, 7137, 7138, 7139, 7144, 7145, 7146, 7147, 7152,…
## $ country <chr> "CA", "CA", "CA", "CA", "CA", "NL", "CA", "US", "US",…
## $ city <chr> "Vancouver", "Toronto", "Toronto", "Toronto", "Toront…
## $ line <chr> "Broadway", "Vaughan", "Scarborough", "Ontario", "Yon…
## $ start_year <chr> "2020", "2009", "2020", "2020", "2020", "2003", "2020…
## $ end_year <chr> "2025", "2017", "2030", "2030", "2030", "2018", "2026…
## $ rr <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ length <dbl> 5.7, 8.6, 7.8, 15.5, 7.4, 9.7, 5.8, 5.1, 4.2, 4.2, 6.…
## $ tunnel_per <chr> "87.72%", "100.00%", "100.00%", "57.00%", "100.00%", …
## $ tunnel <dbl> 5.0, 8.6, 7.8, 8.8, 7.4, 7.1, 5.8, 5.1, 4.2, 4.2, 6.3…
## $ stations <dbl> 6, 6, 3, 15, 6, 8, 5, 2, 2, 2, 3, 3, 4, 7, 13, 4, 4, …
## $ cost <dbl> 7.773679, 7.860185, 8.438150, 8.882020, 8.456168, 8.3…
## $ currency <chr> "CAD", "CAD", "CAD", "CAD", "CAD", "EUR", "CAD", "USD…
## $ year <dbl> 2018, 2013, 2018, 2019, 2020, 2009, 2018, 2012, 2023,…
## $ ppp_rate <dbl> 0.840, 0.810, 0.840, 0.840, 0.840, 1.300, 0.840, 1.00…
## $ cost_km_millions <dbl> 417.05263, 301.39535, 592.30769, 464.60129, 635.67568…
## $ realcost <dbl> 2377.200, 2592.000, 4620.000, 7201.320, 4704.000, 403…
Identify good predictors.
EDA Shortcut
# Step 1 Prepare
data_binarized_tbl <- subway %>%
select(-e) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 437
## Columns: 157
## $ country__BG <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__CA <dbl> 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, …
## $ country__CN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__DE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__ES <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__FR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__IN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__IT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__JP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__KR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__RU <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__SA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__SE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TH <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TW <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__US <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ country__VN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `country__-OTHER` <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ city__Bangkok <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Barcelona <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Beijing <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Changchun <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Changsha <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Chengdu <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Chongqing <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Dongguan <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Guangzhou <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Guiyang <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Hangzhou <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Istanbul <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Madrid <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Mumbai <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Nanjing <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__New_York <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Paris <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Riyadh <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Seoul <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Shanghai <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Shenzhen <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Sofia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Taipei <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Tianjin <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Tokyo <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Toronto <dbl> 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ city__Wuhan <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `city__-OTHER` <dbl> 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, …
## $ line__Line_1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_1_Phase_1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_12 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_3_Phase_1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_4_Phase_1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_5_Phase_1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Phase_1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `line__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ start_year__2000 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2001 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2003 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ start_year__2005 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2006 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2007 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2008 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2009 <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ start_year__2010 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2011 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2012 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2013 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2014 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2016 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2017 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2018 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ start_year__2019 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2020 <dbl> 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ start_year__2021 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `start_year__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2000 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2008 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2009 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2010 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2011 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2012 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2013 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2014 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2016 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ end_year__2017 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2018 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ end_year__2019 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2020 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2021 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2022 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2023 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2024 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2025 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2026 <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
## $ end_year__2027 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ end_year__2028 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2029 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2030 <dbl> 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ `end_year__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rr__0 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ rr__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `length__-Inf_6.1` <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ length__6.1_15 <dbl> 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, …
## $ length__15_28.2 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ length__28.2_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__0.00%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__100.00%` <dbl> 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, …
## $ `tunnel_per__-OTHER` <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ `tunnel__-Inf_3.3` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tunnel__3.3_8.4 <dbl> 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, …
## $ tunnel__8.4_20 <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ tunnel__20_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `stations__-Inf_4` <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
## $ stations__4_10 <dbl> 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ stations__10_20 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ stations__20_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cost__-Inf_7.0647590277918` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost__7.0647590277918_7.99962488678285 <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ cost__7.99962488678285_8.63625718407456 <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ cost__8.63625718407456_Inf <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ currency__CAD <dbl> 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, …
## $ currency__CNY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__EUR <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ currency__INR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__JPY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__KRW <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__RUB <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__SEK <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__THB <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__TWD <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__USD <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ currency__VND <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `currency__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `year__-Inf_2012` <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, …
## $ year__2012_2016 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2016_2018 <dbl> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2018_Inf <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ `ppp_rate__-Inf_0.2379` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.2379_0.266 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.266_1.25 <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, …
## $ ppp_rate__1.25_Inf <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `cost_km_millions__-Inf_133.1804348` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__133.1804348_184.2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__184.2_244.31 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__244.31_Inf <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `realcost__-Inf_1170` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ realcost__1170_2979.84 <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ realcost__2979.84_5632.21 <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ realcost__5632.21_Inf <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
# Step 2 Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(realcost__5632.21_Inf)
data_corr_tbl %>% glimpse()
## Rows: 157
## Columns: 3
## $ feature <fct> cost, realcost, length, stations, tunnel, cost, realcost, …
## $ bin <chr> "8.63625718407456_Inf", "5632.21_Inf", "28.2_Inf", "20_Inf…
## $ correlation <dbl> 1.0000000, 1.0000000, 0.6628447, 0.5829395, 0.5204007, -0.…
# Step 3 Plot
data_corr_tbl %>%
plot_correlation_funnel()
## Warning: ggrepel: 113 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
split data
# data <- sample_n(data, 100)
# Split into train and test data set
set.seed(1234)
data_split <- rsample::initial_split(subway)
data_train <- training(data_split)
data_test <- testing(data_split)
# Further split training data set for cross validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [294/33]> Fold01
## 2 <split [294/33]> Fold02
## 3 <split [294/33]> Fold03
## 4 <split [294/33]> Fold04
## 5 <split [294/33]> Fold05
## 6 <split [294/33]> Fold06
## 7 <split [294/33]> Fold07
## 8 <split [295/32]> Fold08
## 9 <split [295/32]> Fold09
## 10 <split [295/32]> Fold10
library(usemodels)
## Warning: package 'usemodels' was built under R version 4.3.3
usemodels::use_xgboost(realcost ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = realcost ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(24468)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
# Specify recipe
xgboost_recipe <-
recipe(formula = realcost ~ ., data = data_train) %>%
recipes::update_role(e, new_role = "id variable") %>%
step_other(country, city, line, currency, tunnel_per, start_year, end_year) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_log(cost_km_millions) %>%
step_YeoJohnson(length, tunnel, cost)
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 327
## Columns: 42
## $ e <dbl> 7808, 7945, 8177, 7338, 7360, 8139, 7408, 9461, 81…
## $ rr <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ length <dbl> 2.476769, 4.430932, 4.663634, 4.521415, 1.816155, …
## $ tunnel <dbl> 2.6027078, 4.8189022, 2.2209108, 4.9247159, 0.0000…
## $ stations <dbl> 7, 17, 20, 28, 3, 4, 1, 30, 10, 10, 3, 20, 38, 13,…
## $ cost <dbl> 72.64753, 101.40845, 111.52777, 96.63319, 45.97803…
## $ year <dbl> 2019, 2015, 2013, 2016, 2012, 2005, 2011, 2014, 20…
## $ ppp_rate <dbl> 0.2382, 0.2583, 1.8200, 1.7000, 1.3000, 0.3517, 1.…
## $ cost_km_millions <dbl> 5.423672, 5.178520, 5.410202, 4.926078, 4.595601, …
## $ realcost <dbl> 1775.13, 6174.12, 9103.64, 5100.00, 416.00, 218.89…
## $ country_CN <dbl> 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,…
## $ country_IN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ country_other <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,…
## $ city_Beijing <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_other <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ line_Line.3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ line_other <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ start_year_X2013 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ start_year_X2014 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ start_year_X2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ start_year_X2016 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ start_year_X2017 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,…
## $ start_year_X2018 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ start_year_X2019 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ start_year_X2020 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ start_year_other <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,…
## $ end_year_X2019 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ end_year_X2020 <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ end_year_X2021 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,…
## $ end_year_X2022 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ end_year_X2023 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ end_year_X2024 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ end_year_X2025 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ end_year_other <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,…
## $ tunnel_per_X0.00. <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ tunnel_per_X100.00. <dbl> 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,…
## $ tunnel_per_other <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,…
## $ currency_CNY <dbl> 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,…
## $ currency_EUR <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,…
## $ currency_INR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ currency_USD <dbl> 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ currency_other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,…
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), learn_rate = tune(),mtry = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
set.seed(24468)
xgboost_tune <-
tune_grid(xgboost_workflow, resamples = data_cv, grid = 7)
## i Creating pre-processing data to finalize unknown parameter: mtry
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 10
## mtry trees min_n learn_rate .metric .estimator mean n std_err .config
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 34 510 10 0.00849 rmse standard 769. 10 245. Preproces…
## 2 20 1590 4 0.00246 rmse standard 823. 10 187. Preproces…
## 3 12 1100 15 0.0250 rmse standard 1385. 10 277. Preproces…
## 4 25 285 24 0.0943 rmse standard 1800. 10 306. Preproces…
## 5 14 733 33 0.163 rmse standard 2129. 10 345. Preproces…
# Update the model by selecting the best hyper parameter
xgboost_fw <- tune::finalize_workflow(xgboost_workflow, tune::select_best(xgboost_tune, metric = "rmse"))
# Fit the model on the entire training data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)
tune::collect_metrics(data_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 2156. Preprocessor1_Model1
## 2 rsq standard 0.856 Preprocessor1_Model1
tune::collect_predictions(data_fit) %>%
ggplot(aes(realcost, .pred)) +
geom_point(alpha = 0.4, fill = "lavenderblush") +
geom_abline(lty = 2, color = "red3") +
coord_fixed()
# Make predictions