Goal: To figure out how to deliver more high-capacity transit projects for a fraction of the cost in countries like the United States. click here for the data.

Import Data

transit_cost <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-05/transit_cost.csv')
## Rows: 544 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): country, city, line, start_year, end_year, tunnel_per, source1, cu...
## dbl  (9): e, rr, length, tunnel, stations, cost, year, ppp_rate, cost_km_mil...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(transit_cost) 
Data summary
Name transit_cost
Number of rows 544
Number of columns 20
_______________________
Column type frequency:
character 11
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
country 7 0.99 2 2 0 56 0
city 7 0.99 4 16 0 140 0
line 7 0.99 2 46 0 366 0
start_year 53 0.90 4 9 0 40 0
end_year 71 0.87 1 4 0 36 0
tunnel_per 32 0.94 5 7 0 134 0
source1 12 0.98 4 54 0 17 0
currency 7 0.99 2 3 0 39 0
real_cost 0 1.00 1 10 0 534 0
source2 10 0.98 3 16 0 12 0
reference 19 0.97 3 302 0 350 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
e 7 0.99 7738.76 463.23 7136.00 7403.00 7705.00 7977.00 9510.00 ▇▇▂▁▁
rr 8 0.99 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
length 5 0.99 58.34 621.20 0.60 6.50 15.77 29.08 12256.98 ▇▁▁▁▁
tunnel 32 0.94 29.38 344.04 0.00 3.40 8.91 21.52 7790.78 ▇▁▁▁▁
stations 15 0.97 13.81 13.70 0.00 4.00 10.00 20.00 128.00 ▇▁▁▁▁
cost 7 0.99 805438.12 6708033.07 0.00 2289.00 11000.00 27000.00 90000000.00 ▇▁▁▁▁
year 7 0.99 2014.91 5.64 1987.00 2012.00 2016.00 2019.00 2027.00 ▁▁▂▇▂
ppp_rate 9 0.98 0.66 0.87 0.00 0.24 0.26 1.00 5.00 ▇▂▁▁▁
cost_km_millions 2 1.00 232.98 257.22 7.79 134.86 181.25 241.43 3928.57 ▇▁▁▁▁
transit_cost1 <- transit_cost %>%
mutate(realcost = as.numeric(real_cost))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `realcost = as.numeric(real_cost)`.
## Caused by warning:
## ! NAs introduced by coercion
subway <- transit_cost1 %>%
    select(-reference, -source1, -source2) %>%
    na.omit() %>%
    mutate(cost = log(realcost)) 

glimpse(subway)
## Rows: 437
## Columns: 18
## $ e                <dbl> 7136, 7137, 7138, 7139, 7144, 7145, 7146, 7147, 7152,…
## $ country          <chr> "CA", "CA", "CA", "CA", "CA", "NL", "CA", "US", "US",…
## $ city             <chr> "Vancouver", "Toronto", "Toronto", "Toronto", "Toront…
## $ line             <chr> "Broadway", "Vaughan", "Scarborough", "Ontario", "Yon…
## $ start_year       <chr> "2020", "2009", "2020", "2020", "2020", "2003", "2020…
## $ end_year         <chr> "2025", "2017", "2030", "2030", "2030", "2018", "2026…
## $ rr               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ length           <dbl> 5.7, 8.6, 7.8, 15.5, 7.4, 9.7, 5.8, 5.1, 4.2, 4.2, 6.…
## $ tunnel_per       <chr> "87.72%", "100.00%", "100.00%", "57.00%", "100.00%", …
## $ tunnel           <dbl> 5.0, 8.6, 7.8, 8.8, 7.4, 7.1, 5.8, 5.1, 4.2, 4.2, 6.3…
## $ stations         <dbl> 6, 6, 3, 15, 6, 8, 5, 2, 2, 2, 3, 3, 4, 7, 13, 4, 4, …
## $ cost             <dbl> 7.773679, 7.860185, 8.438150, 8.882020, 8.456168, 8.3…
## $ currency         <chr> "CAD", "CAD", "CAD", "CAD", "CAD", "EUR", "CAD", "USD…
## $ year             <dbl> 2018, 2013, 2018, 2019, 2020, 2009, 2018, 2012, 2023,…
## $ ppp_rate         <dbl> 0.840, 0.810, 0.840, 0.840, 0.840, 1.300, 0.840, 1.00…
## $ real_cost        <chr> "2377.2", "2592", "4620", "7201.32", "4704", "4030", …
## $ cost_km_millions <dbl> 417.05263, 301.39535, 592.30769, 464.60129, 635.67568…
## $ realcost         <dbl> 2377.200, 2592.000, 4620.000, 7201.320, 4704.000, 403…

Explore Data

Identify good predictors.

EDA Shortcut

# Step 1 Prepare

data_binarized_tbl <- subway %>%
    select(-e) %>%
    binarize()

data_binarized_tbl %>% glimpse()
## Rows: 437
## Columns: 159
## $ country__BG                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__CA                             <dbl> 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, …
## $ country__CN                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__DE                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__ES                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__FR                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__IN                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__IT                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__JP                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__KR                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__RU                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__SA                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__SE                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TH                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TR                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TW                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__US                             <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ country__VN                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `country__-OTHER`                       <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ city__Bangkok                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Barcelona                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Beijing                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Changchun                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Changsha                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Chengdu                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Chongqing                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Dongguan                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Guangzhou                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Guiyang                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Hangzhou                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Istanbul                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Madrid                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Mumbai                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Nanjing                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__New_York                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Paris                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Riyadh                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Seoul                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Shanghai                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Shenzhen                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Sofia                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Taipei                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Tianjin                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Tokyo                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Toronto                           <dbl> 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ city__Wuhan                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `city__-OTHER`                          <dbl> 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, …
## $ line__Line_1                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_1_Phase_1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_12                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_2                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_3                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_3_Phase_1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_4                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_4_Phase_1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_5                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_5_Phase_1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_6                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_7                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Phase_1                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `line__-OTHER`                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ start_year__2000                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2001                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2003                        <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ start_year__2005                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2006                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2007                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2008                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2009                        <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ start_year__2010                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2011                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2012                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2013                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2014                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2015                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2016                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2017                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2018                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ start_year__2019                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2020                        <dbl> 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ start_year__2021                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `start_year__-OTHER`                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2000                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2008                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2009                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2010                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2011                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2012                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2013                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2014                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2015                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2016                          <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ end_year__2017                          <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2018                          <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ end_year__2019                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2020                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2021                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2022                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2023                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2024                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2025                          <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2026                          <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
## $ end_year__2027                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ end_year__2028                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2029                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2030                          <dbl> 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ `end_year__-OTHER`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rr__0                                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ rr__1                                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `length__-Inf_6.1`                      <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ length__6.1_15                          <dbl> 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, …
## $ length__15_28.2                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ length__28.2_Inf                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__0.00%`                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__100.00%`                   <dbl> 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, …
## $ `tunnel_per__-OTHER`                    <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ `tunnel__-Inf_3.3`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tunnel__3.3_8.4                         <dbl> 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, …
## $ tunnel__8.4_20                          <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ tunnel__20_Inf                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `stations__-Inf_4`                      <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
## $ stations__4_10                          <dbl> 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ stations__10_20                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ stations__20_Inf                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cost__-Inf_7.0647590277918`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost__7.0647590277918_7.99962488678285  <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ cost__7.99962488678285_8.63625718407456 <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ cost__8.63625718407456_Inf              <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ currency__CAD                           <dbl> 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, …
## $ currency__CNY                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__EUR                           <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ currency__INR                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__JPY                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__KRW                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__RUB                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__SEK                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__THB                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__TWD                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__USD                           <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ currency__VND                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `currency__-OTHER`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `year__-Inf_2012`                       <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, …
## $ year__2012_2016                         <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2016_2018                         <dbl> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2018_Inf                          <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ `ppp_rate__-Inf_0.2379`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.2379_0.266                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.266_1.25                    <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, …
## $ ppp_rate__1.25_Inf                      <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ real_cost__2400                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ `real_cost__-OTHER`                     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ `cost_km_millions__-Inf_133.1804348`    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__133.1804348_184.2     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__184.2_244.31          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__244.31_Inf            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `realcost__-Inf_1170`                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ realcost__1170_2979.84                  <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ realcost__2979.84_5632.21               <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ realcost__5632.21_Inf                   <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
# Step 2 Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(realcost__5632.21_Inf)

data_corr_tbl %>% glimpse()
## Rows: 159
## Columns: 3
## $ feature     <fct> cost, realcost, length, stations, tunnel, cost, realcost, …
## $ bin         <chr> "8.63625718407456_Inf", "5632.21_Inf", "28.2_Inf", "20_Inf…
## $ correlation <dbl> 1.0000000, 1.0000000, 0.6628447, 0.5829395, 0.5204007, -0.…
# Step 3 Plot 
data_corr_tbl %>%
    plot_correlation_funnel()
## Warning: ggrepel: 113 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Preprocess Data

Build Models

split data

# data <- sample_n(data, 100)

# Split into train and test data set
set.seed(1234)
data_split <- rsample::initial_split(subway)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training data set for cross validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [294/33]> Fold01
##  2 <split [294/33]> Fold02
##  3 <split [294/33]> Fold03
##  4 <split [294/33]> Fold04
##  5 <split [294/33]> Fold05
##  6 <split [294/33]> Fold06
##  7 <split [294/33]> Fold07
##  8 <split [295/32]> Fold08
##  9 <split [295/32]> Fold09
## 10 <split [295/32]> Fold10
library(usemodels)
## Warning: package 'usemodels' was built under R version 4.3.3
usemodels::use_xgboost(realcost ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = realcost ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(24468)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
# Specify recipe

xgboost_recipe <- 
  recipe(formula = realcost ~ ., data = data_train) %>% 
 recipes::update_role(e, new_role = "id variable") %>%
    step_other(country, city, line) %>%
    step_dummy(country, city, line, one_hot = TRUE) %>%
    step_log(cost_km_millions) %>%
    step_YeoJohnson(length, tunnel, cost)

 
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), learn_rate = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(24468)
xgboost_tune <-
  tune_grid(xgboost_workflow, resamples = data_cv, grid = 5)
## → A | error:   Some columns are non-numeric. The data cannot be converted to numeric matrix: 'start_year', 'end_year', 'tunnel_per', 'currency', 'real_cost'.
## 
There were issues with some computations   A: x1

There were issues with some computations   A: x4

There were issues with some computations   A: x6

There were issues with some computations   A: x8

There were issues with some computations   A: x10

There were issues with some computations   A: x12

There were issues with some computations   A: x16

There were issues with some computations   A: x17

There were issues with some computations   A: x20

There were issues with some computations   A: x21

There were issues with some computations   A: x26

There were issues with some computations   A: x29

There were issues with some computations   A: x31

There were issues with some computations   A: x35

There were issues with some computations   A: x38

There were issues with some computations   A: x42

There were issues with some computations   A: x46

There were issues with some computations   A: x47

There were issues with some computations   A: x48

There were issues with some computations   A: x50
## Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
## information.

sure why they failed I might need help

Evaluate Models

Make predictions