Goal: To figure out how to deliver more high-capacity transit projects for a fraction of the cost in countries like the United States. click here for the data.

Import Data

transit_cost <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-05/transit_cost.csv')

## Rows: 544 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): country, city, line, start_year, end_year, tunnel_per, source1, cu...
## dbl  (9): e, rr, length, tunnel, stations, cost, year, ppp_rate, cost_km_mil...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(transit_cost)

Data summary
Name	transit_cost
Number of rows	544
Number of columns	20
_______________________
Column type frequency:
character	11
numeric	9
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
country	7	0.99	2	2	56
city	7	0.99	4	16	140
line	7	0.99	2	46	366
start_year	53	0.90	4	9	40
end_year	71	0.87	1	4	36
tunnel_per	32	0.94	5	7	134
source1	12	0.98	4	54	17
currency	7	0.99	2	3	39
real_cost	0	1.00	1	10	534
source2	10	0.98	3	16	12
reference	19	0.97	3	302	350

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
e	7	0.99	7738.76	463.23	7136.00	7403.00	7705.00	7977.00	9510.00	▇▇▂▁▁
rr	8	0.99	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
length	5	0.99	58.34	621.20	0.60	6.50	15.77	29.08	12256.98	▇▁▁▁▁
tunnel	32	0.94	29.38	344.04	0.00	3.40	8.91	21.52	7790.78	▇▁▁▁▁
stations	15	0.97	13.81	13.70	0.00	4.00	10.00	20.00	128.00	▇▁▁▁▁
cost	7	0.99	805438.12	6708033.07	0.00	2289.00	11000.00	27000.00	90000000.00	▇▁▁▁▁
year	7	0.99	2014.91	5.64	1987.00	2012.00	2016.00	2019.00	2027.00	▁▁▂▇▂
ppp_rate	9	0.98	0.66	0.87	0.00	0.24	0.26	1.00	5.00	▇▂▁▁▁
cost_km_millions	2	1.00	232.98	257.22	7.79	134.86	181.25	241.43	3928.57	▇▁▁▁▁

transit_cost1 <- transit_cost %>%
mutate(realcost = as.numeric(real_cost))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `realcost = as.numeric(real_cost)`.
## Caused by warning:
## ! NAs introduced by coercion

subway <- transit_cost1 %>%
    select(-reference, -source1, -source2) %>%
    na.omit() %>%
    mutate(cost = log(realcost)) 

glimpse(subway)

## Rows: 437
## Columns: 18
## $ e                <dbl> 7136, 7137, 7138, 7139, 7144, 7145, 7146, 7147, 7152,…
## $ country          <chr> "CA", "CA", "CA", "CA", "CA", "NL", "CA", "US", "US",…
## $ city             <chr> "Vancouver", "Toronto", "Toronto", "Toronto", "Toront…
## $ line             <chr> "Broadway", "Vaughan", "Scarborough", "Ontario", "Yon…
## $ start_year       <chr> "2020", "2009", "2020", "2020", "2020", "2003", "2020…
## $ end_year         <chr> "2025", "2017", "2030", "2030", "2030", "2018", "2026…
## $ rr               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ length           <dbl> 5.7, 8.6, 7.8, 15.5, 7.4, 9.7, 5.8, 5.1, 4.2, 4.2, 6.…
## $ tunnel_per       <chr> "87.72%", "100.00%", "100.00%", "57.00%", "100.00%", …
## $ tunnel           <dbl> 5.0, 8.6, 7.8, 8.8, 7.4, 7.1, 5.8, 5.1, 4.2, 4.2, 6.3…
## $ stations         <dbl> 6, 6, 3, 15, 6, 8, 5, 2, 2, 2, 3, 3, 4, 7, 13, 4, 4, …
## $ cost             <dbl> 7.773679, 7.860185, 8.438150, 8.882020, 8.456168, 8.3…
## $ currency         <chr> "CAD", "CAD", "CAD", "CAD", "CAD", "EUR", "CAD", "USD…
## $ year             <dbl> 2018, 2013, 2018, 2019, 2020, 2009, 2018, 2012, 2023,…
## $ ppp_rate         <dbl> 0.840, 0.810, 0.840, 0.840, 0.840, 1.300, 0.840, 1.00…
## $ real_cost        <chr> "2377.2", "2592", "4620", "7201.32", "4704", "4030", …
## $ cost_km_millions <dbl> 417.05263, 301.39535, 592.30769, 464.60129, 635.67568…
## $ realcost         <dbl> 2377.200, 2592.000, 4620.000, 7201.320, 4704.000, 403…

Explore Data

Identify good predictors.

EDA Shortcut

# Step 1 Prepare

data_binarized_tbl <- subway %>%
    select(-e) %>%
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 437
## Columns: 159
## $ country__BG                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__CA                             <dbl> 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, …
## $ country__CN                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__DE                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__ES                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__FR                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__IN                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__IT                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__JP                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__KR                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__RU                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__SA                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__SE                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TH                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TR                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TW                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__US                             <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ country__VN                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `country__-OTHER`                       <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ city__Bangkok                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Barcelona                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Beijing                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Changchun                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Changsha                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Chengdu                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Chongqing                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Dongguan                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Guangzhou                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Guiyang                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Hangzhou                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Istanbul                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Madrid                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Mumbai                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Nanjing                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__New_York                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Paris                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Riyadh                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Seoul                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Shanghai                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Shenzhen                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Sofia                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Taipei                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Tianjin                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Tokyo                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Toronto                           <dbl> 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ city__Wuhan                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `city__-OTHER`                          <dbl> 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, …
## $ line__Line_1                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_1_Phase_1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_12                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_2                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_3                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_3_Phase_1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_4                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_4_Phase_1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_5                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_5_Phase_1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_6                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_7                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Phase_1                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `line__-OTHER`                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ start_year__2000                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2001                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2003                        <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ start_year__2005                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2006                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2007                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2008                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2009                        <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ start_year__2010                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2011                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2012                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2013                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2014                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2015                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2016                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2017                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2018                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ start_year__2019                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2020                        <dbl> 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ start_year__2021                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `start_year__-OTHER`                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2000                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2008                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2009                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2010                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2011                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2012                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2013                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2014                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2015                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2016                          <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ end_year__2017                          <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2018                          <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ end_year__2019                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2020                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2021                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2022                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2023                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2024                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2025                          <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2026                          <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
## $ end_year__2027                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ end_year__2028                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2029                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2030                          <dbl> 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ `end_year__-OTHER`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rr__0                                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ rr__1                                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `length__-Inf_6.1`                      <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ length__6.1_15                          <dbl> 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, …
## $ length__15_28.2                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ length__28.2_Inf                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__0.00%`                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__100.00%`                   <dbl> 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, …
## $ `tunnel_per__-OTHER`                    <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ `tunnel__-Inf_3.3`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tunnel__3.3_8.4                         <dbl> 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, …
## $ tunnel__8.4_20                          <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ tunnel__20_Inf                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `stations__-Inf_4`                      <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
## $ stations__4_10                          <dbl> 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ stations__10_20                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ stations__20_Inf                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cost__-Inf_7.0647590277918`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost__7.0647590277918_7.99962488678285  <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ cost__7.99962488678285_8.63625718407456 <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ cost__8.63625718407456_Inf              <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ currency__CAD                           <dbl> 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, …
## $ currency__CNY                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__EUR                           <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ currency__INR                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__JPY                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__KRW                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__RUB                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__SEK                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__THB                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__TWD                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__USD                           <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ currency__VND                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `currency__-OTHER`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `year__-Inf_2012`                       <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, …
## $ year__2012_2016                         <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2016_2018                         <dbl> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2018_Inf                          <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ `ppp_rate__-Inf_0.2379`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.2379_0.266                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.266_1.25                    <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, …
## $ ppp_rate__1.25_Inf                      <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ real_cost__2400                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ `real_cost__-OTHER`                     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ `cost_km_millions__-Inf_133.1804348`    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__133.1804348_184.2     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__184.2_244.31          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__244.31_Inf            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `realcost__-Inf_1170`                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ realcost__1170_2979.84                  <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ realcost__2979.84_5632.21               <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ realcost__5632.21_Inf                   <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …

# Step 2 Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(realcost__5632.21_Inf)

data_corr_tbl %>% glimpse()

## Rows: 159
## Columns: 3
## $ feature     <fct> cost, realcost, length, stations, tunnel, cost, realcost, …
## $ bin         <chr> "8.63625718407456_Inf", "5632.21_Inf", "28.2_Inf", "20_Inf…
## $ correlation <dbl> 1.0000000, 1.0000000, 0.6628447, 0.5829395, 0.5204007, -0.…

# Step 3 Plot 
data_corr_tbl %>%
    plot_correlation_funnel()

## Warning: ggrepel: 113 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Preprocess Data

Build Models

split data

# data <- sample_n(data, 100)

# Split into train and test data set
set.seed(1234)
data_split <- rsample::initial_split(subway)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training data set for cross validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [294/33]> Fold01
##  2 <split [294/33]> Fold02
##  3 <split [294/33]> Fold03
##  4 <split [294/33]> Fold04
##  5 <split [294/33]> Fold05
##  6 <split [294/33]> Fold06
##  7 <split [294/33]> Fold07
##  8 <split [295/32]> Fold08
##  9 <split [295/32]> Fold09
## 10 <split [295/32]> Fold10

library(usemodels)

## Warning: package 'usemodels' was built under R version 4.3.3

usemodels::use_xgboost(realcost ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = realcost ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(24468)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

# Specify recipe

xgboost_recipe <- 
  recipe(formula = realcost ~ ., data = data_train) %>% 
 recipes::update_role(e, new_role = "id variable") %>%
    step_other(country, city, line) %>%
    step_dummy(country, city, line, one_hot = TRUE) %>%
    step_log(cost_km_millions) %>%
    step_YeoJohnson(length, tunnel, cost)

 
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), learn_rate = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(24468)
xgboost_tune <-
  tune_grid(xgboost_workflow, resamples = data_cv, grid = 5)

## → A | error:   Some columns are non-numeric. The data cannot be converted to numeric matrix: 'start_year', 'end_year', 'tunnel_per', 'currency', 'real_cost'.

## 
There were issues with some computations   A: x1

There were issues with some computations   A: x4

There were issues with some computations   A: x6

There were issues with some computations   A: x8

There were issues with some computations   A: x10

There were issues with some computations   A: x12

There were issues with some computations   A: x16

There were issues with some computations   A: x17

There were issues with some computations   A: x20

There were issues with some computations   A: x21

There were issues with some computations   A: x26

There were issues with some computations   A: x29

There were issues with some computations   A: x31

There were issues with some computations   A: x35

There were issues with some computations   A: x38

There were issues with some computations   A: x42

There were issues with some computations   A: x46

There were issues with some computations   A: x47

There were issues with some computations   A: x48

There were issues with some computations   A: x50

## Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
## information.

Apply : Public Transit Cost of Construction

Kenton Quaglieri

2024-09-19

Import Data

Explore Data

Preprocess Data

Build Models

sure why they failed I might need help

Evaluate Models

Make predictions