Goal: Build a regression model to predict the cost (cost_km_millions) of the transit project. Click here for the data.

Import Data

transit_cost <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-01-05/transit_cost.csv')

## Rows: 544 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): country, city, line, start_year, end_year, tunnel_per, source1, cu...
## dbl  (9): e, rr, length, tunnel, stations, cost, year, ppp_rate, cost_km_mil...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(transit_cost)

Data summary
Name	transit_cost
Number of rows	544
Number of columns	20
_______________________
Column type frequency:
character	11
numeric	9
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
country	7	0.99	2	2	56
city	7	0.99	4	16	140
line	7	0.99	2	46	366
start_year	53	0.90	4	9	40
end_year	71	0.87	1	4	36
tunnel_per	32	0.94	5	7	134
source1	12	0.98	4	54	17
currency	7	0.99	2	3	39
real_cost	0	1.00	1	10	534
source2	10	0.98	3	16	12
reference	19	0.97	3	302	350

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
e	7	0.99	7738.76	463.23	7136.00	7403.00	7705.00	7977.00	9510.00	▇▇▂▁▁
rr	8	0.99	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
length	5	0.99	58.34	621.20	0.60	6.50	15.77	29.08	12256.98	▇▁▁▁▁
tunnel	32	0.94	29.38	344.04	0.00	3.40	8.91	21.52	7790.78	▇▁▁▁▁
stations	15	0.97	13.81	13.70	0.00	4.00	10.00	20.00	128.00	▇▁▁▁▁
cost	7	0.99	805438.12	6708033.07	0.00	2289.00	11000.00	27000.00	90000000.00	▇▁▁▁▁
year	7	0.99	2014.91	5.64	1987.00	2012.00	2016.00	2019.00	2027.00	▁▁▂▇▂
ppp_rate	9	0.98	0.66	0.87	0.00	0.24	0.26	1.00	5.00	▇▂▁▁▁
cost_km_millions	2	1.00	232.98	257.22	7.79	134.86	181.25	241.43	3928.57	▇▁▁▁▁

data <- transit_cost %>%
    
    # Treat missing values
    select(-cost, -currency, -source2, -source1, -reference) %>%
    na.omit() %>%
    
     # Convert character columns to factors
    mutate(across(where(is.character), as.factor)) %>%
    
    # Convert rr to a factor
    mutate(rr = factor(rr)) %>%
    
    # log transform variables with pos-skewed distributions
    mutate(cost_km_millions = log(cost_km_millions))

skimr::skim(data)

Data summary
Name	data
Number of rows	437
Number of columns	15
_______________________
Column type frequency:
factor	8
numeric	7
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
country	1	FALSE	55	CN: 169, IN: 27, TR: 20, ES: 15
city	1	FALSE	135	Bei: 21, Sha: 18, Ist: 15, Mum: 13
line	1	FALSE	322	Lin: 9, Lin: 8, Lin: 8, Lin: 6
start_year	1	FALSE	37	201: 51, 201: 37, 202: 36, 201: 35
end_year	1	FALSE	36	202: 58, 202: 44, 202: 42, 201: 27
rr	1	FALSE	2	0: 405, 1: 32
tunnel_per	1	FALSE	119	100: 245, 0.0: 51, 20.: 3, 35.: 3
real_cost	1	FALSE	429	240: 3, 175: 2, 300: 2, 416: 2

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
e	1	7695.93	482.31	7136.00	7355.00	7592.00	7928.00	9510.00	▇▅▂▁▁
length	1	20.60	21.90	0.60	6.10	15.00	28.20	200.00	▇▁▁▁▁
tunnel	1	13.64	15.49	0.00	3.30	8.40	20.00	160.00	▇▁▁▁▁
stations	1	13.79	14.02	0.00	4.00	10.00	20.00	128.00	▇▁▁▁▁
year	1	2014.51	5.91	1987.00	2012.00	2016.00	2018.00	2027.00	▁▁▂▇▂
ppp_rate	1	0.72	0.89	0.00	0.24	0.27	1.25	5.00	▇▂▁▁▁
cost_km_millions	1	5.23	0.63	2.05	4.89	5.22	5.50	8.28	▁▁▇▁▁

Explore Data

Identify good predictors.

Length

data %>%
    ggplot(aes(cost_km_millions, length)) +
    scale_y_log10() +
    geom_point()

Tunnel

data %>%
    ggplot(aes(cost_km_millions, tunnel)) +
    scale_y_log10() +
    geom_point()

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

Stations

data %>%
    ggplot(aes(cost_km_millions, as.factor(year))) +
    geom_boxplot()

city

data %>%
    
    group_by(city) %>%
    filter(n() > 5) %>%
    ungroup() %>%
    
    # Plot
    ggplot(aes(cost_km_millions, fct_reorder(city, cost_km_millions))) +
    geom_point() +
    labs(y = "City of Transit Project")

EDA shortcut

# Step 1: Prepare data
data_binarized_tbl <- data %>%
    select(-e, -line) %>%
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 437
## Columns: 124
## $ country__BG                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__CA                                         <dbl> 1, 1, 1, 1, 1, 0, …
## $ country__CN                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__DE                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__ES                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__FR                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__IN                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__IT                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__JP                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__KR                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__RU                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__SA                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__SE                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__TH                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__TR                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__TW                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__US                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ country__VN                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ `country__-OTHER`                                   <dbl> 0, 0, 0, 0, 0, 1, …
## $ city__Bangkok                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Barcelona                                     <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Beijing                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Changchun                                     <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Changsha                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Chengdu                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Chongqing                                     <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Dongguan                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Guangzhou                                     <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Guiyang                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Hangzhou                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Istanbul                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Madrid                                        <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Mumbai                                        <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Nanjing                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__New_York                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Paris                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Riyadh                                        <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Seoul                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Shanghai                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Shenzhen                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Sofia                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Taipei                                        <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Tianjin                                       <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Tokyo                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ city__Toronto                                       <dbl> 0, 1, 1, 1, 1, 0, …
## $ city__Wuhan                                         <dbl> 0, 0, 0, 0, 0, 0, …
## $ `city__-OTHER`                                      <dbl> 1, 0, 0, 0, 0, 1, …
## $ start_year__2000                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2001                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2003                                    <dbl> 0, 0, 0, 0, 0, 1, …
## $ start_year__2005                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2006                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2007                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2008                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2009                                    <dbl> 0, 1, 0, 0, 0, 0, …
## $ start_year__2010                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2011                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2012                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2013                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2014                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2015                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2016                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2017                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2018                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2019                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ start_year__2020                                    <dbl> 1, 0, 1, 1, 1, 0, …
## $ start_year__2021                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ `start_year__-OTHER`                                <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2000                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2008                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2009                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2010                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2011                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2012                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2013                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2014                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2015                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2016                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2017                                      <dbl> 0, 1, 0, 0, 0, 0, …
## $ end_year__2018                                      <dbl> 0, 0, 0, 0, 0, 1, …
## $ end_year__2019                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2020                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2021                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2022                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2023                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2024                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2025                                      <dbl> 1, 0, 0, 0, 0, 0, …
## $ end_year__2026                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2027                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2028                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2029                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ end_year__2030                                      <dbl> 0, 0, 1, 1, 1, 0, …
## $ `end_year__-OTHER`                                  <dbl> 0, 0, 0, 0, 0, 0, …
## $ rr__0                                               <dbl> 1, 1, 1, 1, 1, 1, …
## $ rr__1                                               <dbl> 0, 0, 0, 0, 0, 0, …
## $ `length__-Inf_6.1`                                  <dbl> 1, 0, 0, 0, 0, 0, …
## $ length__6.1_15                                      <dbl> 0, 1, 1, 0, 1, 1, …
## $ length__15_28.2                                     <dbl> 0, 0, 0, 1, 0, 0, …
## $ length__28.2_Inf                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__0.00%`                                 <dbl> 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__100.00%`                               <dbl> 0, 1, 1, 0, 1, 0, …
## $ `tunnel_per__-OTHER`                                <dbl> 1, 0, 0, 1, 0, 1, …
## $ `tunnel__-Inf_3.3`                                  <dbl> 0, 0, 0, 0, 0, 0, …
## $ tunnel__3.3_8.4                                     <dbl> 1, 0, 1, 0, 1, 1, …
## $ tunnel__8.4_20                                      <dbl> 0, 1, 0, 1, 0, 0, …
## $ tunnel__20_Inf                                      <dbl> 0, 0, 0, 0, 0, 0, …
## $ `stations__-Inf_4`                                  <dbl> 0, 0, 1, 0, 0, 0, …
## $ stations__4_10                                      <dbl> 1, 1, 0, 0, 1, 1, …
## $ stations__10_20                                     <dbl> 0, 0, 0, 1, 0, 0, …
## $ stations__20_Inf                                    <dbl> 0, 0, 0, 0, 0, 0, …
## $ `year__-Inf_2012`                                   <dbl> 0, 0, 0, 0, 0, 1, …
## $ year__2012_2016                                     <dbl> 0, 1, 0, 0, 0, 0, …
## $ year__2016_2018                                     <dbl> 1, 0, 1, 0, 0, 0, …
## $ year__2018_Inf                                      <dbl> 0, 0, 0, 1, 1, 0, …
## $ `ppp_rate__-Inf_0.2379`                             <dbl> 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.2379_0.266                              <dbl> 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.266_1.25                                <dbl> 1, 1, 1, 1, 1, 0, …
## $ ppp_rate__1.25_Inf                                  <dbl> 0, 0, 0, 0, 0, 1, …
## $ real_cost__2400                                     <dbl> 0, 0, 0, 0, 0, 0, …
## $ `real_cost__-OTHER`                                 <dbl> 1, 1, 1, 1, 1, 1, …
## $ `cost_km_millions__-Inf_4.89170486143161`           <dbl> 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__4.89170486143161_5.21602212382121 <dbl> 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__5.21602212382121_5.49843791070471 <dbl> 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__5.49843791070471_Inf              <dbl> 1, 1, 1, 1, 1, 1, …

# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(cost_km_millions__5.49843791070471_Inf)

data_corr_tbl

## # A tibble: 124 × 3
##    feature          bin                               correlation
##    <fct>            <chr>                                   <dbl>
##  1 cost_km_millions 5.49843791070471_Inf                    1    
##  2 cost_km_millions -Inf_4.89170486143161                  -0.334
##  3 cost_km_millions 5.21602212382121_5.49843791070471      -0.332
##  4 cost_km_millions 4.89170486143161_5.21602212382121      -0.332
##  5 country          US                                      0.304
##  6 country          CN                                     -0.273
##  7 country          -OTHER                                  0.206
##  8 city             New_York                                0.187
##  9 start_year       2010                                    0.169
## 10 city             -OTHER                                  0.169
## # ℹ 114 more rows

# Step 3: Plot 
data_corr_tbl %>%
    plot_correlation_funnel()

## Warning: ggrepel: 89 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Models

Split data

#data <- sample_n(data, 100)

# Split into training and testing dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test  <- testing(data_split)

# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [294/33]> Fold01
##  2 <split [294/33]> Fold02
##  3 <split [294/33]> Fold03
##  4 <split [294/33]> Fold04
##  5 <split [294/33]> Fold05
##  6 <split [294/33]> Fold06
##  7 <split [294/33]> Fold07
##  8 <split [295/32]> Fold08
##  9 <split [295/32]> Fold09
## 10 <split [295/32]> Fold10

Evaluate Models

usemodels::use_xgboost(cost_km_millions ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = cost_km_millions ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(24468)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

# Specify recipe
xgboost_recipe <- 
    recipe(formula = cost_km_millions ~ ., data = data_train) %>% 
    recipes::update_role(e, new_role = "id variable") %>%
    step_tokenize(line) %>%
    step_tokenfilter(line, max_tokens = 100) %>%
    step_tfidf(line) %>%
    step_other(city) %>%
    step_dummy(country, city, one_hot = TRUE) %>%
    step_log(ppp_rate, stations, tunnel, length)
    
    
xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 327
## Columns: 169
## $ e                        <dbl> 7808, 7945, 8177, 7338, 7360, 8139, 7408, 946…
## $ start_year               <fct> 2019, 2016, 2014, 2013, 2010, 2005, 2007, 201…
## $ end_year                 <fct> 2024, 2020, 2017, 2019, 2013, 2008, 2014, 202…
## $ rr                       <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ length                   <dbl> 2.0579625, 3.5496174, 3.7062281, 3.6109179, 1…
## $ tunnel_per               <fct> 100.00%, 100.00%, 14.00%, 100.00%, 0.00%, 0.0…
## $ tunnel                   <dbl> 2.0579625, 3.5496174, 1.7404662, 3.6109179, -…
## $ stations                 <dbl> 1.9459101, 2.8332133, 2.9957323, 3.3322045, 1…
## $ year                     <dbl> 2019, 2015, 2013, 2016, 2012, 2005, 2011, 201…
## $ ppp_rate                 <dbl> -1.4346446, -1.3536336, 0.5988365, 0.5306283,…
## $ real_cost                <fct> 1775.13, 6174.12, 9103.64, 5100, 416, 218.89,…
## $ cost_km_millions         <dbl> 5.423672, 5.178520, 5.410202, 4.926078, 4.595…
## $ tfidf_line_1             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_line_10            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_line_11            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_12            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_line_13            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_13a           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_13b           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_14            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_15            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_line_16            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_17            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_18            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_19            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_2             <dbl> 0.5139313, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_line_2020          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_2020s         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_22            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_27            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_28            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_2a            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_2b            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_3             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_line_4             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_line_4a            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_5             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_line_5a            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_6             <dbl> 0.000000, 1.670547, 0.000000, 0.000000, 0.000…
## $ tfidf_line_63            <dbl> 0.000000, 0.000000, 0.000000, 2.896507, 0.000…
## $ tfidf_line_7             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_line_7a            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_8             <dbl> 0.9336722, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_line_9             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_a             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_access        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_airport       <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
## $ tfidf_line_and           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_line_arenastaden   <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_line_aubervilliers <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_avenue        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_b             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_b1            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_bahn          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_barkarby      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_bart          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_bc1           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_blue          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_branch        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_c             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_line_circle        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_circular      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_city          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_east          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_eastern       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_express       <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
## $ tfidf_line_extension     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_line_feeder        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_line_first         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_green         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_line_half          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_line          <dbl> 0.2256866, 0.4513733, 0.0000000, 0.0000000, 0…
## $ tfidf_line_lines         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_line_link          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_m1            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_m11           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_m3            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_m4            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_m7            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_m8            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_metro         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_monorail      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_mrt           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_line_north         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_line_northern      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_orange        <dbl> 0.00000, 0.00000, 4.70048, 0.00000, 0.00000, …
## $ tfidf_line_phase         <dbl> 0.3648299, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_line_phase1        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_pink          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_project       <dbl> 0.000000, 0.000000, 0.000000, 2.551455, 0.000…
## $ tfidf_line_purple        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_rail          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_railway       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_red           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_line_ring          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_s1            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_second        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_section       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_line_sergeli       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_south         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_southern      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_sukhumvit     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_taoyuan       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_line_to            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_line_tunnel        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_u             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_line_u55           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_underground   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_urban         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_west          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_western       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_line_yellow        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_AE               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_AR               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_AT               <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_AU               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_BD               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_BE               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_BG               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_BH               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_BR               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ country_CA               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_CH               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_CL               <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_CN               <dbl> 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, …
## $ country_CZ               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_DE               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_DK               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_EC               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_EG               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_ES               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_FI               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_FR               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ country_GR               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ country_HU               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_ID               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_IL               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_IN               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_IR               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_IT               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_JP               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_KR               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_KW               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_MX               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_MY               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_NL               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_NO               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_NZ               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_PA               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_PE               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_PH               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_PK               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_PL               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_PT               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_RO               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_RU               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_SA               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_SE               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_SG               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_TH               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_TR               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_TW               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ country_UA               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_UK               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_US               <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_UZ               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_VN               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city_Beijing             <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city_other               <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, …

# Specify model
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

# Combine recipe and model using workflow
xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 


# Tune hyperparmeters
set.seed(234)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv,
            grid = 5)

## i Creating pre-processing data to finalize unknown parameter: mtry

## → A | error:   Some columns are non-numeric. The data cannot be converted to numeric matrix: 'start_year', 'end_year', 'rr', 'tunnel_per', 'real_cost'.

## There were issues with some computations   A: x1There were issues with some computations   A: x7There were issues with some computations   A: x16There were issues with some computations   A: x23There were issues with some computations   A: x31There were issues with some computations   A: x40There were issues with some computations   A: x47There were issues with some computations   A: x50

## Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
## information.

CodeAlong2

Brady Martin

2025-02-13

Import Data

Explore Data

Build Models

Evaluate Models

Make Predictions