library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Import data
transit_cost_new <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-05/transit_cost.csv')
## Rows: 544 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): country, city, line, start_year, end_year, tunnel_per, source1, cu...
## dbl (9): e, rr, length, tunnel, stations, cost, year, ppp_rate, cost_km_mil...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Data Cleaning
library(dplyr)
transit_cost <- transit_cost_new %>%
filter_all(all_vars(. != "x" & . != "X")) %>%
na.omit()
transit_cost %>% na.omit() %>% skimr::skim()
Data summary
| Name |
Piped data |
| Number of rows |
427 |
| Number of columns |
20 |
| _______________________ |
|
| Column type frequency: |
|
| character |
11 |
| numeric |
9 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| country |
0 |
1 |
2 |
2 |
0 |
54 |
0 |
| city |
0 |
1 |
4 |
16 |
0 |
134 |
0 |
| line |
0 |
1 |
2 |
39 |
0 |
315 |
0 |
| start_year |
0 |
1 |
4 |
4 |
0 |
36 |
0 |
| end_year |
0 |
1 |
4 |
4 |
0 |
34 |
0 |
| tunnel_per |
0 |
1 |
5 |
7 |
0 |
114 |
0 |
| source1 |
0 |
1 |
4 |
54 |
0 |
13 |
0 |
| currency |
0 |
1 |
3 |
3 |
0 |
36 |
0 |
| real_cost |
0 |
1 |
3 |
10 |
0 |
420 |
0 |
| source2 |
0 |
1 |
4 |
16 |
0 |
7 |
0 |
| reference |
0 |
1 |
3 |
302 |
0 |
313 |
0 |
Variable type: numeric
| e |
0 |
1 |
7688.65 |
483.35 |
7136.00 |
7352.50 |
7584.00 |
7913.50 |
9510.00 |
▇▅▁▁▁ |
| rr |
0 |
1 |
0.06 |
0.24 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▁ |
| length |
0 |
1 |
20.38 |
22.02 |
0.60 |
5.95 |
14.29 |
27.95 |
200.00 |
▇▁▁▁▁ |
| tunnel |
0 |
1 |
13.78 |
15.61 |
0.00 |
3.35 |
8.43 |
20.00 |
160.00 |
▇▁▁▁▁ |
| stations |
0 |
1 |
13.62 |
14.06 |
0.00 |
4.00 |
10.00 |
19.50 |
128.00 |
▇▁▁▁▁ |
| cost |
0 |
1 |
587090.86 |
4365866.10 |
21.00 |
1830.00 |
9600.00 |
28958.50 |
48000000.00 |
▇▁▁▁▁ |
| year |
0 |
1 |
2014.64 |
5.76 |
1991.00 |
2012.00 |
2016.00 |
2018.00 |
2027.00 |
▁▁▂▇▂ |
| ppp_rate |
0 |
1 |
0.68 |
0.85 |
0.00 |
0.24 |
0.26 |
1.25 |
5.00 |
▇▂▁▁▁ |
| cost_km_millions |
0 |
1 |
237.60 |
280.16 |
7.79 |
133.65 |
183.72 |
242.18 |
3928.57 |
▇▁▁▁▁ |
transit_cost$start_year <- as.numeric(transit_cost$start_year)
transit_cost$end_year <- as.numeric(transit_cost$end_year)
transit_cost$real_cost <- as.numeric(as.character(transit_cost$real_cost))
transit_cost
## # A tibble: 427 × 20
## e country city line start_year end_year rr length tunnel_per tunnel
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 7136 CA Vanco… Broa… 2020 2025 0 5.7 87.72% 5
## 2 7137 CA Toron… Vaug… 2009 2017 0 8.6 100.00% 8.6
## 3 7138 CA Toron… Scar… 2020 2030 0 7.8 100.00% 7.8
## 4 7139 CA Toron… Onta… 2020 2030 0 15.5 57.00% 8.8
## 5 7144 CA Toron… Yong… 2020 2030 0 7.4 100.00% 7.4
## 6 7145 NL Amste… Nort… 2003 2018 0 9.7 73.00% 7.1
## 7 7146 CA Montr… Blue… 2020 2026 0 5.8 100.00% 5.8
## 8 7147 US Seatt… U-Li… 2009 2016 0 5.1 100.00% 5.1
## 9 7152 US Los A… Purp… 2020 2027 0 4.2 100.00% 4.2
## 10 7153 US Los A… Purp… 2018 2026 0 4.2 100.00% 4.2
## # ℹ 417 more rows
## # ℹ 10 more variables: stations <dbl>, source1 <chr>, cost <dbl>,
## # currency <chr>, year <dbl>, ppp_rate <dbl>, real_cost <dbl>,
## # cost_km_millions <dbl>, source2 <chr>, reference <chr>
clean_data <- transit_cost %>% select(start_year, end_year, length, tunnel, stations, real_cost, cost, cost_km_millions) %>% na.omit()
Data Exploration
clean_data %>% glimpse()
## Rows: 427
## Columns: 8
## $ start_year <dbl> 2020, 2009, 2020, 2020, 2020, 2003, 2020, 2009, 2020,…
## $ end_year <dbl> 2025, 2017, 2030, 2030, 2030, 2018, 2026, 2016, 2027,…
## $ length <dbl> 5.7, 8.6, 7.8, 15.5, 7.4, 9.7, 5.8, 5.1, 4.2, 4.2, 6.…
## $ tunnel <dbl> 5.0, 8.6, 7.8, 8.8, 7.4, 7.1, 5.8, 5.1, 4.2, 4.2, 6.3…
## $ stations <dbl> 6, 6, 3, 15, 6, 8, 5, 2, 2, 2, 3, 3, 4, 7, 13, 4, 4, …
## $ real_cost <dbl> 2377.200, 2592.000, 4620.000, 7201.320, 4704.000, 403…
## $ cost <dbl> 2830, 3200, 5500, 8573, 5600, 3100, 4500, 1756, 3600,…
## $ cost_km_millions <dbl> 417.05263, 301.39535, 592.30769, 464.60129, 635.67568…
clean_data %>% skimr::skim()
Data summary
| Name |
Piped data |
| Number of rows |
427 |
| Number of columns |
8 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
8 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| start_year |
0 |
1 |
2013.16 |
6.69 |
1984.00 |
2010.00 |
2015.00 |
2018.00 |
2025.00 |
▁▁▃▇▇ |
| end_year |
0 |
1 |
2019.03 |
6.30 |
1994.00 |
2016.00 |
2020.00 |
2023.00 |
2030.00 |
▁▁▂▇▅ |
| length |
0 |
1 |
20.38 |
22.02 |
0.60 |
5.95 |
14.29 |
27.95 |
200.00 |
▇▁▁▁▁ |
| tunnel |
0 |
1 |
13.78 |
15.61 |
0.00 |
3.35 |
8.43 |
20.00 |
160.00 |
▇▁▁▁▁ |
| stations |
0 |
1 |
13.62 |
14.06 |
0.00 |
4.00 |
10.00 |
19.50 |
128.00 |
▇▁▁▁▁ |
| real_cost |
0 |
1 |
4160.20 |
4852.89 |
66.25 |
1136.41 |
2970.76 |
5487.05 |
45604.00 |
▇▁▁▁▁ |
| cost |
0 |
1 |
587090.86 |
4365866.10 |
21.00 |
1830.00 |
9600.00 |
28958.50 |
48000000.00 |
▇▁▁▁▁ |
| cost_km_millions |
0 |
1 |
237.60 |
280.16 |
7.79 |
133.65 |
183.72 |
242.18 |
3928.57 |
▇▁▁▁▁ |
Model Building
library(rsample)
set.seed(123)
data_split <- initial_split(clean_data)
data_train <- training(data_split)
data_test <- testing(data_split)
set.seed(234)
data_folds <- bootstraps(data_train, strata = real_cost)
data_folds
## # Bootstrap sampling using stratification
## # A tibble: 25 × 2
## splits id
## <list> <chr>
## 1 <split [320/124]> Bootstrap01
## 2 <split [320/109]> Bootstrap02
## 3 <split [320/119]> Bootstrap03
## 4 <split [320/126]> Bootstrap04
## 5 <split [320/118]> Bootstrap05
## 6 <split [320/111]> Bootstrap06
## 7 <split [320/118]> Bootstrap07
## 8 <split [320/121]> Bootstrap08
## 9 <split [320/113]> Bootstrap09
## 10 <split [320/120]> Bootstrap10
## # ℹ 15 more rows
library(usemodels)
use_xgboost(real_cost ~ start_year + end_year + length + tunnel + stations + cost + cost_km_millions, data = data_train)
## xgboost_recipe <-
## recipe(formula = real_cost ~ start_year + end_year + length + tunnel + stations +
## cost + cost_km_millions, data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(17029)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(recipes)
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
##
## fixed
## The following object is masked from 'package:stats':
##
## step
xgboost_recipe <-
recipe(formula = real_cost ~ ., data = data_train) %>%
#recipes::update_role(country, new_role = "id") %>%
#step_other(favorite_count) %>%
# step_dummy(all_nominal_predictors()) %>% # for factors
step_log(length, tunnel, cost_km_millions) %>%
step_nzv(all_predictors())
xgboost_recipe %>% prep() %>% bake(new_data = NULL)
## # A tibble: 320 × 8
## start_year end_year length tunnel stations cost cost_km_millions real_cost
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2013 2017 3.54 3.37 25 1.94e4 5.13 5819.
## 2 2018 2029 2.19 -Inf 4 1.05e5 4.77 1050
## 3 2012 2021 2.03 -Inf 7 2.29e3 5.71 2289
## 4 2009 2013 1.70 -Inf 6 1.09e4 4.84 696.
## 5 2016 2020 2.89 2.89 15 1.41e4 5.31 3645.
## 6 2012 2014 2.48 2.48 11 2 e7 5.30 2400
## 7 2019 2024 3.36 3.27 19 2.36e4 5.27 5632.
## 8 2006 2013 2.77 2.77 11 1.36e3 4.92 2170.
## 9 1997 2000 3.22 -Inf 19 9.38e3 4.93 3470.
## 10 2019 2022 3.47 3.47 7 9.16e2 4.66 3389.
## # ℹ 310 more rows
library(textrecipes)
library(parsnip)
library(workflows)
library(tune)
ranger_recipe <-
recipe(formula = real_cost ~ ., data = data_train) %>%
step_impute_knn(tunnel)
ranger_spec <-
rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
set_mode("regression") %>%
set_engine("ranger")
ranger_workflow <-
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(ranger_spec)
set.seed(8577)
doParallel::registerDoParallel()
ranger_tune <-
tune_grid(ranger_workflow,
resamples = data_folds,
grid = 11
)
## i Creating pre-processing data to finalize unknown parameter: mtry
Exploring The Results
show_best(ranger_tune, metric = "rmse")
## # A tibble: 5 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 6 6 rmse standard 2213. 25 192. Preprocessor1_Model06
## 2 5 10 rmse standard 2300. 25 186. Preprocessor1_Model05
## 3 3 4 rmse standard 2329. 25 185. Preprocessor1_Model10
## 4 4 18 rmse standard 2454. 25 180. Preprocessor1_Model01
## 5 5 24 rmse standard 2466. 25 182. Preprocessor1_Model03
show_best(ranger_tune, metric = "rsq")
## # A tibble: 5 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 6 6 rsq standard 0.823 25 0.0161 Preprocessor1_Model06
## 2 5 10 rsq standard 0.808 25 0.0149 Preprocessor1_Model05
## 3 3 4 rsq standard 0.808 25 0.0143 Preprocessor1_Model10
## 4 4 18 rsq standard 0.782 25 0.0145 Preprocessor1_Model01
## 5 5 24 rsq standard 0.778 25 0.0153 Preprocessor1_Model03
autoplot(ranger_tune)

final_rf <- ranger_workflow %>%
finalize_workflow(select_best(ranger_tune))
## Warning: No value of `metric` was given; metric 'rmse' will be used.
final_rf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
##
## • step_impute_knn()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = 6
## trees = 1000
## min_n = 6
##
## Computational engine: ranger
transit_fit <- last_fit(final_rf, data_split)
transit_fit
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [320/107]> train/test split <tibble> <tibble> <tibble> <workflow>
collect_metrics(transit_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 1386. Preprocessor1_Model1
## 2 rsq standard 0.915 Preprocessor1_Model1
collect_predictions(transit_fit) %>%
ggplot(aes(real_cost, .pred)) +
geom_abline(lty = 2, color = "gray50") +
geom_point(alpha = 0.5, color = "midnightblue") +
coord_fixed()

predict(transit_fit$.workflow[[1]], data_test[15, ])
## # A tibble: 1 × 1
## .pred
## <dbl>
## 1 3847.
Model Evaluation
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
imp_spec <- ranger_spec %>%
finalize_model(select_best(ranger_tune)) %>%
set_engine("ranger", importance = "permutation")
## Warning: No value of `metric` was given; metric 'rmse' will be used.
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(imp_spec) %>%
fit(data_train) %>%
pull_workflow_fit() %>%
vip(aesthetics = list(alpha = 0.8, fill = "midnightblue"))
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## ℹ Please use `extract_fit_parsnip()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

- Question and Data:
- What is the research question? Clearly state the research question
you aim to address using the new dataset. Why do transit-infrastructure
projects in New York cost 20 times more on a per kilometer basis than in
Seoul?
- Describe the data briefly: Provide an overview of the new dataset,
highlighting its key characteristics and dimensions. The dataset is
about transit costs and contains data about various different transit
costs in different cities and countries. There are 427 rows and 20
columns after cleaning it. There are a broad range of countries and
cities in the data set. There are also many different transit types
used. The dataset has data that provides details on project timelines
and cost factors which are useful in the analysis.
- What are the characteristics of the key variables used in the
analysis? Describe the primary variables of interest in the dataset and
their characteristics. The key variables used in the analysis are
stations, cost_km_millions, tunnel, cost, start year, end year, and
length. The dataset is focusing on transit lines and their variables
consit of construction dates, lengths, number of stations, and most
importantly cost.
- Data Exploration and Transformation:
- Describe the differences between the original data and the data
transformed for modeling. Why? Explain any preprocessing or
transformations performed on the new dataset compared to the original
data. Discuss why these changes were necessary or beneficial. There are
a few differences between the original data and the data transformed for
modeling. First, I converted start_year, end_year, and real_cost from
character to numerical data. I could not get the code to run without an
error before doing this so it was necassary. I also removed all of the x
and na values from the start_year and end_year rows. This also made the
code able to run as it was giving errors before with the missing values
in those rows.
- Data Preparation and Modeling:
- What are the names of data preparation steps mentioned in the video?
List and describe any data preparation steps or techniques mentioned in
the CA video that you applied to the new dataset. The names of the data
preparation steps mentioned in the video are dealing with missing values
in the dataset, transforming data, splitting the data, resampling and
cleaning the data. I used the na.omit() to remove the na values from the
start_year and end_year rows. I also used data splitting to make a data
training set and a data testing set. I also used bootstrap to create
data folds.
- What is the name of the machine learning model(s) used in the
analysis? Specify the machine learning model(s) you employed for your
analysis and briefly explain their relevance to the research question.
The machine learning model used in the analysis is the random forest
model. In the analysis I used random forest because it is a good model
to use as it handles big data sets well. The random forest model can
also automatically see how different variables are related within a
dataset. It is also good because it is less likely to over fit to the
training data compared to individual decsion trees. It provides feature
importance scores which helps me understand which variables are most
influential in prediciting the outcome.
- Model Evaluation:
- What metrics are used in the model evaluation? Detail the evaluation
metrics you used to assess the performance of your machine learning
model(s) on the new dataset. Discuss the significance of these metrics
in the context of your research question. I used root mean squared error
and r-squared. These metrics are helpful as root mean squared error
finds the average of the errors and I can see that if the root mean
square error value is low that the model has close predictions.
R-squared finds the variance in the dependent variable that is
predictable from the independent variables. Using this, I know I can
look at the value and see that the closer the r-squared value is to 1
the better the accuracy of the prediction is. This is important in
seeing the precision performance of the model.
- Conclusion:
- What are the major findings? Summarize the key findings and insights
obtained from your analysis of the new dataset. Relate these findings
back to the research question and any similarities or differences
compared to the CA assignment. Based off the graph at the end of the
apply 1, length of the transit line had the most impact on the cost. The
length of transit-infrastructure projects in New York appear to be much
longer than those in Seoul, which is why they cost a lot more. In a way
this is similar to the CA because the dimensions of the
transit-infrastructure projects explain the higher costs, which is
similar to how the dimensions of an IKEA product would explain the
higher price tag.