library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Import data

transit_cost_new <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-05/transit_cost.csv')
## Rows: 544 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): country, city, line, start_year, end_year, tunnel_per, source1, cu...
## dbl  (9): e, rr, length, tunnel, stations, cost, year, ppp_rate, cost_km_mil...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data Cleaning

library(dplyr)

transit_cost <- transit_cost_new %>%
  filter_all(all_vars(. != "x" & . != "X")) %>%
  na.omit()


transit_cost %>% na.omit() %>% skimr::skim()
Data summary
Name Piped data
Number of rows 427
Number of columns 20
_______________________
Column type frequency:
character 11
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
country 0 1 2 2 0 54 0
city 0 1 4 16 0 134 0
line 0 1 2 39 0 315 0
start_year 0 1 4 4 0 36 0
end_year 0 1 4 4 0 34 0
tunnel_per 0 1 5 7 0 114 0
source1 0 1 4 54 0 13 0
currency 0 1 3 3 0 36 0
real_cost 0 1 3 10 0 420 0
source2 0 1 4 16 0 7 0
reference 0 1 3 302 0 313 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
e 0 1 7688.65 483.35 7136.00 7352.50 7584.00 7913.50 9510.00 ▇▅▁▁▁
rr 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
length 0 1 20.38 22.02 0.60 5.95 14.29 27.95 200.00 ▇▁▁▁▁
tunnel 0 1 13.78 15.61 0.00 3.35 8.43 20.00 160.00 ▇▁▁▁▁
stations 0 1 13.62 14.06 0.00 4.00 10.00 19.50 128.00 ▇▁▁▁▁
cost 0 1 587090.86 4365866.10 21.00 1830.00 9600.00 28958.50 48000000.00 ▇▁▁▁▁
year 0 1 2014.64 5.76 1991.00 2012.00 2016.00 2018.00 2027.00 ▁▁▂▇▂
ppp_rate 0 1 0.68 0.85 0.00 0.24 0.26 1.25 5.00 ▇▂▁▁▁
cost_km_millions 0 1 237.60 280.16 7.79 133.65 183.72 242.18 3928.57 ▇▁▁▁▁
transit_cost$start_year <- as.numeric(transit_cost$start_year)
transit_cost$end_year <- as.numeric(transit_cost$end_year)



transit_cost$real_cost <- as.numeric(as.character(transit_cost$real_cost))
transit_cost
## # A tibble: 427 × 20
##        e country city   line  start_year end_year    rr length tunnel_per tunnel
##    <dbl> <chr>   <chr>  <chr>      <dbl>    <dbl> <dbl>  <dbl> <chr>       <dbl>
##  1  7136 CA      Vanco… Broa…       2020     2025     0    5.7 87.72%        5  
##  2  7137 CA      Toron… Vaug…       2009     2017     0    8.6 100.00%       8.6
##  3  7138 CA      Toron… Scar…       2020     2030     0    7.8 100.00%       7.8
##  4  7139 CA      Toron… Onta…       2020     2030     0   15.5 57.00%        8.8
##  5  7144 CA      Toron… Yong…       2020     2030     0    7.4 100.00%       7.4
##  6  7145 NL      Amste… Nort…       2003     2018     0    9.7 73.00%        7.1
##  7  7146 CA      Montr… Blue…       2020     2026     0    5.8 100.00%       5.8
##  8  7147 US      Seatt… U-Li…       2009     2016     0    5.1 100.00%       5.1
##  9  7152 US      Los A… Purp…       2020     2027     0    4.2 100.00%       4.2
## 10  7153 US      Los A… Purp…       2018     2026     0    4.2 100.00%       4.2
## # ℹ 417 more rows
## # ℹ 10 more variables: stations <dbl>, source1 <chr>, cost <dbl>,
## #   currency <chr>, year <dbl>, ppp_rate <dbl>, real_cost <dbl>,
## #   cost_km_millions <dbl>, source2 <chr>, reference <chr>
clean_data <- transit_cost %>% select(start_year, end_year, length, tunnel, stations, real_cost, cost, cost_km_millions) %>% na.omit()

Data Exploration

clean_data %>% glimpse()
## Rows: 427
## Columns: 8
## $ start_year       <dbl> 2020, 2009, 2020, 2020, 2020, 2003, 2020, 2009, 2020,…
## $ end_year         <dbl> 2025, 2017, 2030, 2030, 2030, 2018, 2026, 2016, 2027,…
## $ length           <dbl> 5.7, 8.6, 7.8, 15.5, 7.4, 9.7, 5.8, 5.1, 4.2, 4.2, 6.…
## $ tunnel           <dbl> 5.0, 8.6, 7.8, 8.8, 7.4, 7.1, 5.8, 5.1, 4.2, 4.2, 6.3…
## $ stations         <dbl> 6, 6, 3, 15, 6, 8, 5, 2, 2, 2, 3, 3, 4, 7, 13, 4, 4, …
## $ real_cost        <dbl> 2377.200, 2592.000, 4620.000, 7201.320, 4704.000, 403…
## $ cost             <dbl> 2830, 3200, 5500, 8573, 5600, 3100, 4500, 1756, 3600,…
## $ cost_km_millions <dbl> 417.05263, 301.39535, 592.30769, 464.60129, 635.67568…
clean_data %>% skimr::skim()
Data summary
Name Piped data
Number of rows 427
Number of columns 8
_______________________
Column type frequency:
numeric 8
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
start_year 0 1 2013.16 6.69 1984.00 2010.00 2015.00 2018.00 2025.00 ▁▁▃▇▇
end_year 0 1 2019.03 6.30 1994.00 2016.00 2020.00 2023.00 2030.00 ▁▁▂▇▅
length 0 1 20.38 22.02 0.60 5.95 14.29 27.95 200.00 ▇▁▁▁▁
tunnel 0 1 13.78 15.61 0.00 3.35 8.43 20.00 160.00 ▇▁▁▁▁
stations 0 1 13.62 14.06 0.00 4.00 10.00 19.50 128.00 ▇▁▁▁▁
real_cost 0 1 4160.20 4852.89 66.25 1136.41 2970.76 5487.05 45604.00 ▇▁▁▁▁
cost 0 1 587090.86 4365866.10 21.00 1830.00 9600.00 28958.50 48000000.00 ▇▁▁▁▁
cost_km_millions 0 1 237.60 280.16 7.79 133.65 183.72 242.18 3928.57 ▇▁▁▁▁

Model Building

library(rsample)

set.seed(123)
data_split <- initial_split(clean_data)
data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(234)
data_folds <- bootstraps(data_train, strata = real_cost)
data_folds
## # Bootstrap sampling using stratification 
## # A tibble: 25 × 2
##    splits            id         
##    <list>            <chr>      
##  1 <split [320/124]> Bootstrap01
##  2 <split [320/109]> Bootstrap02
##  3 <split [320/119]> Bootstrap03
##  4 <split [320/126]> Bootstrap04
##  5 <split [320/118]> Bootstrap05
##  6 <split [320/111]> Bootstrap06
##  7 <split [320/118]> Bootstrap07
##  8 <split [320/121]> Bootstrap08
##  9 <split [320/113]> Bootstrap09
## 10 <split [320/120]> Bootstrap10
## # ℹ 15 more rows
library(usemodels)
use_xgboost(real_cost ~ start_year + end_year + length + tunnel + stations + cost + cost_km_millions, data = data_train)
## xgboost_recipe <- 
##   recipe(formula = real_cost ~ start_year + end_year + length + tunnel + stations + 
##     cost + cost_km_millions, data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(17029)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(recipes)
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
## 
##     fixed
## The following object is masked from 'package:stats':
## 
##     step
xgboost_recipe <- 
  recipe(formula = real_cost ~ ., data = data_train) %>%
    #recipes::update_role(country, new_role = "id") %>%
    #step_other(favorite_count) %>%
    # step_dummy(all_nominal_predictors()) %>%   # for factors
    step_log(length, tunnel, cost_km_millions) %>%
    step_nzv(all_predictors()) 

xgboost_recipe %>% prep() %>% bake(new_data = NULL)
## # A tibble: 320 × 8
##    start_year end_year length  tunnel stations   cost cost_km_millions real_cost
##         <dbl>    <dbl>  <dbl>   <dbl>    <dbl>  <dbl>            <dbl>     <dbl>
##  1       2013     2017   3.54    3.37       25 1.94e4             5.13     5819.
##  2       2018     2029   2.19 -Inf           4 1.05e5             4.77     1050 
##  3       2012     2021   2.03 -Inf           7 2.29e3             5.71     2289 
##  4       2009     2013   1.70 -Inf           6 1.09e4             4.84      696.
##  5       2016     2020   2.89    2.89       15 1.41e4             5.31     3645.
##  6       2012     2014   2.48    2.48       11 2   e7             5.30     2400 
##  7       2019     2024   3.36    3.27       19 2.36e4             5.27     5632.
##  8       2006     2013   2.77    2.77       11 1.36e3             4.92     2170.
##  9       1997     2000   3.22 -Inf          19 9.38e3             4.93     3470.
## 10       2019     2022   3.47    3.47        7 9.16e2             4.66     3389.
## # ℹ 310 more rows
library(textrecipes)
library(parsnip)
library(workflows)
library(tune)


ranger_recipe <-
  recipe(formula = real_cost ~ ., data = data_train) %>%
  step_impute_knn(tunnel)

ranger_spec <-
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
  set_mode("regression") %>%
  set_engine("ranger")

ranger_workflow <-
  workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(ranger_spec)

set.seed(8577)
doParallel::registerDoParallel()
ranger_tune <-
  tune_grid(ranger_workflow,
    resamples = data_folds,
    grid = 11
  )
## i Creating pre-processing data to finalize unknown parameter: mtry

Exploring The Results

show_best(ranger_tune, metric = "rmse")
## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     6     6 rmse    standard   2213.    25    192. Preprocessor1_Model06
## 2     5    10 rmse    standard   2300.    25    186. Preprocessor1_Model05
## 3     3     4 rmse    standard   2329.    25    185. Preprocessor1_Model10
## 4     4    18 rmse    standard   2454.    25    180. Preprocessor1_Model01
## 5     5    24 rmse    standard   2466.    25    182. Preprocessor1_Model03
show_best(ranger_tune, metric = "rsq")
## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     6     6 rsq     standard   0.823    25  0.0161 Preprocessor1_Model06
## 2     5    10 rsq     standard   0.808    25  0.0149 Preprocessor1_Model05
## 3     3     4 rsq     standard   0.808    25  0.0143 Preprocessor1_Model10
## 4     4    18 rsq     standard   0.782    25  0.0145 Preprocessor1_Model01
## 5     5    24 rsq     standard   0.778    25  0.0153 Preprocessor1_Model03
autoplot(ranger_tune)

final_rf <- ranger_workflow %>%
  finalize_workflow(select_best(ranger_tune))
## Warning: No value of `metric` was given; metric 'rmse' will be used.
final_rf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_impute_knn()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   mtry = 6
##   trees = 1000
##   min_n = 6
## 
## Computational engine: ranger
transit_fit <- last_fit(final_rf, data_split)
transit_fit
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits            id               .metrics .notes   .predictions .workflow 
##   <list>            <chr>            <list>   <list>   <list>       <list>    
## 1 <split [320/107]> train/test split <tibble> <tibble> <tibble>     <workflow>
collect_metrics(transit_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard    1386.    Preprocessor1_Model1
## 2 rsq     standard       0.915 Preprocessor1_Model1
collect_predictions(transit_fit) %>%
  ggplot(aes(real_cost, .pred)) +
  geom_abline(lty = 2, color = "gray50") +
  geom_point(alpha = 0.5, color = "midnightblue") +
  coord_fixed()

predict(transit_fit$.workflow[[1]], data_test[15, ])
## # A tibble: 1 × 1
##   .pred
##   <dbl>
## 1 3847.

Model Evaluation

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
imp_spec <- ranger_spec %>%
  finalize_model(select_best(ranger_tune)) %>%
  set_engine("ranger", importance = "permutation")
## Warning: No value of `metric` was given; metric 'rmse' will be used.
workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(imp_spec) %>%
  fit(data_train) %>%
  pull_workflow_fit() %>%
  vip(aesthetics = list(alpha = 0.8, fill = "midnightblue"))
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## ℹ Please use `extract_fit_parsnip()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

  1. Question and Data:
    • What is the research question? Clearly state the research question you aim to address using the new dataset. Why do transit-infrastructure projects in New York cost 20 times more on a per kilometer basis than in Seoul?
    • Describe the data briefly: Provide an overview of the new dataset, highlighting its key characteristics and dimensions. The dataset is about transit costs and contains data about various different transit costs in different cities and countries. There are 427 rows and 20 columns after cleaning it. There are a broad range of countries and cities in the data set. There are also many different transit types used. The dataset has data that provides details on project timelines and cost factors which are useful in the analysis.
    • What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the dataset and their characteristics. The key variables used in the analysis are stations, cost_km_millions, tunnel, cost, start year, end year, and length. The dataset is focusing on transit lines and their variables consit of construction dates, lengths, number of stations, and most importantly cost.
  2. Data Exploration and Transformation:
    • Describe the differences between the original data and the data transformed for modeling. Why? Explain any preprocessing or transformations performed on the new dataset compared to the original data. Discuss why these changes were necessary or beneficial. There are a few differences between the original data and the data transformed for modeling. First, I converted start_year, end_year, and real_cost from character to numerical data. I could not get the code to run without an error before doing this so it was necassary. I also removed all of the x and na values from the start_year and end_year rows. This also made the code able to run as it was giving errors before with the missing values in those rows.
  3. Data Preparation and Modeling:
    • What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new dataset. The names of the data preparation steps mentioned in the video are dealing with missing values in the dataset, transforming data, splitting the data, resampling and cleaning the data. I used the na.omit() to remove the na values from the start_year and end_year rows. I also used data splitting to make a data training set and a data testing set. I also used bootstrap to create data folds.
    • What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question. The machine learning model used in the analysis is the random forest model. In the analysis I used random forest because it is a good model to use as it handles big data sets well. The random forest model can also automatically see how different variables are related within a dataset. It is also good because it is less likely to over fit to the training data compared to individual decsion trees. It provides feature importance scores which helps me understand which variables are most influential in prediciting the outcome.
  4. Model Evaluation:
    • What metrics are used in the model evaluation? Detail the evaluation metrics you used to assess the performance of your machine learning model(s) on the new dataset. Discuss the significance of these metrics in the context of your research question. I used root mean squared error and r-squared. These metrics are helpful as root mean squared error finds the average of the errors and I can see that if the root mean square error value is low that the model has close predictions. R-squared finds the variance in the dependent variable that is predictable from the independent variables. Using this, I know I can look at the value and see that the closer the r-squared value is to 1 the better the accuracy of the prediction is. This is important in seeing the precision performance of the model.
  5. Conclusion:
    • What are the major findings? Summarize the key findings and insights obtained from your analysis of the new dataset. Relate these findings back to the research question and any similarities or differences compared to the CA assignment. Based off the graph at the end of the apply 1, length of the transit line had the most impact on the cost. The length of transit-infrastructure projects in New York appear to be much longer than those in Seoul, which is why they cost a lot more. In a way this is similar to the CA because the dimensions of the transit-infrastructure projects explain the higher costs, which is similar to how the dimensions of an IKEA product would explain the higher price tag.