R Markdown

Goal: To predict transit costs Click here for the data

transit_cost <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-05/transit_cost.csv')

skimr::skim(transit_cost)

Data summary
Name	transit_cost
Number of rows	544
Number of columns	20
_______________________
Column type frequency:
character	11
numeric	9
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
country	7	0.99	2	2	56
city	7	0.99	4	16	140
line	7	0.99	2	46	366
start_year	53	0.90	4	9	40
end_year	71	0.87	1	4	36
tunnel_per	32	0.94	5	7	134
source1	12	0.98	4	54	17
currency	7	0.99	2	3	39
real_cost	0	1.00	1	10	534
source2	10	0.98	3	16	12
reference	19	0.97	3	302	350

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
e	7	0.99	7738.76	463.23	7136.00	7403.00	7705.00	7977.00	9510.00	▇▇▂▁▁
rr	8	0.99	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
length	5	0.99	58.34	621.20	0.60	6.50	15.77	29.08	12256.98	▇▁▁▁▁
tunnel	32	0.94	29.38	344.04	0.00	3.40	8.91	21.52	7790.78	▇▁▁▁▁
stations	15	0.97	13.81	13.70	0.00	4.00	10.00	20.00	128.00	▇▁▁▁▁
cost	7	0.99	805438.12	6708033.07	0.00	2289.00	11000.00	27000.00	90000000.00	▇▁▁▁▁
year	7	0.99	2014.91	5.64	1987.00	2012.00	2016.00	2019.00	2027.00	▁▁▂▇▂
ppp_rate	9	0.98	0.66	0.87	0.00	0.24	0.26	1.00	5.00	▇▂▁▁▁
cost_km_millions	2	1.00	232.98	257.22	7.79	134.86	181.25	241.43	3928.57	▇▁▁▁▁

data1 <- transit_cost %>%
  
  # Treat missing values
  select(-line, -source1, -currency, -source2, -reference, -country) %>%

  
  # log transform variables with pos-skewed distribution
  mutate(cost = log(cost + 1)) %>%
  mutate(real_cost = as.numeric(real_cost)) %>%
  mutate(tunnel_per = tunnel_per %>% str_remove("%") %>% as.numeric()) %>%
  mutate(start_year = as.numeric(start_year)) %>%
  mutate(end_year = as.numeric(end_year)) %>%
  na.omit()

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `real_cost = as.numeric(real_cost)`.
## Caused by warning:
## ! NAs introduced by coercion

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `start_year = as.numeric(start_year)`.
## Caused by warning:
## ! NAs introduced by coercion

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `end_year = as.numeric(end_year)`.
## Caused by warning:
## ! NAs introduced by coercion

Identify good predictors

Year

data1 %>% 
  ggplot(aes(cost, year)) +
  scale_y_log10() +
  geom_point()

Explore Data

Correlation between city and Cost

 transit_cost <- data1 %>%
  count(e, sort = TRUE) %>%
  head(20) %>%
  pull(e)

transit_cost

##  [1] 9459 9460 7136 7137 7138 7139 7144 7145 7146 7147 7152 7153 7154 7155 7160
## [16] 7161 7162 7163 7168 7169

data1 %>%
  
  #Filter for top 20 transit costs by city
  # filter(city %in% transit_cost) %>%
  ggplot(aes(cost, fct_reorder(city, cost))) +
  geom_boxplot()

EDA Shortcut

# Step 1: Prepare the data

data_binarized_tbl1 <- data1 %>%
  binarize() 

data_binarized_tbl1 %>% glimpse(.)

## Rows: 436
## Columns: 76
## $ `e__-Inf_7354.75`                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ e__7354.75_7589.5                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ e__7589.5_7923.5                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ e__7923.5_Inf                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Bangkok                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Barcelona                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Beijing                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Changchun                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Changsha                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Chengdu                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Chongqing                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Dongguan                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Guangzhou                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Guiyang                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Hangzhou                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Istanbul                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Madrid                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Mumbai                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Nanjing                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__New_York                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Paris                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Riyadh                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Seoul                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Shanghai                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Shenzhen                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Sofia                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Taipei                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Tianjin                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Tokyo                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Toronto                           <dbl> 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ city__Wuhan                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `city__-OTHER`                          <dbl> 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, …
## $ `start_year__-Inf_2009`                 <dbl> 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, …
## $ start_year__2009_2015                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2015_2018                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ start_year__2018_Inf                    <dbl> 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ `end_year__-Inf_2016`                   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ end_year__2016_2020                     <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ end_year__2020_2023                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2023_Inf                      <dbl> 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, …
## $ rr__0                                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ rr__1                                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `length__-Inf_6.1`                      <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ length__6.1_14.75                       <dbl> 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, …
## $ length__14.75_28.2                      <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ length__28.2_Inf                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__-Inf_49.09`                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tunnel_per__49.09_Inf                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `tunnel__-Inf_3.275`                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tunnel__3.275_8.4                       <dbl> 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, …
## $ tunnel__8.4_20                          <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ tunnel__20_Inf                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `stations__-Inf_4`                      <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
## $ stations__4_10                          <dbl> 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ stations__10_20                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ stations__20_Inf                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cost__-Inf_7.4713630881871`            <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ cost__7.4713630881871_9.15883647922657  <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ cost__9.15883647922657_10.2633708195977 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost__10.2633708195977_Inf              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `year__-Inf_2012`                       <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, …
## $ year__2012_2016                         <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2016_2018                         <dbl> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2018_Inf                          <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ `ppp_rate__-Inf_0.2379`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.2379_0.266                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.266_1.25                    <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, …
## $ ppp_rate__1.25_Inf                      <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `real_cost__-Inf_1168.71`               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ real_cost__1168.71_2977.92              <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ real_cost__2977.92_5540.5525            <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ real_cost__5540.5525_Inf                <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ `cost_km_millions__-Inf_132.9840103`    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__132.9840103_184.085   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__184.085_243.2825      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__243.2825_Inf          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …

# Step 2 Correlate
 data_corr_tbl <- data_binarized_tbl1 %>%
  correlate(cost__10.2633708195977_Inf)

data_corr_tbl

## # A tibble: 76 × 3
##    feature    bin                               correlation
##    <fct>      <chr>                                   <dbl>
##  1 cost       10.2633708195977_Inf                    1    
##  2 ppp_rate   -Inf_0.2379                             0.684
##  3 real_cost  5540.5525_Inf                           0.376
##  4 cost       -Inf_7.4713630881871                   -0.335
##  5 cost       9.15883647922657_10.2633708195977      -0.333
##  6 cost       7.4713630881871_9.15883647922657       -0.331
##  7 ppp_rate   1.25_Inf                               -0.315
##  8 tunnel_per -Inf_49.09                              0.291
##  9 tunnel_per 49.09_Inf                              -0.291
## 10 ppp_rate   0.266_1.25                             -0.287
## # ℹ 66 more rows

# Step 3 Plot 
data_corr_tbl %>%
  plot_correlation_funnel()

## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Prprocess Data

Build Models

Split Data

#data1 <- sample_n(data, 100)

# Split into train and test data set
set.seed(1234)
data_split <- rsample:: initial_split(data1)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training data set for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [294/33]> Fold01
##  2 <split [294/33]> Fold02
##  3 <split [294/33]> Fold03
##  4 <split [294/33]> Fold04
##  5 <split [294/33]> Fold05
##  6 <split [294/33]> Fold06
##  7 <split [294/33]> Fold07
##  8 <split [295/32]> Fold08
##  9 <split [295/32]> Fold09
## 10 <split [295/32]> Fold10

library(usemodels)

## Warning: package 'usemodels' was built under R version 4.3.2

usemodels:: use_xgboost(cost ~ ., data1)

## xgboost_recipe <- 
##   recipe(formula = cost ~ ., data = data1) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(24468)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

# Specify Recipe

xgboost_recipe <- 
  recipe(formula = cost ~ ., data = data_train) %>%
  step_other(city, threshold = 0.01) %>%
  step_dummy(city) %>%
  step_YeoJohnson(year, cost, stations)

xgboost_recipe %>% prep() %>%  juice() %>% glimpse()

## Rows: 327
## Columns: 39
## $ e                <dbl> 7808, 7945, 8177, 7338, 7360, 8139, 7408, 9462, 8163,…
## $ start_year       <dbl> 2019, 2016, 2014, 2013, 2010, 2005, 2007, 2016, 2022,…
## $ end_year         <dbl> 2024, 2020, 2017, 2019, 2013, 2008, 2014, 2020, 2027,…
## $ rr               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ length           <dbl> 7.830, 34.800, 40.700, 37.000, 4.200, 28.100, 1.600, …
## $ tunnel_per       <dbl> 100.00, 100.00, 14.00, 100.00, 0.00, 0.00, 100.00, 10…
## $ tunnel           <dbl> 7.830, 34.800, 5.700, 37.000, 0.000, 0.000, 1.600, 38…
## $ stations         <dbl> 2.0794415, 2.8903718, 3.0445224, 3.3672958, 1.3862944…
## $ year             <dbl> 2019, 2015, 2013, 2016, 2012, 2005, 2011, 2013, 2019,…
## $ ppp_rate         <dbl> 0.2382, 0.2583, 1.8200, 1.7000, 1.3000, 0.3517, 1.000…
## $ real_cost        <dbl> 1775.13, 6174.12, 9103.64, 5100.00, 416.00, 218.89, 2…
## $ cost_km_millions <dbl> 226.71000, 177.42000, 223.67666, 137.83784, 99.04762,…
## $ cost             <dbl> 3.632871, 3.900502, 3.536830, 3.409920, 2.794208, 2.9…
## $ city_Barcelona   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Beijing     <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ city_Berlin      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Changchun   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Changsha    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ city_Chengdu     <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Chongqing   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Dongguan    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Guangzhou   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Guiyang     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Hangzhou    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ city_Istanbul    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Mumbai      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ city_Nanjing     <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_New.York    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Paris       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Riyadh      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Rome        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Shanghai    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Shenzhen    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Stockholm   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Taipei      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Tianjin     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Tokyo       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_Wuhan       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city_other       <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,…

# Specify Model
  
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

# Combine recipe and model using workflow

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

# Tune hyperparamaters

set.seed(5218)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 10)

## Warning: package 'xgboost' was built under R version 4.3.2

Transit Costs Apply 2

Olivia Pendergast

2024-03-05

R Markdown

Identify good predictors

Explore Data

EDA Shortcut

Prprocess Data

Build Models