Goal: To predict transit costs Click here for the data
transit_cost <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-05/transit_cost.csv')
skimr::skim(transit_cost)
| Name | transit_cost |
| Number of rows | 544 |
| Number of columns | 20 |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| country | 7 | 0.99 | 2 | 2 | 0 | 56 | 0 |
| city | 7 | 0.99 | 4 | 16 | 0 | 140 | 0 |
| line | 7 | 0.99 | 2 | 46 | 0 | 366 | 0 |
| start_year | 53 | 0.90 | 4 | 9 | 0 | 40 | 0 |
| end_year | 71 | 0.87 | 1 | 4 | 0 | 36 | 0 |
| tunnel_per | 32 | 0.94 | 5 | 7 | 0 | 134 | 0 |
| source1 | 12 | 0.98 | 4 | 54 | 0 | 17 | 0 |
| currency | 7 | 0.99 | 2 | 3 | 0 | 39 | 0 |
| real_cost | 0 | 1.00 | 1 | 10 | 0 | 534 | 0 |
| source2 | 10 | 0.98 | 3 | 16 | 0 | 12 | 0 |
| reference | 19 | 0.97 | 3 | 302 | 0 | 350 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| e | 7 | 0.99 | 7738.76 | 463.23 | 7136.00 | 7403.00 | 7705.00 | 7977.00 | 9510.00 | ▇▇▂▁▁ |
| rr | 8 | 0.99 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| length | 5 | 0.99 | 58.34 | 621.20 | 0.60 | 6.50 | 15.77 | 29.08 | 12256.98 | ▇▁▁▁▁ |
| tunnel | 32 | 0.94 | 29.38 | 344.04 | 0.00 | 3.40 | 8.91 | 21.52 | 7790.78 | ▇▁▁▁▁ |
| stations | 15 | 0.97 | 13.81 | 13.70 | 0.00 | 4.00 | 10.00 | 20.00 | 128.00 | ▇▁▁▁▁ |
| cost | 7 | 0.99 | 805438.12 | 6708033.07 | 0.00 | 2289.00 | 11000.00 | 27000.00 | 90000000.00 | ▇▁▁▁▁ |
| year | 7 | 0.99 | 2014.91 | 5.64 | 1987.00 | 2012.00 | 2016.00 | 2019.00 | 2027.00 | ▁▁▂▇▂ |
| ppp_rate | 9 | 0.98 | 0.66 | 0.87 | 0.00 | 0.24 | 0.26 | 1.00 | 5.00 | ▇▂▁▁▁ |
| cost_km_millions | 2 | 1.00 | 232.98 | 257.22 | 7.79 | 134.86 | 181.25 | 241.43 | 3928.57 | ▇▁▁▁▁ |
data1 <- transit_cost %>%
# Treat missing values
na.omit() %>%
# log transform variables with pos-skewed distribution
mutate(cost = log(cost))
Correlation between city and Cost
transit_cost <- data1 %>%
count(e, sort = TRUE) %>%
head(20) %>%
pull(e)
transit_cost
## [1] 9459 9460 7136 7137 7138 7139 7144 7145 7146 7147 7152 7153 7154 7155 7160
## [16] 7161 7162 7163 7168 7169
data1 %>%
#Filter for top 20 transit costs by city
filter(city %in% transit_cost) %>%
ggplot(aes(cost, fct_reorder(city, country))) +
geom_boxplot()
Year
data1 %>%
ggplot(aes(cost, year)) +
scale_y_log10() +
geom_point()
EDA Shortcut
# Step 1: Prepare the data
data_binarized_tbl1 <- data1 %>%
select( - reference, - source2, source1) %>%
binarize()
data_binarized_tbl1 %>% glimpse(.)
## Rows: 428
## Columns: 164
## $ `e__-Inf_7352.75` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ e__7352.75_7584.5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ e__7584.5_7914.25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ e__7914.25_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__BG <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__CA <dbl> 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, …
## $ country__CN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__DE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__ES <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__FR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__IN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__IT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__JP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__KR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__RU <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__SA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__SE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TH <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TW <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__US <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ country__VN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `country__-OTHER` <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ city__Bangkok <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Barcelona <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Beijing <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Changchun <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Changsha <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Chengdu <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Chongqing <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Dongguan <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Guangzhou <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Guiyang <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Hangzhou <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Istanbul <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Madrid <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Mumbai <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Nanjing <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__New_York <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Paris <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Riyadh <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Seoul <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Shanghai <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Shenzhen <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Sofia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Taipei <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Tianjin <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Tokyo <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Toronto <dbl> 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ city__Wuhan <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `city__-OTHER` <dbl> 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, …
## $ line__Line_1_Phase_1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_12 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_3_Phase_1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_4_Phase_1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_5_Phase_1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Phase_1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `line__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ start_year__2000 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2001 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2003 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ start_year__2005 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2006 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2007 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2008 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2009 <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ start_year__2010 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2011 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2012 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2013 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2014 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2016 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2017 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2018 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ start_year__2019 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2020 <dbl> 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ start_year__2021 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `start_year__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2000 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2008 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2009 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2010 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2011 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2012 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2013 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2014 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2016 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ end_year__2017 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2018 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ end_year__2019 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2020 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2021 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2022 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2023 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2024 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2025 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2026 <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
## $ end_year__2027 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ end_year__2028 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2029 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2030 <dbl> 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ `end_year__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rr__0 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ rr__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `length__-Inf_5.975` <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ length__5.975_14.295 <dbl> 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, …
## $ length__14.295_27.875 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ length__27.875_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__0.00%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__100.00%` <dbl> 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, …
## $ `tunnel_per__-OTHER` <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ `tunnel__-Inf_3.375` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tunnel__3.375_8.43 <dbl> 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, …
## $ tunnel__8.43_20 <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ tunnel__20_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `stations__-Inf_4` <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
## $ stations__4_10 <dbl> 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ stations__10_20 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ stations__20_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ source1__Measured <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ source1__Media <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, …
## $ source1__Plan <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ source1__Trade <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ source1__Wiki <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ `source1__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cost__-Inf_7.5201343110015` <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ cost__7.5201343110015_9.16951837745593 <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ cost__9.16951837745593_10.2729534210399 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost__10.2729534210399_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__CAD <dbl> 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, …
## $ currency__CNY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__EUR <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ currency__INR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__JPY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__KRW <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__RUB <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__SEK <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__THB <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__TWD <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__USD <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ currency__VND <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `currency__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `year__-Inf_2012` <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, …
## $ year__2012_2016 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2016_2018 <dbl> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2018_Inf <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ `ppp_rate__-Inf_0.2379` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.2379_0.26215 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.26215_1.25 <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, …
## $ ppp_rate__1.25_Inf <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ real_cost__2400 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ `real_cost__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ `cost_km_millions__-Inf_133.8851087` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__133.8851087_183.845 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__183.845_243.2825 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__243.2825_Inf <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
# Step 2 Correlate
data_corr_tbl <- data_binarized_tbl1 %>%
correlate(e__7352.75_7584.5)
data_corr_tbl
## # A tibble: 164 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 e 7352.75_7584.5 1
## 2 country CN -0.466
## 3 currency CNY -0.466
## 4 e -Inf_7352.75 -0.333
## 5 e 7584.5_7914.25 -0.333
## 6 e 7914.25_Inf -0.333
## 7 country ES 0.330
## 8 ppp_rate 0.2379_0.26215 -0.325
## 9 ppp_rate -Inf_0.2379 0.323
## 10 country JP 0.288
## # ℹ 154 more rows
# Step 3 Plot
data_corr_tbl %>%
plot_correlation_funnel()
## Warning: ggrepel: 97 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps