R Markdown

Goal: To predict transit costs Click here for the data

transit_cost <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-05/transit_cost.csv')

skimr::skim(transit_cost)
Data summary
Name transit_cost
Number of rows 544
Number of columns 20
_______________________
Column type frequency:
character 11
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
country 7 0.99 2 2 0 56 0
city 7 0.99 4 16 0 140 0
line 7 0.99 2 46 0 366 0
start_year 53 0.90 4 9 0 40 0
end_year 71 0.87 1 4 0 36 0
tunnel_per 32 0.94 5 7 0 134 0
source1 12 0.98 4 54 0 17 0
currency 7 0.99 2 3 0 39 0
real_cost 0 1.00 1 10 0 534 0
source2 10 0.98 3 16 0 12 0
reference 19 0.97 3 302 0 350 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
e 7 0.99 7738.76 463.23 7136.00 7403.00 7705.00 7977.00 9510.00 ▇▇▂▁▁
rr 8 0.99 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
length 5 0.99 58.34 621.20 0.60 6.50 15.77 29.08 12256.98 ▇▁▁▁▁
tunnel 32 0.94 29.38 344.04 0.00 3.40 8.91 21.52 7790.78 ▇▁▁▁▁
stations 15 0.97 13.81 13.70 0.00 4.00 10.00 20.00 128.00 ▇▁▁▁▁
cost 7 0.99 805438.12 6708033.07 0.00 2289.00 11000.00 27000.00 90000000.00 ▇▁▁▁▁
year 7 0.99 2014.91 5.64 1987.00 2012.00 2016.00 2019.00 2027.00 ▁▁▂▇▂
ppp_rate 9 0.98 0.66 0.87 0.00 0.24 0.26 1.00 5.00 ▇▂▁▁▁
cost_km_millions 2 1.00 232.98 257.22 7.79 134.86 181.25 241.43 3928.57 ▇▁▁▁▁
data1 <- transit_cost %>%
  
  # Treat missing values
  na.omit() %>%
  
  # log transform variables with pos-skewed distribution
  mutate(cost = log(cost))

Explore Data

Correlation between city and Cost

 transit_cost <- data1 %>%
  count(e, sort = TRUE) %>%
  head(20) %>%
  pull(e)

transit_cost
##  [1] 9459 9460 7136 7137 7138 7139 7144 7145 7146 7147 7152 7153 7154 7155 7160
## [16] 7161 7162 7163 7168 7169
data1 %>%
  
  #Filter for top 20 transit costs by city
  filter(city %in% transit_cost) %>%
  ggplot(aes(cost, fct_reorder(city, country))) +
  geom_boxplot()

Identify good predictors

Year

data1 %>% 
  ggplot(aes(cost, year)) +
  scale_y_log10() +
  geom_point()

EDA Shortcut

EDA Shortcut

# Step 1: Prepare the data

data_binarized_tbl1 <- data1 %>%
  select( - reference, - source2, source1) %>%
  binarize()

data_binarized_tbl1 %>% glimpse(.)
## Rows: 428
## Columns: 164
## $ `e__-Inf_7352.75`                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ e__7352.75_7584.5                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ e__7584.5_7914.25                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ e__7914.25_Inf                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__BG                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__CA                             <dbl> 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, …
## $ country__CN                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__DE                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__ES                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__FR                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__IN                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__IT                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__JP                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__KR                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__RU                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__SA                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__SE                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TH                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TR                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__TW                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__US                             <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ country__VN                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `country__-OTHER`                       <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ city__Bangkok                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Barcelona                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Beijing                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Changchun                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Changsha                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Chengdu                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Chongqing                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Dongguan                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Guangzhou                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Guiyang                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Hangzhou                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Istanbul                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Madrid                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Mumbai                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Nanjing                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__New_York                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Paris                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Riyadh                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Seoul                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Shanghai                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Shenzhen                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Sofia                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Taipei                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Tianjin                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Tokyo                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Toronto                           <dbl> 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ city__Wuhan                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `city__-OTHER`                          <dbl> 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, …
## $ line__Line_1_Phase_1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_12                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_2                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_3                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_3_Phase_1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_4                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_4_Phase_1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_5                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_5_Phase_1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_6                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Line_7                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ line__Phase_1                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `line__-OTHER`                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ start_year__2000                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2001                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2003                        <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ start_year__2005                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2006                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2007                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2008                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2009                        <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ start_year__2010                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2011                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2012                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2013                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2014                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2015                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2016                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2017                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2018                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ start_year__2019                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2020                        <dbl> 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ start_year__2021                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `start_year__-OTHER`                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2000                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2008                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2009                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2010                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2011                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2012                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2013                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2014                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2015                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2016                          <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ end_year__2017                          <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2018                          <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ end_year__2019                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2020                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2021                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2022                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2023                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2024                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2025                          <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2026                          <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
## $ end_year__2027                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ end_year__2028                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2029                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2030                          <dbl> 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ `end_year__-OTHER`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rr__0                                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ rr__1                                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `length__-Inf_5.975`                    <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ length__5.975_14.295                    <dbl> 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, …
## $ length__14.295_27.875                   <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ length__27.875_Inf                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__0.00%`                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__100.00%`                   <dbl> 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, …
## $ `tunnel_per__-OTHER`                    <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ `tunnel__-Inf_3.375`                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tunnel__3.375_8.43                      <dbl> 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, …
## $ tunnel__8.43_20                         <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ tunnel__20_Inf                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `stations__-Inf_4`                      <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
## $ stations__4_10                          <dbl> 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ stations__10_20                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ stations__20_Inf                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ source1__Measured                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ source1__Media                          <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, …
## $ source1__Plan                           <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ source1__Trade                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ source1__Wiki                           <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ `source1__-OTHER`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cost__-Inf_7.5201343110015`            <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ cost__7.5201343110015_9.16951837745593  <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ cost__9.16951837745593_10.2729534210399 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost__10.2729534210399_Inf              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__CAD                           <dbl> 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, …
## $ currency__CNY                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__EUR                           <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ currency__INR                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__JPY                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__KRW                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__RUB                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__SEK                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__THB                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__TWD                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ currency__USD                           <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ currency__VND                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `currency__-OTHER`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `year__-Inf_2012`                       <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, …
## $ year__2012_2016                         <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2016_2018                         <dbl> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2018_Inf                          <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ `ppp_rate__-Inf_0.2379`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.2379_0.26215                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.26215_1.25                  <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, …
## $ ppp_rate__1.25_Inf                      <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ real_cost__2400                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ `real_cost__-OTHER`                     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ `cost_km_millions__-Inf_133.8851087`    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__133.8851087_183.845   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__183.845_243.2825      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__243.2825_Inf          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
# Step 2 Correlate
data_corr_tbl <- data_binarized_tbl1 %>%
  correlate(e__7352.75_7584.5)

data_corr_tbl
## # A tibble: 164 × 3
##    feature  bin            correlation
##    <fct>    <chr>                <dbl>
##  1 e        7352.75_7584.5       1    
##  2 country  CN                  -0.466
##  3 currency CNY                 -0.466
##  4 e        -Inf_7352.75        -0.333
##  5 e        7584.5_7914.25      -0.333
##  6 e        7914.25_Inf         -0.333
##  7 country  ES                   0.330
##  8 ppp_rate 0.2379_0.26215      -0.325
##  9 ppp_rate -Inf_0.2379          0.323
## 10 country  JP                   0.288
## # ℹ 154 more rows
# Step 3 Plot
data_corr_tbl %>%
  plot_correlation_funnel()
## Warning: ggrepel: 97 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps