R Markdown

Goal: To predict transit costs Click here for the data

transit_cost <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-05/transit_cost.csv')

skimr::skim(transit_cost)
Data summary
Name transit_cost
Number of rows 544
Number of columns 20
_______________________
Column type frequency:
character 11
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
country 7 0.99 2 2 0 56 0
city 7 0.99 4 16 0 140 0
line 7 0.99 2 46 0 366 0
start_year 53 0.90 4 9 0 40 0
end_year 71 0.87 1 4 0 36 0
tunnel_per 32 0.94 5 7 0 134 0
source1 12 0.98 4 54 0 17 0
currency 7 0.99 2 3 0 39 0
real_cost 0 1.00 1 10 0 534 0
source2 10 0.98 3 16 0 12 0
reference 19 0.97 3 302 0 350 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
e 7 0.99 7738.76 463.23 7136.00 7403.00 7705.00 7977.00 9510.00 ▇▇▂▁▁
rr 8 0.99 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
length 5 0.99 58.34 621.20 0.60 6.50 15.77 29.08 12256.98 ▇▁▁▁▁
tunnel 32 0.94 29.38 344.04 0.00 3.40 8.91 21.52 7790.78 ▇▁▁▁▁
stations 15 0.97 13.81 13.70 0.00 4.00 10.00 20.00 128.00 ▇▁▁▁▁
cost 7 0.99 805438.12 6708033.07 0.00 2289.00 11000.00 27000.00 90000000.00 ▇▁▁▁▁
year 7 0.99 2014.91 5.64 1987.00 2012.00 2016.00 2019.00 2027.00 ▁▁▂▇▂
ppp_rate 9 0.98 0.66 0.87 0.00 0.24 0.26 1.00 5.00 ▇▂▁▁▁
cost_km_millions 2 1.00 232.98 257.22 7.79 134.86 181.25 241.43 3928.57 ▇▁▁▁▁
data1 <- transit_cost %>%
  
  # Treat missing values
  select(-line, -source1, -currency, -source2, -reference, -country) %>%

  
  # log transform variables with pos-skewed distribution
  mutate(cost = log(cost + 1)) %>%
  mutate(real_cost = as.numeric(real_cost)) %>%
  mutate(tunnel_per = tunnel_per %>% str_remove("%") %>% as.numeric()) %>%
  mutate(start_year = as.numeric(start_year)) %>%
  mutate(end_year = as.numeric(end_year)) %>%
  na.omit()
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `real_cost = as.numeric(real_cost)`.
## Caused by warning:
## ! NAs introduced by coercion
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `start_year = as.numeric(start_year)`.
## Caused by warning:
## ! NAs introduced by coercion
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `end_year = as.numeric(end_year)`.
## Caused by warning:
## ! NAs introduced by coercion

Identify good predictors

Year

data1 %>% 
  ggplot(aes(cost, year)) +
  scale_y_log10() +
  geom_point()

Explore Data

Correlation between city and Cost

 transit_cost <- data1 %>%
  count(e, sort = TRUE) %>%
  head(20) %>%
  pull(e)

transit_cost
##  [1] 9459 9460 7136 7137 7138 7139 7144 7145 7146 7147 7152 7153 7154 7155 7160
## [16] 7161 7162 7163 7168 7169
data1 %>%
  
  #Filter for top 20 transit costs by city
  # filter(city %in% transit_cost) %>%
  ggplot(aes(cost, fct_reorder(city, cost))) +
  geom_boxplot() 

EDA Shortcut

EDA Shortcut

# Step 1: Prepare the data

data_binarized_tbl1 <- data1 %>%
  binarize() 

data_binarized_tbl1 %>% glimpse(.)
## Rows: 436
## Columns: 76
## $ `e__-Inf_7354.75`                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ e__7354.75_7589.5                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ e__7589.5_7923.5                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ e__7923.5_Inf                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Bangkok                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Barcelona                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Beijing                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Changchun                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Changsha                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Chengdu                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Chongqing                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Dongguan                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Guangzhou                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Guiyang                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Hangzhou                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Istanbul                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Madrid                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Mumbai                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Nanjing                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__New_York                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Paris                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Riyadh                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Seoul                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Shanghai                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Shenzhen                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Sofia                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Taipei                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Tianjin                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Tokyo                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Toronto                           <dbl> 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ city__Wuhan                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `city__-OTHER`                          <dbl> 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, …
## $ `start_year__-Inf_2009`                 <dbl> 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, …
## $ start_year__2009_2015                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ start_year__2015_2018                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ start_year__2018_Inf                    <dbl> 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ `end_year__-Inf_2016`                   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ end_year__2016_2020                     <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ end_year__2020_2023                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ end_year__2023_Inf                      <dbl> 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, …
## $ rr__0                                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ rr__1                                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `length__-Inf_6.1`                      <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ length__6.1_14.75                       <dbl> 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, …
## $ length__14.75_28.2                      <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ length__28.2_Inf                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tunnel_per__-Inf_49.09`                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tunnel_per__49.09_Inf                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `tunnel__-Inf_3.275`                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tunnel__3.275_8.4                       <dbl> 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, …
## $ tunnel__8.4_20                          <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ tunnel__20_Inf                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `stations__-Inf_4`                      <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
## $ stations__4_10                          <dbl> 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ stations__10_20                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ stations__20_Inf                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cost__-Inf_7.4713630881871`            <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ cost__7.4713630881871_9.15883647922657  <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ cost__9.15883647922657_10.2633708195977 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost__10.2633708195977_Inf              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `year__-Inf_2012`                       <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, …
## $ year__2012_2016                         <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2016_2018                         <dbl> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2018_Inf                          <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ `ppp_rate__-Inf_0.2379`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.2379_0.266                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ppp_rate__0.266_1.25                    <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, …
## $ ppp_rate__1.25_Inf                      <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `real_cost__-Inf_1168.71`               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ real_cost__1168.71_2977.92              <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ real_cost__2977.92_5540.5525            <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ real_cost__5540.5525_Inf                <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ `cost_km_millions__-Inf_132.9840103`    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__132.9840103_184.085   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__184.085_243.2825      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cost_km_millions__243.2825_Inf          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
# Step 2 Correlate
 data_corr_tbl <- data_binarized_tbl1 %>%
  correlate(cost__10.2633708195977_Inf)

data_corr_tbl
## # A tibble: 76 × 3
##    feature    bin                               correlation
##    <fct>      <chr>                                   <dbl>
##  1 cost       10.2633708195977_Inf                    1    
##  2 ppp_rate   -Inf_0.2379                             0.684
##  3 real_cost  5540.5525_Inf                           0.376
##  4 cost       -Inf_7.4713630881871                   -0.335
##  5 cost       9.15883647922657_10.2633708195977      -0.333
##  6 cost       7.4713630881871_9.15883647922657       -0.331
##  7 ppp_rate   1.25_Inf                               -0.315
##  8 tunnel_per -Inf_49.09                              0.291
##  9 tunnel_per 49.09_Inf                              -0.291
## 10 ppp_rate   0.266_1.25                             -0.287
## # ℹ 66 more rows
# Step 3 Plot 
data_corr_tbl %>%
  plot_correlation_funnel()
## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps