Goal: To predict transit costs Click here for the data
transit_cost <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-05/transit_cost.csv')
skimr::skim(transit_cost)
| Name | transit_cost |
| Number of rows | 544 |
| Number of columns | 20 |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| country | 7 | 0.99 | 2 | 2 | 0 | 56 | 0 |
| city | 7 | 0.99 | 4 | 16 | 0 | 140 | 0 |
| line | 7 | 0.99 | 2 | 46 | 0 | 366 | 0 |
| start_year | 53 | 0.90 | 4 | 9 | 0 | 40 | 0 |
| end_year | 71 | 0.87 | 1 | 4 | 0 | 36 | 0 |
| tunnel_per | 32 | 0.94 | 5 | 7 | 0 | 134 | 0 |
| source1 | 12 | 0.98 | 4 | 54 | 0 | 17 | 0 |
| currency | 7 | 0.99 | 2 | 3 | 0 | 39 | 0 |
| real_cost | 0 | 1.00 | 1 | 10 | 0 | 534 | 0 |
| source2 | 10 | 0.98 | 3 | 16 | 0 | 12 | 0 |
| reference | 19 | 0.97 | 3 | 302 | 0 | 350 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| e | 7 | 0.99 | 7738.76 | 463.23 | 7136.00 | 7403.00 | 7705.00 | 7977.00 | 9510.00 | ▇▇▂▁▁ |
| rr | 8 | 0.99 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| length | 5 | 0.99 | 58.34 | 621.20 | 0.60 | 6.50 | 15.77 | 29.08 | 12256.98 | ▇▁▁▁▁ |
| tunnel | 32 | 0.94 | 29.38 | 344.04 | 0.00 | 3.40 | 8.91 | 21.52 | 7790.78 | ▇▁▁▁▁ |
| stations | 15 | 0.97 | 13.81 | 13.70 | 0.00 | 4.00 | 10.00 | 20.00 | 128.00 | ▇▁▁▁▁ |
| cost | 7 | 0.99 | 805438.12 | 6708033.07 | 0.00 | 2289.00 | 11000.00 | 27000.00 | 90000000.00 | ▇▁▁▁▁ |
| year | 7 | 0.99 | 2014.91 | 5.64 | 1987.00 | 2012.00 | 2016.00 | 2019.00 | 2027.00 | ▁▁▂▇▂ |
| ppp_rate | 9 | 0.98 | 0.66 | 0.87 | 0.00 | 0.24 | 0.26 | 1.00 | 5.00 | ▇▂▁▁▁ |
| cost_km_millions | 2 | 1.00 | 232.98 | 257.22 | 7.79 | 134.86 | 181.25 | 241.43 | 3928.57 | ▇▁▁▁▁ |
data <- transit_cost %>%
# Treat missing values
na.omit() %>%
# log transform variables with pos-skewed distribution
mutate(cost = log(cost))
Correlation between city and Cost
top20_cities_vec <- data %>%
count(city, sort = TRUE) %>%
head(20) %>%
pull(city)
top20_cities_vec
## [1] "Beijing" "Shanghai" "Istanbul" "Mumbai" "Changsha" "Nanjing"
## [7] "Chengdu" "Paris" "Guangzhou" "Tokyo" "Wuhan" "Shenzhen"
## [13] "Chongqing" "Dongguan" "Taipei" "Tianjin" "Bangkok" "Barcelona"
## [19] "Hangzhou" "Madrid"
data %>%
#Filter for top 20 cities worldwide
filter(city %in% top20_cities_vec) %>%
ggplot(aes(cost, fct_reorder(city, city))) +
geom_boxplot()
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
Year
data %>%
ggplot(aes(cost, year)) +
scale_y_log10() +
geom_point()
EDA Shortcut
# Step 1: Prepare the data
data_binarized_tbl <- data %>%
select(year, -city) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 428
## Columns: 4
## $ `year__-Inf_2012` <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ year__2012_2016 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1…
## $ year__2016_2018 <dbl> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0…
## $ year__2018_Inf <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0…
# Step 2 Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(year__2018_Inf)
data_corr_tbl
## # A tibble: 4 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 year 2018_Inf 1
## 2 year 2012_2016 -0.377
## 3 year -Inf_2012 -0.339
## 4 year 2016_2018 -0.258
# Step 3 Plot
data_corr_tbl %>%
plot_correlation_funnel()