R Markdown

Goal: To predict transit costs Click here for the data

transit_cost <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-05/transit_cost.csv')

skimr::skim(transit_cost)

Data summary
Name	transit_cost
Number of rows	544
Number of columns	20
_______________________
Column type frequency:
character	11
numeric	9
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
country	7	0.99	2	2	56
city	7	0.99	4	16	140
line	7	0.99	2	46	366
start_year	53	0.90	4	9	40
end_year	71	0.87	1	4	36
tunnel_per	32	0.94	5	7	134
source1	12	0.98	4	54	17
currency	7	0.99	2	3	39
real_cost	0	1.00	1	10	534
source2	10	0.98	3	16	12
reference	19	0.97	3	302	350

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
e	7	0.99	7738.76	463.23	7136.00	7403.00	7705.00	7977.00	9510.00	▇▇▂▁▁
rr	8	0.99	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
length	5	0.99	58.34	621.20	0.60	6.50	15.77	29.08	12256.98	▇▁▁▁▁
tunnel	32	0.94	29.38	344.04	0.00	3.40	8.91	21.52	7790.78	▇▁▁▁▁
stations	15	0.97	13.81	13.70	0.00	4.00	10.00	20.00	128.00	▇▁▁▁▁
cost	7	0.99	805438.12	6708033.07	0.00	2289.00	11000.00	27000.00	90000000.00	▇▁▁▁▁
year	7	0.99	2014.91	5.64	1987.00	2012.00	2016.00	2019.00	2027.00	▁▁▂▇▂
ppp_rate	9	0.98	0.66	0.87	0.00	0.24	0.26	1.00	5.00	▇▂▁▁▁
cost_km_millions	2	1.00	232.98	257.22	7.79	134.86	181.25	241.43	3928.57	▇▁▁▁▁

data <- transit_cost %>%
  
  # Treat missing values
  na.omit() %>%
  
  # log transform variables with pos-skewed distribution
  mutate(cost = log(cost))

Explore Data

Correlation between city and Cost

top20_cities_vec <- data %>%
  count(city, sort = TRUE) %>%
  head(20) %>%
  pull(city)

top20_cities_vec

##  [1] "Beijing"   "Shanghai"  "Istanbul"  "Mumbai"    "Changsha"  "Nanjing"  
##  [7] "Chengdu"   "Paris"     "Guangzhou" "Tokyo"     "Wuhan"     "Shenzhen" 
## [13] "Chongqing" "Dongguan"  "Taipei"    "Tianjin"   "Bangkok"   "Barcelona"
## [19] "Hangzhou"  "Madrid"

data %>%
  
  #Filter for top 20 cities worldwide
  filter(city %in% top20_cities_vec) %>%
  ggplot(aes(cost, fct_reorder(city, city))) +
  geom_boxplot()

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA

Identify good predictors

Year

data %>% 
  ggplot(aes(cost, year)) +
  scale_y_log10() +
  geom_point()

EDA Shortcut

# Step 1: Prepare the data

data_binarized_tbl <- data %>%
  select(year, -city) %>%
  binarize()

data_binarized_tbl %>% glimpse()

## Rows: 428
## Columns: 4
## $ `year__-Inf_2012` <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ year__2012_2016   <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1…
## $ year__2016_2018   <dbl> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0…
## $ year__2018_Inf    <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0…

# Step 2 Correlate
data_corr_tbl <- data_binarized_tbl %>%
  correlate(year__2018_Inf)

data_corr_tbl

## # A tibble: 4 × 3
##   feature bin       correlation
##   <fct>   <chr>           <dbl>
## 1 year    2018_Inf        1    
## 2 year    2012_2016      -0.377
## 3 year    -Inf_2012      -0.339
## 4 year    2016_2018      -0.258

# Step 3 Plot
data_corr_tbl %>%
  plot_correlation_funnel()

Apply 1: Transit Costs

Olivia Pendergast

2024-02-13

R Markdown

Explore Data

Identify good predictors