Dataset from: https://www.kaggle.com/datasets/timmofeyy/-world-metro
city chr The city where Metro is located
country chr The country where Metro is located
name chr The name of metro (underground)
year int A year when it was constructed
year_last_expansion int A year of last expansion
stations int A number of stations
length_km int Length of lines in km
annual_ridership_mill dbl Annual ridership of the metro in million
region chr Region of metro (was added for visualization)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(hrbrthemes)
## Warning: package 'hrbrthemes' was built under R version 4.1.3
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(dplyr)
library(viridis)
## Warning: package 'viridis' was built under R version 4.1.3
## Loading required package: viridisLite
library(ggcharts)
## Warning: package 'ggcharts' was built under R version 4.1.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#install.packages('ggcharts')
df <- read.csv("metro_countries_cities.csv")
as_tibble(df)
## # A tibble: 198 x 9
## city country name year year_last_expan~ stations length_km
## <chr> <chr> <chr> <int> <int> <int> <int>
## 1 Algiers Algeria Algiers ~ 2011 2018 19 185
## 2 Buenos Aires Argentina Buenos A~ 1913 2019 90 567
## 3 Yerevan Armenia Yerevan ~ 1981 1996 10 134
## 4 Sydney Australia Sydney M~ 2019 2019 13 36
## 5 Vienna Austria Vienna U~ 1976 2017 98 833
## 6 Baku Azerbaijan Baku Met~ 1967 2021 26 3803
## 7 Minsk Belarus Minsk Me~ 1984 2020 33 408
## 8 Brussels Belgium Brussels~ 1976 2009 59 399
## 9 Belo Horizonte Brazil Belo Hor~ 1986 2002 19 281
## 10 BrasÃlia Brazil BrasÃli~ 2001 2020 27 424
## # ... with 188 more rows, and 2 more variables: annual_ridership_mill <dbl>,
## # region <chr>
sum(is.na(df))
## [1] 0
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
#Number of stations by Region
df %>% # Summary by group using dplyr
group_by(region) %>%
summarize(mean = mean(stations),
median = median(stations),
mode = getmode(stations),
min = min(stations),
max = max(stations),
sd=sd(stations),
count = n())
## # A tibble: 6 x 8
## region mean median mode min max sd count
## <chr> <dbl> <dbl> <int> <int> <int> <dbl> <int>
## 1 africa 45 45 19 19 71 36.8 2
## 2 asia 72.7 41 10 6 396 80.4 117
## 3 australia 13 13 13 13 13 NA 1
## 4 europe 69.2 48 33 6 306 68.2 43
## 5 latin_america 50.6 29 19 18 163 43.5 17
## 6 north_america 66.7 42.5 75 13 424 95.9 18
#Length of Metro in km by Region
df %>%
group_by(region) %>%
summarize(mean = mean(length_km),
median = median(length_km),
mode = getmode(length_km),
min = min(length_km),
max = max(length_km),
sd=sd(length_km),
count = n())
## # A tibble: 6 x 8
## region mean median mode min max sd count
## <chr> <dbl> <dbl> <int> <int> <int> <dbl> <int>
## 1 africa 540. 540. 185 185 894 501. 2
## 2 asia 2176. 435 1425 12 49039 6274. 117
## 3 australia 36 36 36 36 36 NA 1
## 4 europe 589. 381 382 34 4052 806. 43
## 5 latin_america 489. 368 567 58 2009 449. 17
## 6 north_america 498. 264. 71 31 1868 520. 18
#Annual riderships by country in million
df %>%
group_by(country) %>%
summarize(mean = mean(annual_ridership_mill),
median = median(annual_ridership_mill),
mode = getmode(annual_ridership_mill),
min = min(annual_ridership_mill),
max = max(annual_ridership_mill),
sd=sd(annual_ridership_mill),
count = n())
## # A tibble: 59 x 8
## country mean median mode min max sd count
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Algeria 45.3 45.3 45.3 45.3 45.3 NA 1
## 2 Argentina 74 74 74 74 74 NA 1
## 3 Armenia 10.8 10.8 10.8 10.8 10.8 NA 1
## 4 Australia 12.9 12.9 12.9 12.9 12.9 NA 1
## 5 Austria 460. 460. 460. 460. 460. NA 1
## 6 Azerbaijan 72.1 72.1 72.1 72.1 72.1 NA 1
## 7 Belarus 219. 219. 219. 219. 219. NA 1
## 8 Belgium 87.6 87.6 87.6 87.6 87.6 NA 1
## 9 Brazil 169. 62 54.4 42.8 764. 264. 7
## 10 Bulgaria 92.4 92.4 92.4 92.4 92.4 NA 1
## # ... with 49 more rows
#Frequency distribution
df_region <- df$region
transform(table(df_region))
## df_region Freq
## 1 africa 2
## 2 asia 117
## 3 australia 1
## 4 europe 43
## 5 latin_america 17
## 6 north_america 18
#relative frequency distribution
transform(table(df_region)/length(df_region))
## df_region Freq
## 1 africa 0.010101010
## 2 asia 0.590909091
## 3 australia 0.005050505
## 4 europe 0.217171717
## 5 latin_america 0.085858586
## 6 north_america 0.090909091
new_df <- filter(df, year >= "2000")
tab1<-table(new_df$region, new_df$year)
tab1
##
## 2000 2001 2002 2003 2004 2005 2006 2008 2009 2010 2011 2012
## africa 0 0 0 0 0 0 0 0 0 0 1 0
## asia 2 0 3 1 5 3 1 1 2 3 5 3
## australia 0 0 0 0 0 0 0 0 0 0 0 0
## europe 0 0 2 0 0 0 1 1 0 0 0 0
## latin_america 0 1 0 0 0 0 0 0 1 0 1 0
## north_america 0 0 0 0 1 0 0 0 0 0 0 0
##
## 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
## africa 0 0 0 0 0 0 0 0 0 0
## asia 3 5 6 4 7 1 11 2 6 1
## australia 0 0 0 0 0 0 1 0 0 0
## europe 1 0 0 0 0 0 0 0 0 0
## latin_america 0 2 0 0 0 0 0 0 0 0
## north_america 0 0 0 0 0 0 0 0 0 0
p.bar <- ggplot(data = df, mapping = aes(x = year))+
geom_bar(aes(fill = region))
p.bar + labs(title = "World Metro from 1890 to 2022", x = "Year", y = "Number of Metro") + theme_light()
us <- filter(df, country == "United States")
us$stations <- as.numeric(us$stations)
df_us <- us %>%
group_by(city) %>%
select(city, stations)
##### Sum of the column by group
data <- aggregate(x= df_us$stations,
by= list(df_us$city),
FUN=sum)
#Position when have time
df2 <- data %>%
mutate(csum = rev(cumsum(rev(x))),
pos = x/2 + lead(csum, 1),
pos = if_else(is.na(pos), x/2, pos))
#df2
ggplot(data, aes(x = "" , y = x, fill = fct_inorder(Group.1))) +
geom_col(width = 1, color = 1) +
coord_polar(theta = "y") +
guides(fill = guide_legend(title = "State")) +
theme_void() + ggtitle("US Metro by States")
#Box Plot1
ggplot(df, aes(x=region, y=stations, fill=region)) +
geom_boxplot() + xlab("Region") +
ylab("Number of Stations") + theme_minimal() +
theme(legend.position="none",
plot.title = element_text(size=14)) +
ggtitle("Number of Stations World Metro Develpment by Region")+
scale_fill_brewer(palette="Dark2") +
scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
NULL
p.asia <- ggplot(df, aes(x=stations, color=region))+
ggtitle("Number of Stations World Metro Develpment by Region")+
geom_histogram(fill="white") + theme_classic()
ggplotly(p.asia, tooltip = c("region", "count"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
asian_df <- filter(df, region == "asia" )
#Box Plot
p.box <- ggplot(asian_df, aes(x=country, y=annual_ridership_mill, fill=country)) +
geom_boxplot() +
xlab("Country") +
ylab("Riderships in Million") +
theme_minimal() +
theme(legend.position="none",
plot.title = element_text(size=14)) +
ggtitle("Annual Ridership in Asian Countries")
p.box + theme(axis.text.x = element_text(angle = 90, size = 8))
ggplot(asian_df, aes(x=annual_ridership_mill, color=country))+
xlab("Annual Ridership in million") +
ylab("Count") +
ggtitle("Metro Annual Riderships in Asia")+
geom_histogram(fill="white") + theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The dataset includes information concerning metro stations all around the World. All underground projects from all over the World, their annual ridership, number of stations, current metros in progress, their planned opening, and date of construction. This dataset only includes heavy rail rapid transit systems. It does not include bus or light rail systems.
I learned that Europe and North America were the first two regions that developed Metro in the 19th. However, Asia has been becoming the region that is growing Metro in the last decade. Therefore, Asia has the highest number of Metro stations in the World because of the growing Metro development in China. There is no doubt that China has the most annual ridership in the region. Australia seems to be the region and country that has the smallest Metro development.
From the year 2000, at least one region in the World had constructed a new metro in a year.
In North America, on the other hand, looking closer at US’s Metro, no doubt that New York City has the most Metro system in the country, followed by Chicago and Philadelphia.