date column
This notebook contains visualizations of a few World Bank indicators for the top 6 economies of the world. It also contains a few plots for the Global emission data (\(CO_2\) & Greenhouse Gases).
library(tidyverse)
library(ggthemes)
library(lubridate)
library(plotly)
library(patchwork)
library(gghighlight)#read csv file
all_data <- read_csv(paste0(getwd(),"/data/world_bank_data/world_bank_development_indicators.csv"))#selecting only relevant columns
all_data <- all_data %>%
select(country, date, population, population_density, GDP_current_US,
`forest_land%`,
`renewvable_energy_consumption%`,
CO2_emisions,
other_greenhouse_emisions,
`research_and_development_expenditure%`,
`military_expenditure%`,
`government_expenditure_on_education%`,
`government_health_expenditure%`
)date column#add year column
all_data$year <- as.integer(lubridate::year(all_data$date))
#summary of year column
summary(all_data$year)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1960 1975 1991 1991 2007 2022
## Rows: 16,780
## Columns: 14
## $ country <chr> "Afghanistan", "Afghanistan", …
## $ date <date> 1960-01-01, 1961-01-01, 1962-…
## $ population <dbl> 8622466, 8790140, 8969047, 915…
## $ population_density <dbl> NA, 13.47706, 13.75136, 14.040…
## $ GDP_current_US <dbl> 537777811, 548888896, 54666667…
## $ `forest_land%` <dbl> NA, NA, NA, NA, NA, NA, NA, NA…
## $ `renewvable_energy_consumption%` <dbl> NA, NA, NA, NA, NA, NA, NA, NA…
## $ CO2_emisions <dbl> NA, NA, NA, NA, NA, NA, NA, NA…
## $ other_greenhouse_emisions <dbl> NA, NA, NA, NA, NA, NA, NA, NA…
## $ `research_and_development_expenditure%` <dbl> NA, NA, NA, NA, NA, NA, NA, NA…
## $ `military_expenditure%` <dbl> NA, NA, NA, NA, NA, NA, NA, NA…
## $ `government_expenditure_on_education%` <dbl> NA, NA, NA, NA, NA, NA, NA, NA…
## $ `government_health_expenditure%` <dbl> NA, NA, NA, NA, NA, NA, NA, NA…
## $ year <int> 1960, 1961, 1962, 1963, 1964, …
#rename column names
all_data <- all_data %>%
rename(#inflation_perc = "inflation_annual%",
#agri_land_perc = "agricultural_land%",
forest_land_perc = "forest_land%",
renewable_energy_consump_perc = "renewvable_energy_consumption%",
r_and_d_exp_perc = "research_and_development_expenditure%",
military_exp_perc = "military_expenditure%",
education_exp_perc = "government_expenditure_on_education%",
health_exp_perc = "government_health_expenditure%")The country column not only contains names of countries, but also names of continents, regions like “Western Africa”, “Sub-Saharan region”, “conflict states”, “OECD” etc.
All those rows are extracted which contain “World” as country.
I’ve created a list of keywords found in the
country column. All rows with these
keywords will be removed, leaving behind proper country names.
#filter out country names from the dataset
#some recurring keywords found in country column
#these will be used to remove containing such patterns
keywords <- "Eastern|Southern|Western|Central|World|Island|Euro|dividend|Asia|income|
Fragile and conflict affected situations|IDA|countries|IBRD|Latin|Middle East|
Not classified|OECD members|small states|Sahara"
#country dataframe with only contain legit country names
country_data <- all_data[-grep(pattern = keywords, all_data$country,
ignore.case = T),]
#remove rows with following names in country columns
country_data <- country_data %>%
filter(!country %in% c("Fragile and conflict affected situations",
"North America",
"Not classified"))The names have been abbreviated to make cleaner labels for X-axis.
The R&D expenditure (as % of GDP) is missing for India for the year 2019. I’ve replaced this NA with 0.7%. This estimate has been taken from the Research and Development Statistics Report, 2019-2020.
2019 has been taken as reference year because most countries have no data for years 2020, 2021 and 2022.
Graphs with different World Bank indicators will be created for these 6 countries.
#top 10 countries with highest GDP in 2019
top6_countries <- country_data %>%
filter(year == 2019) %>%
slice_max(n = 6, order_by = GDP_current_US) %>%
select(country)
#convert the country column to factors with ordered levels
top6_countries <- factor(top6_countries$country, levels = top6_countries$country)I have used theme_economist_white() as
the base theme for all plots. This theme is present in the
ggthemes package. It resembles the graph format used by
news articles on The Economist.
common_theme contains some layout components common to
all plots in this notebook.
#common theme elements for all plots
common_theme <- theme_economist_white(gray_bg = F)+
theme(
#plot.title.position = "plot",
plot.caption.position = "plot",
plot.title = element_text(size = 12, hjust = 0.5)
)Following series of plots is an attempt to make the linechart
interactive.
First is the linechart in its basic form, in subsequent steps, I have
noted down some of my observations while playing around with the object
returned by plotly_build() of
plotly package.
##data prep
r_and_d_data <- country_data %>%
filter(country %in% top6_countries,
year %in% seq(2000,2019,1)) %>%
select(country, date, r_and_d_exp_perc, year)
#R&D value in 2019
exp_2019 <- r_and_d_data %>%
filter(year == 2019) %>%
mutate(r_and_d_exp_perc = round(r_and_d_exp_perc,2))(research_lineplot <- r_and_d_data %>%
ggplot(aes(x = year, y = r_and_d_exp_perc))+
geom_line(aes(color = country), na.rm = T, linewidth = 1)+
geom_point(data = exp_2019, aes(x = year, y = r_and_d_exp_perc, color = country),
shape = 16, na.rm = T)+
#label box for India
annotate(geom = "label", x = 2002, y = 0.7, label = "India", color = "#7ad2f6", size = 2)+
#label box for UK
annotate(geom = "label", x = 2002, y = 1.7, label = "UK", color = "#76c0c1", size = 2)+
#label box for China
annotate(geom = "label", x = 2002, y = 1.2, label = "China", color = "#6794a7", size = 2)+
#label box for Germany
annotate(geom = "label", x = 2002, y = 2.3, label = "Germany", color = "#014d64", size = 2)+
#label box for Japan
annotate(geom = "label", x = 2002, y = 3.1, label = "Japan", color = "#01a2d9", size = 2)+
#label box for USA
annotate(geom = "label", x = 2002, y = 2.6, label = "USA", color = "#00887d", size = 2)+
coord_cartesian(clip = "off")+
scale_x_continuous(breaks = seq(2000,2019,2),
labels = seq(2000,2019,2))+
scale_color_economist(guide = "none")+
labs(x = "", y = "R&D expenditure as % of GDP",
title = "Comparison of emphasis given to R&D by the 6 countries (2000-2019)",
subtitle = "Data for year 2019 for India is unknown")+
common_theme+
theme(plot.title = element_text(size = 9),
plot.subtitle = element_text(margin = margin(t=5, b = 25), size = 8),
axis.title.y = element_text(margin = margin(r=7), size = 7),
axis.text.x = element_text(size = 6, face = "bold"),
axis.text.y = element_text(size = 6, face = "bold")
)
)The label boxes for each country have been removed below to prevent
cluttering in the final interactive plot.
research_lineplot2 is the same linechart
as above except for the label boxes.
research_lineplot2 <- r_and_d_data %>%
ggplot(aes(x = year, y = r_and_d_exp_perc))+
geom_line(aes(color = country), na.rm = T, linewidth = 1)+
geom_point(data = exp_2019, aes(x = year, y = r_and_d_exp_perc, color = country),
shape = 16, na.rm = T)+
coord_cartesian(clip = "off")+
scale_x_continuous(breaks = seq(2000,2019,2),
labels = seq(2000,2019,2))+
scale_color_economist(guide = "none")+
labs(x = "", y = "R&D expenditure as % of GDP",
title = "Comparison of emphasis given to R&D by the 6 countries (2000-2019)",
subtitle = "Data for year 2019 for India is unknown")+
common_theme+
theme(plot.title = element_text(size = 9),
plot.subtitle = element_text(margin = margin(t=5, b = 25), size = 8),
axis.title.y = element_text(margin = margin(r=7), size = 7),
axis.text.x = element_text(size = 6, face = "bold"),
axis.text.y = element_text(size = 6, face = "bold")
)Applying plotly_build() renders the
above graph interactive. But it adds a legend which was explicitly
omitted in the code above!
In order to remove the legend, we need to get down to each component
of the plotly object and set showlegend as false.
research_lineplot_i is a super-nested list
internally. The plot above contains a handful of geoms which can be
customized using the data[[i]] part.
For e.g., geom_line() is the first geom used
followed by geom_point(). So, data[[1]] to data[[6]]
represent the line-component for each of the 6 countries.
data[[1]]$name will return “China”,
data[[2]]$name will return Germany.
## [1] "China"
## $width
## [1] 3.779528
##
## $color
## [1] "rgba(103,148,167,1)"
##
## $dash
## [1] "solid"
## [1] "China"
## $autocolorscale
## [1] FALSE
##
## $color
## [1] "rgba(103,148,167,1)"
##
## $opacity
## [1] 1
##
## $size
## [1] 5.669291
##
## $symbol
## [1] "circle"
##
## $line
## $line$width
## [1] 1.889764
##
## $line$color
## [1] "rgba(103,148,167,1)"
Similary, each of these objects data[[1]] to data[[12]] contain a
component called showlegend which can be turned on or off
depending on requirement.
Below, I’ve removed the legend created by
plotly_build().
Contuining with the explanation above, another component of
data[[i]] objects is text. What it
contains, gets reflected in the tooltip when you hover over the
lines.
Here, I’ve made minor changes to the column names for better readability.
#tooltip content modification
for (i in 1:length(research_lineplot_i$x$data)) {
year <- research_lineplot_i$x$data[[i]]$x
expend <- research_lineplot_i$x$data[[i]]$y
country <- research_lineplot_i$x$data[[i]]$name
research_lineplot_i$x$data[[i]]$text <- str_glue(
"Year: {year}",
"<br>",
"R&D expenditure: {round(expend,2)}%",
"<br>",
"Country: {country}"
)
}And the final plot now looks like…
Hover over the graph above and this one to see the difference in the
tooltip.
I would suggest before trying out these features on Kaggle, it will be better to play around with them in RStudio. I learnt a lot while working on this part in RStudio. :)
#data prep
energy_data <- country_data %>%
filter(country %in% top6_countries,
year %in% c(2000, 2010, 2019)) %>%
select(country, year, renewable_energy_consump_perc)%>%
mutate(year = factor(as.integer(year), levels = c(2000, 2010, 2019))
)%>%
rename(energy_consump = renewable_energy_consump_perc)#plot
(energy_barplot <- energy_data %>%
ggplot(aes(x = country, y = energy_consump))+
geom_bar(aes(fill = year), position = "dodge", stat = "identity", width = 0.7)+
scale_fill_manual(values = c("2000" = "#B0E0E6", "2010" = "#3EBCD2", "2019" = "#006BA2"))+
scale_y_continuous(limits = c(0,50), position = "right", expand = c(0,0))+
labs(title = "Renewable energy consumption (% of total energy)",
subtitle = "Change in green energy consumption over the decades (2000-2019)",
caption = "Source: World Bank",
x = "",
y = "")+
common_theme+
theme(#plot.subtitle = element_text(margin = margin(0,0,5,0)),
#legend.text = element_text(size = 5),
legend.key.size = unit(.5, "cm"),
legend.title = element_text(size = 9),
legend.text = element_text(size = 8),
legend.direction = "horizontal",
axis.ticks.x = element_blank(),
axis.text.x = element_text(size = 7, face = "bold",margin = margin(t=1)),
axis.text.y = element_text(size = 7, face = "bold"),
plot.subtitle = element_text(margin = margin(t=3,b=10), size = 8),
plot.title = element_text(size = 9),
plot.caption = element_text(hjust = 0, size = 7))
)global_climate_data <- world_data %>%
filter(year > 1990) %>%
select(country, date, year,
contains(match = "emision", ignore.case = T)) %>%
mutate(CO2_emisions = CO2_emisions/1e6,
other_greenhouse_emisions = other_greenhouse_emisions/1e6) %>%
drop_na()In the plots below, I’ve added markers for Kyoto Protocol signed by countries all over the world in Dec 1997, followed by the 4th report by IPCC on Climate change urging the countries to build policy that control average earth temperature at 2\(^\circ\)C. Then we signed the Paris Agreement in December 2015 but the global emissions have kept on rising.
The decline after 2019 is because the world stood still during Coronavirus pandemic in 2020.
(co2_plot <- global_climate_data %>%
{
ggplot(.,aes(x = year, y = CO2_emisions))+
geom_line(na.rm = TRUE, color = "#006BA2", linewidth = 1.5)+
geom_area(na.rm = TRUE, fill = "#7ad2f6", alpha = 0.4)+
#geom_hline(yintercept = 21.39, color = "#014d64", linetype = "dashed")+
#Kyoto Protocol 1997
geom_vline(xintercept = 1997, linetype = "dashed", color = "#014d64")+
annotate("label", x = 1997, y = 36.5, label = "Kyoto Protocol\nDec 1997", size = 3)+
#IPCC 4th Report 2007
geom_vline(xintercept = 2007, linetype = "dashed", color = "#014d64")+
annotate(geom = "label", x = 2007, y = 36.5, label = "IPCC 4th Report", size = 3)+
#Paris Agreement 2015
geom_vline(xintercept = 2015, linetype = "dashed", color = "#014d64")+
annotate(geom = "label", x = 2015, y = 36.5, label = "Paris Agreement\nDec 2015", size = 3)+
scale_x_continuous(breaks = seq(1991,2020,1), labels = seq(1991,2020,1))+
scale_y_continuous(expand = c(0,0),
breaks = seq(0,40,5), labels = seq(0,40,5))+
labs(title = "Carbon dioxide emissions in kt (1991-2020)",
#subtitle = "mass in kt(kiloton)",
caption = "Source: World Bank",
x = "",
y = "mass in kt")+
coord_cartesian(clip = "off")+
common_theme+
theme(axis.ticks.length.x.bottom = unit(0.1, units = "cm"),
axis.text.x = element_text(angle = 90, size = 6, margin = margin(t=1.5), face = "bold"),
axis.text.y = element_text(size = 6, face = "bold"),
axis.title.y = element_text(margin = margin(r=10), size = 7),
panel.grid.major.y = element_line(linewidth = 0.3),
plot.caption = element_text(hjust = 0, size = 7),
plot.title = element_text(margin = margin(b = 30), hjust = 0.5))
}
)(ghg_plot <- global_climate_data %>%
{
ggplot(.,aes(x = year, y = other_greenhouse_emisions))+
geom_line(na.rm = TRUE, color = "#006BA2", linewidth = 1.5)+
geom_area(na.rm = TRUE, fill = "#7ad2f6", alpha = 0.4)+
#geom_hline(yintercept = 21.39, color = "#014d64", linetype = "dashed")+
#Kyoto Protocol 1997
geom_vline(xintercept = 1997, linetype = "dashed", color = "#014d64")+
annotate("label", x = 1997, y = 51, label = "Kyoto Protocol\nDec 1997", size = 3)+
#IPCC 4th Report 2007
geom_vline(xintercept = 2007, linetype = "dashed", color = "#014d64")+
annotate(geom = "label", x = 2007, y = 51, label = "IPCC 4th Report", size = 3)+
#Paris Agreement 2015
geom_vline(xintercept = 2015, linetype = "dashed", color = "#014d64")+
annotate(geom = "label", x = 2015, y = 51, label = "Paris Agreement\nDec 2015", size = 3)+
scale_x_continuous(breaks = seq(1991,2020,1), labels = seq(1991,2020,1))+
scale_y_continuous(expand = c(0,0),
breaks = seq(0,55,5), labels = seq(0,55,5))+
labs(title = "Greenhouse gas emission in kt (1991-2000)",
caption = "Source: World Bank",
x = "",
y = "mass in kt(kiloton)")+
coord_cartesian(clip = "off")+
common_theme+
theme(axis.ticks.length.x.bottom = unit(0.1, units = "cm"),
axis.text.x = element_text(angle = 90, size = 6, face = "bold", margin = margin(t=1.5)),
axis.text.y = element_text(size = 6, face = "bold"),
axis.title.y = element_text(margin = margin(r=10), size = 7),
panel.grid.major.y = element_line(linewidth = 0.3),
plot.caption = element_text(hjust = 0, size = 7),
plot.title = element_text(margin = margin(b = 30), hjust = 0.5))
}
)