It’s Saturday night and I think it’s time to analyze some data…
I downloaded some data from Our World in Data (https://ourworldindata.org/co2-and-other-greenhouse-gas-emissions) and thought it would be a good exercise for myself to outline what I would normally do when presented with new data.
#first let's load some libraries
library(data.table)
library(ggpubr)
## Loading required package: ggplot2
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.3 ✓ dplyr 1.0.1
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between() masks data.table::between()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
#now we can read in the three data files I downloaded
co2_data = fread("owid-co2-data.csv")
#quick check of the data
head(co2_data)
#some quick data cleaning, will remove regions so that we are only looking at individual countries
co2_data = filter(co2_data, !(country %in% c("Africa", "Asia", "Asia (excl. China & India)",
"EU-27", "EU-28", "Europe", "Europe (excl. EU-27)",
"Europe (excl. EU-28)", "International transport",
"KP Annex B", "Non KP Annex B" , "Non-OECD", "North America",
"North America (excl. USA)", "OECD", "Oceania", "Reunion",
"South America", "World")))
#doing that makes me think that this would have been a useful column to include in the dataset (country versus region)
Full variable descriptions here: https://github.com/owid/co2-data/blob/master/owid-co2-codebook.csv Now that we have some data, I am interested to see which countries produced the most CO2 emissions in the last ten years.
#most recent year in the data
max(co2_data$year)
## [1] 2018
#let's filter the data to only include years 2008 to 2018
co2_data = filter(co2_data, year >= 2008)
Now there are several things we can ask about the data but first let’s take another look at the available variables that may be of interest.
#variables that we can look at
colnames(co2_data)
## [1] "iso_code" "country"
## [3] "year" "co2"
## [5] "co2_growth_prct" "co2_growth_abs"
## [7] "consumption_co2" "trade_co2"
## [9] "trade_co2_share" "co2_per_capita"
## [11] "consumption_co2_per_capita" "share_global_co2"
## [13] "cumulative_co2" "share_global_cumulative_co2"
## [15] "co2_per_gdp" "consumption_co2_per_gdp"
## [17] "co2_per_unit_energy" "cement_co2"
## [19] "coal_co2" "flaring_co2"
## [21] "gas_co2" "oil_co2"
## [23] "cement_co2_per_capita" "coal_co2_per_capita"
## [25] "flaring_co2_per_capita" "gas_co2_per_capita"
## [27] "oil_co2_per_capita" "total_ghg"
## [29] "ghg_per_capita" "methane"
## [31] "methane_per_capita" "nitrous_oxide"
## [33] "nitrous_oxide_per_capita" "primary_energy_consumption"
## [35] "energy_per_capita" "energy_per_gdp"
## [37] "population" "gdp"
#On the x-axis we will have year
#On the y-axis we will have Annual CO2 emissions distribution across countries
ggstripchart(co2_data, x = "year", y = "co2", size=1, color = "steelblue",
add = c("median"), ggtheme = theme_minimal()) + ylab("Annual CO2 emissions")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: `fun.ymin` is deprecated. Use `fun.min` instead.
## Warning: `fun.ymax` is deprecated. Use `fun.max` instead.
#who are the two countries that are consistently higher than everyone else?
library(ggrepel)
ggplot(co2_data, aes(year, co2, label = country)) +
geom_text_repel(
data = subset(co2_data, co2 > 5000)) +
geom_point(color = ifelse(co2_data$co2 > 5000, "red", "black"))+
theme_minimal()+ ylab("Annual CO2 emissions")+scale_x_continuous(breaks = 0:2100)
#What has been hapening over the years?
ggline(co2_data, x = "year", y = "co2",
add = c("median"), color = "steelblue", ggtheme = theme_minimal()) +scale_x_continuous(breaks = 0:2100) +ylab("Median global annual CO2 emissions")