It’s Saturday night and I think it’s time to analyze some data…

I downloaded some data from Our World in Data (https://ourworldindata.org/co2-and-other-greenhouse-gas-emissions) and thought it would be a good exercise for myself to outline what I would normally do when presented with new data.

#first let's load some libraries 
library(data.table)
library(ggpubr)
## Loading required package: ggplot2
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.3     ✓ dplyr   1.0.1
## ✓ tidyr   1.1.1     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between()   masks data.table::between()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks data.table::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
#now we can read in the three data files I downloaded 
co2_data = fread("owid-co2-data.csv")

#quick check of the data
head(co2_data)
#some quick data cleaning, will remove regions so that we are only looking at individual countries 
co2_data = filter(co2_data, !(country %in% c("Africa", "Asia", "Asia (excl. China & India)",
                                                    "EU-27", "EU-28", "Europe", "Europe (excl. EU-27)",
                                                    "Europe (excl. EU-28)", "International transport", 
                                                    "KP Annex B", "Non KP Annex B" , "Non-OECD", "North America",
                                                    "North America (excl. USA)", "OECD", "Oceania", "Reunion",
                                                    "South America", "World")))

#doing that makes me think that this would have been a useful column to include in the dataset (country versus region)

Full variable descriptions here: https://github.com/owid/co2-data/blob/master/owid-co2-codebook.csv Now that we have some data, I am interested to see which countries produced the most CO2 emissions in the last ten years.

#most recent year in the data
max(co2_data$year)  
## [1] 2018
#let's filter the data to only include years 2008 to 2018
co2_data = filter(co2_data, year >= 2008)

Now there are several things we can ask about the data but first let’s take another look at the available variables that may be of interest.

#variables that we can look at 
colnames(co2_data)
##  [1] "iso_code"                    "country"                    
##  [3] "year"                        "co2"                        
##  [5] "co2_growth_prct"             "co2_growth_abs"             
##  [7] "consumption_co2"             "trade_co2"                  
##  [9] "trade_co2_share"             "co2_per_capita"             
## [11] "consumption_co2_per_capita"  "share_global_co2"           
## [13] "cumulative_co2"              "share_global_cumulative_co2"
## [15] "co2_per_gdp"                 "consumption_co2_per_gdp"    
## [17] "co2_per_unit_energy"         "cement_co2"                 
## [19] "coal_co2"                    "flaring_co2"                
## [21] "gas_co2"                     "oil_co2"                    
## [23] "cement_co2_per_capita"       "coal_co2_per_capita"        
## [25] "flaring_co2_per_capita"      "gas_co2_per_capita"         
## [27] "oil_co2_per_capita"          "total_ghg"                  
## [29] "ghg_per_capita"              "methane"                    
## [31] "methane_per_capita"          "nitrous_oxide"              
## [33] "nitrous_oxide_per_capita"    "primary_energy_consumption" 
## [35] "energy_per_capita"           "energy_per_gdp"             
## [37] "population"                  "gdp"
#On the x-axis we will have year 
#On the y-axis we will have Annual CO2 emissions distribution across countries
ggstripchart(co2_data, x = "year", y = "co2", size=1, color = "steelblue", 
 add = c("median"), ggtheme = theme_minimal()) + ylab("Annual CO2 emissions")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: `fun.ymin` is deprecated. Use `fun.min` instead.
## Warning: `fun.ymax` is deprecated. Use `fun.max` instead.

#who are the two countries that are consistently higher than everyone else?
library(ggrepel)

ggplot(co2_data, aes(year, co2, label = country)) +
  geom_text_repel(
    data = subset(co2_data, co2 > 5000)) +
    geom_point(color = ifelse(co2_data$co2 > 5000, "red", "black"))+
  theme_minimal()+ ylab("Annual CO2 emissions")+scale_x_continuous(breaks = 0:2100)

#What has been hapening over the years?
ggline(co2_data, x = "year", y = "co2", 
 add = c("median"), color = "steelblue", ggtheme = theme_minimal()) +scale_x_continuous(breaks = 0:2100) +ylab("Median global annual CO2 emissions")