Project_2_DATA_607 Emissions dataset tidying

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidyr)
library(dplyr)
library(readr)
library(data.table)

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose

library(maps)

## Warning: package 'maps' was built under R version 4.4.3

## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:purrr':
## 
##     map

library(sf)

## Warning: package 'sf' was built under R version 4.4.3

## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE

Tidying and cleaning data

Why do I think this dataset is not tidy:

Multiple variables in separate columns. Each row should represent a single observation (e.g., emissions for a specific region, item, and year), but this is not the case here.
Years as columns.

This code loads dataset, also it cleans and transform data:

emissions_data <- read.csv("https://raw.githubusercontent.com/farhodibr/CUNY-SPS-MSDS/refs/heads/main/DATA607/PROJECT2/Total%20Emissions%20Per%20Country%20(2000-2020)%20(1).csv")

head(emissions_data)

##          Area             Item                          Element       Unit
## 1 Afghanistan    Crop Residues           Direct emissions (N2O) kilotonnes
## 2 Afghanistan    Crop Residues         Indirect emissions (N2O) kilotonnes
## 3 Afghanistan    Crop Residues                  Emissions (N2O) kilotonnes
## 4 Afghanistan    Crop Residues Emissions (CO2eq) from N2O (AR5) kilotonnes
## 5 Afghanistan    Crop Residues          Emissions (CO2eq) (AR5) kilotonnes
## 6 Afghanistan Rice Cultivation                  Emissions (CH4) kilotonnes
##     X2000    X2001    X2002    X2003    X2004    X2005    X2006    X2007
## 1   0.520   0.5267   0.8200   0.9988   0.8225   1.1821   1.0277   1.2426
## 2   0.117   0.1185   0.1845   0.2247   0.1851   0.2660   0.2312   0.2796
## 3   0.637   0.6452   1.0045   1.2235   1.0075   1.4481   1.2589   1.5222
## 4 168.807 170.9884 266.1975 324.2195 266.9995 383.7498 333.6093 403.3749
## 5 168.807 170.9884 266.1975 324.2195 266.9995 383.7498 333.6093 403.3749
## 6  18.200  16.9400  18.9000  20.3000  27.3000  22.4000  22.4000  23.8000
##      X2008    X2009    X2010    X2011    X2012    X2013    X2014    X2015
## 1   0.8869   1.3920   1.2742   1.0321   1.3726   1.4018   1.4584   1.2424
## 2   0.1996   0.3132   0.2867   0.2322   0.3088   0.3154   0.3281   0.2795
## 3   1.0865   1.7051   1.5609   1.2643   1.6815   1.7173   1.7865   1.5220
## 4 287.9099 451.8647 413.6467 335.0379 445.5958 455.0727 473.4174 403.3181
## 5 287.9099 451.8647 413.6467 335.0379 445.5958 455.0727 473.4174 403.3181
## 6  26.6000  28.0000  29.1200  29.4000  28.7000  28.7000  30.8000  22.9600
##      X2016    X2017    X2018    X2019    X2020
## 1   1.1940   1.0617   0.8988   1.2176   1.3170
## 2   0.2687   0.2389   0.2022   0.2740   0.2963
## 3   1.4627   1.3005   1.1011   1.4916   1.6133
## 4 387.6130 344.6447 291.7838 395.2689 427.5284
## 5 387.6130 344.6447 291.7838 395.2689 427.5284
## 6  16.6600  15.3233  16.4555  17.8542  20.6577

year_cols <- grep("^X2", colnames(emissions_data), value = TRUE)
emissions_data <- emissions_data |>
  distinct(across(all_of(starts_with("X2"))), .keep_all = TRUE)


format_element <- function(x) {
  letters_match <- str_extract(x, "(?i)(CO2eq|N2O|CH4|CO2)")
  clean_txt <- str_remove(x, "(?i)(CO2eq|N2O|CH4|CO2)\\s*") |>
    str_remove_all("(?i)\\bemissions\\b")
  
  clean_txt <- str_to_lower(clean_txt) |>
    str_replace_all("\\s+", "_") |>
    str_replace_all("[^a-z0-9_]", "")|>
    str_replace_all("_$", "")
  
  result <- if(!is.na(letters_match) &letters_match !="") {
    str_c(letters_match, clean_txt, sep = "_")
  } else {
    clean_txt
  }
  
  result <- str_replace_all(result, "(?i)\\bn2o\\b", "N2O")
  result <- str_replace_all(result, "(?i)\\bch4\\b", "CH4")
  result <- str_replace_all(result, "(?i)\\bar5\\b", "AR5")
  result <- str_replace_all(result, "(?i)\\bco2\\b", "CO2")
  result <- str_replace_all(result, "_$", "")
  
  return(result)
}

colnames(emissions_data) <- str_remove_all(colnames(emissions_data), "(?i)^X")

emissions_data <- emissions_data |>
  mutate(Element = map_chr(Element, format_element)) |>
  drop_na()
head(emissions_data)

##          Area             Item              Element       Unit    2000     2001
## 1 Afghanistan    Crop Residues           N2O_direct kilotonnes   0.520   0.5267
## 2 Afghanistan    Crop Residues         N2O_indirect kilotonnes   0.117   0.1185
## 3 Afghanistan    Crop Residues                  N2O kilotonnes   0.637   0.6452
## 4 Afghanistan    Crop Residues CO2eq___from_n2o_ar5 kilotonnes 168.807 170.9884
## 5 Afghanistan Rice Cultivation                  CH4 kilotonnes  18.200  16.9400
## 6 Afghanistan Rice Cultivation CO2eq___from_ch4_ar5 kilotonnes 509.600 474.3200
##       2002     2003     2004     2005     2006     2007     2008     2009
## 1   0.8200   0.9988   0.8225   1.1821   1.0277   1.2426   0.8869   1.3920
## 2   0.1845   0.2247   0.1851   0.2660   0.2312   0.2796   0.1996   0.3132
## 3   1.0045   1.2235   1.0075   1.4481   1.2589   1.5222   1.0865   1.7051
## 4 266.1975 324.2195 266.9995 383.7498 333.6093 403.3749 287.9099 451.8647
## 5  18.9000  20.3000  27.3000  22.4000  22.4000  23.8000  26.6000  28.0000
## 6 529.2000 568.4000 764.4000 627.2000 627.2000 666.4000 744.8000 784.0000
##       2010     2011     2012     2013     2014     2015     2016     2017
## 1   1.2742   1.0321   1.3726   1.4018   1.4584   1.2424   1.1940   1.0617
## 2   0.2867   0.2322   0.3088   0.3154   0.3281   0.2795   0.2687   0.2389
## 3   1.5609   1.2643   1.6815   1.7173   1.7865   1.5220   1.4627   1.3005
## 4 413.6467 335.0379 445.5958 455.0727 473.4174 403.3181 387.6130 344.6447
## 5  29.1200  29.4000  28.7000  28.7000  30.8000  22.9600  16.6600  15.3233
## 6 815.3600 823.2000 803.6000 803.6000 862.4000 642.8800 466.4800 429.0518
##       2018     2019     2020
## 1   0.8988   1.2176   1.3170
## 2   0.2022   0.2740   0.2963
## 3   1.1011   1.4916   1.6133
## 4 291.7838 395.2689 427.5284
## 5  16.4555  17.8542  20.6577
## 6 460.7529 499.9176 578.4156

split_row <- 31406

emissions_data_by_countries <- emissions_data[1:split_row, ]
emissions_data_by_continents <- emissions_data[(split_row +1):nrow(emissions_data), ]

Here I am creating tidy data table:

emissions_data_long <- emissions_data_by_countries |>
  
  pivot_longer(
    cols = starts_with("2"),
    names_to = "Year",
    values_to = "Emission"
  ) |>
  drop_na()
colnames(emissions_data_long) <- str_to_lower(colnames(emissions_data_long))
emissions_data_long$year <- as.numeric(emissions_data_long$year)
emissions_data_long$emission <- round(emissions_data_long$emission, 2)

unique_areas <- unique(emissions_data_long$area)
#print(unique_areas)
#write.csv(emissions_data_long, "emissions_data_long.csv", row.names = FALSE)
head(emissions_data_long)

## # A tibble: 6 × 6
##   area        item          element    unit        year emission
##   <chr>       <chr>         <chr>      <chr>      <dbl>    <dbl>
## 1 Afghanistan Crop Residues N2O_direct kilotonnes  2000     0.52
## 2 Afghanistan Crop Residues N2O_direct kilotonnes  2001     0.53
## 3 Afghanistan Crop Residues N2O_direct kilotonnes  2002     0.82
## 4 Afghanistan Crop Residues N2O_direct kilotonnes  2003     1   
## 5 Afghanistan Crop Residues N2O_direct kilotonnes  2004     0.82
## 6 Afghanistan Crop Residues N2O_direct kilotonnes  2005     1.18

This dataset is tidy because:

Each variable has its own column. The columns represent individual variables: area, item, element, unit, year, and emission. Each column holds one type of data.
Each observation has its own row. This makes it easier for analysis.

Analysis

Here I do analysis for top 10 countries for volumes of emission they produce:

#total emissions by country
total_emissions_by_country <- emissions_data_long |>
  group_by(area) |>
  summarise(total_emission = sum(emission, na.rm = TRUE)) |>
  arrange(desc(total_emission))

top10_emissions <- total_emissions_by_country |>
  top_n(10, total_emission)

ggplot(top10_emissions, aes(x = reorder(area, total_emission), 
                                       y = total_emission)) +
  geom_bar(stat = "identity", fill = "steelblue")+
  scale_y_continuous(trans = "log10")+
  coord_flip() +
  labs(title = "Top Total Emissions by Country 2000 - 2020",
       x= "Country",
       y = "Total Emissions (Kilotones)")+
  theme_minimal()+
  theme(axis.text.y = element_text(angle = 45, hjust = 1))

In this plot we see top 10 countries with highest amounts of emissions produced in years 2000 - 2020.

Project_2_DATA_607 Emissions dataset tidying

Farhod Ibragimov

2025-03-03

Tidying and cleaning data

Analysis