library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(dplyr)
library(readr)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(maps)
## Warning: package 'maps' was built under R version 4.4.3
##
## Attaching package: 'maps'
##
## The following object is masked from 'package:purrr':
##
## map
library(sf)
## Warning: package 'sf' was built under R version 4.4.3
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
Why do I think this dataset is not tidy:
Multiple variables in separate columns. Each row should represent a single observation (e.g., emissions for a specific region, item, and year), but this is not the case here.
Years as columns.
This code loads dataset, also it cleans and transform data:
emissions_data <- read.csv("https://raw.githubusercontent.com/farhodibr/CUNY-SPS-MSDS/refs/heads/main/DATA607/PROJECT2/Total%20Emissions%20Per%20Country%20(2000-2020)%20(1).csv")
head(emissions_data)
## Area Item Element Unit
## 1 Afghanistan Crop Residues Direct emissions (N2O) kilotonnes
## 2 Afghanistan Crop Residues Indirect emissions (N2O) kilotonnes
## 3 Afghanistan Crop Residues Emissions (N2O) kilotonnes
## 4 Afghanistan Crop Residues Emissions (CO2eq) from N2O (AR5) kilotonnes
## 5 Afghanistan Crop Residues Emissions (CO2eq) (AR5) kilotonnes
## 6 Afghanistan Rice Cultivation Emissions (CH4) kilotonnes
## X2000 X2001 X2002 X2003 X2004 X2005 X2006 X2007
## 1 0.520 0.5267 0.8200 0.9988 0.8225 1.1821 1.0277 1.2426
## 2 0.117 0.1185 0.1845 0.2247 0.1851 0.2660 0.2312 0.2796
## 3 0.637 0.6452 1.0045 1.2235 1.0075 1.4481 1.2589 1.5222
## 4 168.807 170.9884 266.1975 324.2195 266.9995 383.7498 333.6093 403.3749
## 5 168.807 170.9884 266.1975 324.2195 266.9995 383.7498 333.6093 403.3749
## 6 18.200 16.9400 18.9000 20.3000 27.3000 22.4000 22.4000 23.8000
## X2008 X2009 X2010 X2011 X2012 X2013 X2014 X2015
## 1 0.8869 1.3920 1.2742 1.0321 1.3726 1.4018 1.4584 1.2424
## 2 0.1996 0.3132 0.2867 0.2322 0.3088 0.3154 0.3281 0.2795
## 3 1.0865 1.7051 1.5609 1.2643 1.6815 1.7173 1.7865 1.5220
## 4 287.9099 451.8647 413.6467 335.0379 445.5958 455.0727 473.4174 403.3181
## 5 287.9099 451.8647 413.6467 335.0379 445.5958 455.0727 473.4174 403.3181
## 6 26.6000 28.0000 29.1200 29.4000 28.7000 28.7000 30.8000 22.9600
## X2016 X2017 X2018 X2019 X2020
## 1 1.1940 1.0617 0.8988 1.2176 1.3170
## 2 0.2687 0.2389 0.2022 0.2740 0.2963
## 3 1.4627 1.3005 1.1011 1.4916 1.6133
## 4 387.6130 344.6447 291.7838 395.2689 427.5284
## 5 387.6130 344.6447 291.7838 395.2689 427.5284
## 6 16.6600 15.3233 16.4555 17.8542 20.6577
year_cols <- grep("^X2", colnames(emissions_data), value = TRUE)
emissions_data <- emissions_data |>
distinct(across(all_of(starts_with("X2"))), .keep_all = TRUE)
format_element <- function(x) {
letters_match <- str_extract(x, "(?i)(CO2eq|N2O|CH4|CO2)")
clean_txt <- str_remove(x, "(?i)(CO2eq|N2O|CH4|CO2)\\s*") |>
str_remove_all("(?i)\\bemissions\\b")
clean_txt <- str_to_lower(clean_txt) |>
str_replace_all("\\s+", "_") |>
str_replace_all("[^a-z0-9_]", "")|>
str_replace_all("_$", "")
result <- if(!is.na(letters_match) &letters_match !="") {
str_c(letters_match, clean_txt, sep = "_")
} else {
clean_txt
}
result <- str_replace_all(result, "(?i)\\bn2o\\b", "N2O")
result <- str_replace_all(result, "(?i)\\bch4\\b", "CH4")
result <- str_replace_all(result, "(?i)\\bar5\\b", "AR5")
result <- str_replace_all(result, "(?i)\\bco2\\b", "CO2")
result <- str_replace_all(result, "_$", "")
return(result)
}
colnames(emissions_data) <- str_remove_all(colnames(emissions_data), "(?i)^X")
emissions_data <- emissions_data |>
mutate(Element = map_chr(Element, format_element)) |>
drop_na()
head(emissions_data)
## Area Item Element Unit 2000 2001
## 1 Afghanistan Crop Residues N2O_direct kilotonnes 0.520 0.5267
## 2 Afghanistan Crop Residues N2O_indirect kilotonnes 0.117 0.1185
## 3 Afghanistan Crop Residues N2O kilotonnes 0.637 0.6452
## 4 Afghanistan Crop Residues CO2eq___from_n2o_ar5 kilotonnes 168.807 170.9884
## 5 Afghanistan Rice Cultivation CH4 kilotonnes 18.200 16.9400
## 6 Afghanistan Rice Cultivation CO2eq___from_ch4_ar5 kilotonnes 509.600 474.3200
## 2002 2003 2004 2005 2006 2007 2008 2009
## 1 0.8200 0.9988 0.8225 1.1821 1.0277 1.2426 0.8869 1.3920
## 2 0.1845 0.2247 0.1851 0.2660 0.2312 0.2796 0.1996 0.3132
## 3 1.0045 1.2235 1.0075 1.4481 1.2589 1.5222 1.0865 1.7051
## 4 266.1975 324.2195 266.9995 383.7498 333.6093 403.3749 287.9099 451.8647
## 5 18.9000 20.3000 27.3000 22.4000 22.4000 23.8000 26.6000 28.0000
## 6 529.2000 568.4000 764.4000 627.2000 627.2000 666.4000 744.8000 784.0000
## 2010 2011 2012 2013 2014 2015 2016 2017
## 1 1.2742 1.0321 1.3726 1.4018 1.4584 1.2424 1.1940 1.0617
## 2 0.2867 0.2322 0.3088 0.3154 0.3281 0.2795 0.2687 0.2389
## 3 1.5609 1.2643 1.6815 1.7173 1.7865 1.5220 1.4627 1.3005
## 4 413.6467 335.0379 445.5958 455.0727 473.4174 403.3181 387.6130 344.6447
## 5 29.1200 29.4000 28.7000 28.7000 30.8000 22.9600 16.6600 15.3233
## 6 815.3600 823.2000 803.6000 803.6000 862.4000 642.8800 466.4800 429.0518
## 2018 2019 2020
## 1 0.8988 1.2176 1.3170
## 2 0.2022 0.2740 0.2963
## 3 1.1011 1.4916 1.6133
## 4 291.7838 395.2689 427.5284
## 5 16.4555 17.8542 20.6577
## 6 460.7529 499.9176 578.4156
split_row <- 31406
emissions_data_by_countries <- emissions_data[1:split_row, ]
emissions_data_by_continents <- emissions_data[(split_row +1):nrow(emissions_data), ]
Here I am creating tidy data table:
emissions_data_long <- emissions_data_by_countries |>
pivot_longer(
cols = starts_with("2"),
names_to = "Year",
values_to = "Emission"
) |>
drop_na()
colnames(emissions_data_long) <- str_to_lower(colnames(emissions_data_long))
emissions_data_long$year <- as.numeric(emissions_data_long$year)
emissions_data_long$emission <- round(emissions_data_long$emission, 2)
unique_areas <- unique(emissions_data_long$area)
#print(unique_areas)
#write.csv(emissions_data_long, "emissions_data_long.csv", row.names = FALSE)
head(emissions_data_long)
## # A tibble: 6 × 6
## area item element unit year emission
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 Afghanistan Crop Residues N2O_direct kilotonnes 2000 0.52
## 2 Afghanistan Crop Residues N2O_direct kilotonnes 2001 0.53
## 3 Afghanistan Crop Residues N2O_direct kilotonnes 2002 0.82
## 4 Afghanistan Crop Residues N2O_direct kilotonnes 2003 1
## 5 Afghanistan Crop Residues N2O_direct kilotonnes 2004 0.82
## 6 Afghanistan Crop Residues N2O_direct kilotonnes 2005 1.18
This dataset is tidy because:
Each variable has its own column. The columns represent
individual variables: area, item,
element, unit, year, and
emission. Each column holds one type of data.
Each observation has its own row. This makes it easier for analysis.
Here I do analysis for top 10 countries for volumes of emission they produce:
#total emissions by country
total_emissions_by_country <- emissions_data_long |>
group_by(area) |>
summarise(total_emission = sum(emission, na.rm = TRUE)) |>
arrange(desc(total_emission))
top10_emissions <- total_emissions_by_country |>
top_n(10, total_emission)
ggplot(top10_emissions, aes(x = reorder(area, total_emission),
y = total_emission)) +
geom_bar(stat = "identity", fill = "steelblue")+
scale_y_continuous(trans = "log10")+
coord_flip() +
labs(title = "Top Total Emissions by Country 2000 - 2020",
x= "Country",
y = "Total Emissions (Kilotones)")+
theme_minimal()+
theme(axis.text.y = element_text(angle = 45, hjust = 1))
In this plot we see top 10 countries with highest amounts of emissions produced in years 2000 - 2020.