library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

my_data <- read.csv("C:/Users/IU Student/Downloads/climate_data.csv", check.names = FALSE) |>
  as_tibble() |>
  rename(
    Year = 1, Country = 2, Temp = 3, CO2 = 4, 
    Sea_Level = 5, Rainfall = 6, Population = 7, 
    Renewables = 8, Weather_Events = 9, Forest = 10
  )

my_data |> head()

## # A tibble: 6 × 10
##    Year Country    Temp   CO2 Sea_Level Rainfall Population Renewables
##   <int> <chr>     <dbl> <dbl>     <dbl>    <int>      <int>      <dbl>
## 1  2006 UK          8.9   9.3       3.1     1441  530911230       20.4
## 2  2019 USA        31     4.8       4.2     2407  107364344       49.2
## 3  2014 France     33.9   2.8       2.2     1241  441101758       33.3
## 4  2010 Argentina   5.9   1.8       3.2     1892 1069669579       23.7
## 5  2007 Germany    26.9   5.6       2.4     1743  124079175       12.5
## 6  2020 China      32.3   1.4       2.7     2100 1202028857       49.4
## # ℹ 2 more variables: Weather_Events <int>, Forest <dbl>

Our dataset is pretty well balanced. Indonesia and Russia represent the largest portions of the data (approx. 7.5% each), but no single country dominates. This balance is important because it allows me to conclude that the subsequent global averages are not biased by a single superpower’s specific policy related to climate.

# Categorical Summary - Country representation
my_data |>
  group_by(Country) |>
  summarise(count = n()) |>
  mutate(percentage = (count / sum(count)) * 100) |>
  arrange(desc(count))

## # A tibble: 15 × 3
##    Country      count percentage
##    <chr>        <int>      <dbl>
##  1 Indonesia       75        7.5
##  2 Russia          74        7.4
##  3 South Africa    73        7.3
##  4 USA             73        7.3
##  5 India           70        7  
##  6 Argentina       67        6.7
##  7 Brazil          67        6.7
##  8 Canada          67        6.7
##  9 China           67        6.7
## 10 France          66        6.6
## 11 UK              65        6.5
## 12 Japan           63        6.3
## 13 Germany         61        6.1
## 14 Australia       57        5.7
## 15 Mexico          55        5.5

# Numeric Summary - Temp and CO2 distribution
my_data |>
  select(Temp, CO2) |>
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value") |>
  group_by(Variable) |>
  summarise(
    Mean = mean(Value, na.rm = TRUE),
    Median = median(Value, na.rm = TRUE),
    Min = min(Value, na.rm = TRUE),
    Max = max(Value, na.rm = TRUE),
    Lower_Quantile = quantile(Value, 0.25, na.rm = TRUE),
    Upper_Quantile = quantile(Value, 0.75, na.rm = TRUE)
  )

## # A tibble: 2 × 7
##   Variable  Mean Median   Min   Max Lower_Quantile Upper_Quantile
##   <chr>    <dbl>  <dbl> <dbl> <dbl>          <dbl>          <dbl>
## 1 CO2       10.4   10.7   0.5  20             5.57           15.4
## 2 Temp      19.9   20.1   5    34.9          12.2            27.2

RESEARCH QUESTIONS

Have CO2 emissions actually decreased in the modern era (2012–2023) compared to the early 2000s (2000–2011)?
How do CO2 emissions per capita vary across different countries and what does the distribution of these emissions reveal about consistency of environmental impact within each nation?
Do nations with larger populations struggle more to increase their renewable energy percentage?

my_data |>
  mutate(Era = if_else(Year <= 2011, "Early 2000s", "Modern Era")) |>
  group_by(Era) |>
  summarise(
    Avg_CO2 = mean(CO2, na.rm = TRUE),
    Avg_Weather_Events = mean(Weather_Events, na.rm = TRUE),
    Avg_Renewables = mean(Renewables, na.rm = TRUE)
  )

## # A tibble: 2 × 4
##   Era         Avg_CO2 Avg_Weather_Events Avg_Renewables
##   <chr>         <dbl>              <dbl>          <dbl>
## 1 Early 2000s    10.2               7.32           26.8
## 2 Modern Era     10.7               7.26           27.8

Insight Have CO2 emissions actually decreased in the modern era (2012–2023) compared to the early 2000s (2000–2011)?

Surprisingly average CO2 emissions rose from 10.19 in the early 2000s to 10.65 in the modern era, despite slight increase in renewable energy adoption (26.7% to 27.8%). I think one conclusion is that current renewable energy growth is failing to keep pace with increasing total energy demand. Policy must shift from simply just adding green energy to actively retiring fossil fuel assets.

library(plotly)
library(tidyverse)

(my_data %>%
  ggplot(aes(x = reorder(Country, `Weather_Events`, FUN = median), 
             y = `Weather_Events`, 
             fill = Country)) +
  geom_boxplot(alpha = 0.7, outlier.colour = "red") +
  coord_flip() + 
  theme_minimal() +
  scale_fill_viridis_d(option = "mako") + 
  labs(
    title = "Variance in Extreme Weather Frequency by Nation",
    subtitle = "Comparing the median and interquartile range of events (2000-2022)",
    x = "Nation",
    y = "Number of Extreme Weather Events",
    caption = "Source: Climate Dataset Analysis"
  ) +
  theme(legend.position = "none")) %>%
  
  ggplotly() %>%
  layout(
    hoverlabel = list(bgcolor = "white"),
    hovermode = "y" 
  ) %>%
  style(
    text = paste0(
      "Max: High risk peak\n",
      "Q3: Heavy impact zone\n",
      "Median: Typical yearly count\n",
      "Q1: Lower bound baseline\n",
      "Min: Minimum recorded shift"
    )
  )

Insight How do CO2 emissions per capita vary across different countries and what does the distribution of the emissions show about consistency of environmental impact within each nation?

This shows most major countries produce a similar amount of CO2, showing a shared global reliance on carbon heavy industries. While the UK has high but steady emission levels, countries like Germany show much more fluctuation and unpredictability year to year. No single nation really stands out as a true low carbon leader yet, regardless of whether they are a wealthy or a developing economy. This suggests that current environmental efforts haven’t yet succeeded in growing economies.

library(plotly)
library(tidyverse)

cor_val <- cor(my_data$Renewables, my_data$CO2, use = "complete.obs")

p <- my_data |>
  ggplot(aes(x = Renewables, y = CO2, color = Country,
             text = paste("Country:", Country, 
                          "<br>Year:", Year, 
                          "<br>Renewables:", Renewables, "%",
                          "<br>CO2 Emissions:", CO2, "Tons/Capita"))) +
  geom_point(size = 2, alpha = 0.6) +
  geom_smooth(aes(group = 1), method = "lm", se = FALSE, color = "black", linetype = "dashed") +
  theme_minimal() +
  labs(
    title = "Interactive Analysis: Renewable Saturation vs. Carbon Output",
    subtitle = paste("Global Correlation Coefficient: ", round(cor_val, 3)),
    x = "Renewable Energy (%)",
    y = "CO2 Emissions (Tons/Capita)"
  )

ggplotly(p, tooltip = "text")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: The following aesthetics were dropped during statistical transformation: text.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Insight Do nations with larger populations struggle more to increase their renewable energy percentage?

So the flat slightly upward trend line (black dashed line) across the 1000 data points confirms earlier aggregation that increasing renewable energy percentage hasn’t yet led to a clear, universal reduction in CO2 per capita. So the conclusion is that factors like industrial manufacturing volume (not captured in this specific plot) likely exerts more pressure on emissions than renewable energy adoption does at its current levels.

Week 2 Data Dive

Tiyasha Banerjee

2026-01-27

RESEARCH QUESTIONS