library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
my_data <- read.csv("C:/Users/IU Student/Downloads/climate_data.csv", check.names = FALSE) |>
as_tibble() |>
rename(
Year = 1, Country = 2, Temp = 3, CO2 = 4,
Sea_Level = 5, Rainfall = 6, Population = 7,
Renewables = 8, Weather_Events = 9, Forest = 10
)
my_data |> head()
## # A tibble: 6 × 10
## Year Country Temp CO2 Sea_Level Rainfall Population Renewables
## <int> <chr> <dbl> <dbl> <dbl> <int> <int> <dbl>
## 1 2006 UK 8.9 9.3 3.1 1441 530911230 20.4
## 2 2019 USA 31 4.8 4.2 2407 107364344 49.2
## 3 2014 France 33.9 2.8 2.2 1241 441101758 33.3
## 4 2010 Argentina 5.9 1.8 3.2 1892 1069669579 23.7
## 5 2007 Germany 26.9 5.6 2.4 1743 124079175 12.5
## 6 2020 China 32.3 1.4 2.7 2100 1202028857 49.4
## # ℹ 2 more variables: Weather_Events <int>, Forest <dbl>
Our dataset is pretty well balanced. Indonesia and Russia represent the largest portions of the data (approx. 7.5% each), but no single country dominates. This balance is important because it allows me to conclude that the subsequent global averages are not biased by a single superpower’s specific policy related to climate.
# Categorical Summary - Country representation
my_data |>
group_by(Country) |>
summarise(count = n()) |>
mutate(percentage = (count / sum(count)) * 100) |>
arrange(desc(count))
## # A tibble: 15 × 3
## Country count percentage
## <chr> <int> <dbl>
## 1 Indonesia 75 7.5
## 2 Russia 74 7.4
## 3 South Africa 73 7.3
## 4 USA 73 7.3
## 5 India 70 7
## 6 Argentina 67 6.7
## 7 Brazil 67 6.7
## 8 Canada 67 6.7
## 9 China 67 6.7
## 10 France 66 6.6
## 11 UK 65 6.5
## 12 Japan 63 6.3
## 13 Germany 61 6.1
## 14 Australia 57 5.7
## 15 Mexico 55 5.5
# Numeric Summary - Temp and CO2 distribution
my_data |>
select(Temp, CO2) |>
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value") |>
group_by(Variable) |>
summarise(
Mean = mean(Value, na.rm = TRUE),
Median = median(Value, na.rm = TRUE),
Min = min(Value, na.rm = TRUE),
Max = max(Value, na.rm = TRUE),
Lower_Quantile = quantile(Value, 0.25, na.rm = TRUE),
Upper_Quantile = quantile(Value, 0.75, na.rm = TRUE)
)
## # A tibble: 2 × 7
## Variable Mean Median Min Max Lower_Quantile Upper_Quantile
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 CO2 10.4 10.7 0.5 20 5.57 15.4
## 2 Temp 19.9 20.1 5 34.9 12.2 27.2
Have CO2 emissions actually decreased in the modern era (2012–2023) compared to the early 2000s (2000–2011)?
How do CO2 emissions per capita vary across different countries and what does the distribution of these emissions reveal about consistency of environmental impact within each nation?
Do nations with larger populations struggle more to increase their renewable energy percentage?
my_data |>
mutate(Era = if_else(Year <= 2011, "Early 2000s", "Modern Era")) |>
group_by(Era) |>
summarise(
Avg_CO2 = mean(CO2, na.rm = TRUE),
Avg_Weather_Events = mean(Weather_Events, na.rm = TRUE),
Avg_Renewables = mean(Renewables, na.rm = TRUE)
)
## # A tibble: 2 × 4
## Era Avg_CO2 Avg_Weather_Events Avg_Renewables
## <chr> <dbl> <dbl> <dbl>
## 1 Early 2000s 10.2 7.32 26.8
## 2 Modern Era 10.7 7.26 27.8
Insight Have CO2 emissions actually decreased in the modern era (2012–2023) compared to the early 2000s (2000–2011)?
Surprisingly average CO2 emissions rose from 10.19 in the early 2000s to 10.65 in the modern era, despite slight increase in renewable energy adoption (26.7% to 27.8%). I think one conclusion is that current renewable energy growth is failing to keep pace with increasing total energy demand. Policy must shift from simply just adding green energy to actively retiring fossil fuel assets.
library(plotly)
library(tidyverse)
(my_data %>%
ggplot(aes(x = reorder(Country, `Weather_Events`, FUN = median),
y = `Weather_Events`,
fill = Country)) +
geom_boxplot(alpha = 0.7, outlier.colour = "red") +
coord_flip() +
theme_minimal() +
scale_fill_viridis_d(option = "mako") +
labs(
title = "Variance in Extreme Weather Frequency by Nation",
subtitle = "Comparing the median and interquartile range of events (2000-2022)",
x = "Nation",
y = "Number of Extreme Weather Events",
caption = "Source: Climate Dataset Analysis"
) +
theme(legend.position = "none")) %>%
ggplotly() %>%
layout(
hoverlabel = list(bgcolor = "white"),
hovermode = "y"
) %>%
style(
text = paste0(
"Max: High risk peak\n",
"Q3: Heavy impact zone\n",
"Median: Typical yearly count\n",
"Q1: Lower bound baseline\n",
"Min: Minimum recorded shift"
)
)
Insight How do CO2 emissions per capita vary across different countries and what does the distribution of the emissions show about consistency of environmental impact within each nation?
This shows most major countries produce a similar amount of CO2, showing a shared global reliance on carbon heavy industries. While the UK has high but steady emission levels, countries like Germany show much more fluctuation and unpredictability year to year. No single nation really stands out as a true low carbon leader yet, regardless of whether they are a wealthy or a developing economy. This suggests that current environmental efforts haven’t yet succeeded in growing economies.
library(plotly)
library(tidyverse)
cor_val <- cor(my_data$Renewables, my_data$CO2, use = "complete.obs")
p <- my_data |>
ggplot(aes(x = Renewables, y = CO2, color = Country,
text = paste("Country:", Country,
"<br>Year:", Year,
"<br>Renewables:", Renewables, "%",
"<br>CO2 Emissions:", CO2, "Tons/Capita"))) +
geom_point(size = 2, alpha = 0.6) +
geom_smooth(aes(group = 1), method = "lm", se = FALSE, color = "black", linetype = "dashed") +
theme_minimal() +
labs(
title = "Interactive Analysis: Renewable Saturation vs. Carbon Output",
subtitle = paste("Global Correlation Coefficient: ", round(cor_val, 3)),
x = "Renewable Energy (%)",
y = "CO2 Emissions (Tons/Capita)"
)
ggplotly(p, tooltip = "text")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: text.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
Insight Do nations with larger populations struggle more to increase their renewable energy percentage?
So the flat slightly upward trend line (black dashed line) across the 1000 data points confirms earlier aggregation that increasing renewable energy percentage hasn’t yet led to a clear, universal reduction in CO2 per capita. So the conclusion is that factors like industrial manufacturing volume (not captured in this specific plot) likely exerts more pressure on emissions than renewable energy adoption does at its current levels.