library(tidyverse)
library(data.table)
library(janitor)
library(scales)
library(patchwork)
The original data represents temperature measured as difference from the average temperature for the baseline period of 1951 -1980; this baseline temperature, according to NASA, is 14 degrees Celsius. I chose to show the yearly average temperature as their actual values in Fahrenheit, so I added 14 to all values and applied the Fahrenheit conversion.
Source: NASA’s Goddard Institute for Space Studies
temp_data <- read_table("earth_temp.txt") |>
select(year = Year, avg_temp = No_Smoothing) |>
mutate(year = as.numeric(year),
avg_temp = ((avg_temp + 14) * 9/5) + 32) |>
filter(year %in% 1980:2024)
The storm data contains wind and pressure telemetry measures for cyclonic storms from several agencies across the globe. I selected only the highest wind and pressure measures for each storm. To do this, I found the max reading from each agency per storm, eliminating the time-stamp granularity. Then, for each storm I kept only the agency measurement that was highest.
Source: National Oceanic and Atmospheric Administration (NOAA)
data <- read_csv("storm_data.csv", na = c("", "NaN", "null", "NULL")) |>
clean_names() |>
select(sid, season, basin, name, contains("wind"), contains("pres")) |>
mutate(across(where(~ is.character(.x) && any(grepl("^[+-]?[0-9.]+$", .x))), ~ as.numeric(.x))) |>
filter(season %in% 1980:2024) |>
filter(!if_all(contains("wind"), is.na),
!if_all(contains("pres"), is.na))
DT <- as.data.table(data)
# Some storms were in >1 basin so we are selecting the basin where they spent the most time
basin_mode <- DT[
, .N, by = .(sid, basin)
][
order(sid, -N)
][
, .SD[1], by = sid
][
, .(sid, basin)
]
key_cols <- "sid"
value_cols <- grep("wind|pres", names(DT), value = TRUE)
# Selecting the max wind and pressure values from each agency for each storm
df <- DT[, lapply(.SD, max, na.rm = TRUE),
by = key_cols,
.SDcols = value_cols]
df2 <- merge(
basin_mode,
df,
by = "sid",
all.x = TRUE
)
meta <- unique(DT[, .(sid, season, name)])
df3 <- merge(meta, df2, by = "sid", all.x = TRUE)
wind_cols <- grep("wind", names(DT), value = TRUE)
pres_cols <- grep("pres", names(DT), value = TRUE)
# Selecting the max wind and pressure among all agency measures
storm_extremes <- DT[
,
{
list(
max_wind = max(as.matrix(.SD[, ..wind_cols]), na.rm = TRUE),
max_pres = max(as.matrix(.SD[, ..pres_cols]), na.rm = TRUE)
)
},
by = sid
]
meta <- df3[, .(sid, season, name, basin)]
storm_data <- merge(meta, storm_extremes, by = "sid", all.x = TRUE)
basin_lookup <- tibble(
basin_abbv = c("NA", "EP", "WP", "NI", "SI", "SP", "SA"),
basin_full = c(
"North Atlantic",
"Eastern North Pacific",
"Western North Pacific",
"North Indian",
"South Indian",
"Southern Pacific",
"South Atlantic"
)
)
storm_data <- as.tibble(storm_data) |>
rename(year = season) |>
left_join(basin_lookup, by = join_by(basin == basin_abbv)) |>
mutate(basin = basin_full,
max_pres = as.numeric(max_pres)) |>
select(-basin_full)
storms <- storm_data |>
left_join(temp_data, by = "year")
print(storms, n=5)
## # A tibble: 4,260 Ă— 7
## sid year name basin max_wind max_pres avg_temp
## <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1979238S08085 1980 TONY South Indian 45 1005 57.7
## 2 1979343S08093 1980 CLAUDETTE:VIOLA South Indian 115 1006 57.7
## 3 1979344S12170 1980 OFA Southern Pacif… 55 997 57.7
## 4 1979357S10095 1980 DANITZA:WILF South Indian 70 1005 57.7
## 5 1980001S13173 1980 PENI Southern Pacif… 65 997 57.7
## # ℹ 4,255 more rows
yearly <- storms |>
group_by(year) |>
summarise(
avg_temp = mean(avg_temp, na.rm = TRUE),
.groups = "drop"
)
ggplot(yearly, aes(x = year, y = avg_temp)) +
geom_line(linewidth = 1) +
geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "tomato") +
scale_x_continuous(breaks = breaks_pretty(5)) +
labs(
title = "Average Global Temperature Has Risen Over Time (1980–2024)",
subtitle = "Over the past 4 decades, the average temperature of the entire world has increased
rapidly and consistently. The overall change amounts to 2 degrees.",
x = NULL, y = "Degrees Fahrenheit"
) +
theme_minimal(base_size = 12)
storms_cat <- storms |>
mutate(
# Convert knots to mph
max_wind_mph = max_wind * 1.15078,
# Categorize storms by wind speed
category_1_to_5 = case_when(
max_wind_mph >= 74 & max_wind_mph <= 95 ~ "Category 1",
max_wind_mph >= 96 & max_wind_mph <= 110 ~ "Category 2",
max_wind_mph >= 111 & max_wind_mph <= 129 ~ "Category 3 (Major)",
max_wind_mph >= 130 & max_wind_mph <= 156 ~ "Category 4 (Major)",
max_wind_mph >= 157 ~ "Category 5 (Major)",
)
) |>
mutate(category = case_when(category_1_to_5 == "Category 1" ~ "Minor storms (Cat 1–2)",
category_1_to_5 == "Category 2" ~ "Minor storms (Cat 1–2)",
.default = "Major storms (Cat 3–5)"))
storms_by_year_cat <- storms_cat |>
group_by(year, category) |>
summarise(
storm_count = n(),
.groups = "drop"
)
ggplot(storms_by_year_cat,
aes(x = year, y = storm_count, color = category)) +
geom_line() +
geom_smooth(data = storms_by_year_cat |> filter(str_detect(category, "Major")),
aes(x = year, y = storm_count),
method = "lm", se = FALSE, linetype = "dashed") +
scale_color_manual(
values = c(
"Major storms (Cat 3–5)" = "tomato", # red/orange (stands out)
"Minor storms (Cat 1–2)" = NULL # neutral grey
)
) +
scale_x_continuous(breaks = pretty_breaks(6)) +
labs(
title = "Number of Major Storms Has Also Increased",
subtitle = "Storms are categorized by maximum wind speed (mph), and any storm with
wind speeds in categories 3-5 (111+ mph) are considered MAJOR storms. In
the same period of time that temperature has been rapidly increasing, major
storms have also noticeably increased.",
x = NULL,
y = NULL,
color = ""
) +
theme_minimal(base_size = 14) +
theme(
legend.position = "top"
)
max_wind_by_year <- storms |>
mutate(
max_wind_mph = max_wind * 1.15078 # knots to mph
) |>
group_by(year) |>
summarise(
yearly_max_wind = max(max_wind_mph, na.rm = TRUE),
.groups = "drop"
)
ggplot(max_wind_by_year, aes(x = year, y = yearly_max_wind)) +
geom_line(linewidth = .5) +
geom_point(size = 1) +
geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "tomato") +
scale_x_continuous(breaks = pretty_breaks(8)) +
labs(
title = "Major Storms are Getting Stronger",
subtitle = "Each point shows the top speed of the single strongest storm recorded in
that year. In addition to more major storms, we have also seen these storms
getting stronger.",
x = NULL,
y = "Wind Speed (mph)"
) +
theme_minimal(base_size = 14)
storms_with_type <- storms |>
mutate(
max_wind_mph = max_wind * 1.15078,
storm_type = case_when(
max_wind_mph >= 111 ~ "Major storms (Cat 3–5)",
max_wind_mph >= 74 ~ "Minor storms (Cat 1–2)",
)
) |>
filter(!is.na(storm_type))
storms_with_type <- storms_with_type |>
group_by(year) |>
mutate(avg_temp_year = mean(avg_temp, na.rm = TRUE)) |>
ungroup() |>
mutate(
temp_group = ntile(avg_temp_year, 3),
temp_group = case_when(
temp_group == 1 ~ "Cooler years",
temp_group == 2 ~ "Middle years",
temp_group == 3 ~ "Warmer years"
)
)
storms_grouped <- storms_with_type |>
filter(temp_group != "Middle years") |>
group_by(year, temp_group, storm_type) |>
summarise(storms_n = n(), .groups = "drop") |>
group_by(temp_group, storm_type) |>
summarise(
avg_storms = mean(storms_n),
.groups = "drop"
)
ggplot(storms_grouped,
aes(x = temp_group, y = avg_storms, fill = storm_type)) +
geom_col() +
scale_fill_manual(
values = c(
"Major storms (Cat 3–5)" = "tomato", # red/orange (stands out)
"Minor storms (Cat 1–2)" = NULL # neutral grey
)
) +
labs(
title = "Global Warming is Causing an Increase in Major Storms",
subtitle = "Warmer temperatures are directly linked to more frequent major storms,
which explains the increase in major storms over the past 4 decades.",
x = "",
y = "Average Storms per Year",
fill = ""
) +
theme_minimal(base_size = 14) +
theme(
legend.position = "top"
)