Packages Used

Show Packages Used

library(tidyverse)
library(data.table)
library(janitor)
library(scales)
library(patchwork)

Load/Tidy Data

Show code for loading data

Earth’s Average Temperature

The original data represents temperature measured as difference from the average temperature for the baseline period of 1951 -1980; this baseline temperature, according to NASA, is 14 degrees Celsius. I chose to show the yearly average temperature as their actual values in Fahrenheit, so I added 14 to all values and applied the Fahrenheit conversion.

Source: NASA’s Goddard Institute for Space Studies

temp_data <- read_table("earth_temp.txt") |> 
  select(year = Year, avg_temp = No_Smoothing) |> 
  mutate(year = as.numeric(year),
         avg_temp = ((avg_temp + 14) * 9/5) + 32) |> 
  filter(year %in% 1980:2024)

Storm Intensity

The storm data contains wind and pressure telemetry measures for cyclonic storms from several agencies across the globe. I selected only the highest wind and pressure measures for each storm. To do this, I found the max reading from each agency per storm, eliminating the time-stamp granularity. Then, for each storm I kept only the agency measurement that was highest.

Source: National Oceanic and Atmospheric Administration (NOAA)

data <- read_csv("storm_data.csv", na = c("", "NaN", "null", "NULL")) |> 
  clean_names() |>  
  select(sid, season, basin, name, contains("wind"), contains("pres")) |> 
  mutate(across(where(~ is.character(.x) && any(grepl("^[+-]?[0-9.]+$", .x))), ~ as.numeric(.x))) |>  
  filter(season %in% 1980:2024) |> 
  filter(!if_all(contains("wind"), is.na),
         !if_all(contains("pres"), is.na))

DT <- as.data.table(data)

# Some storms were in >1 basin so we are selecting the basin where they spent the most time
basin_mode <- DT[
  , .N, by = .(sid, basin)
][
  order(sid, -N)
][
  , .SD[1], by = sid
][
  , .(sid, basin)
]

key_cols <- "sid"
value_cols <- grep("wind|pres", names(DT), value = TRUE)

# Selecting the max wind and pressure values from each agency for each storm
df <- DT[, lapply(.SD, max, na.rm = TRUE),
         by = key_cols,
         .SDcols = value_cols]

df2 <- merge(
  basin_mode,
  df,
  by = "sid",
  all.x = TRUE
)

meta <- unique(DT[, .(sid, season, name)])
df3 <- merge(meta, df2, by = "sid", all.x = TRUE)

wind_cols <- grep("wind", names(DT), value = TRUE)
pres_cols <- grep("pres", names(DT), value = TRUE)


# Selecting the max wind and pressure among all agency measures
storm_extremes <- DT[
  ,
  {
    list(
      max_wind = max(as.matrix(.SD[, ..wind_cols]), na.rm = TRUE),
      max_pres = max(as.matrix(.SD[, ..pres_cols]), na.rm = TRUE)
    )
  },
  by = sid
]

meta <- df3[, .(sid, season, name, basin)]
storm_data <- merge(meta, storm_extremes, by = "sid", all.x = TRUE)

basin_lookup <- tibble(
  basin_abbv = c("NA", "EP", "WP", "NI", "SI", "SP", "SA"),
  basin_full = c(
    "North Atlantic",
    "Eastern North Pacific",
    "Western North Pacific",
    "North Indian",
    "South Indian",
    "Southern Pacific",
    "South Atlantic"
  )
)

storm_data <- as.tibble(storm_data) |> 
  rename(year = season) |> 
  left_join(basin_lookup, by = join_by(basin == basin_abbv)) |> 
  mutate(basin = basin_full,
         max_pres = as.numeric(max_pres)) |> 
  select(-basin_full)

Combine Data

storms <- storm_data |> 
  left_join(temp_data, by = "year")

print(storms, n=5)

## # A tibble: 4,260 × 7
##   sid            year name            basin           max_wind max_pres avg_temp
##   <chr>         <dbl> <chr>           <chr>              <dbl>    <dbl>    <dbl>
## 1 1979238S08085  1980 TONY            South Indian          45     1005     57.7
## 2 1979343S08093  1980 CLAUDETTE:VIOLA South Indian         115     1006     57.7
## 3 1979344S12170  1980 OFA             Southern Pacif…       55      997     57.7
## 4 1979357S10095  1980 DANITZA:WILF    South Indian          70     1005     57.7
## 5 1980001S13173  1980 PENI            Southern Pacif…       65      997     57.7
## # ℹ 4,255 more rows

Visualizations

yearly <- storms |> 
  group_by(year) |> 
  summarise(
    avg_temp = mean(avg_temp, na.rm = TRUE),
    .groups = "drop"
  )

ggplot(yearly, aes(x = year, y = avg_temp)) +
  geom_line(linewidth = 1) +
  geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "tomato") +
  scale_x_continuous(breaks = breaks_pretty(5)) +
  labs(
    title = "Average Global Temperature Has Risen Over Time (1980–2024)",
    subtitle = "Over the past 4 decades, the average temperature of the entire world has increased
rapidly and consistently. The overall change amounts to 2 degrees.",
    x = NULL, y = "Degrees Fahrenheit"
  ) +
  theme_minimal(base_size = 12)

storms_cat <- storms |> 
  mutate(
    # Convert knots to mph
    max_wind_mph = max_wind * 1.15078,

    # Categorize storms by wind speed
    category_1_to_5 = case_when(
      max_wind_mph >= 74  & max_wind_mph <= 95  ~ "Category 1",
      max_wind_mph >= 96  & max_wind_mph <= 110 ~ "Category 2",
      max_wind_mph >= 111 & max_wind_mph <= 129 ~ "Category 3 (Major)",
      max_wind_mph >= 130 & max_wind_mph <= 156 ~ "Category 4 (Major)",
      max_wind_mph >= 157                      ~ "Category 5 (Major)",
    )
  ) |> 
  mutate(category = case_when(category_1_to_5 == "Category 1" ~ "Minor storms (Cat 1–2)",
                              category_1_to_5 == "Category 2" ~ "Minor storms (Cat 1–2)",
                              .default = "Major storms (Cat 3–5)"))


storms_by_year_cat <- storms_cat |> 
  group_by(year, category) |> 
  summarise(
    storm_count = n(),
    .groups = "drop"
  )


ggplot(storms_by_year_cat,
       aes(x = year, y = storm_count, color = category)) +
  geom_line() +
  geom_smooth(data = storms_by_year_cat |> filter(str_detect(category, "Major")),
              aes(x = year, y = storm_count),
              method = "lm", se = FALSE, linetype = "dashed") +
  scale_color_manual(
    values = c(
      "Major storms (Cat 3–5)" = "tomato",  # red/orange (stands out)
      "Minor storms (Cat 1–2)" = NULL    # neutral grey
    )
  ) +
  scale_x_continuous(breaks = pretty_breaks(6)) +
  labs(
    title = "Number of Major Storms Has Also Increased",
    subtitle = "Storms are categorized by maximum wind speed (mph), and any storm with
wind speeds in categories 3-5 (111+ mph) are considered MAJOR storms. In
the same period of time that temperature has been rapidly increasing, major
storms have also noticeably increased.",
    x = NULL,
    y = NULL,
    color = ""
  ) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "top"
  )

Example of damage from Category 5 Storm (Source: wfla.com)

max_wind_by_year <- storms |> 
  mutate(
    max_wind_mph = max_wind * 1.15078  # knots to mph
  )  |> 
  group_by(year) |> 
  summarise(
    yearly_max_wind = max(max_wind_mph, na.rm = TRUE),
    .groups = "drop"
  )

ggplot(max_wind_by_year, aes(x = year, y = yearly_max_wind)) +
  geom_line(linewidth = .5) +
  geom_point(size = 1) +
  geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "tomato") +
  scale_x_continuous(breaks = pretty_breaks(8)) +
  labs(
    title = "Major Storms are Getting Stronger",
    subtitle = "Each point shows the top speed of the single strongest storm recorded in
that year. In addition to more major storms, we have also seen these storms
getting stronger.",
    x = NULL,
    y = "Wind Speed (mph)"
  ) +
  theme_minimal(base_size = 14)

storms_with_type <- storms |> 
  mutate(
    max_wind_mph = max_wind * 1.15078,
    storm_type = case_when(
      max_wind_mph >= 111 ~ "Major storms (Cat 3–5)",
      max_wind_mph >= 74  ~ "Minor storms (Cat 1–2)",
    )
  ) |> 
  filter(!is.na(storm_type))


storms_with_type <- storms_with_type |> 
  group_by(year)  |> 
  mutate(avg_temp_year = mean(avg_temp, na.rm = TRUE)) |> 
  ungroup()  |> 
  mutate(
    temp_group = ntile(avg_temp_year, 3),
    temp_group = case_when(
      temp_group == 1 ~ "Cooler years",
      temp_group == 2 ~ "Middle years",
      temp_group == 3 ~ "Warmer years"
    )
  )


storms_grouped <- storms_with_type |> 
  filter(temp_group != "Middle years") |> 
  group_by(year, temp_group, storm_type)  |> 
  summarise(storms_n = n(), .groups = "drop") |> 
  group_by(temp_group, storm_type) |> 
  summarise(
    avg_storms = mean(storms_n),
    .groups = "drop"
  )


ggplot(storms_grouped,
       aes(x = temp_group, y = avg_storms, fill = storm_type)) +
  geom_col() +
  scale_fill_manual(
    values = c(
      "Major storms (Cat 3–5)" = "tomato",  # red/orange (stands out)
      "Minor storms (Cat 1–2)" = NULL    # neutral grey
    )
  ) +
  labs(
    title = "Global Warming is Causing an Increase in Major Storms",
    subtitle = "Warmer temperatures are directly linked to more frequent major storms,
which explains the increase in major storms over the past 4 decades.",
    x = "",
    y = "Average Storms per Year",
    fill = ""
  ) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "top"
  )