# Load ALL packages required for the analysis
library(tidyverse)  # Includes dplyr, ggplot2, tidyr, readr
library(data.table)
library(cowplot)
library(gridExtra)
library(knitr)
library(lubridate)
library(reshape2)
library(rmarkdown)
library(scales)

Synopsis

This report explores the NOAA Storm Events Database to identify which weather events have the greatest impact on population health and economic loss across the United States. Using data collected by the National Weather Service from 1950 through 2011, the analysis aggregates and visualizes storm types to compare their relative effects. The results may help emergency planners and agencies prepare for severe weather and prioritize resources more effectively.

Data Source and Documentation

The data source is from the NOAA Storm Events Database, maintained by the National Climatic Data Center (NCDC), part of the National Oceanic and Atmospheric Administration (NOAA). It includes U.S. storm and weather event data collected by the National Weather Service (NWS) from 1950 through 2011.

Data Verification

# Load NOAA Storm Data directly from compressed file
storm_data <- read.csv("repdata_data_StormData.csv.bz2")

# Verify structure and dimensions
dim(storm_data)
## [1] 902297     37
str(storm_data)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

DATA PROCESSING

Apply exponent columns to damage values

exp_values <- function(e) {
  ifelse(e %in% c("H", "h"), 1e2,
  ifelse(e %in% c("K", "k"), 1e3,
  ifelse(e %in% c("M", "m"), 1e6,
  ifelse(e %in% c("B", "b"), 1e9, 1))))
}

storm_data$PROPDMGEXP2 <- exp_values(storm_data$PROPDMGEXP)
storm_data$CROPDMGEXP2 <- exp_values(storm_data$CROPDMGEXP)
storm_data$PROPDMGVAL  <- storm_data$PROPDMG * storm_data$PROPDMGEXP2
storm_data$CROPDMGVAL  <- storm_data$CROPDMG * storm_data$CROPDMGEXP2

Clean and standardize event type names

storm_data$EVTYPE <- toupper(storm_data$EVTYPE)      # Convert to uppercase
storm_data$EVTYPE <- trimws(storm_data$EVTYPE)       # Remove leading/trailing spaces

# Basic replacements for common variations
storm_data$EVTYPE <- gsub("TSTM WIND", "THUNDERSTORM WIND", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDERSTORM WINDS", "THUNDERSTORM WIND", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("HURRICANE/TYPHOON", "HURRICANE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("RIP CURRENTS", "RIP CURRENT", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("WILD/FOREST FIRE", "WILDFIRE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("URBAN/SML STREAM FLD", "FLOOD", storm_data$EVTYPE)

Table: Standardizations List

Raw Value Standardized To
“TSTM WIND” “THUNDERSTORM WIND”
“THUNDERSTORM WINDS” “THUNDERSTORM WIND”
“HURRICANE/TYPHOON” “HURRICANE”
“RIP CURRENTS” “RIP CURRENT”
“WILD/FOREST FIRE” “WILDFIRE”
“URBAN/SML STREAM FLD” “FLOOD”

Aggregate total fatalities and injuries by event type

health_impact <- storm_data %>%
  group_by(EVTYPE) %>%
  summarise(
    FATALITIES = sum(FATALITIES, na.rm = TRUE),
    INJURIES   = sum(INJURIES, na.rm = TRUE)
  ) %>%
  mutate(TOTAL_HEALTH_IMPACT = FATALITIES + INJURIES) %>%
  arrange(desc(TOTAL_HEALTH_IMPACT))

Aggregate total property and crop damage by event type

economic_impact <- storm_data %>%
  group_by(EVTYPE) %>%
  summarise(
    PROPERTY_DAMAGE = sum(PROPDMGVAL, na.rm = TRUE),
    CROP_DAMAGE     = sum(CROPDMGVAL, na.rm = TRUE)
  ) %>%
  mutate(TOTAL_ECONOMIC_IMPACT = PROPERTY_DAMAGE + CROP_DAMAGE) %>%
  arrange(desc(TOTAL_ECONOMIC_IMPACT))

RESULTS

Identify top 5 events causing the most fatalities and injuries

top5_health <- head(health_impact, 5)
knitr::kable(top5_health, caption = "Table 1: Top 5 storm events by total fatalities and injuries")
Table 1: Top 5 storm events by total fatalities and injuries
EVTYPE FATALITIES INJURIES TOTAL_HEALTH_IMPACT
TORNADO 5633 91346 96979
THUNDERSTORM WIND 701 9353 10054
EXCESSIVE HEAT 1903 6525 8428
FLOOD 498 6868 7366
LIGHTNING 816 5230 6046

Identify top 5 events causing the greatest economic losses

top5_economic <- head(economic_impact, 5)
knitr::kable(top5_economic, caption = "Table 2: Top 5 storm events by total economic losses")
Table 2: Top 5 storm events by total economic losses
EVTYPE PROPERTY_DAMAGE CROP_DAMAGE TOTAL_ECONOMIC_IMPACT
FLOOD 144716019457 5670456550 150386476007
HURRICANE 81174159010 5349782800 86523941810
TORNADO 56937160779 414953270 57352114049
STORM SURGE 43323536000 5000 43323541000
HAIL 15732267543 3025954473 18758222016

Create bar-chart panels for top 5 health and economic impact events

library(ggplot2)
library(cowplot)

# Create the first plot
p1 <- ggplot(top5_health, aes(x = reorder(EVTYPE, TOTAL_HEALTH_IMPACT), y = TOTAL_HEALTH_IMPACT)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 5 Events: Fatalities and Injuries",
       x = "Event Type",
       y = "Total Health Impact") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

# Create the second plot
p2 <- ggplot(top5_economic, aes(x = reorder(EVTYPE, TOTAL_ECONOMIC_IMPACT), y = TOTAL_ECONOMIC_IMPACT)) +
  geom_bar(stat = "identity", fill = "darkgreen") +
  coord_flip() +
  labs(title = "Top 5 Events: Economic Damage",
       x = "Event Type",
       y = "Total Economic Impact ($)") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

# Create a blank spacer plot so the code knows what to do
spacer <- ggdraw() + theme_void()

# Stacked the spacer between the charts and allows adjustments
plot_grid(p1, spacer, p2, labels = c("CHART A", "", "CHART B"), ncol = 1, rel_heights = c(1, 0.2, 1))
Figure 1: Top 5 events causing fatalities and injuries, and top 5 events causing economic damage

Figure 1: Top 5 events causing fatalities and injuries, and top 5 events causing economic damage

Normalized Comparison

Health vs Economic Impact by Event Type

library(dplyr)
library(ggplot2)
library(tidyr)

# Combine both datasets for top events: Normalizing the data measures to create the visual 
# Event types were combined with duplicates removed  
combined_events <- union(top5_health$EVTYPE, top5_economic$EVTYPE)

data_lolli_v <- full_join(
  select(health_impact, EVTYPE, Health = TOTAL_HEALTH_IMPACT),
  select(economic_impact, EVTYPE, Economic = TOTAL_ECONOMIC_IMPACT),
  by = "EVTYPE"
) %>%
  filter(EVTYPE %in% combined_events) %>%
  replace_na(list(Health = 0, Economic = 0)) %>%
  # Normalize both measures to 0–1 range for comparable scaling
  mutate(
    Health_Norm = Health / max(Health, na.rm = TRUE),
    Economic_Norm = Economic / max(Economic, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = c(Health_Norm, Economic_Norm),
               names_to = "Impact_Type",
               values_to = "Value")

# Vertical lollipop plot
ggplot(data_lolli_v, aes(x = reorder(EVTYPE, Value), y = Value, color = Impact_Type)) +
  geom_segment(aes(xend = EVTYPE, y = 0, yend = Value), linewidth = 1.1, alpha = 0.8) +
  geom_point(size = 4) +
  scale_color_manual(values = c("Health_Norm" = "darkgreen",
                                "Economic_Norm" = "steelblue"),
                     labels = c("Health Impact", "Economic Impact")) +
  labs(
  title = "Normalized Comparison\nHealth vs Economic Impact",
  x = "Event Type",
  y = "Relative Impact (0–1)",
  color = "Impact Type"
) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top",
    panel.grid.minor = element_blank()
  )
Figure 2: Normalized comparison of health and economic impacts by event type

Figure 2: Normalized comparison of health and economic impacts by event type

TECH SPECS

Operating System: Linux Mint 22.2 (x86_64)

Host Machine: Mac mini (Model 7,1)

Kernel: 6.14.0-33-generic

Desktop Environment: Cinnamon 6.4.8 (Mutter / Muffin WM)

CPU: Intel Core i5-4278U (4 cores, 3.1 GHz)

GPU: Intel Haswell-ULT Integrated Graphics

Memory: 7.8 GB (≈3.8 GB used at runtime)

Display Resolution: 1920×1080 × 2 (monitors)

PROGRAM

RStudio 2025.09.1 Build 401

PACKAGES

Core Data & Analysis

tidyverse 2.0.0 (includes dplyr 1.1.4, ggplot2 4.0.0, tidyr 1.3.1, readr 2.1.5, tibble 3.3.0, purrr 1.1.0, stringr 1.5.2, forcats 1.0.1)

data.table 1.17.8 (fast data loading and aggregation)

lubridate 1.9.4 (date handling and extraction)

reshape2 1.4.4 (data reshaping for grouped summaries)

Visualization & Plot Arrangement

ggplot2 (within tidyverse)

cowplot 1.2.0 (plot layout and labeling)

gridExtra 2.3 (grid-based plot arrangement)

scales 1.4.0 (numeric labeling, currency formatting)

Reporting & Output

knitr 1.50 (tables and chunk rendering)

rmarkdown 2.30 (document compilation and knitting)