# Load ALL packages required for the analysis
library(tidyverse)  # Includes dplyr, ggplot2, tidyr, readr
library(data.table)
library(cowplot)
library(gridExtra)
library(knitr)
library(lubridate)
library(reshape2)
library(rmarkdown)
library(scales)

Synopsis

This report explores the NOAA Storm Events Database to identify which weather events have the greatest impact on population health and economic loss across the United States. Using data collected by the National Weather Service from 1950 through 2011, the analysis aggregates and visualizes storm types to compare their relative effects. The results may help emergency planners and agencies prepare for severe weather and prioritize resources more effectively.

Data Source and Documentation

The data source is from the NOAA Storm Events Database, maintained by the National Climatic Data Center (NCDC), part of the National Oceanic and Atmospheric Administration (NOAA). It includes U.S. storm and weather event data collected by the National Weather Service (NWS) from 1950 through 2011.

Data Verification

# Load NOAA Storm Data directly from compressed file
storm_data <- read.csv("repdata_data_StormData.csv.bz2")

# Verify structure and dimensions
dim(storm_data)

## [1] 902297     37

str(storm_data)

## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

DATA PROCESSING

Apply exponent columns to damage values

exp_values <- function(e) {
  ifelse(e %in% c("H", "h"), 1e2,
  ifelse(e %in% c("K", "k"), 1e3,
  ifelse(e %in% c("M", "m"), 1e6,
  ifelse(e %in% c("B", "b"), 1e9, 1))))
}

storm_data$PROPDMGEXP2 <- exp_values(storm_data$PROPDMGEXP)
storm_data$CROPDMGEXP2 <- exp_values(storm_data$CROPDMGEXP)
storm_data$PROPDMGVAL  <- storm_data$PROPDMG * storm_data$PROPDMGEXP2
storm_data$CROPDMGVAL  <- storm_data$CROPDMG * storm_data$CROPDMGEXP2

Clean and standardize event type names

storm_data$EVTYPE <- toupper(storm_data$EVTYPE)      # Convert to uppercase
storm_data$EVTYPE <- trimws(storm_data$EVTYPE)       # Remove leading/trailing spaces

# Basic replacements for common variations
storm_data$EVTYPE <- gsub("TSTM WIND", "THUNDERSTORM WIND", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDERSTORM WINDS", "THUNDERSTORM WIND", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("HURRICANE/TYPHOON", "HURRICANE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("RIP CURRENTS", "RIP CURRENT", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("WILD/FOREST FIRE", "WILDFIRE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("URBAN/SML STREAM FLD", "FLOOD", storm_data$EVTYPE)

Table: Standardizations List

Raw Value	Standardized To
“TSTM WIND”	“THUNDERSTORM WIND”
“THUNDERSTORM WINDS”	“THUNDERSTORM WIND”
“HURRICANE/TYPHOON”	“HURRICANE”
“RIP CURRENTS”	“RIP CURRENT”
“WILD/FOREST FIRE”	“WILDFIRE”
“URBAN/SML STREAM FLD”	“FLOOD”

Aggregate total fatalities and injuries by event type

health_impact <- storm_data %>%
  group_by(EVTYPE) %>%
  summarise(
    FATALITIES = sum(FATALITIES, na.rm = TRUE),
    INJURIES   = sum(INJURIES, na.rm = TRUE)
  ) %>%
  mutate(TOTAL_HEALTH_IMPACT = FATALITIES + INJURIES) %>%
  arrange(desc(TOTAL_HEALTH_IMPACT))

Aggregate total property and crop damage by event type

economic_impact <- storm_data %>%
  group_by(EVTYPE) %>%
  summarise(
    PROPERTY_DAMAGE = sum(PROPDMGVAL, na.rm = TRUE),
    CROP_DAMAGE     = sum(CROPDMGVAL, na.rm = TRUE)
  ) %>%
  mutate(TOTAL_ECONOMIC_IMPACT = PROPERTY_DAMAGE + CROP_DAMAGE) %>%
  arrange(desc(TOTAL_ECONOMIC_IMPACT))

RESULTS

Identify top 5 events causing the most fatalities and injuries

top5_health <- head(health_impact, 5)
knitr::kable(top5_health, caption = "Table 1: Top 5 storm events by total fatalities and injuries")

Table 1: Top 5 storm events by total fatalities and injuries
EVTYPE	FATALITIES	INJURIES	TOTAL_HEALTH_IMPACT
TORNADO	5633	91346	96979
THUNDERSTORM WIND	701	9353	10054
EXCESSIVE HEAT	1903	6525	8428
FLOOD	498	6868	7366
LIGHTNING	816	5230	6046

Identify top 5 events causing the greatest economic losses

top5_economic <- head(economic_impact, 5)
knitr::kable(top5_economic, caption = "Table 2: Top 5 storm events by total economic losses")

Table 2: Top 5 storm events by total economic losses
EVTYPE	PROPERTY_DAMAGE	CROP_DAMAGE	TOTAL_ECONOMIC_IMPACT
FLOOD	144716019457	5670456550	150386476007
HURRICANE	81174159010	5349782800	86523941810
TORNADO	56937160779	414953270	57352114049
STORM SURGE	43323536000	5000	43323541000
HAIL	15732267543	3025954473	18758222016

Create bar-chart panels for top 5 health and economic impact events

library(ggplot2)
library(cowplot)

# Create the first plot
p1 <- ggplot(top5_health, aes(x = reorder(EVTYPE, TOTAL_HEALTH_IMPACT), y = TOTAL_HEALTH_IMPACT)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 5 Events: Fatalities and Injuries",
       x = "Event Type",
       y = "Total Health Impact") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

# Create the second plot
p2 <- ggplot(top5_economic, aes(x = reorder(EVTYPE, TOTAL_ECONOMIC_IMPACT), y = TOTAL_ECONOMIC_IMPACT)) +
  geom_bar(stat = "identity", fill = "darkgreen") +
  coord_flip() +
  labs(title = "Top 5 Events: Economic Damage",
       x = "Event Type",
       y = "Total Economic Impact ($)") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

# Create a blank spacer plot so the code knows what to do
spacer <- ggdraw() + theme_void()

# Stacked the spacer between the charts and allows adjustments
plot_grid(p1, spacer, p2, labels = c("CHART A", "", "CHART B"), ncol = 1, rel_heights = c(1, 0.2, 1))

Figure 1: Top 5 events causing fatalities and injuries, and top 5 events causing economic damage

Normalized Comparison

Health vs Economic Impact by Event Type

library(dplyr)
library(ggplot2)
library(tidyr)

# Combine both datasets for top events: Normalizing the data measures to create the visual 
# Event types were combined with duplicates removed  
combined_events <- union(top5_health$EVTYPE, top5_economic$EVTYPE)

data_lolli_v <- full_join(
  select(health_impact, EVTYPE, Health = TOTAL_HEALTH_IMPACT),
  select(economic_impact, EVTYPE, Economic = TOTAL_ECONOMIC_IMPACT),
  by = "EVTYPE"
) %>%
  filter(EVTYPE %in% combined_events) %>%
  replace_na(list(Health = 0, Economic = 0)) %>%
  # Normalize both measures to 0–1 range for comparable scaling
  mutate(
    Health_Norm = Health / max(Health, na.rm = TRUE),
    Economic_Norm = Economic / max(Economic, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = c(Health_Norm, Economic_Norm),
               names_to = "Impact_Type",
               values_to = "Value")

# Vertical lollipop plot
ggplot(data_lolli_v, aes(x = reorder(EVTYPE, Value), y = Value, color = Impact_Type)) +
  geom_segment(aes(xend = EVTYPE, y = 0, yend = Value), linewidth = 1.1, alpha = 0.8) +
  geom_point(size = 4) +
  scale_color_manual(values = c("Health_Norm" = "darkgreen",
                                "Economic_Norm" = "steelblue"),
                     labels = c("Health Impact", "Economic Impact")) +
  labs(
  title = "Normalized Comparison\nHealth vs Economic Impact",
  x = "Event Type",
  y = "Relative Impact (0–1)",
  color = "Impact Type"
) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top",
    panel.grid.minor = element_blank()
  )

Figure 2: Normalized comparison of health and economic impacts by event type

TECH SPECS

Operating System: Linux Mint 22.2 (x86_64)

Host Machine: Mac mini (Model 7,1)

Kernel: 6.14.0-33-generic

Desktop Environment: Cinnamon 6.4.8 (Mutter / Muffin WM)

CPU: Intel Core i5-4278U (4 cores, 3.1 GHz)

GPU: Intel Haswell-ULT Integrated Graphics

Memory: 7.8 GB (≈3.8 GB used at runtime)

Display Resolution: 1920×1080 × 2 (monitors)

PROGRAM

RStudio 2025.09.1 Build 401

PACKAGES

Core Data & Analysis

tidyverse 2.0.0 (includes dplyr 1.1.4, ggplot2 4.0.0, tidyr 1.3.1, readr 2.1.5, tibble 3.3.0, purrr 1.1.0, stringr 1.5.2, forcats 1.0.1)

data.table 1.17.8 (fast data loading and aggregation)

lubridate 1.9.4 (date handling and extraction)

reshape2 1.4.4 (data reshaping for grouped summaries)

Visualization & Plot Arrangement

ggplot2 (within tidyverse)

cowplot 1.2.0 (plot layout and labeling)

gridExtra 2.3 (grid-based plot arrangement)

scales 1.4.0 (numeric labeling, currency formatting)

Reporting & Output

knitr 1.50 (tables and chunk rendering)

rmarkdown 2.30 (document compilation and knitting)

NOAA Storm Data: U.S. Impact Analysis

MIKEB

Date 2025-11-04