# Load ALL packages required for the analysis
library(tidyverse) # Includes dplyr, ggplot2, tidyr, readr
library(data.table)
library(cowplot)
library(gridExtra)
library(knitr)
library(lubridate)
library(reshape2)
library(rmarkdown)
library(scales)
This report explores the NOAA Storm Events Database to identify which weather events have the greatest impact on population health and economic loss across the United States. Using data collected by the National Weather Service from 1950 through 2011, the analysis aggregates and visualizes storm types to compare their relative effects. The results may help emergency planners and agencies prepare for severe weather and prioritize resources more effectively.
The data source is from the NOAA Storm Events Database, maintained by the National Climatic Data Center (NCDC), part of the National Oceanic and Atmospheric Administration (NOAA). It includes U.S. storm and weather event data collected by the National Weather Service (NWS) from 1950 through 2011.
# Load NOAA Storm Data directly from compressed file
storm_data <- read.csv("repdata_data_StormData.csv.bz2")
# Verify structure and dimensions
dim(storm_data)
## [1] 902297 37
str(storm_data)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
exp_values <- function(e) {
ifelse(e %in% c("H", "h"), 1e2,
ifelse(e %in% c("K", "k"), 1e3,
ifelse(e %in% c("M", "m"), 1e6,
ifelse(e %in% c("B", "b"), 1e9, 1))))
}
storm_data$PROPDMGEXP2 <- exp_values(storm_data$PROPDMGEXP)
storm_data$CROPDMGEXP2 <- exp_values(storm_data$CROPDMGEXP)
storm_data$PROPDMGVAL <- storm_data$PROPDMG * storm_data$PROPDMGEXP2
storm_data$CROPDMGVAL <- storm_data$CROPDMG * storm_data$CROPDMGEXP2
storm_data$EVTYPE <- toupper(storm_data$EVTYPE) # Convert to uppercase
storm_data$EVTYPE <- trimws(storm_data$EVTYPE) # Remove leading/trailing spaces
# Basic replacements for common variations
storm_data$EVTYPE <- gsub("TSTM WIND", "THUNDERSTORM WIND", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDERSTORM WINDS", "THUNDERSTORM WIND", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("HURRICANE/TYPHOON", "HURRICANE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("RIP CURRENTS", "RIP CURRENT", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("WILD/FOREST FIRE", "WILDFIRE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("URBAN/SML STREAM FLD", "FLOOD", storm_data$EVTYPE)
Table: Standardizations List
| Raw Value | Standardized To |
|---|---|
| “TSTM WIND” | “THUNDERSTORM WIND” |
| “THUNDERSTORM WINDS” | “THUNDERSTORM WIND” |
| “HURRICANE/TYPHOON” | “HURRICANE” |
| “RIP CURRENTS” | “RIP CURRENT” |
| “WILD/FOREST FIRE” | “WILDFIRE” |
| “URBAN/SML STREAM FLD” | “FLOOD” |
health_impact <- storm_data %>%
group_by(EVTYPE) %>%
summarise(
FATALITIES = sum(FATALITIES, na.rm = TRUE),
INJURIES = sum(INJURIES, na.rm = TRUE)
) %>%
mutate(TOTAL_HEALTH_IMPACT = FATALITIES + INJURIES) %>%
arrange(desc(TOTAL_HEALTH_IMPACT))
economic_impact <- storm_data %>%
group_by(EVTYPE) %>%
summarise(
PROPERTY_DAMAGE = sum(PROPDMGVAL, na.rm = TRUE),
CROP_DAMAGE = sum(CROPDMGVAL, na.rm = TRUE)
) %>%
mutate(TOTAL_ECONOMIC_IMPACT = PROPERTY_DAMAGE + CROP_DAMAGE) %>%
arrange(desc(TOTAL_ECONOMIC_IMPACT))
top5_health <- head(health_impact, 5)
knitr::kable(top5_health, caption = "Table 1: Top 5 storm events by total fatalities and injuries")
| EVTYPE | FATALITIES | INJURIES | TOTAL_HEALTH_IMPACT |
|---|---|---|---|
| TORNADO | 5633 | 91346 | 96979 |
| THUNDERSTORM WIND | 701 | 9353 | 10054 |
| EXCESSIVE HEAT | 1903 | 6525 | 8428 |
| FLOOD | 498 | 6868 | 7366 |
| LIGHTNING | 816 | 5230 | 6046 |
top5_economic <- head(economic_impact, 5)
knitr::kable(top5_economic, caption = "Table 2: Top 5 storm events by total economic losses")
| EVTYPE | PROPERTY_DAMAGE | CROP_DAMAGE | TOTAL_ECONOMIC_IMPACT |
|---|---|---|---|
| FLOOD | 144716019457 | 5670456550 | 150386476007 |
| HURRICANE | 81174159010 | 5349782800 | 86523941810 |
| TORNADO | 56937160779 | 414953270 | 57352114049 |
| STORM SURGE | 43323536000 | 5000 | 43323541000 |
| HAIL | 15732267543 | 3025954473 | 18758222016 |
library(ggplot2)
library(cowplot)
# Create the first plot
p1 <- ggplot(top5_health, aes(x = reorder(EVTYPE, TOTAL_HEALTH_IMPACT), y = TOTAL_HEALTH_IMPACT)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 5 Events: Fatalities and Injuries",
x = "Event Type",
y = "Total Health Impact") +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
# Create the second plot
p2 <- ggplot(top5_economic, aes(x = reorder(EVTYPE, TOTAL_ECONOMIC_IMPACT), y = TOTAL_ECONOMIC_IMPACT)) +
geom_bar(stat = "identity", fill = "darkgreen") +
coord_flip() +
labs(title = "Top 5 Events: Economic Damage",
x = "Event Type",
y = "Total Economic Impact ($)") +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
# Create a blank spacer plot so the code knows what to do
spacer <- ggdraw() + theme_void()
# Stacked the spacer between the charts and allows adjustments
plot_grid(p1, spacer, p2, labels = c("CHART A", "", "CHART B"), ncol = 1, rel_heights = c(1, 0.2, 1))
Figure 1: Top 5 events causing fatalities and injuries, and top 5 events causing economic damage
library(dplyr)
library(ggplot2)
library(tidyr)
# Combine both datasets for top events: Normalizing the data measures to create the visual
# Event types were combined with duplicates removed
combined_events <- union(top5_health$EVTYPE, top5_economic$EVTYPE)
data_lolli_v <- full_join(
select(health_impact, EVTYPE, Health = TOTAL_HEALTH_IMPACT),
select(economic_impact, EVTYPE, Economic = TOTAL_ECONOMIC_IMPACT),
by = "EVTYPE"
) %>%
filter(EVTYPE %in% combined_events) %>%
replace_na(list(Health = 0, Economic = 0)) %>%
# Normalize both measures to 0–1 range for comparable scaling
mutate(
Health_Norm = Health / max(Health, na.rm = TRUE),
Economic_Norm = Economic / max(Economic, na.rm = TRUE)
) %>%
pivot_longer(cols = c(Health_Norm, Economic_Norm),
names_to = "Impact_Type",
values_to = "Value")
# Vertical lollipop plot
ggplot(data_lolli_v, aes(x = reorder(EVTYPE, Value), y = Value, color = Impact_Type)) +
geom_segment(aes(xend = EVTYPE, y = 0, yend = Value), linewidth = 1.1, alpha = 0.8) +
geom_point(size = 4) +
scale_color_manual(values = c("Health_Norm" = "darkgreen",
"Economic_Norm" = "steelblue"),
labels = c("Health Impact", "Economic Impact")) +
labs(
title = "Normalized Comparison\nHealth vs Economic Impact",
x = "Event Type",
y = "Relative Impact (0–1)",
color = "Impact Type"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top",
panel.grid.minor = element_blank()
)
Figure 2: Normalized comparison of health and economic impacts by event type
Operating System: Linux Mint 22.2 (x86_64)
Host Machine: Mac mini (Model 7,1)
Kernel: 6.14.0-33-generic
Desktop Environment: Cinnamon 6.4.8 (Mutter / Muffin WM)
CPU: Intel Core i5-4278U (4 cores, 3.1 GHz)
GPU: Intel Haswell-ULT Integrated Graphics
Memory: 7.8 GB (≈3.8 GB used at runtime)
Display Resolution: 1920×1080 × 2 (monitors)
RStudio 2025.09.1 Build 401
Core Data & Analysis
tidyverse 2.0.0 (includes dplyr 1.1.4, ggplot2 4.0.0, tidyr 1.3.1, readr 2.1.5, tibble 3.3.0, purrr 1.1.0, stringr 1.5.2, forcats 1.0.1)
data.table 1.17.8 (fast data loading and aggregation)
lubridate 1.9.4 (date handling and extraction)
reshape2 1.4.4 (data reshaping for grouped summaries)
Visualization & Plot Arrangement
ggplot2 (within tidyverse)
cowplot 1.2.0 (plot layout and labeling)
gridExtra 2.3 (grid-based plot arrangement)
scales 1.4.0 (numeric labeling, currency formatting)
Reporting & Output
knitr 1.50 (tables and chunk rendering)
rmarkdown 2.30 (document compilation and knitting)