This report analyzes the U.S. NOAA Storm Database to identify which
event types are most harmful to population health and which have the
greatest economic consequences. The analysis starts from the original
compressed raw data file (StormData.csv.bz2) and performs
all preprocessing in this document. Population health impact is defined
as the sum of fatalities and injuries. Economic impact is defined as the
sum of property damage and crop damage after converting exponent codes
to numeric multipliers. Event-level impacts are aggregated by
EVTYPE across the United States. The results are presented
using ranked summaries and plots of the top event types. In this
dataset, tornado-related events tend to dominate health impacts.
Economic impact results are strongly influenced by high-magnitude events
such as floods, hurricanes/typhoons, and storm surge depending on
normalization choices for EVTYPE. Because reporting quality
varies over time (especially earlier years), findings should be
interpreted as patterns in recorded data rather than absolute truth.
required_packages <- c("dplyr", "ggplot2", "readr", "stringr", "tibble")
missing <- required_packages[!sapply(required_packages, requireNamespace, quietly = TRUE)]
if (length(missing) > 0) {
install.packages(missing, repos = "https://cloud.r-project.org")
}
library(dplyr)
library(ggplot2)
library(readr)
library(stringr)
library(tibble)
data_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
data_file <- "StormData.csv.bz2"
if (!file.exists(data_file)) {
download.file(data_url, destfile = data_file, mode = "wb")
}
storm <- read.csv(data_file, stringsAsFactors = FALSE)
dim(storm)
## [1] 902297 37
exp_to_mult <- function(exp_code) {
exp_code <- toupper(trimws(as.character(exp_code)))
dplyr::case_when(
exp_code %in% as.character(0:8) ~ 10^as.numeric(exp_code),
exp_code == "H" ~ 1e2,
exp_code == "K" ~ 1e3,
exp_code == "M" ~ 1e6,
exp_code == "B" ~ 1e9,
TRUE ~ 1
)
}
storm2 <- storm %>%
mutate(
EVTYPE_CLEAN = str_squish(toupper(EVTYPE)),
PROP_MULT = exp_to_mult(PROPDMGEXP),
CROP_MULT = exp_to_mult(CROPDMGEXP),
PROP_DAMAGE_USD = PROPDMG * PROP_MULT,
CROP_DAMAGE_USD = CROPDMG * CROP_MULT,
HEALTH_IMPACT = FATALITIES + INJURIES,
ECON_IMPACT = PROP_DAMAGE_USD + CROP_DAMAGE_USD
)
summary(storm2$HEALTH_IMPACT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1725 0.0000 1742.0000
summary(storm2$ECON_IMPACT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00e+00 0.00e+00 0.00e+00 5.29e+05 1.00e+03 1.15e+11
health_by_event <- storm2 %>%
group_by(EVTYPE_CLEAN) %>%
summarise(
FATALITIES = sum(FATALITIES, na.rm = TRUE),
INJURIES = sum(INJURIES, na.rm = TRUE),
HEALTH_IMPACT = sum(HEALTH_IMPACT, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(HEALTH_IMPACT))
econ_by_event <- storm2 %>%
group_by(EVTYPE_CLEAN) %>%
summarise(
PROP_DAMAGE_USD = sum(PROP_DAMAGE_USD, na.rm = TRUE),
CROP_DAMAGE_USD = sum(CROP_DAMAGE_USD, na.rm = TRUE),
ECON_IMPACT = sum(ECON_IMPACT, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(ECON_IMPACT))
head(health_by_event, 10)
## # A tibble: 10 × 4
## EVTYPE_CLEAN FATALITIES INJURIES HEALTH_IMPACT
## <chr> <dbl> <dbl> <dbl>
## 1 TORNADO 5633 91346 96979
## 2 EXCESSIVE HEAT 1903 6525 8428
## 3 TSTM WIND 504 6957 7461
## 4 FLOOD 470 6789 7259
## 5 LIGHTNING 816 5230 6046
## 6 HEAT 937 2100 3037
## 7 FLASH FLOOD 978 1777 2755
## 8 ICE STORM 89 1975 2064
## 9 THUNDERSTORM WIND 133 1488 1621
## 10 WINTER STORM 206 1321 1527
head(econ_by_event, 10)
## # A tibble: 10 × 4
## EVTYPE_CLEAN PROP_DAMAGE_USD CROP_DAMAGE_USD ECON_IMPACT
## <chr> <dbl> <dbl> <dbl>
## 1 FLOOD 144657709807 5661968450 150319678257
## 2 HURRICANE/TYPHOON 69305840000 2607872800 71913712800
## 3 TORNADO 56947380676. 414953270 57362333946.
## 4 STORM SURGE 43323536000 5000 43323541000
## 5 HAIL 15735267513. 3025954473 18761221986.
## 6 FLASH FLOOD 16822723978. 1421317100 18244041078.
## 7 DROUGHT 1046106000 13972566000 15018672000
## 8 HURRICANE 11868319010 2741910000 14610229010
## 9 RIVER FLOOD 5118945500 5029459000 10148404500
## 10 ICE STORM 3944927860 5022113500 8967041360
health_top10 <- health_by_event %>%
slice_head(n = 10)
health_top10
## # A tibble: 10 × 4
## EVTYPE_CLEAN FATALITIES INJURIES HEALTH_IMPACT
## <chr> <dbl> <dbl> <dbl>
## 1 TORNADO 5633 91346 96979
## 2 EXCESSIVE HEAT 1903 6525 8428
## 3 TSTM WIND 504 6957 7461
## 4 FLOOD 470 6789 7259
## 5 LIGHTNING 816 5230 6046
## 6 HEAT 937 2100 3037
## 7 FLASH FLOOD 978 1777 2755
## 8 ICE STORM 89 1975 2064
## 9 THUNDERSTORM WIND 133 1488 1621
## 10 WINTER STORM 206 1321 1527
health_top10 %>%
mutate(EVTYPE_CLEAN = reorder(EVTYPE_CLEAN, HEALTH_IMPACT)) %>%
ggplot(aes(x = EVTYPE_CLEAN, y = HEALTH_IMPACT)) +
geom_col(fill = "#D55E00") +
coord_flip() +
labs(
title = "Top 10 Event Types by Population Health Impact",
subtitle = "Health impact = Fatalities + Injuries",
x = "Event Type",
y = "Total Health Impact"
) +
theme_minimal(base_size = 12)
The highest health impacts are concentrated in a small set of event types, with tornado-related categories typically ranking highest in this dataset.
econ_top10 <- econ_by_event %>%
slice_head(n = 10) %>%
mutate(ECON_IMPACT_BILLION = ECON_IMPACT / 1e9)
econ_top10 %>%
select(EVTYPE_CLEAN, PROP_DAMAGE_USD, CROP_DAMAGE_USD, ECON_IMPACT, ECON_IMPACT_BILLION)
## # A tibble: 10 × 5
## EVTYPE_CLEAN PROP_DAMAGE_USD CROP_DAMAGE_USD ECON_IMPACT ECON_IMPACT_BILLION
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 FLOOD 144657709807 5661968450 1.50e11 150.
## 2 HURRICANE/TY… 69305840000 2607872800 7.19e10 71.9
## 3 TORNADO 56947380676. 414953270 5.74e10 57.4
## 4 STORM SURGE 43323536000 5000 4.33e10 43.3
## 5 HAIL 15735267513. 3025954473 1.88e10 18.8
## 6 FLASH FLOOD 16822723978. 1421317100 1.82e10 18.2
## 7 DROUGHT 1046106000 13972566000 1.50e10 15.0
## 8 HURRICANE 11868319010 2741910000 1.46e10 14.6
## 9 RIVER FLOOD 5118945500 5029459000 1.01e10 10.1
## 10 ICE STORM 3944927860 5022113500 8.97e 9 8.97
econ_top10 %>%
mutate(EVTYPE_CLEAN = reorder(EVTYPE_CLEAN, ECON_IMPACT_BILLION)) %>%
ggplot(aes(x = EVTYPE_CLEAN, y = ECON_IMPACT_BILLION)) +
geom_col(fill = "#0072B2") +
coord_flip() +
labs(
title = "Top 10 Event Types by Economic Consequences",
subtitle = "Economic impact = Property Damage + Crop Damage",
x = "Event Type",
y = "Total Economic Impact (Billion USD)"
) +
theme_minimal(base_size = 12)
Economic losses are dominated by high-impact weather categories and are sensitive to event labeling conventions in the source data.
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.2
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Los_Angeles
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tibble_3.3.1 stringr_1.6.0 readr_2.2.0 ggplot2_4.0.2 dplyr_1.2.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 compiler_4.5.2 tidyselect_1.2.1
## [5] jquerylib_0.1.4 scales_1.4.0 yaml_2.3.12 fastmap_1.2.0
## [9] R6_2.6.1 labeling_0.4.3 generics_0.1.4 knitr_1.51
## [13] bslib_0.10.0 pillar_1.11.1 RColorBrewer_1.1-3 tzdb_0.5.0
## [17] rlang_1.1.7 utf8_1.2.6 cachem_1.1.0 stringi_1.8.7
## [21] xfun_0.56 sass_0.4.10 S7_0.2.1 cli_3.6.5
## [25] withr_3.0.2 magrittr_2.0.4 digest_0.6.39 grid_4.5.2
## [29] hms_1.1.4 lifecycle_1.0.5 vctrs_0.7.1 evaluate_1.0.5
## [33] glue_1.8.0 farver_2.1.2 rmarkdown_2.30 tools_4.5.2
## [37] pkgconfig_2.0.3 htmltools_0.5.9