Synopsis
This repotz uses the US National Ocianic and Atmospheric
Adiminstration (NOAA) storm database (1950- 2011) to identify which
severe weather event types are 1. Most harmful to the health of the
population 2. have the greatest economic consequences. we load the raw
data file (csv.bz2)directly, standardize event lavels, and
compute total fatalities, injuries and inflation unadjusted property and
crop damages. we find that a smal number of event classes dominate
impacts, which tornados leading human harm and floods driving economic
losses. Methods and code are below.
Data Processing Loading the raw data
We start from the orgiginal compresst CSV.
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
# install.packages(c("tidyverse","lubridate","readr","stringr","scales"))
# run once if needed
library(tidyverse)
library(lubridate)
library(readr)
library(stringr)
library(scales)
library(conflicted)
# File must be in the working directory
f <- "repdata_data_StormData.csv.bz2"
raw <- read_csv(
f,
show_col_types = FALSE,
locale = locale(encoding = "latin1") # tolerant to extended chars
)
# keep only the variables we need
dat <- raw %>%
transmute(
BGN_DATE = mdy_hms(BGN_DATE, quiet = TRUE),
EVTYPE = as.character(EVTYPE),
FATALITIES = as.numeric(FATALITIES),
INJURIES = as.numeric(INJURIES),
PROPDMG, PROPDMGEXP,
CROPDMG, CROPDMGEXP
)
summary(dat$BGN_DATE)
## Min. 1st Qu.
## "1950-01-03 00:00:00.0000" "1995-04-20 00:00:00.0000"
## Median Mean
## "2002-03-18 00:00:00.0000" "1998-12-27 23:37:48.9970"
## 3rd Qu. Max.
## "2007-07-28 00:00:00.0000" "2011-11-30 00:00:00.0000"
Event type standardization (justification)
EVTYPE contains many near-duplicates (e.g., “TSTM WIND”, “THUNDERSTORM WIND”, extra spaces). We apply lightweight normalization that preserves meaning while merging obvious variants:
-uppercase, trim spaces, collapse multiple spaces
-alias common synonyms (e.g., TSTM → THUNDERSTORM)
-remove punctuation that doesn’t convey type
-map a few high-frequency patterns to canonical names
This reduces fragmentation and yields more interpretable totals without over-engineering a full taxonomy.
canon_evtype <- function(x){
s <- toupper(str_trim(x))
s <- str_replace_all(s, "[[:space:]]+", " ") # collapse whitespace
s <- str_replace_all(s, "[/-]", " ") # normalize separators
# fixed (non-regex) aliases to merge obvious variants
s <- str_replace_all(s, fixed("TSTM"), "THUNDERSTORM")
s <- str_replace_all(s, fixed("THUNDERSTORM WINDS"), "THUNDERSTORM WIND")
s <- str_replace_all(s, fixed("HURRICANE TYPHOON"), "HURRICANE")
s <- str_replace_all(s, fixed("RIP CURRENTS"), "RIP CURRENT")
s <- str_replace_all(s, fixed("EXTREME COLD WIND CHILL"), "EXTREME COLD")
s <- str_squish(s)
dplyr::case_when(
startsWith(s, "TORNADO") ~ "TORNADO",
startsWith(s, "HURRICANE") | startsWith(s, "TYPHOON") ~ "HURRICANE",
startsWith(s, "THUNDERSTORM") ~ "THUNDERSTORM WIND",
startsWith(s, "FLASH FLOOD") ~ "FLASH FLOOD",
startsWith(s, "FLOOD") ~ "FLOOD",
startsWith(s, "WILDFIRE") | startsWith(s, "WILD FIRE") | startsWith(s, "FOREST FIRE") ~ "WILDFIRE",
startsWith(s, "HEAT") | startsWith(s, "EXCESSIVE HEAT") ~ "HEAT",
startsWith(s, "COLD") | startsWith(s, "EXTREME COLD") | startsWith(s, "RECORD COLD") ~ "COLD",
startsWith(s, "WINTER STORM") ~ "WINTER STORM",
startsWith(s, "HIGH WIND") | startsWith(s, "STRONG WIND") ~ "HIGH WIND",
startsWith(s, "HAIL") ~ "HAIL",
startsWith(s, "DROUGHT") ~ "DROUGHT",
startsWith(s, "ICE STORM") ~ "ICE STORM",
startsWith(s, "STORM SURGE") ~ "STORM SURGE/TIDE",
TRUE ~ s
)
}
# create cleaned event type column
dat <- dat %>% mutate(EVTYPE_C = canon_evtype(EVTYPE))
Damage exponents (justification)
Property/crop damage units use exponents in PROPDMGEXP / CROPDMGEXP. Following common practice, we interpret H=10², K=10³, M=10⁶, B=10⁹; digits 0–9 as 10^digit; blanks and other symbols as 1.
exp_to_mult <- function(x){
s <- toupper(trimws(as.character(x)))
out <- rep(1, length(s))
out[s == "H"] <- 1e2
out[s == "K"] <- 1e3
out[s == "M"] <- 1e6
out[s == "B"] <- 1e9
is_digit <- grepl("^[0-9]$", s)
out[is_digit] <- 10 ^ as.integer(s[is_digit])
out
}
dat <- dat %>%
mutate(
PROP_MULT = exp_to_mult(PROPDMGEXP),
CROP_MULT = exp_to_mult(CROPDMGEXP),
PROP_DMG_USD = PROPDMG * PROP_MULT,
CROP_DMG_USD = CROPDMG * CROP_MULT,
ECON_DAMAGE = PROP_DMG_USD + CROP_DMG_USD,
HUMAN_IMPACT = coalesce(FATALITIES, 0) + coalesce(INJURIES, 0)
)
Results Which event types are most harmful to population health?
health_top <- dat %>%
group_by(EVTYPE_C) %>%
summarise(
Fatalities = sum(FATALITIES, na.rm = TRUE),
Injuries = sum(INJURIES, na.rm = TRUE),
HumanImpact = Fatalities + Injuries
) %>%
arrange(desc(HumanImpact)) %>%
slice_head(n = 10)
health_top
## # A tibble: 10 × 4
## EVTYPE_C Fatalities Injuries HumanImpact
## <chr> <dbl> <dbl> <dbl>
## 1 TORNADO 5658 91364 97022
## 2 HEAT 3021 9019 12040
## 3 THUNDERSTORM WIND 710 9508 10218
## 4 FLOOD 495 6806 7301
## 5 LIGHTNING 816 5230 6046
## 6 FLASH FLOOD 1018 1785 2803
## 7 HIGH WIND 404 1772 2176
## 8 ICE STORM 89 1977 2066
## 9 WINTER STORM 217 1353 1570
## 10 HURRICANE 135 1333 1468
ggplot(health_top, aes(reorder(EVTYPE_C, HumanImpact), HumanImpact)) +
geom_col() +
coord_flip() +
labs(x = NULL, y = "Fatalities + Injuries", title = "Human Impact by Event Type")
fmt_list <- function(x) {
x <- as.character(x)
if (length(x) == 1) return(x)
if (length(x) == 2) return(paste(x, collapse = " and "))
paste(paste(x[-length(x)], collapse = ", "), "and", x[length(x)])
}
Interpretation (Health). The top three event types are TORNADO, HEAT and THUNDERSTORM WIND.
Which event types have the greatest economic consequences?
econ_top <- dat %>%
group_by(EVTYPE_C) %>%
summarise(
Property = sum(PROP_DMG_USD, na.rm = TRUE),
Crops = sum(CROP_DMG_USD, na.rm = TRUE),
Economic = Property + Crops
) %>%
arrange(desc(Economic)) %>%
slice_head(n = 10)
econ_top
## # A tibble: 10 × 4
## EVTYPE_C Property Crops Economic
## <chr> <dbl> <dbl> <dbl>
## 1 FLOOD 144958136816 5878707950 150836844766
## 2 HURRICANE 85356410010 5516117800 90872527810
## 3 TORNADO 58552151876. 417461470 58969613346.
## 4 STORM SURGE/TIDE 47964724000 855000 47965579000
## 5 HAIL 15977470013. 3026094623 19003564636.
## 6 FLASH FLOOD 17414731089. 1437163150 18851894239.
## 7 DROUGHT 1046106000 13972571780 15018677780
## 8 THUNDERSTORM WIND 9970811300. 1225458988 11196270288.
## 9 RIVER FLOOD 5118945500 5029459000 10148404500
## 10 ICE STORM 3944927860 5022113500 8967041360
ggplot(econ_top, aes(reorder(EVTYPE_C, Economic), Economic)) +
geom_col() +
coord_flip() +
scale_y_continuous(labels = label_dollar()) +
labs(x = NULL, y = "Damage (USD, nominal)", title = "Economic Damage by Event Type")
fmt_list <- function(x) {
x <- as.character(x)
if (length(x) == 0) return("")
if (length(x) == 1) return(x)
if (length(x) == 2) return(paste(x, collapse = " and "))
paste(paste(x[-length(x)], collapse = ", "), "and", x[length(x)])
}
Interpretation (Economic). Total nominal damages are led by FLOOD, followed by HURRICANE and TORNADO.
Reproducibility notes
-Source: original NOAA Storm Data CSV (1950–2011) loaded directly from repdata_data_StormData.csv.bz2.
-All transformations are shown above; heavy steps use cache=TRUE where appropriate.
-Figures are limited to two and include descriptive captions.
-Session info:
sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=German_Switzerland.utf8 LC_CTYPE=German_Switzerland.utf8
## [3] LC_MONETARY=German_Switzerland.utf8 LC_NUMERIC=C
## [5] LC_TIME=German_Switzerland.utf8
##
## time zone: Europe/Zurich
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] conflicted_1.2.0 scales_1.3.0 lubridate_1.9.3 forcats_1.0.0
## [5] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
## [9] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.9 utf8_1.2.4 generics_0.1.3 stringi_1.8.4
## [5] hms_1.1.3 digest_0.6.37 magrittr_2.0.3 evaluate_1.0.0
## [9] grid_4.4.1 timechange_0.3.0 fastmap_1.2.0 jsonlite_1.8.8
## [13] fansi_1.0.6 jquerylib_0.1.4 cli_3.6.3 rlang_1.1.4
## [17] crayon_1.5.3 bit64_4.5.2 munsell_0.5.1 withr_3.0.1
## [21] cachem_1.1.0 yaml_2.3.10 tools_4.4.1 parallel_4.4.1
## [25] tzdb_0.4.0 memoise_2.0.1 colorspace_2.1-1 vctrs_0.6.5
## [29] R6_2.5.1 lifecycle_1.0.4 bit_4.5.0 vroom_1.6.5
## [33] pkgconfig_2.0.3 pillar_1.9.0 bslib_0.8.0 gtable_0.3.5
## [37] glue_1.7.0 highr_0.11 xfun_0.47 tidyselect_1.2.1
## [41] rstudioapi_0.16.0 knitr_1.48 farver_2.1.2 htmltools_0.5.8.1
## [45] rmarkdown_2.28 labeling_0.4.3 compiler_4.4.1