Synopsis

This repotz uses the US National Ocianic and Atmospheric Adiminstration (NOAA) storm database (1950- 2011) to identify which severe weather event types are 1. Most harmful to the health of the population 2. have the greatest economic consequences. we load the raw data file (csv.bz2)directly, standardize event lavels, and
compute total fatalities, injuries and inflation unadjusted property and crop damages. we find that a smal number of event classes dominate impacts, which tornados leading human harm and floods driving economic losses. Methods and code are below.

Data Processing Loading the raw data

We start from the orgiginal compresst CSV.

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
# install.packages(c("tidyverse","lubridate","readr","stringr","scales")) 
# run once if needed

library(tidyverse)
library(lubridate)
library(readr)
library(stringr)
library(scales)
library(conflicted)
# File must be in the working directory

f <- "repdata_data_StormData.csv.bz2"

raw <- read_csv(
f,
show_col_types = FALSE,
locale = locale(encoding = "latin1") # tolerant to extended chars
)

# keep only the variables we need

dat <- raw %>%
transmute(
BGN_DATE = mdy_hms(BGN_DATE, quiet = TRUE),
EVTYPE   = as.character(EVTYPE),
FATALITIES = as.numeric(FATALITIES),
INJURIES   = as.numeric(INJURIES),
PROPDMG, PROPDMGEXP,
CROPDMG, CROPDMGEXP
)
summary(dat$BGN_DATE)
##                       Min.                    1st Qu. 
## "1950-01-03 00:00:00.0000" "1995-04-20 00:00:00.0000" 
##                     Median                       Mean 
## "2002-03-18 00:00:00.0000" "1998-12-27 23:37:48.9970" 
##                    3rd Qu.                       Max. 
## "2007-07-28 00:00:00.0000" "2011-11-30 00:00:00.0000"

Event type standardization (justification)

EVTYPE contains many near-duplicates (e.g., “TSTM WIND”, “THUNDERSTORM WIND”, extra spaces). We apply lightweight normalization that preserves meaning while merging obvious variants:

-uppercase, trim spaces, collapse multiple spaces

-alias common synonyms (e.g., TSTM → THUNDERSTORM)

-remove punctuation that doesn’t convey type

-map a few high-frequency patterns to canonical names

This reduces fragmentation and yields more interpretable totals without over-engineering a full taxonomy.

canon_evtype <- function(x){
  s <- toupper(str_trim(x))
  s <- str_replace_all(s, "[[:space:]]+", " ")  # collapse whitespace
  s <- str_replace_all(s, "[/-]", " ")          # normalize separators

  # fixed (non-regex) aliases to merge obvious variants
  s <- str_replace_all(s, fixed("TSTM"), "THUNDERSTORM")
  s <- str_replace_all(s, fixed("THUNDERSTORM WINDS"), "THUNDERSTORM WIND")
  s <- str_replace_all(s, fixed("HURRICANE TYPHOON"), "HURRICANE")
  s <- str_replace_all(s, fixed("RIP CURRENTS"), "RIP CURRENT")
  s <- str_replace_all(s, fixed("EXTREME COLD WIND CHILL"), "EXTREME COLD")
  s <- str_squish(s)

  dplyr::case_when(
    startsWith(s, "TORNADO") ~ "TORNADO",
    startsWith(s, "HURRICANE") | startsWith(s, "TYPHOON") ~ "HURRICANE",
    startsWith(s, "THUNDERSTORM") ~ "THUNDERSTORM WIND",
    startsWith(s, "FLASH FLOOD") ~ "FLASH FLOOD",
    startsWith(s, "FLOOD") ~ "FLOOD",
    startsWith(s, "WILDFIRE") | startsWith(s, "WILD FIRE") | startsWith(s, "FOREST FIRE") ~ "WILDFIRE",
    startsWith(s, "HEAT") | startsWith(s, "EXCESSIVE HEAT") ~ "HEAT",
    startsWith(s, "COLD") | startsWith(s, "EXTREME COLD") | startsWith(s, "RECORD COLD") ~ "COLD",
    startsWith(s, "WINTER STORM") ~ "WINTER STORM",
    startsWith(s, "HIGH WIND") | startsWith(s, "STRONG WIND") ~ "HIGH WIND",
    startsWith(s, "HAIL") ~ "HAIL",
    startsWith(s, "DROUGHT") ~ "DROUGHT",
    startsWith(s, "ICE STORM") ~ "ICE STORM",
    startsWith(s, "STORM SURGE") ~ "STORM SURGE/TIDE",
    TRUE ~ s
  )
}
# create cleaned event type column 
dat <- dat %>% mutate(EVTYPE_C = canon_evtype(EVTYPE))

Damage exponents (justification)

Property/crop damage units use exponents in PROPDMGEXP / CROPDMGEXP. Following common practice, we interpret H=10², K=10³, M=10⁶, B=10⁹; digits 0–9 as 10^digit; blanks and other symbols as 1.

exp_to_mult <- function(x){
s <- toupper(trimws(as.character(x)))
out <- rep(1, length(s))
out[s == "H"] <- 1e2
out[s == "K"] <- 1e3
out[s == "M"] <- 1e6
out[s == "B"] <- 1e9
is_digit <- grepl("^[0-9]$", s)
out[is_digit] <- 10 ^ as.integer(s[is_digit])
out
}

dat <- dat %>%
mutate(
PROP_MULT    = exp_to_mult(PROPDMGEXP),
CROP_MULT    = exp_to_mult(CROPDMGEXP),
PROP_DMG_USD = PROPDMG * PROP_MULT,
CROP_DMG_USD = CROPDMG * CROP_MULT,
ECON_DAMAGE  = PROP_DMG_USD + CROP_DMG_USD,
HUMAN_IMPACT = coalesce(FATALITIES, 0) + coalesce(INJURIES, 0)
)

Results Which event types are most harmful to population health?

health_top <- dat %>%
group_by(EVTYPE_C) %>%
summarise(
Fatalities  = sum(FATALITIES, na.rm = TRUE),
Injuries    = sum(INJURIES,   na.rm = TRUE),
HumanImpact = Fatalities + Injuries
) %>%
arrange(desc(HumanImpact)) %>%
slice_head(n = 10)

health_top
## # A tibble: 10 × 4
##    EVTYPE_C          Fatalities Injuries HumanImpact
##    <chr>                  <dbl>    <dbl>       <dbl>
##  1 TORNADO                 5658    91364       97022
##  2 HEAT                    3021     9019       12040
##  3 THUNDERSTORM WIND        710     9508       10218
##  4 FLOOD                    495     6806        7301
##  5 LIGHTNING                816     5230        6046
##  6 FLASH FLOOD             1018     1785        2803
##  7 HIGH WIND                404     1772        2176
##  8 ICE STORM                 89     1977        2066
##  9 WINTER STORM             217     1353        1570
## 10 HURRICANE                135     1333        1468
ggplot(health_top, aes(reorder(EVTYPE_C, HumanImpact), HumanImpact)) +
geom_col() +
coord_flip() +
labs(x = NULL, y = "Fatalities + Injuries", title = "Human Impact by Event Type")

fmt_list <- function(x) {
  x <- as.character(x)
  if (length(x) == 1) return(x)
  if (length(x) == 2) return(paste(x, collapse = " and "))
  paste(paste(x[-length(x)], collapse = ", "), "and", x[length(x)])
}

Interpretation (Health). The top three event types are TORNADO, HEAT and THUNDERSTORM WIND.

Which event types have the greatest economic consequences?

econ_top <- dat %>%
group_by(EVTYPE_C) %>%
summarise(
Property = sum(PROP_DMG_USD, na.rm = TRUE),
Crops    = sum(CROP_DMG_USD, na.rm = TRUE),
Economic = Property + Crops
) %>%
arrange(desc(Economic)) %>%
slice_head(n = 10)

econ_top
## # A tibble: 10 × 4
##    EVTYPE_C               Property       Crops      Economic
##    <chr>                     <dbl>       <dbl>         <dbl>
##  1 FLOOD             144958136816   5878707950 150836844766 
##  2 HURRICANE          85356410010   5516117800  90872527810 
##  3 TORNADO            58552151876.   417461470  58969613346.
##  4 STORM SURGE/TIDE   47964724000       855000  47965579000 
##  5 HAIL               15977470013.  3026094623  19003564636.
##  6 FLASH FLOOD        17414731089.  1437163150  18851894239.
##  7 DROUGHT             1046106000  13972571780  15018677780 
##  8 THUNDERSTORM WIND   9970811300.  1225458988  11196270288.
##  9 RIVER FLOOD         5118945500   5029459000  10148404500 
## 10 ICE STORM           3944927860   5022113500   8967041360
ggplot(econ_top, aes(reorder(EVTYPE_C, Economic), Economic)) +
geom_col() +
coord_flip() +
scale_y_continuous(labels = label_dollar()) +
labs(x = NULL, y = "Damage (USD, nominal)", title = "Economic Damage by Event Type")

fmt_list <- function(x) {
  x <- as.character(x)
  if (length(x) == 0) return("")
  if (length(x) == 1) return(x)
  if (length(x) == 2) return(paste(x, collapse = " and "))
  paste(paste(x[-length(x)], collapse = ", "), "and", x[length(x)])
}

Interpretation (Economic). Total nominal damages are led by FLOOD, followed by HURRICANE and TORNADO.

Reproducibility notes

-Source: original NOAA Storm Data CSV (1950–2011) loaded directly from repdata_data_StormData.csv.bz2.

-All transformations are shown above; heavy steps use cache=TRUE where appropriate.

-Figures are limited to two and include descriptive captions.

-Session info:

sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=German_Switzerland.utf8  LC_CTYPE=German_Switzerland.utf8   
## [3] LC_MONETARY=German_Switzerland.utf8 LC_NUMERIC=C                       
## [5] LC_TIME=German_Switzerland.utf8    
## 
## time zone: Europe/Zurich
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] conflicted_1.2.0 scales_1.3.0     lubridate_1.9.3  forcats_1.0.0   
##  [5] stringr_1.5.1    dplyr_1.1.4      purrr_1.0.2      readr_2.1.5     
##  [9] tidyr_1.3.1      tibble_3.2.1     ggplot2_3.5.1    tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.9        utf8_1.2.4        generics_0.1.3    stringi_1.8.4    
##  [5] hms_1.1.3         digest_0.6.37     magrittr_2.0.3    evaluate_1.0.0   
##  [9] grid_4.4.1        timechange_0.3.0  fastmap_1.2.0     jsonlite_1.8.8   
## [13] fansi_1.0.6       jquerylib_0.1.4   cli_3.6.3         rlang_1.1.4      
## [17] crayon_1.5.3      bit64_4.5.2       munsell_0.5.1     withr_3.0.1      
## [21] cachem_1.1.0      yaml_2.3.10       tools_4.4.1       parallel_4.4.1   
## [25] tzdb_0.4.0        memoise_2.0.1     colorspace_2.1-1  vctrs_0.6.5      
## [29] R6_2.5.1          lifecycle_1.0.4   bit_4.5.0         vroom_1.6.5      
## [33] pkgconfig_2.0.3   pillar_1.9.0      bslib_0.8.0       gtable_0.3.5     
## [37] glue_1.7.0        highr_0.11        xfun_0.47         tidyselect_1.2.1 
## [41] rstudioapi_0.16.0 knitr_1.48        farver_2.1.2      htmltools_0.5.8.1
## [45] rmarkdown_2.28    labeling_0.4.3    compiler_4.4.1