Synopsis

Severe weather events can cause loss of life, injuries, and large economic losses. Using the NOAA Storm Database (1950–November 2011), I identify which event types are most harmful to population health and which have the greatest economic consequences. Because early years in the database contain fewer recorded events due to limited records, results focus on totals across the full period but note data completeness improves in later decades fileciteturn0file0. Event names were standardized toward the National Weather Service’s permitted list (48 event types) to reduce miscoding and duplication fileciteturn0file1. Health impact was measured as fatalities + injuries; economic impact was measured as the sum of property and crop damage after converting magnitude exponents (K/M/B). The analysis shows tornadoes dominate total injuries/fatalities, while floods and hurricanes/typhoons account for the largest total economic losses. Plots of the top 10 event types for each outcome are provided. All code is shown for full reproducibility.

Data Processing

This analysis starts from the original compressed CSV (repdata_data_StormData.csv.bz2). No preprocessing was done outside this document. The Storm Events database is compiled by NWS and archived by NCDC/NOAA; updates typically lag event months by ~90–120 days, and some inputs come from varied sources with possible uncertainty fileciteturn0file0. Event type standardization follows the NWS Instruction 10-1605 permitted table of event names fileciteturn0file1.

suppressPackageStartupMessages({
  library(dplyr)
  library(readr)
  library(stringr)
  library(ggplot2)
})
# Download (if needed) and load data ----
data_url <- "https://d396qusza40orc.cloudfront.net/repdata/data/StormData.csv.bz2"
data_file <- "repdata_data_StormData.csv.bz2"

if (!file.exists(data_file)) {
  download.file(data_url, destfile = data_file, mode = "wb", quiet = TRUE)
}

# Read with base read.csv for strict compatibility
# (Using bzfile so we start from the raw compressed CSV, as required)
df <- read.csv(bzfile(data_file), stringsAsFactors = FALSE)

# Keep only columns we need for this assignment
df <- df %>%
  dplyr::select(BGN_DATE, EVTYPE, FATALITIES, INJURIES,
                PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)

Event Type Normalization

NOAA/NWS defines a permitted list of event names; historical data contain many variants (e.g., TSTM WIND, THUNDERSTORM WINDS, etc.). Below, I map common variants to the closest permitted type to reduce fragmentation of counts fileciteturn0file1.

# Helper: normalize exponents to multipliers
exp_to_mult <- function(x) {
  x <- toupper(trimws(as.character(x)))
  dplyr::case_when(
    x %in% c("K") ~ 1e3,
    x %in% c("M") ~ 1e6,
    x %in% c("B") ~ 1e9,
    x %in% as.character(0:8) ~ 10^as.numeric(x),
    TRUE ~ 1
  )
}

# Helper: normalize event types toward NWS 48 categories (not exhaustive)
normalize_evtype <- function(x) {
  y <- toupper(x)
  y <- stringr::str_replace_all(y, "[^A-Z0-9 /()&-]", " ")
  y <- stringr::str_squish(y)

  dplyr::case_when(
    str_detect(y, "HURRICANE|TYPHOON") ~ "HURRICANE (TYPHOON)",
    str_detect(y, "TSTM|THUNDERSTORM") & str_detect(y, "MARINE") ~ "MARINE THUNDERSTORM WIND",
    str_detect(y, "THUNDERSTORM|TSTM") ~ "THUNDERSTORM WIND",
    str_detect(y, "TORNADO|TORNDAO|LANDSPOUT") ~ "TORNADO",
    str_detect(y, "WATERSPOUT") ~ "WATERSPOUT",
    str_detect(y, "FUNNEL") ~ "FUNNEL CLOUD",
    str_detect(y, "FLASH FLOOD") ~ "FLASH FLOOD",
    str_detect(y, "FLOOD|RIVER FLOOD|URBAN FLOOD|STREAM") ~ "FLOOD",
    str_detect(y, "COASTAL FLOOD") ~ "COASTAL FLOOD",
    str_detect(y, "LAKESHORE FLOOD") ~ "LAKESHORE FLOOD",
    str_detect(y, "STORM SURGE|STORM TIDE|SURGE/TIDE") ~ "STORM SURGE/TIDE",
    str_detect(y, "TSUNAMI") ~ "TSUNAMI",
    str_detect(y, "TROPICAL STORM") ~ "TROPICAL STORM",
    str_detect(y, "TROPICAL DEPRESSION") ~ "TROPICAL DEPRESSION",
    str_detect(y, "WINTER STORM") ~ "WINTER STORM",
    str_detect(y, "WINTER WEATHER|WINTRY") ~ "WINTER WEATHER",
    str_detect(y, "BLIZZARD") ~ "BLIZZARD",
    str_detect(y, "ICE STORM|GLAZE") ~ "ICE STORM",
    str_detect(y, "HEAVY SNOW") ~ "HEAVY SNOW",
    str_detect(y, "LAKE EFFECT SNOW|LAKE-EFFECT") ~ "LAKE-EFFECT SNOW",
    str_detect(y, "SLEET") ~ "SLEET",
    str_detect(y, "FREEZING FOG") ~ "FREEZING FOG",
    str_detect(y, "DENSE FOG") & !str_detect(y, "FREEZING") ~ "DENSE FOG",
    str_detect(y, "EXTREME COLD|WIND CHILL") ~ "EXTREME COLD/WIND CHILL",
    str_detect(y, "COLD|CHILL") ~ "COLD/WIND CHILL",
    str_detect(y, "EXCESSIVE HEAT|HEAT WAVE") ~ "EXCESSIVE HEAT",
    str_detect(y, "HEAT") ~ "HEAT",
    str_detect(y, "DROUGHT") ~ "DROUGHT",
    str_detect(y, "HAIL") & str_detect(y, "MARINE") ~ "MARINE HAIL",
    str_detect(y, "HAIL") ~ "HAIL",
    str_detect(y, "LIGHTNING") ~ "LIGHTNING",
    str_detect(y, "MARINE HIGH WIND") ~ "MARINE HIGH WIND",
    str_detect(y, "MARINE STRONG WIND") ~ "MARINE STRONG WIND",
    str_detect(y, "HIGH WIND") ~ "HIGH WIND",
    str_detect(y, "STRONG WIND|GUSTY WIND") ~ "STRONG WIND",
    str_detect(y, "HEAVY RAIN|TORRENTIAL|RAINSTORM") ~ "HEAVY RAIN",
    str_detect(y, "RIP CURRENT") ~ "RIP CURRENT",
    str_detect(y, "HIGH SURF|HEAVY SURF|ROUGH SURF|HAZARDOUS SURF") ~ "HIGH SURF",
    str_detect(y, "SEICHE") ~ "SEICHE",
    str_detect(y, "DUST DEVIL") ~ "DUST DEVIL",
    str_detect(y, "DUST STORM|BLOWING DUST|SAHARAN DUST") ~ "DUST STORM",
    str_detect(y, "AVALANCHE") ~ "AVALANCHE",
    str_detect(y, "DEBRIS FLOW|MUDSLIDE|MUD SLIDE|LANDSLIDE") ~ "DEBRIS FLOW",
    str_detect(y, "VOLCANIC ASH") ~ "VOLCANIC ASH",
    str_detect(y, "WILDFIRE|WILD FIRE|GRASS FIRE|FOREST FIRE") ~ "WILDFIRE",
    str_detect(y, "ASTRONOMICAL LOW TIDE") ~ "ASTRONOMICAL LOW TIDE",
    TRUE ~ y
  )
}

df <- df %>%
  mutate(
    EVTYPE_NORM = normalize_evtype(EVTYPE),
    prop_mult = exp_to_mult(PROPDMGEXP),
    crop_mult = exp_to_mult(CROPDMGEXP),
    PROP_DMG_USD = as.numeric(PROPDMG) * prop_mult,
    CROP_DMG_USD = as.numeric(CROPDMG) * crop_mult,
    ECON_DMG_USD = PROP_DMG_USD + CROP_DMG_USD,
    HEALTH_HARM = as.numeric(FATALITIES) + as.numeric(INJURIES)
  )

To reduce noise, I keep only rows whose normalized type is one of the official 48 event types.

permitted <- c(
  "ASTRONOMICAL LOW TIDE","AVALANCHE","BLIZZARD","COASTAL FLOOD",
  "COLD/WIND CHILL","DEBRIS FLOW","DENSE FOG","DENSE SMOKE","DROUGHT",
  "DUST DEVIL","DUST STORM","EXCESSIVE HEAT","EXTREME COLD/WIND CHILL",
  "FLASH FLOOD","FLOOD","FREEZING FOG","FROST/FREEZE","FUNNEL CLOUD",
  "HAIL","HEAT","HEAVY RAIN","HEAVY SNOW","HIGH SURF","HIGH WIND",
  "HURRICANE (TYPHOON)","ICE STORM","LAKESHORE FLOOD","LAKE-EFFECT SNOW",
  "LIGHTNING","MARINE HAIL","MARINE HIGH WIND","MARINE STRONG WIND",
  "MARINE THUNDERSTORM WIND","RIP CURRENT","SEICHE","SLEET","STORM SURGE/TIDE",
  "STRONG WIND","THUNDERSTORM WIND","TORNADO","TROPICAL DEPRESSION",
  "TROPICAL STORM","TSUNAMI","VOLCANIC ASH","WATERSPOUT","WILDFIRE",
  "WINTER STORM","WINTER WEATHER"
)

df_clean <- df %>% filter(EVTYPE_NORM %in% permitted)

Results

Which event types are most harmful to population health?

health_top <- df_clean %>%
  group_by(EVTYPE_NORM) %>%
  summarise(total_health_harm = sum(HEALTH_HARM, na.rm = TRUE),
            fatalities = sum(FATALITIES, na.rm = TRUE),
            injuries = sum(INJURIES, na.rm = TRUE)) %>%
  arrange(desc(total_health_harm)) %>%
  slice_head(n = 10)

health_top
## # A tibble: 10 × 4
##    EVTYPE_NORM       total_health_harm fatalities injuries
##    <chr>                         <dbl>      <dbl>    <dbl>
##  1 TORNADO                       97043       5636    91407
##  2 THUNDERSTORM WIND             10245        735     9510
##  3 EXCESSIVE HEAT                 9022       2103     6919
##  4 FLOOD                          7399        518     6881
##  5 LIGHTNING                      6048        817     5231
##  6 HEAT                           3340       1035     2305
##  7 FLASH FLOOD                    2837       1035     1802
##  8 ICE STORM                      2302         96     2206
##  9 HIGH WIND                      1795        293     1502
## 10 WILDFIRE                       1696         90     1606
p1 <- ggplot(health_top, aes(x = reorder(EVTYPE_NORM, total_health_harm), y = total_health_harm)) +
  geom_col() +
  coord_flip() +
  labs(title = "Top 10 Event Types by Total Injuries + Fatalities (1950–2011)",
       x = "Event Type", y = "Total Injuries + Fatalities",
       caption = "Source: NOAA Storm Database.") +
  theme_minimal(base_size = 12)

print(p1)
Top 10 event types by total injuries + fatalities (1950–2011). Source: NOAA Storm Database.

Top 10 event types by total injuries + fatalities (1950–2011). Source: NOAA Storm Database.

Which event types have the greatest economic consequences?

econ_top <- df_clean %>%
  group_by(EVTYPE_NORM) %>%
  summarise(total_econ_usd = sum(ECON_DMG_USD, na.rm = TRUE),
            prop_usd = sum(PROP_DMG_USD, na.rm = TRUE),
            crop_usd = sum(CROP_DMG_USD, na.rm = TRUE)) %>%
  arrange(desc(total_econ_usd)) %>%
  slice_head(n = 10)

econ_top_print <- econ_top %>%
  mutate(across(ends_with("_usd"), ~ .x / 1e9, .names = "{.col}_billions"))

econ_top_print
## # A tibble: 10 × 7
##    EVTYPE_NORM         total_econ_usd   prop_usd crop_usd total_econ_usd_billi…¹
##    <chr>                        <dbl>      <dbl>    <dbl>                  <dbl>
##  1 FLOOD                161538088439     1.51e11  1.09e10                 162.  
##  2 HURRICANE (TYPHOON)   90872527810     8.54e10  5.52e 9                  90.9 
##  3 TORNADO               57418286946.    5.70e10  4.15e 8                  57.4 
##  4 STORM SURGE/TIDE      47965579000     4.80e10  8.55e 5                  48.0 
##  5 FLASH FLOOD           19120489246.    1.76e10  1.53e 9                  19.1 
##  6 HAIL                  19024447641.    1.60e10  3.05e 9                  19.0 
##  7 DROUGHT               15018672000     1.05e 9  1.40e10                  15.0 
##  8 THUNDERSTORM WIND     14053053110.    1.28e10  1.27e 9                  14.1 
##  9 ICE STORM              8969446660     3.95e 9  5.02e 9                   8.97
## 10 WILDFIRE               8899855130     8.50e 9  4.03e 8                   8.90
## # ℹ abbreviated name: ¹​total_econ_usd_billions
## # ℹ 2 more variables: prop_usd_billions <dbl>, crop_usd_billions <dbl>
p2 <- ggplot(econ_top, aes(x = reorder(EVTYPE_NORM, total_econ_usd), y = total_econ_usd/1e9)) +
  geom_col() +
  coord_flip() +
  labs(title = "Top 10 Event Types by Total Economic Damage (1950–2011)",
       x = "Event Type", y = "Total Damage (USD Billions)",
       caption = "Property + crop damage after K/M/B exponent conversion.") +
  theme_minimal(base_size = 12)

print(p2)
Top 10 event types by total economic damage (USD billions), 1950–2011. Property + crop damage after K/M/B exponent conversion.

Top 10 event types by total economic damage (USD billions), 1950–2011. Property + crop damage after K/M/B exponent conversion.

Reproducibility

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=German_Germany.utf8  LC_CTYPE=German_Germany.utf8   
## [3] LC_MONETARY=German_Germany.utf8 LC_NUMERIC=C                   
## [5] LC_TIME=German_Germany.utf8    
## 
## time zone: Europe/Berlin
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.5.2 stringr_1.5.2 readr_2.1.5   dplyr_1.1.4  
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     compiler_4.5.1     tidyselect_1.2.1  
##  [5] jquerylib_0.1.4    scales_1.4.0       yaml_2.3.10        fastmap_1.2.0     
##  [9] R6_2.6.1           labeling_0.4.3     generics_0.1.4     knitr_1.50        
## [13] tibble_3.3.0       bslib_0.9.0        pillar_1.11.0      RColorBrewer_1.1-3
## [17] tzdb_0.5.0         rlang_1.1.6        utf8_1.2.6         cachem_1.1.0      
## [21] stringi_1.8.7      xfun_0.52          sass_0.4.10        cli_3.6.5         
## [25] withr_3.0.2        magrittr_2.0.3     digest_0.6.37      grid_4.5.1        
## [29] rstudioapi_0.17.1  hms_1.1.3          lifecycle_1.0.4    vctrs_0.6.5       
## [33] evaluate_1.0.4     glue_1.8.0         farver_2.1.2       rmarkdown_2.29    
## [37] tools_4.5.1        pkgconfig_2.0.3    htmltools_0.5.8.1