Synopsis

This report analyzes the U.S. NOAA Storm Database to identify which event types are most harmful to population health and which have the greatest economic consequences. The analysis starts from the original compressed raw data file (StormData.csv.bz2) and performs all preprocessing in this document. Population health impact is defined as the sum of fatalities and injuries. Economic impact is defined as the sum of property damage and crop damage after converting exponent codes to numeric multipliers. Event-level impacts are aggregated by EVTYPE across the United States. The results are presented using ranked summaries and plots of the top event types. In this dataset, tornado-related events tend to dominate health impacts. Economic impact results are strongly influenced by high-magnitude events such as floods, hurricanes/typhoons, and storm surge depending on normalization choices for EVTYPE. Because reporting quality varies over time (especially earlier years), findings should be interpreted as patterns in recorded data rather than absolute truth.

Data Processing

Load required packages

required_packages <- c("dplyr", "ggplot2", "readr", "stringr", "tibble")
missing <- required_packages[!sapply(required_packages, requireNamespace, quietly = TRUE)]
if (length(missing) > 0) {
  install.packages(missing, repos = "https://cloud.r-project.org")
}

library(dplyr)
library(ggplot2)
library(readr)
library(stringr)
library(tibble)

Download and read raw data

data_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
data_file <- "StormData.csv.bz2"

if (!file.exists(data_file)) {
  download.file(data_url, destfile = data_file, mode = "wb")
}

storm <- read.csv(data_file, stringsAsFactors = FALSE)
dim(storm)
## [1] 902297     37

Transform damage exponents and derive analysis fields

exp_to_mult <- function(exp_code) {
  exp_code <- toupper(trimws(as.character(exp_code)))
  dplyr::case_when(
    exp_code %in% as.character(0:8) ~ 10^as.numeric(exp_code),
    exp_code == "H" ~ 1e2,
    exp_code == "K" ~ 1e3,
    exp_code == "M" ~ 1e6,
    exp_code == "B" ~ 1e9,
    TRUE ~ 1
  )
}

storm2 <- storm %>%
  mutate(
    EVTYPE_CLEAN = str_squish(toupper(EVTYPE)),
    PROP_MULT = exp_to_mult(PROPDMGEXP),
    CROP_MULT = exp_to_mult(CROPDMGEXP),
    PROP_DAMAGE_USD = PROPDMG * PROP_MULT,
    CROP_DAMAGE_USD = CROPDMG * CROP_MULT,
    HEALTH_IMPACT = FATALITIES + INJURIES,
    ECON_IMPACT = PROP_DAMAGE_USD + CROP_DAMAGE_USD
  )

summary(storm2$HEALTH_IMPACT)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##    0.0000    0.0000    0.0000    0.1725    0.0000 1742.0000
summary(storm2$ECON_IMPACT)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.00e+00 0.00e+00 0.00e+00 5.29e+05 1.00e+03 1.15e+11

Aggregate by event type

health_by_event <- storm2 %>%
  group_by(EVTYPE_CLEAN) %>%
  summarise(
    FATALITIES = sum(FATALITIES, na.rm = TRUE),
    INJURIES = sum(INJURIES, na.rm = TRUE),
    HEALTH_IMPACT = sum(HEALTH_IMPACT, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(HEALTH_IMPACT))

econ_by_event <- storm2 %>%
  group_by(EVTYPE_CLEAN) %>%
  summarise(
    PROP_DAMAGE_USD = sum(PROP_DAMAGE_USD, na.rm = TRUE),
    CROP_DAMAGE_USD = sum(CROP_DAMAGE_USD, na.rm = TRUE),
    ECON_IMPACT = sum(ECON_IMPACT, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(ECON_IMPACT))

head(health_by_event, 10)
## # A tibble: 10 × 4
##    EVTYPE_CLEAN      FATALITIES INJURIES HEALTH_IMPACT
##    <chr>                  <dbl>    <dbl>         <dbl>
##  1 TORNADO                 5633    91346         96979
##  2 EXCESSIVE HEAT          1903     6525          8428
##  3 TSTM WIND                504     6957          7461
##  4 FLOOD                    470     6789          7259
##  5 LIGHTNING                816     5230          6046
##  6 HEAT                     937     2100          3037
##  7 FLASH FLOOD              978     1777          2755
##  8 ICE STORM                 89     1975          2064
##  9 THUNDERSTORM WIND        133     1488          1621
## 10 WINTER STORM             206     1321          1527
head(econ_by_event, 10)
## # A tibble: 10 × 4
##    EVTYPE_CLEAN      PROP_DAMAGE_USD CROP_DAMAGE_USD   ECON_IMPACT
##    <chr>                       <dbl>           <dbl>         <dbl>
##  1 FLOOD               144657709807       5661968450 150319678257 
##  2 HURRICANE/TYPHOON    69305840000       2607872800  71913712800 
##  3 TORNADO              56947380676.       414953270  57362333946.
##  4 STORM SURGE          43323536000             5000  43323541000 
##  5 HAIL                 15735267513.      3025954473  18761221986.
##  6 FLASH FLOOD          16822723978.      1421317100  18244041078.
##  7 DROUGHT               1046106000      13972566000  15018672000 
##  8 HURRICANE            11868319010       2741910000  14610229010 
##  9 RIVER FLOOD           5118945500       5029459000  10148404500 
## 10 ICE STORM             3944927860       5022113500   8967041360

Results

Across the U.S., which events are most harmful to population health?

health_top10 <- health_by_event %>%
  slice_head(n = 10)

health_top10
## # A tibble: 10 × 4
##    EVTYPE_CLEAN      FATALITIES INJURIES HEALTH_IMPACT
##    <chr>                  <dbl>    <dbl>         <dbl>
##  1 TORNADO                 5633    91346         96979
##  2 EXCESSIVE HEAT          1903     6525          8428
##  3 TSTM WIND                504     6957          7461
##  4 FLOOD                    470     6789          7259
##  5 LIGHTNING                816     5230          6046
##  6 HEAT                     937     2100          3037
##  7 FLASH FLOOD              978     1777          2755
##  8 ICE STORM                 89     1975          2064
##  9 THUNDERSTORM WIND        133     1488          1621
## 10 WINTER STORM             206     1321          1527
health_top10 %>%
  mutate(EVTYPE_CLEAN = reorder(EVTYPE_CLEAN, HEALTH_IMPACT)) %>%
  ggplot(aes(x = EVTYPE_CLEAN, y = HEALTH_IMPACT)) +
  geom_col(fill = "#D55E00") +
  coord_flip() +
  labs(
    title = "Top 10 Event Types by Population Health Impact",
    subtitle = "Health impact = Fatalities + Injuries",
    x = "Event Type",
    y = "Total Health Impact"
  ) +
  theme_minimal(base_size = 12)

The highest health impacts are concentrated in a small set of event types, with tornado-related categories typically ranking highest in this dataset.

Across the U.S., which events have the greatest economic consequences?

econ_top10 <- econ_by_event %>%
  slice_head(n = 10) %>%
  mutate(ECON_IMPACT_BILLION = ECON_IMPACT / 1e9)

econ_top10 %>%
  select(EVTYPE_CLEAN, PROP_DAMAGE_USD, CROP_DAMAGE_USD, ECON_IMPACT, ECON_IMPACT_BILLION)
## # A tibble: 10 × 5
##    EVTYPE_CLEAN  PROP_DAMAGE_USD CROP_DAMAGE_USD ECON_IMPACT ECON_IMPACT_BILLION
##    <chr>                   <dbl>           <dbl>       <dbl>               <dbl>
##  1 FLOOD           144657709807       5661968450     1.50e11              150.  
##  2 HURRICANE/TY…    69305840000       2607872800     7.19e10               71.9 
##  3 TORNADO          56947380676.       414953270     5.74e10               57.4 
##  4 STORM SURGE      43323536000             5000     4.33e10               43.3 
##  5 HAIL             15735267513.      3025954473     1.88e10               18.8 
##  6 FLASH FLOOD      16822723978.      1421317100     1.82e10               18.2 
##  7 DROUGHT           1046106000      13972566000     1.50e10               15.0 
##  8 HURRICANE        11868319010       2741910000     1.46e10               14.6 
##  9 RIVER FLOOD       5118945500       5029459000     1.01e10               10.1 
## 10 ICE STORM         3944927860       5022113500     8.97e 9                8.97
econ_top10 %>%
  mutate(EVTYPE_CLEAN = reorder(EVTYPE_CLEAN, ECON_IMPACT_BILLION)) %>%
  ggplot(aes(x = EVTYPE_CLEAN, y = ECON_IMPACT_BILLION)) +
  geom_col(fill = "#0072B2") +
  coord_flip() +
  labs(
    title = "Top 10 Event Types by Economic Consequences",
    subtitle = "Economic impact = Property Damage + Crop Damage",
    x = "Event Type",
    y = "Total Economic Impact (Billion USD)"
  ) +
  theme_minimal(base_size = 12)

Economic losses are dominated by high-impact weather categories and are sensitive to event labeling conventions in the source data.

Reproducibility

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.2
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tibble_3.3.1  stringr_1.6.0 readr_2.2.0   ggplot2_4.0.2 dplyr_1.2.0  
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     compiler_4.5.2     tidyselect_1.2.1  
##  [5] jquerylib_0.1.4    scales_1.4.0       yaml_2.3.12        fastmap_1.2.0     
##  [9] R6_2.6.1           labeling_0.4.3     generics_0.1.4     knitr_1.51        
## [13] bslib_0.10.0       pillar_1.11.1      RColorBrewer_1.1-3 tzdb_0.5.0        
## [17] rlang_1.1.7        utf8_1.2.6         cachem_1.1.0       stringi_1.8.7     
## [21] xfun_0.56          sass_0.4.10        S7_0.2.1           cli_3.6.5         
## [25] withr_3.0.2        magrittr_2.0.4     digest_0.6.39      grid_4.5.2        
## [29] hms_1.1.4          lifecycle_1.0.5    vctrs_0.7.1        evaluate_1.0.5    
## [33] glue_1.8.0         farver_2.1.2       rmarkdown_2.30     tools_4.5.2       
## [37] pkgconfig_2.0.3    htmltools_0.5.9