1 Synopsis

This report analyzes the U.S. National Oceanic and Atmospheric Administration (NOAA) Storm Events Database to identify which event types are most harmful to population health and which have the greatest economic consequences. We begin from the raw CSV repdata_data_StormData.csv. We compute health impacts (fatalities, injuries) and standardized economic losses (property + crop), showing all R code and caching heavy steps for speed. Results typically show tornadoes dominating health impacts, while floods, hurricanes/typhoons, and storm surges dominate economic losses.

2 Data Processing

2.1 Packages (pre-installed via install_packages_windows.R)

library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(stringr)
library(readr)
library(forcats)
library(tidyr)

2.2 Load raw data

csv_path <- "repdata_data_StormData.csv"
stopifnot(file.exists(csv_path))

storm_raw <- readr::read_csv(
  csv_path,
  show_col_types = FALSE,
  progress = FALSE
)

storm <- storm_raw %>%
  dplyr::select(
    BGN_DATE, STATE, EVTYPE,
    FATALITIES, INJURIES,
    PROPDMG, PROPDMGEXP,
    CROPDMG, CROPDMGEXP
  )

rm(storm_raw)

2.3 Normalize event types (EVTYPE)

storm <- storm %>%
  mutate(EVTYPE = stringr::str_squish(stringr::str_to_upper(as.character(EVTYPE))))

2.4 Health outcomes

health_by_event <- storm %>%
  group_by(EVTYPE) %>%
  summarise(
    fatalities = sum(FATALITIES, na.rm = TRUE),
    injuries   = sum(INJURIES,   na.rm = TRUE),
    health_harm = fatalities + injuries,
    .groups = "drop"
  ) %>%
  filter(health_harm > 0)

top_health <- health_by_event %>%
  arrange(desc(health_harm)) %>%
  slice_head(n = 10)

top_health

2.5 Economic damages (standardization)

exp_to_multiplier <- function(x) {
  x <- toupper(trimws(as.character(x)))
  m <- rep(1, length(x))
  m[x %in% c("H")] <- 1e2
  m[x %in% c("K")] <- 1e3
  m[x %in% c("M")] <- 1e6
  m[x %in% c("B")] <- 1e9
  is_digit <- grepl("^[0-8]$", x)
  m[is_digit] <- 10 ^ as.numeric(x[is_digit])
  m
}

storm <- storm %>%
  mutate(
    prop_mult = exp_to_multiplier(PROPDMGEXP),
    crop_mult = exp_to_multiplier(CROPDMGEXP),
    prop_loss = as.numeric(PROPDMG) * prop_mult,
    crop_loss = as.numeric(CROPDMG) * crop_mult,
    total_loss = prop_loss + crop_loss
  )

econ_by_event <- storm %>%
  group_by(EVTYPE) %>%
  summarise(
    property_damage = sum(prop_loss, na.rm = TRUE),
    crop_damage     = sum(crop_loss, na.rm = TRUE),
    economic_loss   = sum(total_loss, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(economic_loss > 0)

top_econ <- econ_by_event %>%
  arrange(desc(economic_loss)) %>%
  slice_head(n = 10)

top_econ

3 Results

3.1 Most harmful to population health

health_long <- health_by_event %>%
  semi_join(top_health, by = "EVTYPE") %>%
  select(EVTYPE, fatalities, injuries) %>%
  tidyr::pivot_longer(cols = c(fatalities, injuries),
                      names_to = "metric", values_to = "count") %>%
  mutate(EVTYPE = forcats::fct_reorder(EVTYPE, count, sum))

ggplot(health_long, aes(x = EVTYPE, y = count, fill = metric)) +
  geom_col() +
  coord_flip() +
  labs(
    x = "Event Type (EVTYPE)",
    y = "People Affected",
    fill = "Health Metric",
    title = "Top 10 Event Types by Health Harm (Fatalities + Injuries)"
  ) +
  theme_minimal(base_size = 12)

3.2 Greatest economic consequences

econ_long <- econ_by_event %>%
  semi_join(top_econ, by = "EVTYPE") %>%
  select(EVTYPE, property_damage, crop_damage) %>%
  tidyr::pivot_longer(cols = c(property_damage, crop_damage),
                      names_to = "type", values_to = "usd") %>%
  mutate(EVTYPE = forcats::fct_reorder(EVTYPE, usd, sum))

ggplot(econ_long, aes(x = EVTYPE, y = usd/1e9, fill = type)) +
  geom_col() +
  coord_flip() +
  labs(
    x = "Event Type (EVTYPE)",
    y = "Economic Loss (Billions of USD)",
    fill = "Damage Type",
    title = "Top 10 Event Types by Economic Loss (Property + Crop)"
  ) +
  theme_minimal(base_size = 12)

4 Reproducibility Notes

  • Start from raw repdata_data_StormData.csv (no external pre-processing).
  • Use cache=TRUE on heavy chunks to speed re-runs.
  • Exactly two figures.
  • Knit to HTML in RStudio Desktop and publish to RPubs.

5 Appendix: Session Info

sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=Spanish_Mexico.utf8  LC_CTYPE=Spanish_Mexico.utf8   
## [3] LC_MONETARY=Spanish_Mexico.utf8 LC_NUMERIC=C                   
## [5] LC_TIME=Spanish_Mexico.utf8    
## 
## time zone: America/Guayaquil
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tidyr_1.3.1   forcats_1.0.0 readr_2.1.5   stringr_1.5.1 ggplot2_3.5.1
## [6] dplyr_1.1.4  
## 
## loaded via a namespace (and not attached):
##  [1] bit_4.0.5         gtable_0.3.5      jsonlite_1.8.8    highr_0.11       
##  [5] crayon_1.5.3      compiler_4.4.1    tidyselect_1.2.1  parallel_4.4.1   
##  [9] jquerylib_0.1.4   scales_1.3.0      yaml_2.3.9        fastmap_1.2.0    
## [13] R6_2.5.1          labeling_0.4.3    generics_0.1.3    knitr_1.48       
## [17] tibble_3.2.1      munsell_0.5.1     bslib_0.7.0       pillar_1.9.0     
## [21] tzdb_0.4.0        rlang_1.1.4       utf8_1.2.4        cachem_1.1.0     
## [25] stringi_1.8.4     xfun_0.45         sass_0.4.9        bit64_4.0.5      
## [29] cli_3.6.3         withr_3.0.0       magrittr_2.0.3    digest_0.6.36    
## [33] grid_4.4.1        vroom_1.6.5       rstudioapi_0.16.0 hms_1.1.3        
## [37] lifecycle_1.0.4   vctrs_0.6.5       evaluate_0.24.0   glue_1.7.0       
## [41] farver_2.1.2      codetools_0.2-20  fansi_1.0.6       colorspace_2.1-0 
## [45] purrr_1.0.2       rmarkdown_2.27    tools_4.4.1       pkgconfig_2.0.3  
## [49] htmltools_0.5.8.1