This analysis uses the NOAA Storm Events database (1950 through November 2011) to identify which event types cause the greatest harm to population health (fatalities + injuries) and which cause the largest economic damage (property + crop losses). The R Markdown document downloads the raw compressed CSV, performs reproducible processing (including interpreting damage exponent codes), summarizes the data by event type, and presents bar charts of the top contributors for both public-health and economic impact. Code is shown for every step so the analysis can be re-run and published to RPubs.
This section shows the R code used to download, read, clean, and prepare the NOAA storm data for analysis. All code chunks are shown (echo = TRUE).
# Load packages used in the analysis
if (!requireNamespace("dplyr", quietly = TRUE)) install.packages("dplyr")
if (!requireNamespace("readr", quietly = TRUE)) install.packages("readr")
if (!requireNamespace("ggplot2", quietly = TRUE)) install.packages("ggplot2")
if (!requireNamespace("stringr", quietly = TRUE)) install.packages("stringr")
library(dplyr)
library(readr)
library(ggplot2)
library(stringr)
# URL for the course dataset (compressed CSV)
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
# Download file to a temporary file and read directly (no manual preprocessing)
tempfile_path <- tempfile(fileext = ".bz2")
downloader <- tryCatch({
download.file(url, destfile = tempfile_path, mode = "wb")
TRUE
}, error = function(e) {
message("Download failed: ", e$message)
FALSE
})
# Read the CSV (readr::read_csv is tolerant and fast)
storm_raw <- read_csv(tempfile_path, guess_max = 100000)
# Inspect the structure
glimpse(storm_raw)
## Rows: 902,297
## Columns: 37
## $ STATE__ <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ BGN_DATE <chr> "4/18/1950 0:00:00", "4/18/1950 0:00:00", "2/20/1951 0:00:0…
## $ BGN_TIME <chr> "0130", "0145", "1600", "0900", "1500", "2000", "0100", "09…
## $ TIME_ZONE <chr> "CST", "CST", "CST", "CST", "CST", "CST", "CST", "CST", "CS…
## $ COUNTY <dbl> 97, 3, 57, 89, 43, 77, 9, 123, 125, 57, 43, 9, 73, 49, 107,…
## $ COUNTYNAME <chr> "MOBILE", "BALDWIN", "FAYETTE", "MADISON", "CULLMAN", "LAUD…
## $ STATE <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",…
## $ EVTYPE <chr> "TORNADO", "TORNADO", "TORNADO", "TORNADO", "TORNADO", "TOR…
## $ BGN_RANGE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ BGN_AZI <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ BGN_LOCATI <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ END_DATE <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ END_TIME <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ COUNTY_END <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ COUNTYENDN <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ END_RANGE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ END_AZI <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ END_LOCATI <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ LENGTH <dbl> 14.0, 2.0, 0.1, 0.0, 0.0, 1.5, 1.5, 0.0, 3.3, 2.3, 1.3, 4.7…
## $ WIDTH <dbl> 100, 150, 123, 100, 150, 177, 33, 33, 100, 100, 400, 400, 2…
## $ F <dbl> 3, 2, 2, 2, 2, 2, 2, 1, 3, 3, 1, 1, 3, 3, 3, 4, 1, 1, 1, 1,…
## $ MAG <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FATALITIES <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 0,…
## $ INJURIES <dbl> 15, 0, 2, 2, 2, 6, 1, 0, 14, 0, 3, 3, 26, 12, 6, 50, 2, 0, …
## $ PROPDMG <dbl> 25.0, 2.5, 25.0, 2.5, 2.5, 2.5, 2.5, 2.5, 25.0, 25.0, 2.5, …
## $ PROPDMGEXP <chr> "K", "K", "K", "K", "K", "K", "K", "K", "K", "K", "M", "M",…
## $ CROPDMG <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ CROPDMGEXP <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ WFO <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ STATEOFFIC <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ ZONENAMES <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ LATITUDE <dbl> 3040, 3042, 3340, 3458, 3412, 3450, 3405, 3255, 3334, 3336,…
## $ LONGITUDE <dbl> 8812, 8755, 8742, 8626, 8642, 8748, 8631, 8558, 8740, 8738,…
## $ LATITUDE_E <dbl> 3051, 0, 0, 0, 0, 0, 0, 0, 3336, 3337, 3402, 3404, 0, 3432,…
## $ LONGITUDE_ <dbl> 8806, 0, 0, 0, 0, 0, 0, 0, 8738, 8737, 8644, 8640, 0, 8540,…
## $ REMARKS <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ REFNUM <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
# Keep only columns we need for this analysis
storm <- storm_raw %>%
select(BGN_DATE, STATE, EVTYPE,
FATALITIES, INJURIES,
PROPDMG, PROPDMGEXP,
CROPDMG, CROPDMGEXP)
# Clean EVTYPE: uppercase trim and collapse whitespace
storm <- storm %>%
mutate(EVTYPE = str_to_upper(str_trim(EVTYPE)))
# Function to convert damage exponent to numeric multiplier
exp_to_mult <- function(exp) {
# make NA and blanks zero multiplier
exp <- toupper(as.character(exp))
exp[is.na(exp)] <- ""
exp <- str_trim(exp)
mult <- numeric(length(exp))
mult[exp %in% c("", "0")] <- 1
mult[exp %in% c("K")] <- 1e3
mult[exp %in% c("M")] <- 1e6
mult[exp %in% c("B")] <- 1e9
# There are other codes used; handle common ones
mult[exp %in% c("H")] <- 100
# Numeric exponents (e.g., "2" => 10^2) occasionally appear
is_num <- grepl("^[0-9]+$", exp)
mult[is_num] <- 10 ^ as.numeric(exp[is_num])
# Anything else: try to interpret common synonyms (e.g., 'K' spelled full)
# default to 1 to avoid dropping data but flag in a column if unusual
mult[mult == 0] <- 1
return(mult)
}
# Apply multipliers and compute total economic damage
storm <- storm %>%
mutate(PROP_MULT = exp_to_mult(PROPDMGEXP),
CROP_MULT = exp_to_mult(CROPDMGEXP),
PROP_DMG_USD = PROPDMG * PROP_MULT,
CROP_DMG_USD = CROPDMG * CROP_MULT,
TOTAL_ECON_DMG = PROP_DMG_USD + CROP_DMG_USD,
TOTAL_HEALTH_IMPACT = FATALITIES + INJURIES)
# Quick sanity checks
summary(storm$PROP_DMG_USD)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 0.000e+00 0.000e+00 4.746e+05 5.000e+02 1.150e+11
summary(storm$CROP_DMG_USD)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 0.000e+00 0.000e+00 5.442e+04 0.000e+00 5.000e+09
summary(storm$TOTAL_HEALTH_IMPACT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1725 0.0000 1742.0000
This section computes summaries by event type and produces figures showing the top event types by health impact and by economic damage.
# Aggregate by event type for health impact
health_by_event <- storm %>%
group_by(EVTYPE) %>%
summarize(total_fatalities = sum(FATALITIES, na.rm = TRUE),
total_injuries = sum(INJURIES, na.rm = TRUE),
total_health = sum(TOTAL_HEALTH_IMPACT, na.rm = TRUE),
n_events = n()) %>%
arrange(desc(total_health))
# Top 10 by total health impact
top10_health <- health_by_event %>% slice_head(n = 10)
knitr::kable(top10_health, digits = 0, caption = "Top 10 EVTYPE by total (fatalities + injuries)")
| EVTYPE | total_fatalities | total_injuries | total_health | n_events |
|---|---|---|---|---|
| TORNADO | 5633 | 91346 | 96979 | 60652 |
| EXCESSIVE HEAT | 1903 | 6525 | 8428 | 1678 |
| TSTM WIND | 504 | 6957 | 7461 | 219946 |
| FLOOD | 470 | 6789 | 7259 | 25327 |
| LIGHTNING | 816 | 5230 | 6046 | 15755 |
| HEAT | 937 | 2100 | 3037 | 767 |
| FLASH FLOOD | 978 | 1777 | 2755 | 54278 |
| ICE STORM | 89 | 1975 | 2064 | 2006 |
| THUNDERSTORM WIND | 133 | 1488 | 1621 | 82564 |
| WINTER STORM | 206 | 1321 | 1527 | 11433 |
# Bar plot for health impact: fatalities + injuries (stacked)
plot_health <- top10_health %>%
tidyr::pivot_longer(cols = c(total_fatalities, total_injuries),
names_to = "type", values_to = "count") %>%
mutate(EVTYPE = forcats::fct_reorder(EVTYPE, -count, .fun = sum))
ggplot(plot_health, aes(x = EVTYPE, y = count, fill = type)) +
geom_col(position = "stack") +
coord_flip() +
labs(title = "Top 10 Event Types by Total Health Impact (Fatalities + Injuries)",
x = "Event Type", y = "Count", fill = "Measure") +
theme_minimal()
# Aggregate by event type for economic damage
econ_by_event <- storm %>%
group_by(EVTYPE) %>%
summarize(total_prop = sum(PROP_DMG_USD, na.rm = TRUE),
total_crop = sum(CROP_DMG_USD, na.rm = TRUE),
total_econ = sum(TOTAL_ECON_DMG, na.rm = TRUE),
n_events = n()) %>%
arrange(desc(total_econ))
# Top 10 by economic damage
top10_econ <- econ_by_event %>% slice_head(n = 10)
knitr::kable(top10_econ %>% mutate(total_prop = round(total_prop, 0),
total_crop = round(total_crop, 0),
total_econ = round(total_econ, 0)),
caption = "Top 10 EVTYPE by total economic damage (USD)")
| EVTYPE | total_prop | total_crop | total_econ | n_events |
|---|---|---|---|---|
| FLOOD | 144657709807 | 5661968450 | 150319678257 | 25327 |
| HURRICANE/TYPHOON | 69305840000 | 2607872800 | 71913712800 | 88 |
| TORNADO | 56947380676 | 414953270 | 57362333946 | 60652 |
| STORM SURGE | 43323536000 | 5000 | 43323541000 | 261 |
| HAIL | 15735267513 | 3025954473 | 18761221986 | 288661 |
| FLASH FLOOD | 16822723978 | 1421317100 | 18244041078 | 54278 |
| DROUGHT | 1046106000 | 13972566000 | 15018672000 | 2488 |
| HURRICANE | 11868319010 | 2741910000 | 14610229010 | 174 |
| RIVER FLOOD | 5118945500 | 5029459000 | 10148404500 | 173 |
| ICE STORM | 3944927860 | 5022113500 | 8967041360 | 2006 |
# Bar plot for economic damage (property + crop stacked)
plot_econ <- top10_econ %>%
tidyr::pivot_longer(cols = c(total_prop, total_crop),
names_to = "type", values_to = "amount") %>%
mutate(EVTYPE = forcats::fct_reorder(EVTYPE, -amount, .fun = sum))
ggplot(plot_econ, aes(x = EVTYPE, y = amount, fill = type)) +
geom_col(position = "stack") +
coord_flip() +
scale_y_continuous(labels = scales::dollar_format(prefix = "$", scale = 1)) +
labs(title = "Top 10 Event Types by Total Economic Damage (Property + Crop)",
x = "Event Type", y = "Damage (USD)", fill = "Damage Type") +
theme_minimal()
The tables and figures above identify which event types are most harmful to population health and which cause the largest economic losses across the period covered. The code and plots are suitable for inclusion in a short briefing to municipal managers. When preparing resources and prioritization, managers may consider both frequency (number of events) and severity (per-event damage). The code can be modified to restrict analysis to more recent years (e.g., 1990+) if that is preferred.
unique_props <- sort(unique(storm_raw$PROPDMGEXP))
unique_crops <- sort(unique(storm_raw$CROPDMGEXP))
unique_props
## [1] "-" "?" "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "h" "H" "K" "m" "M"
unique_crops
## [1] "?" "0" "2" "B" "k" "K" "m" "M"
sessionInfo()
## R version 4.5.0 (2025-04-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Africa/Nairobi
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] stringr_1.5.1 ggplot2_3.5.2 readr_2.1.5 dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] bit_4.6.0 gtable_0.3.6 jsonlite_2.0.0 crayon_1.5.3
## [5] compiler_4.5.0 tidyselect_1.2.1 parallel_4.5.0 tidyr_1.3.1
## [9] jquerylib_0.1.4 scales_1.3.0 yaml_2.3.10 fastmap_1.2.0
## [13] R6_2.6.1 labeling_0.4.3 generics_0.1.3 knitr_1.50
## [17] forcats_1.0.0 tibble_3.2.1 munsell_0.5.1 bslib_0.9.0
## [21] pillar_1.10.2 tzdb_0.5.0 rlang_1.1.6 cachem_1.1.0
## [25] stringi_1.8.7 xfun_0.52 sass_0.4.10 bit64_4.6.0-1
## [29] cli_3.6.4 withr_3.0.2 magrittr_2.0.3 digest_0.6.37
## [33] grid_4.5.0 vroom_1.6.5 rstudioapi_0.17.1 hms_1.1.3
## [37] lifecycle_1.0.4 vctrs_0.6.5 evaluate_1.0.3 glue_1.8.0
## [41] farver_2.1.2 colorspace_2.1-1 purrr_1.0.4 rmarkdown_2.29
## [45] tools_4.5.0 pkgconfig_2.0.3 htmltools_0.5.8.1