Using the U.S. National Oceanic and Atmospheric Administration (NOAA) Storm Database (1950–Nov 2011), this analysis identifies which event types are most harmful to population health and which have the greatest economic impact. After cleaning event labels and converting damage estimates using the provided exponent fields, tornadoes account for the most fatalities and injuries, while floods and hurricanes/typhoons dominate total economic losses. Results are reproducible from the original CSV. Plots summarize the top event types and magnitudes.
knitr::opts_chunk$set(echo = TRUE, cache = TRUE)
req_pkgs <- c("dplyr", "ggplot2", "readr", "stringr", "forcats", "tidyr", "scales", "lubridate")
new_pkgs <- req_pkgs[!(req_pkgs %in% installed.packages()[,"Package"])]
if(length(new_pkgs)) install.packages(new_pkgs, repos = "https://cloud.r-project.org")
library(dplyr)
library(ggplot2)
library(readr)
library(stringr)
library(forcats)
library(tidyr)
library(scales)
library(lubridate)
# Download and Load Data
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
zip_path <- "StormData.csv.bz2"
if(!file.exists(zip_path)) download.file(url, destfile = zip_path, mode = "wb")
storm <- read_csv(zip_path, show_col_types = FALSE)
head(storm)
## # A tibble: 6 × 37
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE BGN_RANGE
## <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 1 4/18/1950… 0130 CST 97 MOBILE AL TORNA… 0
## 2 1 4/18/1950… 0145 CST 3 BALDWIN AL TORNA… 0
## 3 1 2/20/1951… 1600 CST 57 FAYETTE AL TORNA… 0
## 4 1 6/8/1951 … 0900 CST 89 MADISON AL TORNA… 0
## 5 1 11/15/195… 1500 CST 43 CULLMAN AL TORNA… 0
## 6 1 11/15/195… 2000 CST 77 LAUDERDALE AL TORNA… 0
## # ℹ 28 more variables: BGN_AZI <chr>, BGN_LOCATI <chr>, END_DATE <chr>,
## # END_TIME <chr>, COUNTY_END <dbl>, COUNTYENDN <lgl>, END_RANGE <dbl>,
## # END_AZI <chr>, END_LOCATI <chr>, LENGTH <dbl>, WIDTH <dbl>, F <dbl>,
## # MAG <dbl>, FATALITIES <dbl>, INJURIES <dbl>, PROPDMG <dbl>,
## # PROPDMGEXP <chr>, CROPDMG <dbl>, CROPDMGEXP <chr>, WFO <chr>,
## # STATEOFFIC <chr>, ZONENAMES <chr>, LATITUDE <dbl>, LONGITUDE <dbl>,
## # LATITUDE_E <dbl>, LONGITUDE_ <dbl>, REMARKS <chr>, REFNUM <dbl>
# Clean Data
exp_to_mult <- function(x){
x <- toupper(trimws(as.character(x)))
case_when(
x %in% c("K") ~ 1e3,
x %in% c("M") ~ 1e6,
x %in% c("B") ~ 1e9,
x %in% as.character(0:8) ~ 10^as.numeric(x),
TRUE ~ 1
)
}
storm_clean <- storm %>%
mutate(
EVTYPE = str_squish(toupper(EVTYPE)),
EVTYPE = str_replace_all(EVTYPE, "TSTM", "THUNDERSTORM"),
EVTYPE = str_replace_all(EVTYPE, "\\n", " "),
EVTYPE = case_when(
str_detect(EVTYPE, "HURRICANE|TYPHOON") ~ "HURRICANE/TYPHOON",
str_detect(EVTYPE, "TORNADO") ~ "TORNADO",
str_detect(EVTYPE, "THUNDERSTORM|THUNDERSTORM WIND|TSTM WIND") ~ "THUNDERSTORM WIND",
str_detect(EVTYPE, "FLASH FLOOD") ~ "FLASH FLOOD",
str_detect(EVTYPE, "FLOOD") ~ "FLOOD",
str_detect(EVTYPE, "HEAT|WARM") ~ "EXCESSIVE HEAT",
str_detect(EVTYPE, "COLD|CHILL|HYPOTHERMIA|LOW TEMP") ~ "EXTREME COLD/WIND CHILL",
str_detect(EVTYPE, "WINTER|SNOW|BLIZZARD|ICE STORM") ~ "WINTER WEATHER",
str_detect(EVTYPE, "HAIL") ~ "HAIL",
str_detect(EVTYPE, "DROUGHT") ~ "DROUGHT",
str_detect(EVTYPE, "FIRE") ~ "WILDFIRE",
str_detect(EVTYPE, "LIGHTNING") ~ "LIGHTNING",
str_detect(EVTYPE, "SURF|RIP CURRENT|SEAS|SWELL|WAVE") ~ "MARINE/COASTAL HAZARDS",
TRUE ~ EVTYPE
),
PROP_MULT = exp_to_mult(PROPDMGEXP),
CROP_MULT = exp_to_mult(CROPDMGEXP),
PROP_DMG_USD = PROPDMG * PROP_MULT,
CROP_DMG_USD = CROPDMG * CROP_MULT,
ECON_DMG_USD = PROP_DMG_USD + CROP_DMG_USD,
HEALTH_HARM = FATALITIES + INJURIES,
YEAR = as.integer(lubridate::year(lubridate::mdy_hms(BGN_DATE)))
)
head(storm_clean)
## # A tibble: 6 × 44
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE BGN_RANGE
## <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 1 4/18/1950… 0130 CST 97 MOBILE AL TORNA… 0
## 2 1 4/18/1950… 0145 CST 3 BALDWIN AL TORNA… 0
## 3 1 2/20/1951… 1600 CST 57 FAYETTE AL TORNA… 0
## 4 1 6/8/1951 … 0900 CST 89 MADISON AL TORNA… 0
## 5 1 11/15/195… 1500 CST 43 CULLMAN AL TORNA… 0
## 6 1 11/15/195… 2000 CST 77 LAUDERDALE AL TORNA… 0
## # ℹ 35 more variables: BGN_AZI <chr>, BGN_LOCATI <chr>, END_DATE <chr>,
## # END_TIME <chr>, COUNTY_END <dbl>, COUNTYENDN <lgl>, END_RANGE <dbl>,
## # END_AZI <chr>, END_LOCATI <chr>, LENGTH <dbl>, WIDTH <dbl>, F <dbl>,
## # MAG <dbl>, FATALITIES <dbl>, INJURIES <dbl>, PROPDMG <dbl>,
## # PROPDMGEXP <chr>, CROPDMG <dbl>, CROPDMGEXP <chr>, WFO <chr>,
## # STATEOFFIC <chr>, ZONENAMES <chr>, LATITUDE <dbl>, LONGITUDE <dbl>,
## # LATITUDE_E <dbl>, LONGITUDE_ <dbl>, REMARKS <chr>, REFNUM <dbl>, …
# Most harmful to population health
health_top <- storm_clean %>%
group_by(EVTYPE) %>%
summarise(Fatalities = sum(FATALITIES, na.rm=TRUE),
Injuries = sum(INJURIES, na.rm=TRUE)) %>%
mutate(Total = Fatalities + Injuries) %>%
arrange(desc(Total)) %>%
slice_head(n = 10)
health_top
## # A tibble: 10 × 4
## EVTYPE Fatalities Injuries Total
## <chr> <dbl> <dbl> <dbl>
## 1 TORNADO 5661 91407 97068
## 2 EXCESSIVE HEAT 3178 9243 12421
## 3 THUNDERSTORM WIND 729 9544 10273
## 4 FLOOD 490 6802 7292
## 5 WINTER WEATHER 622 5852 6474
## 6 LIGHTNING 817 5231 6048
## 7 FLASH FLOOD 1035 1802 2837
## 8 WILDFIRE 90 1608 1698
## 9 MARINE/COASTAL HAZARDS 763 810 1573
## 10 HURRICANE/TYPHOON 135 1333 1468
ggplot(health_top, aes(x=reorder(EVTYPE, Total), y=Total)) +
geom_col(fill="firebrick") +
coord_flip() +
labs(title="Top 10 Event Types by Health Impact",
subtitle="Fatalities + Injuries (1950–2011)",
x="Event Type", y="Total Health Harm")
# Figure 1: Tornadoes cause the highest combined fatalities and injuries, followed by excessive heat and thunderstorms.
# Greatest economic consequences
econ_top <- storm_clean %>%
group_by(EVTYPE) %>%
summarise(Property = sum(PROP_DMG_USD, na.rm=TRUE),
Crop = sum(CROP_DMG_USD, na.rm=TRUE)) %>%
mutate(Total = Property + Crop) %>%
arrange(desc(Total)) %>%
slice_head(n = 10)
econ_top
## # A tibble: 10 × 4
## EVTYPE Property Crop Total
## <chr> <dbl> <dbl> <dbl>
## 1 FLOOD 150623398739 10847881950 161471280689
## 2 HURRICANE/TYPHOON 85356410010 5516117800 90872527810
## 3 TORNADO 58603317926. 417461520 59020779446.
## 4 STORM SURGE 43323536000 5000 43323541000
## 5 FLASH FLOOD 17588292096. 1532197150 19120489246.
## 6 HAIL 15977564018. 3046887623 19024451641.
## 7 WINTER WEATHER 12407006811 5316280600 17723287411
## 8 DROUGHT 1046106000 13972566000 15018672000
## 9 THUNDERSTORM WIND 11184751522. 1271708988 12456460510.
## 10 WILDFIRE 8501628500 403281630 8904910130
ggplot(econ_top, aes(x=reorder(EVTYPE, Total), y=Total/1e9)) +
geom_col(fill="steelblue") +
coord_flip() +
labs(title="Top 10 Event Types by Economic Damage",
subtitle="Property + Crop Damage (1950–2011)",
x="Event Type", y="Total Damage (Billion USD)")
# Figure 2: Floods cause the largest economic losses, followed by hurricanes/typhoons and tornadoes.
## Discussion
# The analysis shows that tornadoes are responsible for the highest combined fatalities and injuries in the U.S. from 1950 to 2011.
# Excessive heat and thunderstorms also have a significant impact on population health.
# Economically, floods cause the largest property and crop damages, followed by hurricanes/typhoons and tornadoes.
# This suggests that while tornadoes are deadly, floods and tropical storms carry the greatest financial burden.
# These insights can guide policymakers in prioritizing disaster preparedness and resource allocation to reduce both human and economic losses.
## Conclusion
# The analysis shows that tornadoes are the most harmful to U.S. population health, causing the highest combined fatalities and injuries.
# Floods and hurricanes/typhoons lead to the greatest economic losses.
# This can help guide disaster preparedness and resource allocation.
# Reproducibility
sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Asia/Qatar
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] lubridate_1.9.4 scales_1.4.0 tidyr_1.3.1 forcats_1.0.0
## [5] stringr_1.5.1 readr_2.1.5 ggplot2_3.5.2 dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] bit_4.6.0 gtable_0.3.6 jsonlite_2.0.0 crayon_1.5.3
## [5] compiler_4.5.1 tidyselect_1.2.1 parallel_4.5.1 jquerylib_0.1.4
## [9] yaml_2.3.10 fastmap_1.2.0 R6_2.6.1 labeling_0.4.3
## [13] generics_0.1.4 knitr_1.50 tibble_3.3.0 bslib_0.9.0
## [17] pillar_1.11.0 RColorBrewer_1.1-3 tzdb_0.5.0 rlang_1.1.6
## [21] utf8_1.2.6 cachem_1.1.0 stringi_1.8.7 xfun_0.52
## [25] sass_0.4.10 bit64_4.6.0-1 timechange_0.3.0 cli_3.6.5
## [29] withr_3.0.2 magrittr_2.0.3 digest_0.6.37 grid_4.5.1
## [33] vroom_1.6.5 rstudioapi_0.17.1 hms_1.1.3 lifecycle_1.0.4
## [37] vctrs_0.6.5 evaluate_1.0.4 glue_1.8.0 farver_2.1.2
## [41] purrr_1.1.0 rmarkdown_2.29 tools_4.5.1 pkgconfig_2.0.3
## [45] htmltools_0.5.8.1