This report analyzes storm event data from the U.S. National Oceanic and Atmospheric Administration (NOAA) to identify which weather event types are most harmful to population health and which have the greatest economic impact. The dataset spans from 1950 to 2011 and includes information on injuries, fatalities, property, and crop damage. Using data transformation and aggregation in R, we analyze the top events by total health and economic damage. The results show that tornadoes cause the most harm to human health, while floods, hurricanes, and storm surges cause the highest financial losses. These findings can inform municipal planning and emergency preparedness.
We begin by loading the raw .csv.bz2
file provided, and
extracting relevant variables such as event type, health impact, and
economic losses. The dataset includes columns with exponential damage
indicators (e.g., "K"
for thousands), which we convert to
numeric values. We then calculate total property and crop damage in
dollars and aggregate the data by event type.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
# Load the dataset
storm_raw <- read_csv("repdata_data_StormData.csv.bz2")
## Rows: 902297 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): BGN_DATE, BGN_TIME, TIME_ZONE, COUNTYNAME, STATE, EVTYPE, BGN_AZI,...
## dbl (18): STATE__, COUNTY, BGN_RANGE, COUNTY_END, END_RANGE, LENGTH, WIDTH, ...
## lgl (1): COUNTYENDN
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Select relevant columns
storm <- storm_raw %>%
select(BGN_DATE, EVTYPE, FATALITIES, INJURIES,
PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
# Convert date
storm$BGN_DATE <- mdy_hms(storm$BGN_DATE)
# Convert damage exponents
exp_convert <- function(exp) {
case_when(
toupper(exp) == "K" ~ 1e3,
toupper(exp) == "M" ~ 1e6,
toupper(exp) == "B" ~ 1e9,
TRUE ~ 1
)
}
# Apply conversion
storm <- storm %>%
mutate(PROPDMGEXP = exp_convert(PROPDMGEXP),
CROPDMGEXP = exp_convert(CROPDMGEXP),
PROP_DMG = PROPDMG * PROPDMGEXP,
CROP_DMG = CROPDMG * CROPDMGEXP)
health <- storm %>%
group_by(EVTYPE) %>%
summarise(Fatalities = sum(FATALITIES, na.rm = TRUE),
Injuries = sum(INJURIES, na.rm = TRUE),
Total_Health_Impact = Fatalities + Injuries) %>%
arrange(desc(Total_Health_Impact)) %>%
slice(1:10)
ggplot(health, aes(x = reorder(EVTYPE, Total_Health_Impact), y = Total_Health_Impact)) +
geom_col(fill = "firebrick") +
coord_flip() +
labs(title = "Top 10 Weather Events Most Harmful to Population Health",
x = "Event Type", y = "Total Fatalities + Injuries") +
theme_minimal()
economic <- storm %>%
group_by(EVTYPE) %>%
summarise(Property_Damage = sum(PROP_DMG, na.rm = TRUE),
Crop_Damage = sum(CROP_DMG, na.rm = TRUE),
Total_Damage = Property_Damage + Crop_Damage) %>%
arrange(desc(Total_Damage)) %>%
slice(1:10)
ggplot(economic, aes(x = reorder(EVTYPE, Total_Damage), y = Total_Damage)) +
geom_col(fill = "darkblue") +
coord_flip() +
labs(title = "Top 10 Weather Events with Greatest Economic Impact",
x = "Event Type", y = "Total Damage (USD)") +
scale_y_continuous(labels = dollar_format()) +
theme_minimal()