Synopsis

This report analyzes storm event data from the U.S. National Oceanic and Atmospheric Administration (NOAA) to identify which weather event types are most harmful to population health and which have the greatest economic impact. The dataset spans from 1950 to 2011 and includes information on injuries, fatalities, property, and crop damage. Using data transformation and aggregation in R, we analyze the top events by total health and economic damage. The results show that tornadoes cause the most harm to human health, while floods, hurricanes, and storm surges cause the highest financial losses. These findings can inform municipal planning and emergency preparedness.

Data Processing

We begin by loading the raw .csv.bz2 file provided, and extracting relevant variables such as event type, health impact, and economic losses. The dataset includes columns with exponential damage indicators (e.g., "K" for thousands), which we convert to numeric values. We then calculate total property and crop damage in dollars and aggregate the data by event type.

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
# Load the dataset
storm_raw <- read_csv("repdata_data_StormData.csv.bz2")
## Rows: 902297 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): BGN_DATE, BGN_TIME, TIME_ZONE, COUNTYNAME, STATE, EVTYPE, BGN_AZI,...
## dbl (18): STATE__, COUNTY, BGN_RANGE, COUNTY_END, END_RANGE, LENGTH, WIDTH, ...
## lgl  (1): COUNTYENDN
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Select relevant columns
storm <- storm_raw %>%
  select(BGN_DATE, EVTYPE, FATALITIES, INJURIES, 
         PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)

# Convert date
storm$BGN_DATE <- mdy_hms(storm$BGN_DATE)





# Convert damage exponents
exp_convert <- function(exp) {
  case_when(
    toupper(exp) == "K" ~ 1e3,
    toupper(exp) == "M" ~ 1e6,
    toupper(exp) == "B" ~ 1e9,
    TRUE ~ 1
  )
}

# Apply conversion
storm <- storm %>%
  mutate(PROPDMGEXP = exp_convert(PROPDMGEXP),
         CROPDMGEXP = exp_convert(CROPDMGEXP),
         PROP_DMG = PROPDMG * PROPDMGEXP,
         CROP_DMG = CROPDMG * CROPDMGEXP)








health <- storm %>%
  group_by(EVTYPE) %>%
  summarise(Fatalities = sum(FATALITIES, na.rm = TRUE),
            Injuries = sum(INJURIES, na.rm = TRUE),
            Total_Health_Impact = Fatalities + Injuries) %>%
  arrange(desc(Total_Health_Impact)) %>%
  slice(1:10)

ggplot(health, aes(x = reorder(EVTYPE, Total_Health_Impact), y = Total_Health_Impact)) +
  geom_col(fill = "firebrick") +
  coord_flip() +
  labs(title = "Top 10 Weather Events Most Harmful to Population Health",
       x = "Event Type", y = "Total Fatalities + Injuries") +
  theme_minimal()

economic <- storm %>%
  group_by(EVTYPE) %>%
  summarise(Property_Damage = sum(PROP_DMG, na.rm = TRUE),
            Crop_Damage = sum(CROP_DMG, na.rm = TRUE),
            Total_Damage = Property_Damage + Crop_Damage) %>%
  arrange(desc(Total_Damage)) %>%
  slice(1:10)

ggplot(economic, aes(x = reorder(EVTYPE, Total_Damage), y = Total_Damage)) +
  geom_col(fill = "darkblue") +
  coord_flip() +
  labs(title = "Top 10 Weather Events with Greatest Economic Impact",
       x = "Event Type", y = "Total Damage (USD)") +
  scale_y_continuous(labels = dollar_format()) +
  theme_minimal()