Synopsis

This report analyzes the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database to determine which weather events are most harmful to population health and have the greatest economic consequences. The analysis involves cleaning and standardizing event types, aggregating fatality and injury data, and calculating total economic damage from property and crop loss. The results show that tornadoes are the most harmful to human health, while floods cause the most significant economic damage.

Loading and preprocessing the data

knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
storm_raw <- read.csv("repdata_data_StormData.csv.bz2", stringsAsFactors = FALSE)

Question 1: Across the United States, which types of events (as indicated in the EVTYPE) are most harmful with respect to population health?

Subsetting raw data

health_raw <- storm_raw %>%
    select(EVTYPE, FATALITIES, INJURIES)

Group harmful data by type in subset of raw data

impact_by_evtype <- health_raw %>%
    group_by(EVTYPE) %>%
    summarise(
        total_fatality = sum(FATALITIES, na.rm = TRUE), 
        total_injury = sum(INJURIES, na.rm = TRUE), 
        .groups = "drop"
    ) %>%
    mutate(
        total_health = total_fatality + total_injury
    )

Ordering raw event types by total health impact to guide keyword identification

impact_top10 <- impact_by_evtype %>%
    arrange(desc(total_health)) %>%
    slice_head(n=10)

impact_top10
## # A tibble: 10 × 4
##    EVTYPE            total_fatality total_injury total_health
##    <chr>                      <dbl>        <dbl>        <dbl>
##  1 TORNADO                     5633        91346        96979
##  2 EXCESSIVE HEAT              1903         6525         8428
##  3 TSTM WIND                    504         6957         7461
##  4 FLOOD                        470         6789         7259
##  5 LIGHTNING                    816         5230         6046
##  6 HEAT                         937         2100         3037
##  7 FLASH FLOOD                  978         1777         2755
##  8 ICE STORM                     89         1975         2064
##  9 THUNDERSTORM WIND            133         1488         1621
## 10 WINTER STORM                 206         1321         1527

Processing raw data of health with top keywords

Based on the raw event frequencies, the following keywords were prioritized for cleaning

health_clean <- health_raw %>%
  mutate(
    EVTYPE_CLEAN = EVTYPE %>%
      toupper() %>%
      str_trim() %>%
      str_replace_all("\\s+", " ") %>%
      str_replace_all("[[:punct:]]", "") %>%
      str_replace_all("TSTM", "THUNDERSTORM") %>%
      str_replace_all("THUNDERSTORMS?", "THUNDERSTORM WIND") %>%
      str_replace_all("THUNDERSTORMWIND", "THUNDERSTORM WIND") %>%
      str_replace_all("THUNDERSTORM WINDS?", "THUNDERSTORM WIND") %>%
      str_replace_all("LIGHTNING|LIGTNING", "LIGHTNING") %>%
      str_replace_all("HEAT WAVE|HEATWAVE", "EXCESSIVE HEAT") %>%
      str_replace_all("FLASH FLOODING|FLASHFLOOD", "FLASH FLOOD") %>%
      str_replace_all("URBAN.*FLOOD", "FLASH FLOOD") %>%
      str_replace_all("RIVER.*FLOOD", "RIVER FLOOD") %>%
      str_replace_all("COASTAL.*FLOOD", "COASTAL FLOOD") %>%
      str_replace_all("HURRICANE|TYPHOON|TROPICAL STORM", "TROPICAL CYCLONE") %>%
      str_replace_all("WINTER STORM|BLIZZARD|ICE STORM", "WINTER STORM") %>%
      str_replace_all("^HIGH WIND$|^STRONG WIND$", "HIGH WIND") %>%
      str_replace_all("WILDFIRE|FOREST FIRE|BRUSH FIRE", "WILDFIRE")
  )

Group health data by EVTYPE_CLEAN

Fatalities and injuries are summed with equal weight, as both represent direct harm to population health.

health_summary <- health_clean %>%
    group_by(EVTYPE_CLEAN) %>%
    summarise(total_fatality = sum(FATALITIES, na.rm = TRUE),
              total_injury = sum(INJURIES, na.rm = TRUE),
              .groups = 'drop'
    ) %>%
    mutate(
        total_health = total_fatality + total_injury
    ) %>%
    arrange(desc(total_health))

Top 10 Most Harmful Events

health_summary %>%
  slice_head(n = 10) %>%
  ggplot(aes(x = reorder(EVTYPE_CLEAN, total_health), y = total_health)) +
  geom_col(fill = "darkred") +
  coord_flip() +
  labs(
    title = "Top 10 Weather Events Most Harmful to Population Health",
    subtitle = "Based on total fatalities and injuries (1950-2011)",
    x = "Event Type",
    y = "Total Fatalities + Injuries"
  ) +
  theme_minimal()

Final answer

Across the United States, tornadoes are the most harmful weather events with respect to population health, causing the highest combined number of fatalities and injuries. Excessive heat, flash floods, and thunderstorm winds also rank among the top contributors to weather-related health impacts.

Question 2: Across the United States, which types of events have the greatest economic consequences?

Subsetting raw data

dmg_raw <- storm_raw %>%
    select(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)

First processing exponent values with calculation function

Creating a function to calculate

calculate_total_damage <- function(dmg_value, dmg_exp) {
  # Convert the exponent character to a numeric multiplier
  multiplier <- case_when(
    dmg_exp %in% c("", "0") ~ 1,
    dmg_exp == "1" ~ 10,
    dmg_exp == "2" ~ 100,
    dmg_exp == "3" ~ 1000,
    dmg_exp == "4" ~ 10000,
    dmg_exp == "5" ~ 100000,
    dmg_exp == "6" ~ 1000000,
    dmg_exp == "7" ~ 10000000,
    dmg_exp == "8" ~ 100000000,
    dmg_exp %in% c("B", "b") ~ 1000000000,
    dmg_exp %in% c("M", "m") ~ 1000000,
    dmg_exp %in% c("K", "k") ~ 1000,
    dmg_exp %in% c("H", "h") ~ 100,
    # Treat unknown/ambiguous symbols as no multiplier (or NA if you prefer)
    dmg_exp %in% c("+", "-", "?") ~ 1, 
    TRUE ~ 1 # Default case for any other unexpected values
  )
  
  # Return the total damage, handling potential NAs in the input value
  return(as.numeric(dmg_value) * multiplier)
}

Calculating total damage and cleaning EVTYPE

dmg_clean <- dmg_raw %>%
  mutate(
    # Step 1: Calculate total damage per row
    total_propdmg = calculate_total_damage(PROPDMG, PROPDMGEXP),
    total_cropdmg = calculate_total_damage(CROPDMG, CROPDMGEXP),
    total_dmg = total_propdmg + total_cropdmg,
    
    # Step 2: Clean EVTYPE immediately
    EVTYPE_CLEAN = case_when(
      grepl("HURRICANE|TYPHOON|TROPICAL", EVTYPE) ~ "TROPICAL CYCLONE",
      grepl("THUNDERSTORM|TSTM|SEVERE", EVTYPE) ~ "THUNDERSTORM WIND",
      grepl("HIGH WIND", EVTYPE) ~ "HIGH WIND",
      grepl("FLASH", EVTYPE) ~ "FLASH FLOOD",
      grepl("RIVER", EVTYPE) ~ "RIVER FLOOD",
      grepl("FLOOD", EVTYPE) ~ "FLOOD", 
      grepl("WINTER|ICE|BLIZZARD|SNOW|FROST|FREEZE", EVTYPE) ~ "WINTER STORM",
      grepl("WILD|FOREST", EVTYPE) ~ "WILDFIRE",
      grepl("TORNADO", EVTYPE) ~ "TORNADO",
      grepl("HAIL", EVTYPE) ~ "HAIL",
      grepl("DROUGHT", EVTYPE) ~ "DROUGHT",
      grepl("STORM SURGE", EVTYPE) ~ "STORM SURGE",
      grepl("HEAVY RAIN", EVTYPE) ~ "HEAVY RAIN",
      grepl("LIGHTNING", EVTYPE) ~ "LIGHTNING",
      grepl("EXTREME COLD", EVTYPE) ~ "EXTREME COLD/WIND CHILL",
      TRUE ~ "OTHER"
    )
  )

Group total damage value by evtype in order to identify top damage keywords

dmg_top <- dmg_clean %>%
    group_by(EVTYPE_CLEAN) %>%
    summarise(
        total_dmg = sum(total_dmg, na.rm = TRUE),
        .groups = "drop"
    ) %>%
    arrange(desc(total_dmg))

Top 10 Most Damaging Events

dmg_top %>%
    slice_head(n = 10)
## # A tibble: 10 × 2
##    EVTYPE_CLEAN          total_dmg
##    <chr>                     <dbl>
##  1 FLOOD             151124876189 
##  2 TROPICAL CYCLONE   99283551360 
##  3 TORNADO            57418279446.
##  4 STORM SURGE        47965579000 
##  5 WINTER STORM       19603220711 
##  6 FLASH FLOOD        19121509246.
##  7 HAIL               19024452136.
##  8 THUNDERSTORM WIND  16558987688.
##  9 DROUGHT            15018927780 
## 10 RIVER FLOOD        10160248500

Final answer

Based on the analysis of property and crop damage data, floods are the most economically damaging weather events in the United States, accounting for the highest total costs. Tropical cyclones (hurricanes/typhoons) and storm surge also rank among the top contributors to economic losses, highlighting the severe financial impact of large-scale meteorological systems.