Introduction

This report analyzes the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database to identify storm event types that are most harmful to population health and those that have the greatest economic consequences across the United States. The data spans from 1950 to November 2011.

Data Preprocessing

This section details the steps taken to clean and prepare the data for analysis. This includes selecting relevant columns, cleaning the event type (EVTYPE) variable, and calculating numerical values for economic damages.

1. Load the Data

file_path <- "repdata_data_StormData.csv.bz2"
if (file.exists(file_path)) {
  storm_data_raw <- read.csv(bzfile(file_path), stringsAsFactors = FALSE)
  print(paste("Successfully loaded", nrow(storm_data_raw), "rows and", ncol(storm_data_raw), "columns."))
} else {
  stop("Data file not found. Please check the file_path variable.")
}
## [1] "Successfully loaded 902297 rows and 37 columns."

2. Select Relevant Columns and Initial Transformation

storm_data <- storm_data_raw %>%
  select(BGN_DATE, EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP,
         CROPDMG, CROPDMGEXP, STATE, REFNUM)

storm_data$BGN_DATE_PARSED <- as.Date(storm_data$BGN_DATE, format="%m/%d/%Y %H:%M:%S")
storm_data$YEAR <- as.numeric(format(storm_data$BGN_DATE_PARSED, "%Y"))

3. Clean Event Types (EVTYPE)

storm_data$EVTYPE_CLEAN <- toupper(trimws(storm_data$EVTYPE))

storm_data$EVTYPE_CLEAN <- str_replace_all(storm_data$EVTYPE_CLEAN,
                                           c("^TSTM WIND.*|^THUNDERSTORM WIND.*S$" = "THUNDERSTORM WIND",
                                             "^THUNDERSTORM$" = "THUNDERSTORM WIND",
                                             "^FLASH FLOOD.*" = "FLASH FLOOD",
                                             "^FLOOD$" = "FLOOD",
                                             "^(COASTAL|CSTL) FLOOD.*" = "COASTAL FLOOD",
                                             "^RIP CURRENT.*" = "RIP CURRENT",
                                             "^TORNADO.*|^TORNDAO$" = "TORNADO",
                                             "^HAIL.*" = "HAIL",
                                             "^HURRICANE.*|^TYPHOON$" = "HURRICANE/TYPHOON",
                                             "^STORM SURGE.*" = "STORM SURGE/TIDE",
                                             "^HEAVY SNOW.*" = "HEAVY SNOW",
                                             "^BLIZZARD.*" = "BLIZZARD",
                                             "^HEAVY RAIN.*" = "HEAVY RAIN",
                                             "^WINTER STORM.*" = "WINTER STORM",
                                             "^WINTER WEATHER.*" = "WINTER WEATHER",
                                             "^(EXTREME )?COLD.*|^(EXTREME )?WIND CHILL.*" = "EXTREME COLD/WIND CHILL",
                                             "^(EXCESSIVE )?HEAT.*" = "HEAT",
                                             "^DROUGHT.*" = "DROUGHT",
                                             "^WILD.*FIRE.*|^FOREST FIRE.*" = "WILDFIRE",
                                             "^LIGHTNING.*" = "LIGHTNING",
                                             "^DENSE FOG.*" = "DENSE FOG",
                                             "^ICE STORM.*" = "ICE STORM",
                                             "^AVALANCHE.*" = "AVALANCHE",
                                             "^(HIGH|STRONG) WIND.*" = "HIGH WIND",
                                             "^WATERSPOUT.*" = "WATERSPOUT",
                                             "^DUST DEVIL.*" = "DUST DEVIL",
                                             "^VOLCANIC ASH.*" = "VOLCANIC ASH",
                                             "^MARINE TSTM WIND|^MARINE THUNDERSTORM WIND" = "MARINE THUNDERSTORM WIND",
                                             "^MARINE HAIL" = "MARINE HAIL",
                                             "^MARINE HIGH WIND" = "MARINE HIGH WIND",
                                             "^MARINE STRONG WIND" = "MARINE STRONG WIND",
                                             "SUMMARY.*" = "SUMMARY"))

storm_data <- storm_data %>% filter(!grepl("SUMMARY", EVTYPE_CLEAN))

4. Calculate Numerical Economic Damage

get_multiplier <- function(exp_char) {
  exp_char <- toupper(as.character(exp_char))
  if (is.na(exp_char) || exp_char %in% c("", "+", "-", "?")) return(1)
  else if (exp_char == "H") return(100)
  else if (exp_char == "K") return(1000)
  else if (exp_char == "M") return(1000000)
  else if (exp_char == "B") return(1000000000)
  else if (grepl("^[0-9]$", exp_char)) return(10^as.numeric(exp_char))
  else return(1)
}

storm_data$PROPDMG_VALUE <- storm_data$PROPDMG * sapply(storm_data$PROPDMGEXP, get_multiplier)
storm_data$CROPDMG_VALUE <- storm_data$CROPDMG * sapply(storm_data$CROPDMGEXP, get_multiplier)
storm_data$TOTAL_ECONOMIC_DMG <- storm_data$PROPDMG_VALUE + storm_data$CROPDMG_VALUE

Analysis of Storm Impact

Question 1: Most Harmful Events to Population Health

health_impact <- storm_data %>%
  group_by(EVTYPE_CLEAN) %>%
  summarise(TotalFatalities = sum(FATALITIES, na.rm = TRUE),
            TotalInjuries = sum(INJURIES, na.rm = TRUE),
            .groups = 'drop') %>%
  mutate(TotalCasualties = TotalFatalities + TotalInjuries) %>%
  filter(TotalCasualties > 0) %>%
  arrange(desc(TotalCasualties))

kable(head(health_impact, 10), caption = "Top 10 Event Types by Total Casualties")
Top 10 Event Types by Total Casualties
EVTYPE_CLEAN TotalFatalities TotalInjuries TotalCasualties
TORNADO 5658 91364 97022
HEAT 3021 9019 12040
THUNDERSTORM WIND 708 9468 10176
FLOOD 470 6789 7259
LIGHTNING 817 5232 6049
FLASH FLOOD 1018 1785 2803
HIGH WIND 404 1772 2176
ICE STORM 89 1977 2066
WILDFIRE 90 1606 1696
WINTER STORM 217 1353 1570
top_n_health_plot <- head(health_impact, 10)
ggplot(top_n_health_plot, aes(x = reorder(EVTYPE_CLEAN, TotalCasualties), y = TotalCasualties, fill = EVTYPE_CLEAN)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(title = "Top 10 Most Harmful Event Types to Population Health",
       x = "Event Type",
       y = "Total Casualties") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

Question 2: Events with Greatest Economic Consequences

economic_impact <- storm_data %>%
  group_by(EVTYPE_CLEAN) %>%
  summarise(TotalPropertyDamage_USD = sum(PROPDMG_VALUE, na.rm = TRUE),
            TotalCropDamage_USD = sum(CROPDMG_VALUE, na.rm = TRUE),
            TotalCombinedDamage_USD = sum(TOTAL_ECONOMIC_DMG, na.rm = TRUE),
            .groups = 'drop') %>%
  filter(TotalCombinedDamage_USD > 0) %>%
  arrange(desc(TotalCombinedDamage_USD))

kable(head(economic_impact, 10), caption = "Top 10 Event Types by Total Economic Damage (USD)")
Top 10 Event Types by Total Economic Damage (USD)
EVTYPE_CLEAN TotalPropertyDamage_USD TotalCropDamage_USD TotalCombinedDamage_USD
FLOOD 144657709807 5661968450 150319678257
HURRICANE/TYPHOON 85356410010 5516117800 90872527810
TORNADO 58552152377 417461470 58969613847
STORM SURGE/TIDE 47964724000 855000 47965579000
HAIL 15977470013 3026094623 19003564636
FLASH FLOOD 17414731089 1437163150 18851894239
DROUGHT 1046106000 13972571780 15018677780
THUNDERSTORM WIND 9968511026 1225260988 11193772014
RIVER FLOOD 5118945500 5029459000 10148404500
ICE STORM 3944927860 5022113500 8967041360
top_n_economic_plot <- head(economic_impact, 10)
ggplot(top_n_economic_plot, aes(x = reorder(EVTYPE_CLEAN, TotalCombinedDamage_USD), 
                                y = TotalCombinedDamage_USD / 1e9, fill = EVTYPE_CLEAN)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(title = "Top 10 Event Types by Economic Consequences",
       x = "Event Type",
       y = "Total Economic Damage (Billions USD)") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))