Synopsis

This analysis explores the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database to identify:

  1. The most harmful weather events for public health (fatalities & injuries).
  2. The weather events with the greatest economic consequences (property & crop damage).

Using data from 1950 to 2011, this study provides insights using data processing, aggregation, and visualization techniques in R.

Load Required Libraries

library(dplyr)   # For data manipulation
## Warning: package 'dplyr' was built under R version 4.2.3
library(ggplot2) # For data visualization
## Warning: package 'ggplot2' was built under R version 4.2.3
library(readr)   # For reading CSV files
## Warning: package 'readr' was built under R version 4.2.3
# Set options for large numbers
options(scipen=999)  # Disable scientific notation

knitr::opts_chunk$set(fig.path="figure/", echo=TRUE, message=FALSE, warning=FALSE)

Load the Dataset

# Download and extract the dataset if not already present
file_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
file_name <- "StormData.csv.bz2"

if (!file.exists(file_name)) {
  download.file(file_url, destfile = file_name, method = "curl")
}

# Read the data
storm_data <- read_csv(file_name)

Data Inspection

dim(storm_data)  # Check number of rows and columns
## [1] 902297     37
head(storm_data) # View first few rows
## # A tibble: 6 × 37
##   STATE__ BGN_DATE   BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE BGN_RANGE
##     <dbl> <chr>      <chr>    <chr>      <dbl> <chr>      <chr> <chr>      <dbl>
## 1       1 4/18/1950… 0130     CST           97 MOBILE     AL    TORNA…         0
## 2       1 4/18/1950… 0145     CST            3 BALDWIN    AL    TORNA…         0
## 3       1 2/20/1951… 1600     CST           57 FAYETTE    AL    TORNA…         0
## 4       1 6/8/1951 … 0900     CST           89 MADISON    AL    TORNA…         0
## 5       1 11/15/195… 1500     CST           43 CULLMAN    AL    TORNA…         0
## 6       1 11/15/195… 2000     CST           77 LAUDERDALE AL    TORNA…         0
## # ℹ 28 more variables: BGN_AZI <chr>, BGN_LOCATI <chr>, END_DATE <chr>,
## #   END_TIME <chr>, COUNTY_END <dbl>, COUNTYENDN <lgl>, END_RANGE <dbl>,
## #   END_AZI <chr>, END_LOCATI <chr>, LENGTH <dbl>, WIDTH <dbl>, F <dbl>,
## #   MAG <dbl>, FATALITIES <dbl>, INJURIES <dbl>, PROPDMG <dbl>,
## #   PROPDMGEXP <chr>, CROPDMG <dbl>, CROPDMGEXP <chr>, WFO <chr>,
## #   STATEOFFIC <chr>, ZONENAMES <chr>, LATITUDE <dbl>, LONGITUDE <dbl>,
## #   LATITUDE_E <dbl>, LONGITUDE_ <dbl>, REMARKS <chr>, REFNUM <dbl>

Data Cleaning and Preprocessing

storm_data <- storm_data %>%
  select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)

Convert Damage Multiplier Variables

convert_multiplier <- function(exp) {
  exp <- toupper(exp)
  ifelse(exp == "B", 1e9, 
  ifelse(exp == "M", 1e6, 
  ifelse(exp == "K", 1e3, 1)))
}

storm_data <- storm_data %>%
  mutate(
    PROPDMGEXP = convert_multiplier(PROPDMGEXP),
    CROPDMGEXP = convert_multiplier(CROPDMGEXP),
    PropertyDamage = PROPDMG * PROPDMGEXP,
    CropDamage = CROPDMG * CROPDMGEXP,
    TotalDamage = PropertyDamage + CropDamage
  )

RESULTS

Most Harmful Events to Population Health

health_impact <- storm_data %>%
  group_by(EVTYPE) %>%
  summarize(TotalFatalities = sum(FATALITIES, na.rm = TRUE),
            TotalInjuries = sum(INJURIES, na.rm = TRUE)) %>%
  arrange(desc(TotalFatalities + TotalInjuries)) %>%
  head(10)

print(health_impact)
## # A tibble: 10 × 3
##    EVTYPE            TotalFatalities TotalInjuries
##    <chr>                       <dbl>         <dbl>
##  1 TORNADO                      5633         91346
##  2 EXCESSIVE HEAT               1903          6525
##  3 TSTM WIND                     504          6957
##  4 FLOOD                         470          6789
##  5 LIGHTNING                     816          5230
##  6 HEAT                          937          2100
##  7 FLASH FLOOD                   978          1777
##  8 ICE STORM                      89          1975
##  9 THUNDERSTORM WIND             133          1488
## 10 WINTER STORM                  206          1321
# Plot: Top 10 Events Causing Most Fatalities & Injuries
ggplot(health_impact, aes(x = reorder(EVTYPE, TotalFatalities + TotalInjuries), 
                          y = TotalFatalities + TotalInjuries, fill = TotalFatalities)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 10 Most Harmful Weather Events (Health Impact)",
       x = "Event Type",
       y = "Total Fatalities & Injuries") +
  theme_minimal()

Weather Events with the Greatest Economic Impact

economic_impact <- storm_data %>%
  group_by(EVTYPE) %>%
  summarize(TotalDamage = sum(TotalDamage, na.rm = TRUE)) %>%
  arrange(desc(TotalDamage)) %>%
  head(10)

print(economic_impact)
## # A tibble: 10 × 2
##    EVTYPE              TotalDamage
##    <chr>                     <dbl>
##  1 FLOOD             138007444500 
##  2 HURRICANE/TYPHOON  29348167800 
##  3 TORNADO            16570326363 
##  4 HURRICANE          12405268000 
##  5 RIVER FLOOD        10108369000 
##  6 HAIL               10045596620 
##  7 FLASH FLOOD         8715885183.
##  8 ICE STORM           5925150850 
##  9 STORM SURGE/TIDE    4641493000 
## 10 THUNDERSTORM WIND   3813647990
# Plot: Top 10 Events with the Highest Economic Impact (Dot Plot)
ggplot(economic_impact, aes(x = reorder(EVTYPE, TotalDamage), y = TotalDamage)) +
  geom_point(color = "blue", size = 4) +
  coord_flip() +
  labs(title = "Top 10 Weather Events with Highest Economic Impact",
       x = "Event Type",
       y = "Total Damage (USD)") +
  theme_minimal()

Conclusion