Synopsis

This report analyzes severe weather events in the United States using the NOAA Storm Database. The goal is to determine:

  1. Which types of events are most harmful to population health (fatalities and injuries)?
  2. Which types of events cause the greatest economic damage (property and crop losses)?

The dataset covers events from 1950 to 2011, though earlier years contain fewer recorded events.

Data Processing

Load Libraries and Dataset

library(dplyr)
library(ggplot2)
library(readr)
# Download and load the dataset
file_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
file_name <- "StormData.csv.bz2"
if (!file.exists(file_name)) {
  download.file(file_url, file_name, mode = "wb")
}

storm_data <- read_csv(file_name)

Data Inspection and Cleaning

# Check structure of data
dim(storm_data)
## [1] 902297     37
str(storm_data)
## spc_tbl_ [902,297 × 37] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ STATE__   : num [1:902297] 1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr [1:902297] "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr [1:902297] "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr [1:902297] "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num [1:902297] 97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr [1:902297] "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr [1:902297] "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr [1:902297] "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num [1:902297] 0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr [1:902297] NA NA NA NA ...
##  $ BGN_LOCATI: chr [1:902297] NA NA NA NA ...
##  $ END_DATE  : chr [1:902297] NA NA NA NA ...
##  $ END_TIME  : chr [1:902297] NA NA NA NA ...
##  $ COUNTY_END: num [1:902297] 0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi [1:902297] NA NA NA NA NA NA ...
##  $ END_RANGE : num [1:902297] 0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr [1:902297] NA NA NA NA ...
##  $ END_LOCATI: chr [1:902297] NA NA NA NA ...
##  $ LENGTH    : num [1:902297] 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num [1:902297] 100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : num [1:902297] 3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num [1:902297] 0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num [1:902297] 0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num [1:902297] 15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num [1:902297] 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr [1:902297] "K" "K" "K" "K" ...
##  $ CROPDMG   : num [1:902297] 0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr [1:902297] NA NA NA NA ...
##  $ WFO       : chr [1:902297] NA NA NA NA ...
##  $ STATEOFFIC: chr [1:902297] NA NA NA NA ...
##  $ ZONENAMES : chr [1:902297] NA NA NA NA ...
##  $ LATITUDE  : num [1:902297] 3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num [1:902297] 8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num [1:902297] 3051 0 0 0 0 ...
##  $ LONGITUDE_: num [1:902297] 8806 0 0 0 0 ...
##  $ REMARKS   : chr [1:902297] NA NA NA NA ...
##  $ REFNUM    : num [1:902297] 1 2 3 4 5 6 7 8 9 10 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   STATE__ = col_double(),
##   ..   BGN_DATE = col_character(),
##   ..   BGN_TIME = col_character(),
##   ..   TIME_ZONE = col_character(),
##   ..   COUNTY = col_double(),
##   ..   COUNTYNAME = col_character(),
##   ..   STATE = col_character(),
##   ..   EVTYPE = col_character(),
##   ..   BGN_RANGE = col_double(),
##   ..   BGN_AZI = col_character(),
##   ..   BGN_LOCATI = col_character(),
##   ..   END_DATE = col_character(),
##   ..   END_TIME = col_character(),
##   ..   COUNTY_END = col_double(),
##   ..   COUNTYENDN = col_logical(),
##   ..   END_RANGE = col_double(),
##   ..   END_AZI = col_character(),
##   ..   END_LOCATI = col_character(),
##   ..   LENGTH = col_double(),
##   ..   WIDTH = col_double(),
##   ..   F = col_double(),
##   ..   MAG = col_double(),
##   ..   FATALITIES = col_double(),
##   ..   INJURIES = col_double(),
##   ..   PROPDMG = col_double(),
##   ..   PROPDMGEXP = col_character(),
##   ..   CROPDMG = col_double(),
##   ..   CROPDMGEXP = col_character(),
##   ..   WFO = col_character(),
##   ..   STATEOFFIC = col_character(),
##   ..   ZONENAMES = col_character(),
##   ..   LATITUDE = col_double(),
##   ..   LONGITUDE = col_double(),
##   ..   LATITUDE_E = col_double(),
##   ..   LONGITUDE_ = col_double(),
##   ..   REMARKS = col_character(),
##   ..   REFNUM = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Select relevant columns
storm_data <- storm_data %>%
  select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)

Convert Damage Values

# Function to convert exponent values
convert_exp <- function(exp) {
  if (is.na(exp) || exp == "" || exp == " ") return(1)
  if (exp %in% c("H", "h")) return(100)
  if (exp %in% c("K", "k")) return(1000)
  if (exp %in% c("M", "m")) return(1e6)
  if (exp %in% c("B", "b")) return(1e9)
  as.numeric(exp)
}

# Apply the function to calculate actual damage values
storm_data <- storm_data %>%
  mutate(
    PROPDMGEXP = sapply(PROPDMGEXP, convert_exp),
    CROPDMGEXP = sapply(CROPDMGEXP, convert_exp),
    PROPERTY_DAMAGE = PROPDMG * PROPDMGEXP,
    CROP_DAMAGE = CROPDMG * CROPDMGEXP
  )

Results

Most Harmful Events to Population Health

health_impact <- storm_data %>%
  group_by(EVTYPE) %>%
  summarise(
    Total_Fatalities = sum(FATALITIES, na.rm = TRUE),
    Total_Injuries = sum(INJURIES, na.rm = TRUE)
  ) %>%
  arrange(desc(Total_Fatalities + Total_Injuries)) %>%
  head(10)

# Plot
ggplot(health_impact, aes(x = reorder(EVTYPE, -(Total_Fatalities + Total_Injuries)), y = Total_Fatalities + Total_Injuries)) +
  geom_bar(stat = "identity", fill = "red") +
  coord_flip() +
  labs(title = "Top 10 Most Harmful Weather Events to Population Health",
       x = "Event Type",
       y = "Total Fatalities & Injuries")

Events with the Greatest Economic Consequences

economic_impact <- storm_data %>%
  group_by(EVTYPE) %>%
  summarise(
    Total_Property_Damage = sum(PROPERTY_DAMAGE, na.rm = TRUE),
    Total_Crop_Damage = sum(CROP_DAMAGE, na.rm = TRUE),
    Total_Economic_Damage = Total_Property_Damage + Total_Crop_Damage
  ) %>%
  arrange(desc(Total_Economic_Damage)) %>%
  head(10)

# Plot
ggplot(economic_impact, aes(x = reorder(EVTYPE, -Total_Economic_Damage), y = Total_Economic_Damage)) +
  geom_bar(stat = "identity", fill = "blue") +
  coord_flip() +
  labs(title = "Top 10 Most Costly Weather Events",
       x = "Event Type",
       y = "Total Economic Damage (USD)")

Conclusion

  1. Tornadoes are the most harmful weather event to population health, causing the highest number of fatalities and injuries.
  2. Hurricanes and floods cause the greatest economic damage, affecting property and crops significantly.
  3. The findings suggest that preparedness efforts should focus on tornadoes for public safety and hurricanes/floods for economic protection.

Reproducibility

All code for this analysis is provided in the document. The results can be fully reproduced using the NOAA Storm Database.