This analysis explores the NOAA Storm Database to determine the types of severe weather events that are most harmful to population health and those that have the greatest economic consequences across the United States. The analysis includes data processing steps, summary statistics, and visualizations to identify the most impactful event types.
# Load required packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# Download and load the dataset
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
file <- "StormData.csv.bz2"
if (!file.exists(file)) {
download.file(url, file)
}
# Read the data
storm_data <- read.csv(bzfile(file))
# Display the structure of the data
str(storm_data)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
# Select relevant columns
storm_data <- storm_data %>%
select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
# Convert property and crop damage exponents to numeric
convert_exp <- function(exp) {
if (is.na(exp)) return(1)
exp <- toupper(exp)
if (exp == "K") return(1e3)
if (exp == "M") return(1e6)
if (exp == "B") return(1e9)
if (exp %in% c("", "0", "+", "-", "?", "H")) return(1)
return(as.numeric(exp))
}
# Handle conversion warnings
storm_data <- storm_data %>%
mutate(PROPDMGEXP = sapply(PROPDMGEXP, convert_exp),
CROPDMGEXP = sapply(CROPDMGEXP, convert_exp),
PROPDMG = PROPDMG * PROPDMGEXP,
CROPDMG = CROPDMG * CROPDMGEXP)
# Summarize the data by event type
health_impact <- storm_data %>%
group_by(EVTYPE) %>%
summarize(TotalFatalities = sum(FATALITIES, na.rm = TRUE),
TotalInjuries = sum(INJURIES, na.rm = TRUE)) %>%
arrange(desc(TotalFatalities), desc(TotalInjuries))
economic_impact <- storm_data %>%
group_by(EVTYPE) %>%
summarize(TotalPropDamage = sum(PROPDMG, na.rm = TRUE),
TotalCropDamage = sum(CROPDMG, na.rm = TRUE),
TotalDamage = TotalPropDamage + TotalCropDamage) %>%
arrange(desc(TotalDamage))
# Plot fatalities and injuries by event type
top_health_impact <- health_impact %>%
top_n(10, wt = TotalFatalities)
ggplot(top_health_impact, aes(x = reorder(EVTYPE, -TotalFatalities), y = TotalFatalities)) +
geom_bar(stat = "identity", fill = "red") +
geom_text(aes(label = TotalFatalities), vjust = -0.5) +
labs(title = "Top 10 Events Causing Fatalities", x = "Event Type", y = "Total Fatalities") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
top_injuries_impact <- health_impact %>%
top_n(10, wt = TotalInjuries)
ggplot(top_injuries_impact, aes(x = reorder(EVTYPE, -TotalInjuries), y = TotalInjuries)) +
geom_bar(stat = "identity", fill = "blue") +
geom_text(aes(label = TotalInjuries), vjust = -0.5) +
labs(title = "Top 10 Events Causing Injuries", x = "Event Type", y = "Total Injuries") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plot property and crop damage by event type
top_economic_impact <- economic_impact %>%
top_n(10, wt = TotalDamage)
ggplot(top_economic_impact, aes(x = reorder(EVTYPE, -TotalDamage), y = TotalDamage)) +
geom_bar(stat = "identity", fill = "green") +
geom_text(aes(label = TotalDamage), vjust = -0.5) +
labs(title = "Top 10 Events with Greatest Economic Consequences", x = "Event Type", y = "Total Damage (USD)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The analysis shows that tornadoes are the most harmful event type concerning population health, causing the highest number of fatalities and injuries. In terms of economic consequences, floods lead to the greatest property and crop damage combined.