This report analyzes the impact of severe weather events in the United States using data from the NOAA Storm Database. The analysis focuses on identifying: 1. The most harmful weather events in terms of fatalities and injuries. 2. The weather events causing the greatest economic damage.
The dataset includes storm data from 1950 to November 2011, but early records may be incomplete. Data transformations include cleaning event types, adjusting damage multipliers, and filtering relevant variables. The results are presented through summaries and visualizations to assist policymakers in prioritizing disaster preparedness.
# Load required libraries
library(knitr)
library(ggplot2)
library(dplyr)
library(readr)
# Define file path
file_path <- "repdata_data_StormData.csv.bz2"
# Read the compressed CSV file
storm_data <- read.csv(file_path, stringsAsFactors = FALSE)
# Display structure and first few rows
str(storm_data)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
head(storm_data)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL TORNADO
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL TORNADO
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL TORNADO
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL TORNADO
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL TORNADO
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL TORNADO
## BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1 0 0 NA
## 2 0 0 NA
## 3 0 0 NA
## 4 0 0 NA
## 5 0 0 NA
## 6 0 0 NA
## END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1 0 14.0 100 3 0 0 15 25.0
## 2 0 2.0 150 2 0 0 0 2.5
## 3 0 0.1 123 2 0 0 2 25.0
## 4 0 0.0 100 2 0 0 2 2.5
## 5 0 0.0 150 2 0 0 2 2.5
## 6 0 1.5 177 2 0 0 6 2.5
## PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1 K 0 3040 8812
## 2 K 0 3042 8755
## 3 K 0 3340 8742
## 4 K 0 3458 8626
## 5 K 0 3412 8642
## 6 K 0 3450 8748
## LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3051 8806 1
## 2 0 0 2
## 3 0 0 3
## 4 0 0 4
## 5 0 0 5
## 6 0 0 6
# Select relevant columns
storm_filtered <- storm_data %>%
filter(FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0 | CROPDMG > 0) %>%
select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
# Summarize fatalities and injuries
health_impact <- storm_filtered %>%
group_by(EVTYPE) %>%
summarise(
Total_Fatalities = sum(FATALITIES, na.rm = TRUE),
Total_Injuries = sum(INJURIES, na.rm = TRUE)
) %>%
arrange(desc(Total_Fatalities))
# Display top 10 most harmful events
head(health_impact, 10)
## # A tibble: 10 × 3
## EVTYPE Total_Fatalities Total_Injuries
## <chr> <dbl> <dbl>
## 1 TORNADO 5633 91346
## 2 EXCESSIVE HEAT 1903 6525
## 3 FLASH FLOOD 978 1777
## 4 HEAT 937 2100
## 5 LIGHTNING 816 5230
## 6 TSTM WIND 504 6957
## 7 FLOOD 470 6789
## 8 RIP CURRENT 368 232
## 9 HIGH WIND 248 1137
## 10 AVALANCHE 224 170
# Select top 10 events
top_health_events <- health_impact[1:10, ]
# Create bar plot
ggplot(top_health_events, aes(x = reorder(EVTYPE, -Total_Fatalities), y = Total_Fatalities)) +
geom_bar(stat = "identity", fill = "red") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Top 10 Deadliest Weather Events in the U.S.",
x = "Event Type",
y = "Number of Fatalities")
# Function to convert damage exponents
convert_damage_exp <- function(exp) {
exp <- toupper(exp)
if (exp %in% c("B")) return(1e9) # Billion
if (exp %in% c("M")) return(1e6) # Million
if (exp %in% c("K")) return(1e3) # Thousand
if (exp %in% c("H")) return(1e2) # Hundred
return(1) # Default to 1 if unspecified
}
# Apply conversion
storm_filtered$PROPDMGEXP <- sapply(storm_filtered$PROPDMGEXP, convert_damage_exp)
storm_filtered$CROPDMGEXP <- sapply(storm_filtered$CROPDMGEXP, convert_damage_exp)
# Calculate total economic damage
storm_filtered <- storm_filtered %>%
mutate(
Property_Damage = PROPDMG * PROPDMGEXP,
Crop_Damage = CROPDMG * CROPDMGEXP,
Total_Economic_Damage = Property_Damage + Crop_Damage
)
# Aggregate economic damage by event type
economic_impact <- storm_filtered %>%
group_by(EVTYPE) %>%
summarise(Total_Economic_Damage = sum(Total_Economic_Damage, na.rm = TRUE)) %>%
arrange(desc(Total_Economic_Damage))
# Display top 10 events with the highest economic impact
head(economic_impact, 10)
## # A tibble: 10 × 2
## EVTYPE Total_Economic_Damage
## <chr> <dbl>
## 1 FLOOD 150319678257
## 2 HURRICANE/TYPHOON 71913712800
## 3 TORNADO 57352114049.
## 4 STORM SURGE 43323541000
## 5 HAIL 18758222016.
## 6 FLASH FLOOD 17562129167.
## 7 DROUGHT 15018672000
## 8 HURRICANE 14610229010
## 9 RIVER FLOOD 10148404500
## 10 ICE STORM 8967041360
# Select top 10 events
top_economic_events <- economic_impact[1:10, ]
# Create bar plot
ggplot(top_economic_events, aes(x = reorder(EVTYPE, -Total_Economic_Damage), y = Total_Economic_Damage)) +
geom_bar(stat = "identity", fill = "blue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Top 10 Weather Events with Highest Economic Impact in the U.S.",
x = "Event Type",
y = "Total Economic Damage (USD)")
The analysis of NOAA storm data shows that:
Further improvements could involve: