Synopsis

This analysis explores the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database to identify the types of severe weather events that are most harmful to population health and have the greatest economic consequences. The data spans events from 1950 to 2011. The raw dataset is cleaned, and only events with injuries, fatalities, or economic damage are considered. Events are grouped and standardized for clarity. Health impacts are evaluated by summing injuries and fatalities, while economic impacts are measured by property and crop damage. The results indicate tornadoes cause the most health-related harm, while floods result in the highest economic damage.

Data Processing

Data Import

# Load the dataset
data <- read.csv("repdata_data_StormData.csv")

Data Cleaning

#filter only relevant columns
data <- data[,c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
#only events with FATALITIES or INJURIES or PROPDMG or CROPDMG are relevant
data <- data[which(data$FATALITIES != 0 | data$INJURIES != 0 | data$PROPDMG != 0 | data$CROPDMG != 0),]
#remove . from EVTYPE
data$EVTYPE <- gsub("\\.", "", data$EVTYPE)
#everything in EVTYPE to upper case
data$EVTYPE <- toupper(data$EVTYPE)
#remove words and leave keywords unchanged
for(word in c("AGRICULTURAL", "UNSEASONABLE", "RAPIDLY RISING", "GUSTY", "?", "ASTRONOMICAL", "LOW", "HIGH", "HAZARDOUS", "MARINE", "BRUSH", "?")){
  data$EVTYPE <- gsub(word, "", data$EVTYPE)
}
#remove leading and trailing whitespace
data$EVTYPE <- trimws(data$EVTYPE)
#unify some EVTYPEs
data$EVTYPE[grepl("THUNDERSTORM", data$EVTYPE)] <- "THUNDERSTORM"
data$EVTYPE[grepl("TSTM", data$EVTYPE)] <- "THUNDERSTORM"
data$EVTYPE[grepl("THUDERSTORM", data$EVTYPE)] <- "THUNDERSTORM"
data$EVTYPE[grepl("TUNDERSTORM", data$EVTYPE)] <- "THUNDERSTORM"
data$EVTYPE[grepl("THUNDERSTROM", data$EVTYPE)] <- "THUNDERSTORM"
data$EVTYPE[grepl("THUNDERTORM", data$EVTYPE)] <- "THUNDERSTORM"
data$EVTYPE[grepl("HAIL", data$EVTYPE)] <- "HAIL"
data$EVTYPE[grepl("SNOW", data$EVTYPE)] <- "SNOW"
data$EVTYPE[grepl("TORNADO", data$EVTYPE)] <- "TORNADO"
data$EVTYPE[grepl("FIRE", data$EVTYPE)] <- "FIRE"
data$EVTYPE[grepl("TORNDAO", data$EVTYPE)] <- "TORNADO"
data$EVTYPE[grepl("GUSTNADO", data$EVTYPE)] <- "TORNADO"
data$EVTYPE[grepl("WIND", data$EVTYPE)] <- "WIND"
data$EVTYPE[grepl("ICE", data$EVTYPE)] <- "ICE"
data$EVTYPE[grepl("FREEZ", data$EVTYPE)] <- "ICE"
data$EVTYPE[grepl("FLOOD", data$EVTYPE)] <- "FLOOD"
data$EVTYPE[grepl("HURRICANE", data$EVTYPE)] <- "HURRICANE"
data$EVTYPE[grepl("TYPHOON", data$EVTYPE)] <- "HURRICANE"
data$EVTYPE[grepl("SURGE", data$EVTYPE)] <- "FLOOD"
data$EVTYPE[grepl("STORM", data$EVTYPE)] <- "THUNDERSTORM"

Data filtering to relevant counts

#amount with cases <=5 (equals roughly 0.06 % of whole dataset)
EVTYPE_removal <- names(table(data$EVTYPE)[which(table(data$EVTYPE) <= 5)])
removal_percentage <- nrow(data[which(data$EVTYPE %in% EVTYPE_removal),]) / nrow(data) * 100
print(paste("Removed entry percentage:", removal_percentage))#0.05576653
## [1] "Removed entry percentage: 0.0475193710163255"
data <- data[which(!data$EVTYPE %in% EVTYPE_removal),]

Results

Events most harmful to population health

#aggregate fatalities and injuries
fatalities <- aggregate(FATALITIES ~ EVTYPE, data, sum)
injuries <- aggregate(INJURIES ~ EVTYPE, data, sum)

#merge and compute total health impact (assuming fatalities equal injuries), sort descending and select top 10
health <- merge(fatalities, injuries, by = "EVTYPE")
health$TOTAL <- health$FATALITIES + health$INJURIES
health_sorted <- health[order(health$TOTAL, decreasing  = TRUE), ]
top_health <- head(health_sorted, 10)

#plot results
par(mar = c(5.1, 8.5, 4.1, 2.1))
barplot(rev(top_health$TOTAL),
        names.arg = rev(top_health$EVTYPE),
        horiz = TRUE,
        las = 1,
        col = "firebrick",
        main = "Top 10 Weather Events Impacting Population Health",
        xlab = "Total Fatalities and Injuries")

Events with greatest economic consequences?

#remove unclear EXP rows
data <- data[which(!data$PROPDMGEXP %in% c("-", "+", "?", "0")),]
data <- data[which(!data$CROPDMGEXP %in% c("-", "+", "?", "0")),]

#function to convert exponent to numeric
exp_convert <- function(exp) {
  ifelse(toupper(exp) == "H", 1e2,
  ifelse(toupper(exp) == "K", 1e3,
  ifelse(toupper(exp) == "M", 1e6,
  ifelse(toupper(exp) == "B", 1e9, 
  ifelse(toupper(exp) == "",  1,  suppressWarnings(10^as.numeric(exp)))))))
}

#Calculate damage by multiplying damage with corresponding exponent
data$PROPDMGEXP <- exp_convert(data$PROPDMGEXP)
data$CROPDMGEXP <- exp_convert(data$CROPDMGEXP)

data$PROPDMGMULT <- data$PROPDMG * data$PROPDMGEXP
data$CROPDMGMULT <- data$CROPDMG * data$CROPDMGEXP

data$TOTALDMG <- data$PROPDMGMULT + data$CROPDMGMULT

# Aggregate by EVTYPE, sort descending, select top 10
dmg_summary <- aggregate(TOTALDMG ~ EVTYPE, data, sum)
econ_sorted <- dmg_summary[order(-dmg_summary$TOTALDMG), ]
top_economic <- head(econ_sorted, 10)

#plot results
par(mar = c(5.1, 8.5, 4.1, 2.1))
barplot(rev(top_economic$TOTALDMG / 1e9),
        names.arg = rev(top_economic$EVTYPE),
        horiz = TRUE,
        las = 1,
        col = "darkblue",
        main = "Top 10 Weather Events by Economic Damage",
        xlab = "Total Damage (in Billions USD)")