Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.
This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
Since the data originally contained hundreds of different event types, they needed to be scrubbed and categorized into one of 9 classes: COLD, FIRE, FLOOD, HAIL, HEAT, OTHER, RAIN, TORNADO, WIND. Tornadoes were the primary contributors to both death and injuries, while hail contributed the lowest numbers of deaths and injuries. This would suggest a strong direct correlation between deaths and injuries in natural disasters.
From an economic perspective, floods had the strongest impact on property damage, while heat had the strongest effect on crop damage. This suggests an inverse relationship between favorable conditions for urban life, and those for agricultural productivity.
Since the source data contains over 985 different overlapping categories of natural disasters, we’ve created a classification algorithm which aggregates all events into one of 9 main classes: COLD, FIRE, FLOOD, HAIL, HEAT, OTHER, RAIN, TORNADO, WIND.
We’ve also eliminated any redundant records which did not generate any injuries, deaths, property damage or crop damage. This classification was subjective, and a different algorithm might yield different results.
set.seed(1)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggplot2)
library(reshape)
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
temp <- tempfile()
fileURL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileURL, temp)
tmpData <- read.csv(temp, sep = ",", header=TRUE)
unlink(temp)
tmpData$NEWPROPDMG <- 0
tmpData$NEWCROPDMG <- 0
tmpData$PROPDMGEXP <- as.character(tmpData$PROPDMGEXP)
tmpData$PROPDMGEXP[grep("^M$", tmpData$PROPDMGEXP)] <- 6
tmpData$PROPDMGEXP[grep("^m$", tmpData$PROPDMGEXP)] <- 6
tmpData$PROPDMGEXP[grep("^B$", tmpData$PROPDMGEXP)] <- 9
tmpData$PROPDMGEXP[grep("^b$", tmpData$PROPDMGEXP)] <- 9
tmpData$PROPDMGEXP[grep("^H$", tmpData$PROPDMGEXP)] <- 2
tmpData$PROPDMGEXP[grep("^h$", tmpData$PROPDMGEXP)] <- 2
tmpData$PROPDMGEXP[grep("^K$", tmpData$PROPDMGEXP)] <- 3
tmpData$PROPDMGEXP[grep("^k$", tmpData$PROPDMGEXP)] <- 3
tmpData$PROPDMGEXP[grep("^+$", tmpData$PROPDMGEXP)] <- 0
tmpData$PROPDMGEXP[grep("^-$", tmpData$PROPDMGEXP)] <- 0
tmpData$PROPDMGEXP[grep("^K$", tmpData$PROPDMGEXP)] <- 3
tmpData$PROPDMGEXP[tmpData$PROPDMGEXP == "?"] <- 0
tmpData$PROPDMGEXP[tmpData$PROPDMGEXP == "+"] <- 0
tmpData$PROPDMGEXP[tmpData$PROPDMGEXP == "-"] <- 0
tmpData$PROPDMGEXP[is.na(tmpData$PROPDMGEXP)] <- 0
tmpData$NEWPROPDMG <- tmpData$PROPDMG * (10 ^ as.numeric(tmpData$PROPDMGEXP))
tmpData$CROPDMGEXP <- as.character(tmpData$CROPDMGEXP)
tmpData$CROPDMGEXP[grep("^M$", tmpData$CROPDMGEXP)] <- 6
tmpData$CROPDMGEXP[grep("^m$", tmpData$CROPDMGEXP)] <- 6
tmpData$CROPDMGEXP[grep("^B$", tmpData$CROPDMGEXP)] <- 9
tmpData$CROPDMGEXP[grep("^b$", tmpData$CROPDMGEXP)] <- 9
tmpData$CROPDMGEXP[grep("^H$", tmpData$CROPDMGEXP)] <- 2
tmpData$CROPDMGEXP[grep("^h$", tmpData$CROPDMGEXP)] <- 2
tmpData$CROPDMGEXP[grep("^K$", tmpData$CROPDMGEXP)] <- 3
tmpData$CROPDMGEXP[grep("^k$", tmpData$CROPDMGEXP)] <- 3
tmpData$CROPDMGEXP[grep("^+$", tmpData$CROPDMGEXP)] <- 0
tmpData$CROPDMGEXP[grep("^-$", tmpData$CROPDMGEXP)] <- 0
tmpData$CROPDMGEXP[grep("^K$", tmpData$CROPDMGEXP)] <- 3
tmpData$CROPDMGEXP[tmpData$CROPDMGEXP == "?"] <- 0
tmpData$CROPDMGEXP[tmpData$CROPDMGEXP == "+"] <- 0
tmpData$CROPDMGEXP[tmpData$CROPDMGEXP == "-"] <- 0
tmpData$CROPDMGEXP[is.na(tmpData$CROPDMGEXP)] <- 0
tmpData$NEWCROPDMG <- tmpData$CROPDMG * (10 ^ as.numeric(tmpData$CROPDMGEXP))
filterSet <- tmpData[, c("FATALITIES", "EVTYPE", "INJURIES", "NEWPROPDMG", "NEWCROPDMG" )]
filterSet <- subset(filterSet, (FATALITIES + INJURIES + NEWPROPDMG + NEWCROPDMG) > 0)
filterSet$EVTYPE <- as.factor(as.character(toupper(filterSet$EVTYPE)))
filterSet$EVTYPE <- gsub(",", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("/", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("\" ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("EXTREME ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("HIGH ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("RECORDEXCESSIVE ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("HEAVY ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("EXCESSIVE ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("UNSEASONABLY ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("RECORD ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("HAZARDOUS ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("ABNORMALLY ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("MONTHLY ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("UNSEASONAL ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("PROLONGED ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("VERY ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("SEVERE ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub(" ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("UNSEASONABLE ", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub(" WEATHER", "", filterSet$EVTYPE)
filterSet$EVTYPE <- gsub("EXTENDED ", "", filterSet$EVTYPE)
filterSet$EVTYPE[which(filterSet$EVTYPE == "?")] <- "OTHER"
filterSet$EVTYPE[grep("TSTM", filterSet$EVTYPE)] <- "RAIN"
filterSet$EVTYPE[grep("FLOOD", filterSet$EVTYPE)] <- "FLOOD"
filterSet$EVTYPE[grep("MARINE", filterSet$EVTYPE)] <- "MARITIME"
filterSet$EVTYPE[grep("CURRENT", filterSet$EVTYPE)] <- "MARITIME"
filterSet$EVTYPE[grep("TORNADO", filterSet$EVTYPE)] <- "TORNADO"
filterSet$EVTYPE[grep("TSUNAMI", filterSet$EVTYPE)] <- "FLOOD"
filterSet$EVTYPE[grep("TIDE", filterSet$EVTYPE)] <- "FLOOD"
filterSet$EVTYPE[grep("TROPICAL", filterSet$EVTYPE)] <- "RAIN"
filterSet$EVTYPE[grep("FIRE", filterSet$EVTYPE)] <- "FIRE"
filterSet$EVTYPE[grep("HEAT WAVE", filterSet$EVTYPE)] <- "HEAT"
filterSet$EVTYPE[grep("THUNDER", filterSet$EVTYPE)] <- "RAIN"
filterSet$EVTYPE[grep("GLAZE", filterSet$EVTYPE)] <- "COLD"
filterSet$EVTYPE[grep("SNOW", filterSet$EVTYPE)] <- "COLD"
filterSet$EVTYPE[grep("WINTRY", filterSet$EVTYPE)] <- "COLD"
filterSet$EVTYPE[grep("WINDHAIL", filterSet$EVTYPE)] <- "HAIL"
filterSet$EVTYPE[grep("TURBULENCE", filterSet$EVTYPE)] <- "WIND"
filterSet$EVTYPE[grep("COLD AND", filterSet$EVTYPE)] <- "COLD"
filterSet$EVTYPE[grep("SMOKE", filterSet$EVTYPE)] <- "FIRE"
filterSet$EVTYPE[grep("DROUGHT", filterSet$EVTYPE)] <- "HEAT"
filterSet$EVTYPE[grep("FROST", filterSet$EVTYPE)] <- "COLD"
COLDgrep <- "^WINTER STORM$|^WINTER STORMS$|BLIZZARD|^COLD AND SNOW$|CHILL|^COLD$|FREEZ|^ICE STORM$|^COLDWIND CHILL$|^WINTER WEATHERMIX$|^GLAZE$|^ICY ROADS$|^ICE$|^WINTER WEATHER MIX$|^BLACK ICE$|^WINTER WEATHER$|ICE ON|PERTHERM|POTHERMI|$COLD|COLD TEMPERATURES|COLD TEMPERATURE|COLD WAVE|^ICE|LOW TEMPERATURE|WINTER"
filterSet$EVTYPE[grep(COLDgrep, filterSet$EVTYPE)] <- "COLD"
OTHERgrep <- "^MARITIME$|^WINDSEA$|MUDSLIDE|^AVALANCHE$|^LANDSLIDE$|^ROUGH SEAS$|^FOG$|^WINDSEAS$|^DENSE FOG$|^SUMMARY OF |NON WIND|NON-WIND|EROSION|SLIDE|DROWNING|SURF ADVISORY|VOLCAN|AVALANCE|MIX|LANDSL|LIGHTING|APACHE COUNTY|ROUGH SURF|ROGUE WAVE|^WAVES$|LIGNTNING|^HIGH$|$WATER$|^SEAS$|$HIGH^|^WATER$"
filterSet$EVTYPE[grep(OTHERgrep, filterSet$EVTYPE)] <- "OTHER"
WINDgrep <- "^TSTM WIND$|^DUST STORM$|^STRONG WIND$|^HURRICANE|^HURRICANETYPHOON$|^STORM SURGE$|^WIND|^WINTER STORM WINDS$|^WIND AND SEAS$|^GUSTY WINDS$|^NON-SEVERE WIND DAMAGE$|BLOWING|WIND$|WINDS$|TYPHOON|WIND (G45)"
filterSet$EVTYPE[grep(WINDgrep, filterSet$EVTYPE)] <- "WIND"
FLOODgrep <- "^$STORM SURGE|^SURF|^SURFSURF$|^URBANSML STREAM FLD$|FLDG|SEICHE|URBANSMALL|URBAN AND SMALL|SMALL STREAM URBAN|URBAN SMALL|COASTAL STORM|DAM BREAK|RAPIDLY RISING WATER|COASTAL SURGE|^SWELLS$"
filterSet$EVTYPE[grep(FLOODgrep, filterSet$EVTYPE)] <- "FLOOD"
RAINgrep <- "^THUNDERSTORM WINDS$|^LIGHTNING$|^THUNDERSTORM WIND$|^MIXED PRECIP$|^FREEZING DRIZZLE$|^FREEZING RAIN$|^RAINFALL$|COASTALSTORM|PRECIPITATION|RAIN$|RAINSURF|$RAIN|RAINFALL|RAINS$|^SHOWER$|COOL AND WET|WETNESS|RAINWEATHER|RAINSTORM"
filterSet$EVTYPE[grep(RAINgrep, filterSet$EVTYPE)] <- "RAIN"
HEATgrep <- "^EXCESSIVE HEAT$|^WARM$|^$|^WARM AND DRY$|^DRY"
filterSet$EVTYPE[grep(HEATgrep, filterSet$EVTYPE)] <- "HEAT"
TORNADOgrep <- "^DUST DEVIL$|^WATERSPOUT$|MICROBURST|FUNNEL|LIGHTNING|SPOUT|GUSTNADO|DOWNBURST|TORNDAO"
filterSet$EVTYPE[grep(TORNADOgrep, filterSet$EVTYPE)] <- "TORNADO"
HAILgrep <- "SLEET|PELLETS|HAIL"
filterSet$EVTYPE[grep(HAILgrep, filterSet$EVTYPE)] <- "HAIL"
We’ve analyzed the effective ranges of 4 different outcomes, across 7 classes of natural disasters. For each of the 4 outcomes, we’ve found the following:
plotData <- aggregate(. ~ EVTYPE, filterSet, sum)
colnames(plotData)[which(names(plotData) %in% c("NEWPROPDMG", "NEWCROPDMG"))] <- c("PROPERTY_DAMAGE", "CROP_DAMAGE")
plotDataHarm <- melt(plotData, id=c("EVTYPE", "PROPERTY_DAMAGE", "CROP_DAMAGE"))
plotDataCost <- melt(plotData, id=c("EVTYPE", "FATALITIES", "INJURIES"))
ggplot(plotDataHarm, aes(x = factor(EVTYPE), y = as.numeric(value), fill=variable) ) + labs(title= "Most Harmful To Health", y= "Number of Injuries/Deaths", x="Event") + geom_bar(stat="identity", position="dodge")
Lowest Property Damage: OTHER @ 300 Million
Lowest Crop Damage: OTHER @ 20 Million
ggplot(plotDataCost, aes(x = factor(EVTYPE), y = (as.numeric(value) / 1000000000), fill=variable) ) + labs(title= "Greatest Economic Consequences", y= "Cost (in $USD-Billions)", x="Event") + facet_wrap(~variable, nrow=2) + geom_bar(stat="identity", position="dodge") + guides(fill=FALSE)
Lowest Fatality: HAIL @ 17
Lowest Injury: HAIL @ 1371
plotDataBillion <- plotData
plotDataBillion$PROPERTY_DAMAGE <- (plotDataBillion$PROPERTY_DAMAGE / 1000000000)
plotDataBillion$CROP_DAMAGE <- (plotDataBillion$CROP_DAMAGE / 1000000000)
grid.table(plotDataBillion)