Using the U.S.A. National Oceanic and Atmospheric Administration’s (NOAA) Storm Data spanning 1950 through November 2011 this report endeavours to address the following two questions:
We start by reading the raw data from NOAA’s Storm Data followed by a fair amount of text manipulation to the EVTYPE (Event Types) field. Next we aggregate the Health and Economic per severe weather event type over the span of our dataset. We then find the top Health and Economic severe weather event types. In conclusion, we report our findings using a combinations of plots and tables.
The NOAA Storm Data consists of 902,297 observations containing, among other, the following fields. For a full description of the data definition please refer to the NOAA Storm Data Documentation.
| Field | Definition |
|---|---|
EVTYPE |
Event Type |
FATALITIES |
Number of Fatalities |
INJURIES |
Number of Injuries |
PROPDMG |
USD Property Damage |
PROPDMGEXP |
USD Propery Damage Magnitude (H = hundreds, K = thousands, M = millions, B = billions) |
CROPDMG |
USD Crop Damage |
CROPDMGEXP |
USD Crop Damage Magnitude (H = hundreds, K = thousands, M = millions, B = billions) |
Start by initializing a few R libraries. Turn echo on for R code chunks, center figures and suppress messages.
require(gdata)
require(lattice)
require(knitr)
require(plyr)
require(reshape2)
require(xtable)
opts_chunk$set(echo=TRUE, fig.align='center', message=FALSE, cache=TRUE)
Check to see if we’ve already downloaded the file, if not, download it now.
filename <- 'data/StormData.csv.bz2'
dir.create(file.path(getwd(), 'data'), showWarnings = FALSE)
if (!file.exists(filename))
{
download.file(
"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
filename,
method="curl")
}
Create a function to calculate damage using the Damage Magnitude (H = hunderds, K = thousands, M = millions, B = billions) scale.
CalculateDamage <- function(dmg, exp){
if (exp == 'H') { dmg * 10^2 }
else if (exp == 'K') { dmg * 10^3 }
else if (exp == 'M') { dmg * 10^6 }
else if (exp == 'B') { dmg * 10^9 }
else { dmg }
}
Read data from StormData.csv.bz2 into dataRaw and then proceed to ready data by converting columns to the right data types and lastly calculate DMG (damage total) using CalculateDamage().
if (!exists("dataRaw")){
dataRaw <- read.csv(bzfile(filename))
data <- dataRaw[ ,c('EVTYPE', 'FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP', 'CROPDMG', 'CROPDMGEXP')]
data$EVTYPE <- toupper(data$EVTYPE)
data$FATALITIES > as.numeric(as.character(data$FATALITIES))
data$INJURIES > as.numeric(as.character(data$INJURIES))
data$PROPDMG > as.numeric(as.character(data$PROPDMG))
data$CROPDMG > as.numeric(as.character(data$CROPDMG))
data$PROPDMGEXP <- toupper(data$PROPDMGEXP)
data$CROPDMGEXP <- toupper(data$CROPDMGEXP)
data <- data[data$FATALITIES > 0 |data$INJURIES > 0 |
data$PROPDMG > 0 | data$CROPDMG > 0, ]
data <- cbind(data, DMG =
mapply(CalculateDamage, data$PROPDMG, data$PROPDMGEXP) +
mapply(CalculateDamage, data$CROPDMG, data$CROPDMGEXP))
}
The EVTYPE (Event Types) seems to have been populated by hand and as a result suffers from a great deal of finger trouble, spelling mistakes and inconsistent logging. Create two functions to clean up EVTYPE (Event Types).
EventTypeCleanupStartsWith <- function(eventType){
if (startsWith(eventType, "THUNDERSTORM WIND")){
if (eventType != "THUNDERSTORM WIND FLOOD" |
eventType != "THUNDERSTORM WIND HAIL" |
eventType != "THUNDERSTORM WIND LIGHTNING" |
eventType != "THUNDERSTORM WIND FUNNEL CLOU" |
eventType != "THUNDERSTORM WIND LIGHTNING" |
eventType != "THUNDERSTORM WIND TREE") {
eventType <- "THUNDERSTORM WIND"
}
}
if (startsWith(eventType, "TROPICAL STORM")){
eventType <- "TROPICAL STORM"
}
return(eventType)
}
EventTypeCleanup <- function(data) {
# Remove unwanted chars
data$EVTYPE <- gsub("[(]", " ", data$EVTYPE)
data$EVTYPE <- gsub("[)]", " ", data$EVTYPE)
data$EVTYPE <- gsub("[&]", " ", data$EVTYPE)
data$EVTYPE <- gsub("[,]", " ", data$EVTYPE)
data$EVTYPE <- gsub("[-]", " ", data$EVTYPE)
data$EVTYPE <- gsub("[/]", " ", data$EVTYPE)
data$EVTYPE <- gsub("[\\]", " ", data$EVTYPE)
data$EVTYPE <- gsub("AND", " ", data$EVTYPE)
# Drop Suffix
data$EVTYPE <- gsub(" DAMAGE$", "", data$EVTYPE)
data$EVTYPE <- gsub(" DAMAGE TO$", "", data$EVTYPE)
data$EVTYPE <- gsub("MINOR$", "", data$EVTYPE)
data$EVTYPE <- gsub("G40", "", data$EVTYPE)
data$EVTYPE <- gsub("48", "", data$EVTYPE)
# De-pluralize
data$EVTYPE <- gsub("FIRE", " FIRE", data$EVTYPE)
data$EVTYPE <- gsub("FIRES", " FIRE", data$EVTYPE)
data$EVTYPE <- gsub("FLOOD", " FLOOD", data$EVTYPE)
data$EVTYPE <- gsub("FLOODING", " FLOOD", data$EVTYPE)
data$EVTYPE <- gsub("FLOODS", " FLOOD", data$EVTYPE)
data$EVTYPE <- gsub("SLIDE", " SLIDE", data$EVTYPE)
data$EVTYPE <- gsub("SLIDES", " SLIDE", data$EVTYPE)
data$EVTYPE <- gsub("STORM", " STORM", data$EVTYPE)
data$EVTYPE <- gsub("STORMS", " STORM", data$EVTYPE)
data$EVTYPE <- gsub("SQUALLS", "SQUALL", data$EVTYPE)
data$EVTYPE <- gsub("TREES", "TREE", data$EVTYPE)
data$EVTYPE <- gsub("TORNADOES", "TORNADO", data$EVTYPE)
data$EVTYPE <- gsub("WINS", " WIND", data$EVTYPE)
data$EVTYPE <- gsub("WINDS", " WIND", data$EVTYPE)
data$EVTYPE <- gsub("WINDHAIL", "WIND HAIL", data$EVTYPE)
data$EVTYPE <- gsub("CURRENTS", "CURRENTS", data$EVTYPE)
# Reference
data$EVTYPE <- gsub("UNSEASONABLE", "UNSEASONABLY", data$EVTYPE)
# HAIL Numbers
data$EVTYPE <- gsub(" 0.75$", " 075", data$EVTYPE)
data$EVTYPE <- gsub(" 75$", " 075", data$EVTYPE)
# Address some spelling / type issues
data$EVTYPE <- gsub("THUNER", "THUNDER", data$EVTYPE)
data$EVTYPE <- gsub("THUDER", "THUNDER", data$EVTYPE)
data$EVTYPE <- gsub("TUNDER", "THUNDER", data$EVTYPE)
data$EVTYPE <- gsub("THUNDER", "THUNDER", data$EVTYPE)
data$EVTYPE <- gsub("THUNDEER", "THUNDER", data$EVTYPE)
data$EVTYPE <- gsub("THUNDERE", "THUNDER", data$EVTYPE)
data$EVTYPE <- gsub("THUNDER", "THUNDER", data$EVTYPE)
data$EVTYPE <- gsub("THUNDERTORM", "THUNDERSTORM", data$EVTYPE)
data$EVTYPE <- gsub("TSTMW", "THUNDERSTORM WIND", data$EVTYPE)
data$EVTYPE <- gsub("TSTM", "THUNDERSTORM", data$EVTYPE)
data$EVTYPE <- gsub("THUNDER STORMW", "THUNDERSTORM WIND", data$EVTYPE)
data$EVTYPE <- gsub("THUNDERSNOW", "THUNDER SNOW", data$EVTYPE)
data$EVTYPE <- gsub("TORNDAO", "TORNADO", data$EVTYPE)
data$EVTYPE <- gsub("THUNDERSTROM", "THUNDERSTORM", data$EVTYPE)
data$EVTYPE <- gsub("THUNDERSTORM", "THUNDERSTORM", data$EVTYPE)
data$EVTYPE <- gsub("THUNDER STORMW", "THUNDERSTORM WIND", data$EVTYPE)
data$EVTYPE <- gsub("THUNDER STORM", "THUNDERSTORM", data$EVTYPE)
# Clean Up spaces
data$EVTYPE <- gsub(" ", " ", data$EVTYPE)
data$EVTYPE <- gsub(" ", " ", data$EVTYPE)
data$EVTYPE <- gsub(" ", " ", data$EVTYPE)
data$EVTYPE <- gsub(" ", " ", data$EVTYPE)
data$EVTYPE <- gsub("^\\s+|\\s+$", "", data$EVTYPE)
# Clean up a few Starts With
data$EVTYPE <- mapply(EventTypeCleanupStartsWith, data$EVTYPE)
# Final Thunderstorm
data$EVTYPE <- gsub("THUNDER STORM", "THUNDERSTORM", data$EVTYPE)
return(data)
}
Create a data frame dataHltSum as a summary related to health (fatalities and injuries). Use the EventTypeCleanup() function to clean up EVTYPE and then aggregate again. Ranking the EVTYPE from a health perspective is tricky because just adding FATALITIES and INJURIES together does not make sense from a social perspective. It is clear that a fatality is more severe than an injury. But how much? For the purposes of this report we assume 10 times.
dataHlt <- data[data$FATALITIES > 0 | data$INJURIES > 0, c('EVTYPE', 'FATALITIES', 'INJURIES')]
dataHltSum <- ddply(dataHlt, .(EVTYPE), colwise(sum))
dataHltSum <- EventTypeCleanup(dataHltSum)
dataHltSum <- ddply(dataHltSum, .(EVTYPE), colwise(sum))
dataHltSum <- cbind(dataHltSum,
TOTAL=dataHltSum$FATALITIES + dataHltSum$INJURIES,
TOTAL_ADJUSTED=dataHltSum$FATALITIES + (dataHltSum$INJURIES/10))
dataHltSum <- dataHltSum[with(dataHltSum, order(-TOTAL_ADJUSTED)), ]
dataHltSum <- dataHltSum[1:20,]
Create a data frame dataDmgSum as a summary related to property damage. Use the EventTypeCleanup() function to clean up EVTYPE and then aggregate again. Order by DMG (damage) descending and select the 20 most servere weather events.
dataDmg <- data[data$DMG > 0 , c('EVTYPE','DMG')]
dataDmgSum <- ddply(dataDmg, .(EVTYPE), colwise(sum))
dataDmgSum <- EventTypeCleanup(dataDmgSum)
dataDmgSum <- ddply(dataDmgSum, .(EVTYPE), colwise(sum))
dataDmgSum <- dataDmgSum[with(dataDmgSum, order(-DMG)), ]
dataDmgSum <- dataDmgSum[1:20,]
dataDmgSum$DMG <- dataDmgSum$DMG / 10^9
Print the 20 most severe Event Types from a Public Health perspective using xtable(). Order the list descending by TOTAL_ADJUSTED which is calculated as FATALITIES + (INJURIES/10) in an effort to convey a weighed public health harm attributed to the respective event types.
dataHltSum$FATALITIES <- format(dataHltSum$FATALITIES, big.mark=',')
dataHltSum$INJURIES <- format(dataHltSum$INJURIES, big.mark=',')
dataHltSum$TOTAL <- format(dataHltSum$TOTAL, big.mark=',')
dataHltSum$TOTAL_ADJUSTED <- format(dataHltSum$TOTAL_ADJUSTED, big.mark=',')
colnames(dataHltSum) <- c(
'Event Type', 'Fatalities', 'Injuries',
'Fatalities + Injuries', 'Fatalities + Injuries (Weighted)')
print(xtable(dataHltSum), type='HTML', html.table.attributes="align='center', border='1px'", include.rownames=FALSE)
| Event Type | Fatalities | Injuries | Fatalities + Injuries | Fatalities + Injuries (Weighted) |
|---|---|---|---|---|
| TORNADO | 5,633 | 91,346 | 96,979 | 14,767.6 |
| EXCESSIVE HEAT | 1,903 | 6,525 | 8,428 | 2,555.5 |
| THUNDERSTORM WIND | 710 | 9,496 | 10,206 | 1,659.6 |
| LIGHTNING | 816 | 5,230 | 6,046 | 1,339.0 |
| FLASH FLOOD | 999 | 1,785 | 2,784 | 1,177.5 |
| FLOOD | 476 | 6,791 | 7,267 | 1,155.1 |
| HEAT | 937 | 2,100 | 3,037 | 1,147.0 |
| HIGH WIND | 283 | 1,440 | 1,723 | 427.0 |
| RIP CURRENT | 368 | 232 | 600 | 391.2 |
| WINTER STORM | 216 | 1,338 | 1,554 | 349.8 |
| ICE STORM | 89 | 1,975 | 2,064 | 286.5 |
| AVALANCHE | 224 | 170 | 394 | 241.0 |
| RIP CURRENTS | 204 | 297 | 501 | 233.7 |
| HEAVY SNOW | 127 | 1,021 | 1,148 | 229.1 |
| HEAT WAVE | 172 | 379 | 551 | 209.9 |
| HURRICANE TYPHOON | 64 | 1,275 | 1,339 | 191.5 |
| EXTREME COLD | 162 | 231 | 393 | 185.1 |
| WILD FIRE | 78 | 1,061 | 1,139 | 184.1 |
| BLIZZARD | 101 | 805 | 906 | 181.5 |
| HAIL | 15 | 1,361 | 1,376 | 151.1 |
Create a bar chart showing the 20 most severe Event Types from a Property Damage perspective.
dataDmgSum$EVTYPE <- reorder(dataDmgSum$EVTYPE, -dataDmgSum$DMG)
print(barchart(EVTYPE ~ DMG,
data=dataDmgSum[1:15,],
xlim=c(0,155),
xlab="Damage caused in USD Billion (10^9) between 1950 and November 2011",
main="Top 15 Severe Event Types from a Property Damage perspective"
))
Print the 20 most severe Event Types from a Property Damage perspective using xtable(). DMG (damage) is shown in Billion (10^9) USD
dataDmgSum$DMG <- format(dataDmgSum$DMG, big.mark=',')
colnames(dataDmgSum) <- c('Event Type', 'Damage in USD Billion (10^9)')
print(xtable(dataDmgSum), type='HTML', html.table.attributes="align='center', border='1px'", include.rownames=FALSE)
| Event Type | Damage in USD Billion (10^9) |
|---|---|
| FLOOD | 150.443 |
| HURRICANE TYPHOON | 71.914 |
| TORNADO | 57.352 |
| STORM SURGE | 43.324 |
| HAIL | 18.758 |
| FLASH FLOOD | 17.894 |
| DROUGHT | 15.019 |
| HURRICANE | 14.610 |
| THUNDERSTORM WIND | 10.986 |
| RIVER FLOOD | 10.293 |
| ICE STORM | 8.967 |
| TROPICAL STORM | 8.409 |
| WINTER STORM | 6.716 |
| HIGH WIND | 6.559 |
| WILD FIRE | 5.786 |
| STORM SURGE TIDE | 4.642 |
| HURRICANE OPAL | 3.192 |
| WILD FOREST FIRE | 3.109 |
| HEAVY RAIN SEVERE WEATHER | 2.500 |
| TORNADO THUNDERSTORM WIND HAIL | 1.603 |