Synopsis

Using the U.S.A. National Oceanic and Atmospheric Administration’s (NOAA) Storm Data spanning 1950 through November 2011 this report endeavours to address the following two questions:

We start by reading the raw data from NOAA’s Storm Data followed by a fair amount of text manipulation to the EVTYPE (Event Types) field. Next we aggregate the Health and Economic per severe weather event type over the span of our dataset. We then find the top Health and Economic severe weather event types. In conclusion, we report our findings using a combinations of plots and tables.

Data Dictionary

The NOAA Storm Data consists of 902,297 observations containing, among other, the following fields. For a full description of the data definition please refer to the NOAA Storm Data Documentation.

Field Definition
EVTYPE Event Type
FATALITIES Number of Fatalities
INJURIES Number of Injuries
PROPDMG USD Property Damage
PROPDMGEXP USD Propery Damage Magnitude (H = hundreds, K = thousands, M = millions, B = billions)
CROPDMG USD Crop Damage
CROPDMGEXP USD Crop Damage Magnitude (H = hundreds, K = thousands, M = millions, B = billions)

Data Processing

Start by initializing a few R libraries. Turn echo on for R code chunks, center figures and suppress messages.

require(gdata)
require(lattice)
require(knitr)
require(plyr)
require(reshape2)
require(xtable)
opts_chunk$set(echo=TRUE, fig.align='center', message=FALSE, cache=TRUE)

Check to see if we’ve already downloaded the file, if not, download it now.

filename <- 'data/StormData.csv.bz2'
dir.create(file.path(getwd(), 'data'), showWarnings = FALSE)
if (!file.exists(filename))
{
  download.file(
    "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", 
    filename, 
    method="curl")
}

Create a function to calculate damage using the Damage Magnitude (H = hunderds, K = thousands, M = millions, B = billions) scale.

CalculateDamage <- function(dmg, exp){
  if (exp == 'H') { dmg * 10^2 }  
  else if (exp == 'K') { dmg * 10^3 }
  else if (exp == 'M') { dmg * 10^6 }
  else if (exp == 'B') { dmg * 10^9 }
  else { dmg }
}

Read data from StormData.csv.bz2 into dataRaw and then proceed to ready data by converting columns to the right data types and lastly calculate DMG (damage total) using CalculateDamage().

if (!exists("dataRaw")){
  dataRaw <- read.csv(bzfile(filename))
  
  data <- dataRaw[ ,c('EVTYPE', 'FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP', 'CROPDMG', 'CROPDMGEXP')]
  data$EVTYPE <- toupper(data$EVTYPE)
  data$FATALITIES > as.numeric(as.character(data$FATALITIES))
  data$INJURIES > as.numeric(as.character(data$INJURIES))
  data$PROPDMG > as.numeric(as.character(data$PROPDMG))
  data$CROPDMG > as.numeric(as.character(data$CROPDMG))
  data$PROPDMGEXP <- toupper(data$PROPDMGEXP) 
  data$CROPDMGEXP <- toupper(data$CROPDMGEXP)  
  
  data <- data[data$FATALITIES > 0 |data$INJURIES > 0 |
                 data$PROPDMG > 0 | data$CROPDMG > 0, ]
  
  data <- cbind(data, DMG = 
                  mapply(CalculateDamage, data$PROPDMG, data$PROPDMGEXP) +
                  mapply(CalculateDamage, data$CROPDMG, data$CROPDMGEXP))
}

The EVTYPE (Event Types) seems to have been populated by hand and as a result suffers from a great deal of finger trouble, spelling mistakes and inconsistent logging. Create two functions to clean up EVTYPE (Event Types).

EventTypeCleanupStartsWith <- function(eventType){
  if (startsWith(eventType, "THUNDERSTORM WIND")){
    if (eventType != "THUNDERSTORM WIND FLOOD" |
          eventType != "THUNDERSTORM WIND HAIL" |
          eventType != "THUNDERSTORM WIND LIGHTNING" |
          eventType != "THUNDERSTORM WIND FUNNEL CLOU" |
          eventType != "THUNDERSTORM WIND LIGHTNING" |
          eventType != "THUNDERSTORM WIND TREE") {
      eventType <- "THUNDERSTORM WIND"
    }
  }
  
  if (startsWith(eventType, "TROPICAL STORM")){
    eventType <- "TROPICAL STORM"
  }
  
  return(eventType)
}

EventTypeCleanup <- function(data) {
  # Remove unwanted chars 
  data$EVTYPE <- gsub("[(]", " ", data$EVTYPE)
  data$EVTYPE <- gsub("[)]", " ", data$EVTYPE)
  data$EVTYPE <- gsub("[&]", " ", data$EVTYPE)
  data$EVTYPE <- gsub("[,]", " ", data$EVTYPE)
  data$EVTYPE <- gsub("[-]", " ", data$EVTYPE)
  data$EVTYPE <- gsub("[/]", " ", data$EVTYPE)
  data$EVTYPE <- gsub("[\\]", " ", data$EVTYPE)
  data$EVTYPE <- gsub("AND", " ", data$EVTYPE)
  
  # Drop Suffix
  data$EVTYPE <- gsub(" DAMAGE$", "", data$EVTYPE)
  data$EVTYPE <- gsub(" DAMAGE TO$", "", data$EVTYPE)
  data$EVTYPE <- gsub("MINOR$", "", data$EVTYPE)
  data$EVTYPE <- gsub("G40", "", data$EVTYPE)
  data$EVTYPE <- gsub("48", "", data$EVTYPE)
  
  # De-pluralize
  data$EVTYPE <- gsub("FIRE", " FIRE", data$EVTYPE)
  data$EVTYPE <- gsub("FIRES", " FIRE", data$EVTYPE)
  data$EVTYPE <- gsub("FLOOD", " FLOOD", data$EVTYPE)
  data$EVTYPE <- gsub("FLOODING", " FLOOD", data$EVTYPE)
  data$EVTYPE <- gsub("FLOODS", " FLOOD", data$EVTYPE)
  data$EVTYPE <- gsub("SLIDE", " SLIDE", data$EVTYPE)
  data$EVTYPE <- gsub("SLIDES", " SLIDE", data$EVTYPE)
  data$EVTYPE <- gsub("STORM", " STORM", data$EVTYPE)
  data$EVTYPE <- gsub("STORMS", " STORM", data$EVTYPE)
  data$EVTYPE <- gsub("SQUALLS", "SQUALL", data$EVTYPE)
  data$EVTYPE <- gsub("TREES", "TREE", data$EVTYPE)
  data$EVTYPE <- gsub("TORNADOES", "TORNADO", data$EVTYPE)
  data$EVTYPE <- gsub("WINS", " WIND", data$EVTYPE)
  data$EVTYPE <- gsub("WINDS", " WIND", data$EVTYPE)
  data$EVTYPE <- gsub("WINDHAIL", "WIND HAIL", data$EVTYPE)
  data$EVTYPE <- gsub("CURRENTS", "CURRENTS", data$EVTYPE)
  
  # Reference
  data$EVTYPE <- gsub("UNSEASONABLE", "UNSEASONABLY", data$EVTYPE)
  
  # HAIL Numbers
  data$EVTYPE <- gsub(" 0.75$", " 075", data$EVTYPE)
  data$EVTYPE <- gsub(" 75$", " 075", data$EVTYPE)
  
  # Address some spelling / type issues
  data$EVTYPE <- gsub("THUNER", "THUNDER", data$EVTYPE)
  data$EVTYPE <- gsub("THUDER", "THUNDER", data$EVTYPE)
  data$EVTYPE <- gsub("TUNDER", "THUNDER", data$EVTYPE)
  data$EVTYPE <- gsub("THUNDER", "THUNDER", data$EVTYPE)
  data$EVTYPE <- gsub("THUNDEER", "THUNDER", data$EVTYPE)
  data$EVTYPE <- gsub("THUNDERE", "THUNDER", data$EVTYPE)
  data$EVTYPE <- gsub("THUNDER", "THUNDER", data$EVTYPE)
  data$EVTYPE <- gsub("THUNDERTORM", "THUNDERSTORM", data$EVTYPE)
  data$EVTYPE <- gsub("TSTMW", "THUNDERSTORM WIND", data$EVTYPE)
  data$EVTYPE <- gsub("TSTM", "THUNDERSTORM", data$EVTYPE)
  data$EVTYPE <- gsub("THUNDER STORMW", "THUNDERSTORM WIND", data$EVTYPE)
  data$EVTYPE <- gsub("THUNDERSNOW", "THUNDER SNOW", data$EVTYPE)
  data$EVTYPE <- gsub("TORNDAO", "TORNADO", data$EVTYPE)
  data$EVTYPE <- gsub("THUNDERSTROM", "THUNDERSTORM", data$EVTYPE)
  data$EVTYPE <- gsub("THUNDERSTORM", "THUNDERSTORM", data$EVTYPE)
  data$EVTYPE <- gsub("THUNDER STORMW", "THUNDERSTORM WIND", data$EVTYPE)
  data$EVTYPE <- gsub("THUNDER STORM", "THUNDERSTORM", data$EVTYPE)
  
  # Clean Up spaces
  data$EVTYPE <- gsub("  ", " ", data$EVTYPE)
  data$EVTYPE <- gsub("  ", " ", data$EVTYPE)
  data$EVTYPE <- gsub("  ", " ", data$EVTYPE)
  data$EVTYPE <- gsub("  ", " ", data$EVTYPE)
  data$EVTYPE <- gsub("^\\s+|\\s+$", "", data$EVTYPE)
  
  # Clean up a few Starts With
  data$EVTYPE <- mapply(EventTypeCleanupStartsWith, data$EVTYPE)
  
  # Final Thunderstorm
  data$EVTYPE <- gsub("THUNDER STORM", "THUNDERSTORM", data$EVTYPE)
  
  return(data)
}

Create a data frame dataHltSum as a summary related to health (fatalities and injuries). Use the EventTypeCleanup() function to clean up EVTYPE and then aggregate again. Ranking the EVTYPE from a health perspective is tricky because just adding FATALITIES and INJURIES together does not make sense from a social perspective. It is clear that a fatality is more severe than an injury. But how much? For the purposes of this report we assume 10 times.

dataHlt <- data[data$FATALITIES > 0 | data$INJURIES > 0, c('EVTYPE', 'FATALITIES', 'INJURIES')]
dataHltSum <- ddply(dataHlt, .(EVTYPE), colwise(sum))
dataHltSum <- EventTypeCleanup(dataHltSum)
dataHltSum <- ddply(dataHltSum, .(EVTYPE), colwise(sum))

dataHltSum <- cbind(dataHltSum, 
                    TOTAL=dataHltSum$FATALITIES + dataHltSum$INJURIES,
                    TOTAL_ADJUSTED=dataHltSum$FATALITIES + (dataHltSum$INJURIES/10))

dataHltSum <- dataHltSum[with(dataHltSum, order(-TOTAL_ADJUSTED)), ]
dataHltSum <- dataHltSum[1:20,] 

Create a data frame dataDmgSum as a summary related to property damage. Use the EventTypeCleanup() function to clean up EVTYPE and then aggregate again. Order by DMG (damage) descending and select the 20 most servere weather events.

dataDmg <- data[data$DMG > 0 , c('EVTYPE','DMG')]
dataDmgSum <- ddply(dataDmg, .(EVTYPE), colwise(sum))
dataDmgSum <- EventTypeCleanup(dataDmgSum)
dataDmgSum <- ddply(dataDmgSum, .(EVTYPE), colwise(sum))
dataDmgSum <- dataDmgSum[with(dataDmgSum, order(-DMG)), ]
dataDmgSum <- dataDmgSum[1:20,] 
dataDmgSum$DMG <- dataDmgSum$DMG / 10^9

Results

Which types of events are most harmful with respect to population health?

Print the 20 most severe Event Types from a Public Health perspective using xtable(). Order the list descending by TOTAL_ADJUSTED which is calculated as FATALITIES + (INJURIES/10) in an effort to convey a weighed public health harm attributed to the respective event types.

dataHltSum$FATALITIES <- format(dataHltSum$FATALITIES, big.mark=',')
dataHltSum$INJURIES <- format(dataHltSum$INJURIES, big.mark=',')
dataHltSum$TOTAL <- format(dataHltSum$TOTAL, big.mark=',')
dataHltSum$TOTAL_ADJUSTED <- format(dataHltSum$TOTAL_ADJUSTED, big.mark=',')
colnames(dataHltSum) <- c(
  'Event Type', 'Fatalities', 'Injuries', 
  'Fatalities + Injuries', 'Fatalities + Injuries (Weighted)')
print(xtable(dataHltSum), type='HTML', html.table.attributes="align='center', border='1px'", include.rownames=FALSE)
Event Type Fatalities Injuries Fatalities + Injuries Fatalities + Injuries (Weighted)
TORNADO 5,633 91,346 96,979 14,767.6
EXCESSIVE HEAT 1,903 6,525 8,428 2,555.5
THUNDERSTORM WIND 710 9,496 10,206 1,659.6
LIGHTNING 816 5,230 6,046 1,339.0
FLASH FLOOD 999 1,785 2,784 1,177.5
FLOOD 476 6,791 7,267 1,155.1
HEAT 937 2,100 3,037 1,147.0
HIGH WIND 283 1,440 1,723 427.0
RIP CURRENT 368 232 600 391.2
WINTER STORM 216 1,338 1,554 349.8
ICE STORM 89 1,975 2,064 286.5
AVALANCHE 224 170 394 241.0
RIP CURRENTS 204 297 501 233.7
HEAVY SNOW 127 1,021 1,148 229.1
HEAT WAVE 172 379 551 209.9
HURRICANE TYPHOON 64 1,275 1,339 191.5
EXTREME COLD 162 231 393 185.1
WILD FIRE 78 1,061 1,139 184.1
BLIZZARD 101 805 906 181.5
HAIL 15 1,361 1,376 151.1


Which types of events have the greatest economic consequences?

Create a bar chart showing the 20 most severe Event Types from a Property Damage perspective.

dataDmgSum$EVTYPE <- reorder(dataDmgSum$EVTYPE, -dataDmgSum$DMG) 
print(barchart(EVTYPE ~ DMG, 
               data=dataDmgSum[1:15,], 
               xlim=c(0,155),
               xlab="Damage caused in USD Billion (10^9) between 1950 and November 2011",
               main="Top 15 Severe Event Types from a Property Damage perspective"
               ))

plot of chunk unnamed-chunk-9

Print the 20 most severe Event Types from a Property Damage perspective using xtable(). DMG (damage) is shown in Billion (10^9) USD

dataDmgSum$DMG <- format(dataDmgSum$DMG, big.mark=',')
colnames(dataDmgSum) <- c('Event Type', 'Damage in USD Billion (10^9)')

print(xtable(dataDmgSum), type='HTML', html.table.attributes="align='center', border='1px'", include.rownames=FALSE)
Event Type Damage in USD Billion (10^9)
FLOOD 150.443
HURRICANE TYPHOON 71.914
TORNADO 57.352
STORM SURGE 43.324
HAIL 18.758
FLASH FLOOD 17.894
DROUGHT 15.019
HURRICANE 14.610
THUNDERSTORM WIND 10.986
RIVER FLOOD 10.293
ICE STORM 8.967
TROPICAL STORM 8.409
WINTER STORM 6.716
HIGH WIND 6.559
WILD FIRE 5.786
STORM SURGE TIDE 4.642
HURRICANE OPAL 3.192
WILD FOREST FIRE 3.109
HEAVY RAIN SEVERE WEATHER 2.500
TORNADO THUNDERSTORM WIND HAIL 1.603