Synopsis

In this report, we examine data from the NOAA National Climatic Data Center Storm Events Database to determine what kind of events are (a) the deadliest, as determined by total fatalities and injuries, and (b) the costliest, as determined by total property and crop damage. Having determined the largest-magnitude events, we examine the trend over time to see which events have the fastest-increasing impact.

Data Processing

Download the raw data file from the internet. The ultimate sorce of these data is the NOAA National Climatic Data Center Storm Events Database.

fileUrl <- "https://d396qusza40orc.cloudfront.net/"

fileName <- "repdata%2Fdata%2FStormData.csv.bz2"

filePath <- "./"

## Download file if we don't already have it
if(!file.exists(paste(filePath, fileName, sep = ""))) {
        download.file(paste(fileUrl, fileName, sep = ""),
                      destfile = paste(filePath, fileName, sep = ""))
}

## Read file into dataframe; note that read.csv can unzip compressed files
stormdata <- read.csv(paste(filePath, fileName, sep = ""),
                      stringsAsFactors = FALSE)

Now, we will perform the pre-processing and data cleaning before we can perform our data analysis:

## Convert fields BGN_DATE/END_DATE to date fields BEGDATE/ENDDATE
stormdata$BEGDATE <- strptime(stormdata$BGN_DATE, "%m/%d/%Y")
stormdata$ENDDATE <- strptime(stormdata$END_DATE, "%m/%d/%Y")

## Create a year column
stormdata$YEAR <- 1900 + stormdata$BEGDATE$year

## Calculate per-event property and crop damage
stormdata$PROPERTY <- stormdata$PROPDMG * 
        ifelse(stormdata$PROPDMGEXP == "K", 1e3,
               ifelse(stormdata$PROPDMGEXP == "M", 1e6,
                      ifelse(stormdata$PROPDMGEXP == "B", 1e9,
                             1)))
stormdata$CROP <- stormdata$CROPDMG * 
        ifelse(stormdata$CROPDMGEXP == "K", 1e3,
               ifelse(stormdata$CROPDMGEXP == "M", 1e6,
                      ifelse(stormdata$CROPDMGEXP == "B", 1e9,
                             1)))
## Field EVTYPE has 985 unique entries, mostly due to inconsistencies in
## recording.
## Clean EVTYPE field, creating new EVTYPE2 field

## Convert to upper case
stormdata$EVTYPE2 <- toupper(stormdata$EVTYPE)

## Remove leading/trailing spaces, and multiple spaces
stormdata$EVTYPE2 <- gsub("^ *|(?<= ) | *$", "", stormdata$EVTYPE2, perl = TRUE)

## Remove parentheses, since they are used inconsistently
stormdata$EVTYPE2 <- gsub("\\(|\\)", "", stormdata$EVTYPE2)

This simplistic cleanup on field EVTYPE results in a slight consolidation. NWS Directive 10-1605 lists 48 standard event types. We will use this to consolidate some of the event types remaining.

## If plyr package not already installed, install it, then load it
if(!require(plyr)) {install.packages("plyr")}
## Loading required package: plyr
require(plyr)

stormdata$EVTYPE2 <- revalue(stormdata$EVTYPE2, 
                             c(COASTALSTORM = "COASTAL STORM",
                               "COASTAL FLOOD" = "COASTAL FLOODING",
                               COLD = "COLD/WIND CHILL",
                               "COLD TEMPERATURE" = "COLD/WIND CHILL",
                               "COLD WEATHER" = "COLD/WIND CHILL",
                               "EROSION/CSTL FLOOD" = "COASTAL FLOODING/EROSION",
                               "EXTREME COLD" = "EXTREME COLD/WIND CHILL",
                               "EXTREME WINDCHILL" = "EXTREME COLD/WIND CHILL",
                               "FLOOD/FLASH/FLOOD" = "FLASH FLOOD/FLOOD",
                               "GUSTY WINDS" = "GUSTY WIND",
                               "HEAT WAVE" = "HEAT",
                               "HIGH WINDS" = "HIGH WIND",
                               "HIGH WIND G40" = "HIGH WIND",
                               "HURRICANE EDOUARD" = "HURRICANE",
                               "HURRICANE/TYPHOON" = "HURRICANE",
                               "ICE ON ROAD" = "ICY ROADS",
                               "ICE ROADS" = "ICY ROADS",
                               "LAKE EFFECT SNOW" = "LAKE-EFFECT SNOW",
                               LANDSLIDES = "LANDSLIDE",
                               "LIGHT SNOWFALL" = "LIGHT SNOW",
                               "MARINE TSTM WIND" = "MARINE THUNDERSTORM WIND",
                               "MIXED PRECIP" = "MIXED PRECIPITATION",
                               "MUD SLIDE" = "MUDSLIDE",
                               "MUDSLIDES" = "MUDSLIDE",
                               "NON TSTM WIND" = "NON-TSTM WIND",
                               "RIP CURRENTS" = "RIP CURRENT",
                               "RIVER FLOODING" = "RIVER FLOOD",
                               "SNOW SQUALLS" = "SNOW SQUALL",
                               "STORM SURGE" = "STORM SURGE/TIDE",
                               "STRONG WINDS" = "STRONG WIND",
                               "THUNDERSTORM WIND G40" = "THUNDERSTORM WIND",
                               "TSTM WIND" = "THUNDERSTORM WIND",
                               "TSTM WIND 40" = "THUNDERSTORM WIND",
                               "TSTM WIND 41" = "THUNDERSTORM WIND",
                               "TSTM WIND 45" = "THUNDERSTORM WIND",
                               "TSTM WIND G35" = "THUNDERSTORM WIND",
                               "TSTM WIND G40" = "THUNDERSTORM WIND",
                               "TSTM WIND G45" = "THUNDERSTORM WIND",
                               "TYPHOON" = "HURRICANE",
                               "UNSEASONABLE COLD" = "UNSEASONABLY COLD",
                               "WINDS" = "WIND",
                               "WIND DAMAGE" = "WIND",
                               "WINTER WEATHER MIX" = "WINTER WEATHER/MIX",
                               "WINTRY MIX" = "WINTER WEATHER/MIX"
                               )
                             )

## Set EVTYPE2 as factor
stormdata$EVTYPE2 <- as.factor(stormdata$EVTYPE2)

We are not concerned with events that caused no death/injury or property/crop damage, so we will exclude these. Furthermore, recordkeeping was standardised in 1996, so data prior to 1/1/1996 is unlikely to be comparable to data afterwards. With more time, we might perform a separate analysis on data prior to 1996.

## Subset just data with fatalities, injuries property damage, crop damage
stormdata2 <- stormdata[stormdata$FATALITIES > 0 | 
                                stormdata$INJURIES > 0 | 
                                stormdata$PROPDMG > 0 | 
                                stormdata$CROPDMG > 0, ]

## Subset data since 1/1/1996, when recordkeeping was standardized
start.date <- strptime("1/1/1996", "%m/%d/%Y")
stormdata3 <- stormdata2[stormdata2$BEGDATE >= start.date, ]

In order to determine which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health, we will aggregate and sum fatalities and injuries by event type, total them, and sort them in descending order of total.

## Aggregate fatalities and injuries by event type, and sum them
storm.death <- aggregate(cbind(stormdata3$FATALITIES, stormdata3$INJURIES) 
                         ~ stormdata3$EVTYPE2, 
                         FUN = sum)

## Give columns in resulting table useful names
colnames(storm.death) = c("EvType", "Fatalities", "Injuries")

## Create a total column
storm.death$Total <- storm.death$Fatalities + storm.death$Injuries

## Exclude rows in which the total is zero, and order by descending total
storm.death <- storm.death[storm.death$Total > 0, ]
storm.death <- storm.death[order(-storm.death$Total), ]

To determine which types of events have the greatest economic consequences, we will aggregate and sum property and crop damage by event type, total them, and sort them in descending order of total.

## Aggregate property and crop damage by event type, and sum them
storm.damage <- aggregate(cbind(stormdata3$PROPERTY, stormdata3$CROP) 
                          ~ stormdata3$EVTYPE2, 
                          FUN = sum)

## Give columns in resulting table useful names
colnames(storm.damage) = c("EvType", "Property", "Crop")

## Create a total column
storm.damage$Total <- storm.damage$Property + storm.damage$Crop

## Exclude rows in which the total is zero, and order by descending total
storm.damage <- storm.damage[storm.damage$Total > 0, ]
storm.damage <- storm.damage[order(-storm.damage$Total), ]

Results

We will simply view the 9 largest-magnitude events.
The nine deadliest events are:

storm.death[1:9,]
##                EvType Fatalities Injuries Total
## 114           TORNADO       1511    20667 22178
## 28     EXCESSIVE HEAT       1797     6391  8188
## 35              FLOOD        414     6758  7172
## 112 THUNDERSTORM WIND        373     5033  5406
## 82          LIGHTNING        651     4141  4792
## 33        FLASH FLOOD        887     1674  2561
## 54               HEAT        237     1292  1529
## 134      WINTER STORM        191     1292  1483
## 69          HURRICANE        125     1328  1453

We can determine how these are changing over time:

## Obtain a list of the event types for deadliest events
evtype <- storm.death[1:9, "EvType"]

## Subset data corresponding to deadliest events
stormdata4 <- stormdata3[stormdata3$EVTYPE2 %in% evtype, ]

## Aggregate fatalities and injuries by event type and year, and sum them
storm.death2 <- aggregate(cbind(stormdata4$FATALITIES, stormdata4$INJURIES) 
                          ~ stormdata4$EVTYPE2 + stormdata4$YEAR,
                          FUN = sum)

## Give columns in resulting table useful names
colnames(storm.death2) = c("EvType", "Year", "Fatalities", "Injuries")

## Create a total column
storm.death2$Total <- storm.death2$Fatalities + storm.death2$Injuries

## If ggplot2 package not already installed, install it, then load it
if(!require(ggplot2)) {install.packages("ggplot2")}
## Loading required package: ggplot2
require(ggplot2)

## Setup ggplot with data frame
g <- ggplot(storm.death2, aes(Year, Total))

## Add layers
p <- g + geom_point()
p <- p + geom_smooth(method="lm", se=FALSE)
p <- p + theme_bw()
p <- p + facet_grid(EvType ~ ., scales = "free")
p <- p + labs(title = "Sum of Fatalities and Injuries by Year and Event Type")

## Print plot
print(p)

It would appear that the event type increasing the most in concern is risk of death or injury from tornados.

The nine costliest events are:

storm.damage[1:9,]
##                EvType     Property        Crop        Total
## 35              FLOOD 143944833550  4974778400 148919611950
## 69          HURRICANE  81718889010  5350107800  87068996810
## 109  STORM SURGE/TIDE  47834724000      855000  47835579000
## 114           TORNADO  24616945710   283425010  24900370720
## 51               HAIL  14595143420  2476029450  17071172870
## 33        FLASH FLOOD  15222253910  1334901700  16557155610
## 22            DROUGHT   1046101000 13367566000  14413667000
## 112 THUNDERSTORM WIND   7869140380   952246350   8821386730
## 117    TROPICAL STORM   7642475550   677711000   8320186550

We can determine how these are changing over time:

## Obtain a list of the event types for costliest events
evtype2 <- storm.damage[1:9, "EvType"]

## Subset data corresponding to costliest events
stormdata5 <- stormdata3[stormdata3$EVTYPE2 %in% evtype2, ]

## Aggregate property and crop damage by event type and year, and sum them
storm.damage2 <- aggregate(cbind(stormdata5$PROPERTY, stormdata5$CROP)
                           ~ stormdata5$EVTYPE2 + stormdata5$YEAR,
                           FUN = sum)

## Give columns in resulting table useful names
colnames(storm.damage2) = c("EvType", "Year", "Property", "Crop")

## Create a total column
storm.damage2$Total <- storm.damage2$Property + storm.damage2$Crop

## Setup ggplot with data frame
g <- ggplot(storm.damage2, aes(Year, Total))

## Add layers
p <- g + geom_point()
p <- p + geom_smooth(method="lm", se=FALSE)
p <- p + theme_bw()
p <- p + facet_grid(EvType ~ ., scales = "free")
p <- p + labs(y= "Total Damage (US Dollars)")
p <- p + labs(title = "Sum of Property and Crop Damage by Year and Event Type")

## Print plot
print(p)

It would appear that the event type increasing the most in concern is risk of property and crop damage from hurricanes.