In this report, we examine data from the NOAA National Climatic Data Center Storm Events Database to determine what kind of events are (a) the deadliest, as determined by total fatalities and injuries, and (b) the costliest, as determined by total property and crop damage. Having determined the largest-magnitude events, we examine the trend over time to see which events have the fastest-increasing impact.
Download the raw data file from the internet. The ultimate sorce of these data is the NOAA National Climatic Data Center Storm Events Database.
fileUrl <- "https://d396qusza40orc.cloudfront.net/"
fileName <- "repdata%2Fdata%2FStormData.csv.bz2"
filePath <- "./"
## Download file if we don't already have it
if(!file.exists(paste(filePath, fileName, sep = ""))) {
download.file(paste(fileUrl, fileName, sep = ""),
destfile = paste(filePath, fileName, sep = ""))
}
## Read file into dataframe; note that read.csv can unzip compressed files
stormdata <- read.csv(paste(filePath, fileName, sep = ""),
stringsAsFactors = FALSE)
Now, we will perform the pre-processing and data cleaning before we can perform our data analysis:
## Convert fields BGN_DATE/END_DATE to date fields BEGDATE/ENDDATE
stormdata$BEGDATE <- strptime(stormdata$BGN_DATE, "%m/%d/%Y")
stormdata$ENDDATE <- strptime(stormdata$END_DATE, "%m/%d/%Y")
## Create a year column
stormdata$YEAR <- 1900 + stormdata$BEGDATE$year
## Calculate per-event property and crop damage
stormdata$PROPERTY <- stormdata$PROPDMG *
ifelse(stormdata$PROPDMGEXP == "K", 1e3,
ifelse(stormdata$PROPDMGEXP == "M", 1e6,
ifelse(stormdata$PROPDMGEXP == "B", 1e9,
1)))
stormdata$CROP <- stormdata$CROPDMG *
ifelse(stormdata$CROPDMGEXP == "K", 1e3,
ifelse(stormdata$CROPDMGEXP == "M", 1e6,
ifelse(stormdata$CROPDMGEXP == "B", 1e9,
1)))
## Field EVTYPE has 985 unique entries, mostly due to inconsistencies in
## recording.
## Clean EVTYPE field, creating new EVTYPE2 field
## Convert to upper case
stormdata$EVTYPE2 <- toupper(stormdata$EVTYPE)
## Remove leading/trailing spaces, and multiple spaces
stormdata$EVTYPE2 <- gsub("^ *|(?<= ) | *$", "", stormdata$EVTYPE2, perl = TRUE)
## Remove parentheses, since they are used inconsistently
stormdata$EVTYPE2 <- gsub("\\(|\\)", "", stormdata$EVTYPE2)
This simplistic cleanup on field EVTYPE results in a slight consolidation. NWS Directive 10-1605 lists 48 standard event types. We will use this to consolidate some of the event types remaining.
## If plyr package not already installed, install it, then load it
if(!require(plyr)) {install.packages("plyr")}
## Loading required package: plyr
require(plyr)
stormdata$EVTYPE2 <- revalue(stormdata$EVTYPE2,
c(COASTALSTORM = "COASTAL STORM",
"COASTAL FLOOD" = "COASTAL FLOODING",
COLD = "COLD/WIND CHILL",
"COLD TEMPERATURE" = "COLD/WIND CHILL",
"COLD WEATHER" = "COLD/WIND CHILL",
"EROSION/CSTL FLOOD" = "COASTAL FLOODING/EROSION",
"EXTREME COLD" = "EXTREME COLD/WIND CHILL",
"EXTREME WINDCHILL" = "EXTREME COLD/WIND CHILL",
"FLOOD/FLASH/FLOOD" = "FLASH FLOOD/FLOOD",
"GUSTY WINDS" = "GUSTY WIND",
"HEAT WAVE" = "HEAT",
"HIGH WINDS" = "HIGH WIND",
"HIGH WIND G40" = "HIGH WIND",
"HURRICANE EDOUARD" = "HURRICANE",
"HURRICANE/TYPHOON" = "HURRICANE",
"ICE ON ROAD" = "ICY ROADS",
"ICE ROADS" = "ICY ROADS",
"LAKE EFFECT SNOW" = "LAKE-EFFECT SNOW",
LANDSLIDES = "LANDSLIDE",
"LIGHT SNOWFALL" = "LIGHT SNOW",
"MARINE TSTM WIND" = "MARINE THUNDERSTORM WIND",
"MIXED PRECIP" = "MIXED PRECIPITATION",
"MUD SLIDE" = "MUDSLIDE",
"MUDSLIDES" = "MUDSLIDE",
"NON TSTM WIND" = "NON-TSTM WIND",
"RIP CURRENTS" = "RIP CURRENT",
"RIVER FLOODING" = "RIVER FLOOD",
"SNOW SQUALLS" = "SNOW SQUALL",
"STORM SURGE" = "STORM SURGE/TIDE",
"STRONG WINDS" = "STRONG WIND",
"THUNDERSTORM WIND G40" = "THUNDERSTORM WIND",
"TSTM WIND" = "THUNDERSTORM WIND",
"TSTM WIND 40" = "THUNDERSTORM WIND",
"TSTM WIND 41" = "THUNDERSTORM WIND",
"TSTM WIND 45" = "THUNDERSTORM WIND",
"TSTM WIND G35" = "THUNDERSTORM WIND",
"TSTM WIND G40" = "THUNDERSTORM WIND",
"TSTM WIND G45" = "THUNDERSTORM WIND",
"TYPHOON" = "HURRICANE",
"UNSEASONABLE COLD" = "UNSEASONABLY COLD",
"WINDS" = "WIND",
"WIND DAMAGE" = "WIND",
"WINTER WEATHER MIX" = "WINTER WEATHER/MIX",
"WINTRY MIX" = "WINTER WEATHER/MIX"
)
)
## Set EVTYPE2 as factor
stormdata$EVTYPE2 <- as.factor(stormdata$EVTYPE2)
We are not concerned with events that caused no death/injury or property/crop damage, so we will exclude these. Furthermore, recordkeeping was standardised in 1996, so data prior to 1/1/1996 is unlikely to be comparable to data afterwards. With more time, we might perform a separate analysis on data prior to 1996.
## Subset just data with fatalities, injuries property damage, crop damage
stormdata2 <- stormdata[stormdata$FATALITIES > 0 |
stormdata$INJURIES > 0 |
stormdata$PROPDMG > 0 |
stormdata$CROPDMG > 0, ]
## Subset data since 1/1/1996, when recordkeeping was standardized
start.date <- strptime("1/1/1996", "%m/%d/%Y")
stormdata3 <- stormdata2[stormdata2$BEGDATE >= start.date, ]
In order to determine which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health, we will aggregate and sum fatalities and injuries by event type, total them, and sort them in descending order of total.
## Aggregate fatalities and injuries by event type, and sum them
storm.death <- aggregate(cbind(stormdata3$FATALITIES, stormdata3$INJURIES)
~ stormdata3$EVTYPE2,
FUN = sum)
## Give columns in resulting table useful names
colnames(storm.death) = c("EvType", "Fatalities", "Injuries")
## Create a total column
storm.death$Total <- storm.death$Fatalities + storm.death$Injuries
## Exclude rows in which the total is zero, and order by descending total
storm.death <- storm.death[storm.death$Total > 0, ]
storm.death <- storm.death[order(-storm.death$Total), ]
To determine which types of events have the greatest economic consequences, we will aggregate and sum property and crop damage by event type, total them, and sort them in descending order of total.
## Aggregate property and crop damage by event type, and sum them
storm.damage <- aggregate(cbind(stormdata3$PROPERTY, stormdata3$CROP)
~ stormdata3$EVTYPE2,
FUN = sum)
## Give columns in resulting table useful names
colnames(storm.damage) = c("EvType", "Property", "Crop")
## Create a total column
storm.damage$Total <- storm.damage$Property + storm.damage$Crop
## Exclude rows in which the total is zero, and order by descending total
storm.damage <- storm.damage[storm.damage$Total > 0, ]
storm.damage <- storm.damage[order(-storm.damage$Total), ]
We will simply view the 9 largest-magnitude events.
The nine deadliest events are:
storm.death[1:9,]
## EvType Fatalities Injuries Total
## 114 TORNADO 1511 20667 22178
## 28 EXCESSIVE HEAT 1797 6391 8188
## 35 FLOOD 414 6758 7172
## 112 THUNDERSTORM WIND 373 5033 5406
## 82 LIGHTNING 651 4141 4792
## 33 FLASH FLOOD 887 1674 2561
## 54 HEAT 237 1292 1529
## 134 WINTER STORM 191 1292 1483
## 69 HURRICANE 125 1328 1453
We can determine how these are changing over time:
## Obtain a list of the event types for deadliest events
evtype <- storm.death[1:9, "EvType"]
## Subset data corresponding to deadliest events
stormdata4 <- stormdata3[stormdata3$EVTYPE2 %in% evtype, ]
## Aggregate fatalities and injuries by event type and year, and sum them
storm.death2 <- aggregate(cbind(stormdata4$FATALITIES, stormdata4$INJURIES)
~ stormdata4$EVTYPE2 + stormdata4$YEAR,
FUN = sum)
## Give columns in resulting table useful names
colnames(storm.death2) = c("EvType", "Year", "Fatalities", "Injuries")
## Create a total column
storm.death2$Total <- storm.death2$Fatalities + storm.death2$Injuries
## If ggplot2 package not already installed, install it, then load it
if(!require(ggplot2)) {install.packages("ggplot2")}
## Loading required package: ggplot2
require(ggplot2)
## Setup ggplot with data frame
g <- ggplot(storm.death2, aes(Year, Total))
## Add layers
p <- g + geom_point()
p <- p + geom_smooth(method="lm", se=FALSE)
p <- p + theme_bw()
p <- p + facet_grid(EvType ~ ., scales = "free")
p <- p + labs(title = "Sum of Fatalities and Injuries by Year and Event Type")
## Print plot
print(p)
It would appear that the event type increasing the most in concern is risk of death or injury from tornados.
The nine costliest events are:
storm.damage[1:9,]
## EvType Property Crop Total
## 35 FLOOD 143944833550 4974778400 148919611950
## 69 HURRICANE 81718889010 5350107800 87068996810
## 109 STORM SURGE/TIDE 47834724000 855000 47835579000
## 114 TORNADO 24616945710 283425010 24900370720
## 51 HAIL 14595143420 2476029450 17071172870
## 33 FLASH FLOOD 15222253910 1334901700 16557155610
## 22 DROUGHT 1046101000 13367566000 14413667000
## 112 THUNDERSTORM WIND 7869140380 952246350 8821386730
## 117 TROPICAL STORM 7642475550 677711000 8320186550
We can determine how these are changing over time:
## Obtain a list of the event types for costliest events
evtype2 <- storm.damage[1:9, "EvType"]
## Subset data corresponding to costliest events
stormdata5 <- stormdata3[stormdata3$EVTYPE2 %in% evtype2, ]
## Aggregate property and crop damage by event type and year, and sum them
storm.damage2 <- aggregate(cbind(stormdata5$PROPERTY, stormdata5$CROP)
~ stormdata5$EVTYPE2 + stormdata5$YEAR,
FUN = sum)
## Give columns in resulting table useful names
colnames(storm.damage2) = c("EvType", "Year", "Property", "Crop")
## Create a total column
storm.damage2$Total <- storm.damage2$Property + storm.damage2$Crop
## Setup ggplot with data frame
g <- ggplot(storm.damage2, aes(Year, Total))
## Add layers
p <- g + geom_point()
p <- p + geom_smooth(method="lm", se=FALSE)
p <- p + theme_bw()
p <- p + facet_grid(EvType ~ ., scales = "free")
p <- p + labs(y= "Total Damage (US Dollars)")
p <- p + labs(title = "Sum of Property and Crop Damage by Year and Event Type")
## Print plot
print(p)
It would appear that the event type increasing the most in concern is risk of property and crop damage from hurricanes.