The storm data shows that tornadoes are by far the worst type of storm and are the leading storm-related cause of fatalities, injuries, and property damage. Excessive heat and floods follow far behind but are also leading causes of injury and death. Leading causes of property damage, after tornadoes, are floods, flash floods, hurricanes, hail, thunderstorm wind, storm surges, and wildfires.
Drought is the leading cause of crop-damage followed by flood and hurricanes.
The storm data are loaded from repdata-data-StormData.csv.
Next, the EVTYPE (event types) field is “cleaned up” and mapped to the official event types (as described in http://www.nws.noaa.gov/directives/sym/pd01016005curr.pdf). The mapped event type is stored in a new field named OfficialEventType. The EVTYPE field is mapped first using the amatch() function (method=“soundex”). After that the mapping was done using grepl to find certain character strings that were then mapped to an official event type.
The property and crop damage fields (PROPDMG and CROPDMG, respectively) were converted to common “thousands of dollars”" fields (PropertyDamageInThousands and CropDamageInThousands) to simplify comparisons.
The data were then grouped by event type.
# DATA PROCESSING
## Load required libraries
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.1.3
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(data.table)
## Warning: package 'data.table' was built under R version 3.1.3
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:dplyr':
##
## between, last
library(grid)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3
library(stringr)
## Warning: package 'stringr' was built under R version 3.1.3
library(stringdist)
## Warning: package 'stringdist' was built under R version 3.1.3
library(XLConnect)
## Warning: package 'XLConnect' was built under R version 3.1.3
## Loading required package: XLConnectJars
## Warning: package 'XLConnectJars' was built under R version 3.1.3
## XLConnect 0.2-11 by Mirai Solutions GmbH [aut],
## Martin Studer [cre],
## The Apache Software Foundation [ctb, cph] (Apache POI, Apache Commons
## Codec),
## Stephen Colebourne [ctb, cph] (Joda-Time Java library)
## http://www.mirai-solutions.com ,
## http://miraisolutions.wordpress.com
### Set working directory
setwd("D:/Diversions/CourseraCourses/DataScientist/Course5_ReproducibleResearch")
## Load the data from the bz2 file.
stormData <- read.csv(bzfile("repdata-data-StormData.csv.bz2","repdata-data-StormData.csv"))
## Process Data
### Need to map each actual EVTYPE to an official EVTYPE
### 1. Get complete list of actual EVTYPEs --> Excel
# Load the official event types from a spreadsheet created (manually) from
# http://www.nws.noaa.gov/directives/sym/pd01016005curr.pdf
eventTypes <- (readWorksheet(loadWorkbook("OfficialStormEventTypes.xlsx"),sheet=1, header=TRUE))
### Upper case EVTYPE and trim spaces
eventTypes$EVTYPE = str_trim(toupper(eventTypes$EVTYPE))
### For consistency let's upper case EVTYPE in stormData.
stormData$EVTYPE <- toupper(stormData$EVTYPE)
### Match stormData$EVTYPE to official list in eventTypes$EVTYPE --> store in OfficialEventType
### Keep original EVTYPE for validation
### First, use amatch() in stringdist package returns position of closest match in table.
stormData$EVTYPEIndex <- amatch(stormData$EVTYPE, eventTypes$EVTYPE, method="soundex", nomatch=0)
### Initialize OfficialEventType.
stormData$OfficialEventType <- ""
### Now store the value, as identified by amatch(), into stormData$OfficialEventType
stormData$OfficialEventType[stormData$EVTYPEIndex > 0] <-
eventTypes$EVTYPE[stormData$EVTYPEIndex[stormData$EVTYPEIndex > 0]]
table(stormData$OfficialEventType[stormData$OfficialEventType==""])
231327
### There are 231327 without matches.
### Looking at frequency of values that weren't matched:
### 219940 = TSTM WIND
### 3392 = URBAN/SML STREAM FLD
### 1028 = TSTM WIND/HAIL
### Use grepl to populate as many as we can of the rest.
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("MARINE TSTM WIND", stormData$EVTYPE)] <-
"MARINE THUNDERSTORM WIND"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("MARINE TSTM WIND", stormData$EVTYPE)] <- -1
### Use grepl to update rows where xxx is in any EVTYPE.
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("TSTM WIND", stormData$EVTYPE)] <-
"THUNDERSTORM WIND"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("TSTM WIND", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("WIND", stormData$EVTYPE)] <- "HIGH WIND"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("WIND", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("FIRE", stormData$EVTYPE)] <- "WILDFIRE"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("FIRE", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("SPOUT", stormData$EVTYPE)] <- "WATERSPOUT"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("SPOUT", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("LIGHTNING|LIGNTNING", stormData$EVTYPE)] <-
"LIGHTNING"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("LIGHTNING|LIGNTNING", stormData$EVTYPE)] <- -1
### Pick up any flash floods
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("FLOOD FLASH|FLOOD/FLASH", stormData$EVTYPE)] <- "FLASH FLOOD"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("FLOOD FLASH|FLOOD/FLASH", stormData$EVTYPE)] <- -1
### Now pick up any other floods. ["URBAN" has been used for urban flooding.]
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("FLOOD|FLD|URBAN", stormData$EVTYPE)] <- "FLOOD"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("FLOOD|FLD|URBAN", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("TYPHOON", stormData$EVTYPE)] <- "HURRICANE (TYPHOON)"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("TYPHOON", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("LANDSLIDE|MUD|SLIDE", stormData$EVTYPE)] <- "DEBRIS FLOW"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("LANDSLIDE|MUD|SLIDE", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("SNOW", stormData$EVTYPE)] <- "HEAVY SNOW"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("SNOW", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("FOG|VOG", stormData$EVTYPE)] <- "DENSE FOG"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("FOG|VOG", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("HAIL", stormData$EVTYPE)] <- "HAIL"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("HAIL", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("DRY", stormData$EVTYPE)] <- "DROUGHT"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("DRY", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("DUST", stormData$EVTYPE)] <- "DUST STORM"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("DUST", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("HEAVY PREC|HEAVY SHOW|HEAVY MIX", stormData$EVTYPE)] <- "HEAVY RAIN"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("HEAVY PREC|HEAVY SHOW|HEAVY MIX", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("THUNDERSTORM|TSTM", stormData$EVTYPE)] <- "THUNDERSTORM WIND"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("THUNDERSTORM|TSTM", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("HEAT|HOT|WARM", stormData$EVTYPE)] <- "HEAT"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("HEAT|HOT|WARM", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("SURF|HEAVY SWELLS|HEAVY SEAS|HIGH SWELLS|HIGH SEAS|HIGH SWELLS|HIGH TIDES|HIGH WATER|HIGH WAVES|ROUGH SEAS|ROGUE WAVE|TIDE", stormData$EVTYPE)] <- "HEAVY SURF"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("SURF|HEAVY SWELLS|HEAVY SEAS|HIGH SWELLS|HIGH SEAS|HIGH SWELLS|HIGH TIDES|HIGH WATER|HIGH WAVES|ROUGH SEAS|ROGUE WAVE|TIDE", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("RECORD HIGH|RECORD TEMP|TEMPERATURE RECORD", stormData$EVTYPE)] <- "EXCESSIVE HEAT"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("RECORD HIGH|RECORD TEMP|TEMPERATURE RECORD", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("FREEZE|ICE|GLAZE|ICY", stormData$EVTYPE)] <- "FROST/FREEZE"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("FREEZE|ICE|GLAZE|ICY", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("MICROBURST", stormData$EVTYPE)] <- "STRONG WIND"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("MICROBURST", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("COLD|COOL", stormData$EVTYPE)] <- "EXTREME COLD/WIND CHILL"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("COLD|COOL", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("RAIN|WET", stormData$EVTYPE)] <- "HEAVY RAIN"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("RAIN|WET", stormData$EVTYPE)] <- -1
stormData$OfficialEventType[stormData$EVTYPEIndex == 0 & grepl("FUNNEL|ROTATING", stormData$EVTYPE)] <- "FUNNEL CLOUD"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0 & grepl("FUNNEL|ROTATING", stormData$EVTYPE)] <- -1
###Put everything left in OTHER.
stormData$OfficialEventType[stormData$EVTYPEIndex == 0] <- "OTHER"
stormData$EVTYPEIndex[stormData$EVTYPEIndex == 0] <- -1
### Convert all property and crop damages to THOUSANDS using PROPDMGEXP and CROPDMGEXP.
### From class Discussion board:
### h/H = hundreds; k/K = thousands; m/M = millions; b/B = billions
### [1-9] = powers of 10 (e.g. 2 = 10-squared=100; 3=10-cubed = 1000
### other values - junk-- multiply by 1 (but to convert to THOUSANDS multiply by 0.001)
df <- function(n) {
x <- character(n)
y <- double(n)
for (i in 1:n) {
x[i] <- ""
y[i] <- 0.001
}
# data frame is returned
data.frame(x, y, stringsAsFactors=FALSE)
}
DamageExps <- df(length(unique(toupper(stormData$PROPDMGEXP))))
### Update x using unique values of PROPDMGEXP (CROPDMGEXP is a subset of PROPDMGEXP)
DamageExps$x <- unique(toupper(stormData$PROPDMGEXP))
### Now update the multiplier (y) (for THOUSANDS).
for (i in 1:length(DamageExps$x)) {
if (DamageExps$x[i] == "H") {DamageExps$y[i] <- 0.1}
else if (DamageExps$x[i] == "K") {DamageExps$y[i] <- 1.0}
else if (DamageExps$x[i] == "M") {DamageExps$y[i] <- 1000.0}
else if (DamageExps$x[i] == "B") {DamageExps$y[i] <- 100000.0}
else if (DamageExps$x[i] == "1") {DamageExps$y[i] <- 0.01}
else if (DamageExps$x[i] == "2") {DamageExps$y[i] <- 0.1}
else if (DamageExps$x[i] == "3") {DamageExps$y[i] <- 1.0}
else if (DamageExps$x[i] == "4") {DamageExps$y[i] <- 10.0}
else if (DamageExps$x[i] == "5") {DamageExps$y[i] <- 100.0}
else if (DamageExps$x[i] == "6") {DamageExps$y[i] <- 1000.0}
else if (DamageExps$x[i] == "7") {DamageExps$y[i] <- 10000.0}
else if (DamageExps$x[i] == "8") {DamageExps$y[i] <- 100000.0}
else if (DamageExps$x[i] == "9") {DamageExps$y[i] <- 1000000.0}
else {DamageExps$y[i] <- 0.001}
}
### Now update the PropertyDamageInThousands using the multipliers.
for (i in 1:length(DamageExps$x)) {
# Filter the stormData to this PROPDMGEXP.
# Set "PropertyDamageMultiplier" for these rows.
stormData$PropertyDamageMultiplier[toupper(stormData$PROPDMGEXP)==DamageExps$x[i]] <- DamageExps$y[i]
}
## Warning: closing unused connection 5 (repdata-data-StormData.csv.bz2)
stormData$PropertyDamageInThousands <- stormData$PropertyDamageMultiplier * stormData$PROPDMG
### Now update the CropDamageInThousands using the multipliers.
for (i in 1:length(DamageExps$x)) {
# Filter the stormData to this CROPDMGEXP.
# Set "CropDamageMultiplier" for these rows.
stormData$CropDamageMultiplier[toupper(stormData$CROPDMGEXP)==DamageExps$x[i]] <- DamageExps$y[i]
}
stormData$CropDamageInThousands <- stormData$CropDamageMultiplier * stormData$CROPDMG
## Summarize Fatalities and Injuries by Event Type
### Sort then group by Official Event Type (OfficialEventType).
stormDataByEventType <- arrange(stormData, OfficialEventType) %>% group_by(OfficialEventType)
### Total Property Damage, Crop Damage, Injuries, and Fatalities by EventType
stormDataByEventType <- summarize(stormDataByEventType
,totalPropertyDamageInThousands = sum(PropertyDamageInThousands, na.rm = TRUE)
,totalCropDamageInThousands = sum(CropDamageInThousands, na.rm = TRUE)
,totalInjuries = sum(INJURIES, na.rm=TRUE)
,totalFatalities = sum(FATALITIES, na.rm=TRUE)
,avgPropertyDamageInThousands = mean(PropertyDamageInThousands, na.rm = TRUE)
,avgCropDamageInThousands = mean(CropDamageInThousands, na.rm = TRUE)
,avgInjuries = mean(INJURIES, na.rm=TRUE)
,avgFatalities = mean(FATALITIES, na.rm=TRUE)
)
The results are plotted using one plot to show the count of fatalities and injuries by event type and a second plot to show the property damage and crop damage in thousands of dollars by event type.
# RESULTS
## Increase bottom margin to make room for rotated labels [bottom,left,top,right]
par(mar = c(8,8,2,4))
# Plot Injuries and Fatalities together.
p1 <- ggplot(stormDataByEventType) +
xlab("Official Event Type") +
ylab("Count") +
ggtitle("Offical Event Type vs Injuries & Fatalities") +
geom_point(aes(x=OfficialEventType,y = totalFatalities, shape=5)) +
geom_point(aes(x=OfficialEventType,y = totalInjuries, shape=15)) + scale_shape_identity()
p1 <- p1 + theme(axis.text.x=
element_text(size=8,angle=90,color="blue", vjust=0.75, hjust=1.0))
p1
### Use options so we don't get scientific notation on y-axis
options(scipen=3)
# Plot Property and Crop Damage together.
p2 <- ggplot(stormDataByEventType) +
xlab("Official Event Type") +
ylab("Damage ($ Thousands)") +
ggtitle("Offical Event Type vs Property & Crop Damage") +
geom_point(aes(x=OfficialEventType,y = totalPropertyDamageInThousands)) +
geom_point(aes(x=OfficialEventType,y = totalCropDamageInThousands, shape=5)) +
scale_shape_identity()
p2 <- p2 + scale_fill_continuous(guide = guide_legend(title = "V"))
p2 <- p2 + theme(axis.text.x=
element_text(size=8,angle=90,color="blue", vjust=0.75, hjust=1.0))
p2