Synopsis

Weather events produce many adverse effects on human health and economics. We can mitigate the risks of loss or injury by directing inventment and attention to the most consequential types of storms. This analysis looks at NOAA storm data over a 60 year period and summarizes the impact of various weather related events.

Data Processing

Obtaining and Loading the data

This analysis is based on the NOAA data obtained from cloudfront.net on Jan 24, 2014. The raw data file has an expected MD5 checksum: 33ab0bd27d935eeefef0dd7300f800af.

library("dplyr")
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("tools")      #needed to calculate the MD5 hash
library("lubridate")
library("ggplot2")
library("reshape2")
ProjectDir <- "~/R/Class5/proj2"
dataURL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
dataArchive <- "repdata%2Fdata%2FStormData.csv.bz2"
dataFile <- "repdata%2Fdata%2FStormData.csv"
now <- Sys.time()

system(paste("mkdir -p ", ProjectDir))
setwd(ProjectDir)

#Check to see if the expected data file exists. Download if necessary. Then show the hash signature of the file used.
if (file.exists(dataFile)){
        message("Reusing the existing data file with MD5 hash:")        
} else {
        system(paste("wget ",dataURL))
        message(paste("Got a fresh download of the data",now))
        system( paste("bunzip2", dataArchive))
}
## Reusing the existing data file with MD5 hash:
chksum <- md5sum(dataFile)

The computed MD5 checksum is: 33ab0bd27d935eeefef0dd7300f800af

Cleanup

The data file contains many non standardised terms for similar weather events. For example, “light snow”, “Light Snow” and “LIGHT SNOW” are three distinct categories of weather events in the data. There are also variations in punctuation, spelling and abbreviations. The only cleanup implemented in this analysis is to convert all event types to upper case prior to grouping and summing.

#If our variable is not in the workspace, load up the csv file. Otherwise carry on
if ( ! exists("dfRaw")) {
    dfRaw <- read.csv(dataFile, header=TRUE, sep=",")
}

# Possible EVTYPE cleanups: case, punctuation, consequtive spaces, stop words
# Implemented cleanups: only converting to upper case.
melted <- melt(data=dfRaw, id.vars=c("EVTYPE"), measure.vars=c("FATALITIES","INJURIES","PROPDMG","CROPDMG"))
recast <- dcast(melted, toupper(EVTYPE) ~ variable, fun.aggregate=sum)
colnames(recast) <- c("EVTYPE","FATALITIES", "INJURIES", "PROPERTY_DAMAGE", "CROP_DAMAGE")

totals <- mutate(recast, OUCH=FATALITIES + INJURIES, BOOM=PROPERTY_DAMAGE + CROP_DAMAGE)
worstHealth <- arrange(totals, desc(OUCH))
worstDamage <- arrange(totals, desc(BOOM))

Results

The top ten weather events that have caused the most injuries and fatalities are

worstHealth[seq(1,10), c(1,2,3)]
##               EVTYPE FATALITIES INJURIES
## 1            TORNADO       5633    91346
## 2     EXCESSIVE HEAT       1903     6525
## 3          TSTM WIND        504     6957
## 4              FLOOD        470     6789
## 5          LIGHTNING        816     5230
## 6               HEAT        937     2100
## 7        FLASH FLOOD        978     1777
## 8          ICE STORM         89     1975
## 9  THUNDERSTORM WIND        133     1488
## 10      WINTER STORM        206     1321

The top ten weather events that have caused the most property and crop damage are

worstDamage[seq(1,10), c(1,4,5)]
##                EVTYPE PROPERTY_DAMAGE CROP_DAMAGE
## 1             TORNADO       3212258.2   100018.52
## 2         FLASH FLOOD       1420124.6   179200.46
## 3           TSTM WIND       1335995.6   109202.60
## 4                HAIL        688693.4   579596.28
## 5               FLOOD        899938.5   168037.88
## 6   THUNDERSTORM WIND        876844.2    66791.45
## 7           LIGHTNING        603351.8     3580.61
## 8  THUNDERSTORM WINDS        446293.2    18684.93
## 9           HIGH WIND        324731.6    17283.21
## 10       WINTER STORM        132720.6     1978.99

Items on both top lists

intersect(worstHealth[seq(1,10),1],worstDamage[seq(1,10),1])
## [1] "TORNADO"           "TSTM WIND"         "FLOOD"            
## [4] "LIGHTNING"         "FLASH FLOOD"       "THUNDERSTORM WIND"
## [7] "WINTER STORM"

Figures

To show damage over time, each event type is summarized to the decade in which it happened.

worstEvents <- union(worstHealth[seq(1,10),1],worstDamage[seq(1,10),1])
dfWorst <- subset(dfRaw, EVTYPE %in% worstEvents)

melted2a <- melt(data=dfWorst, id.vars=c("EVTYPE","BGN_DATE"), measure.vars=c("FATALITIES"))
melted2b <- melt(data=dfWorst, id.vars=c("EVTYPE","BGN_DATE"), measure.vars=c("PROPDMG"))
melted3a <- mutate(melted2a,EVENT=toupper(EVTYPE), STARTDATE=mdy_hms(BGN_DATE),EVYEAR=year(STARTDATE), DECADE=trunc(EVYEAR/10)*10) %>%
            group_by(EVENT, DECADE, variable) %>%
            summarise(VALUE=sum(value))
melted3b <- mutate(melted2b,EVENT=toupper(EVTYPE), STARTDATE=mdy_hms(BGN_DATE),EVYEAR=year(STARTDATE), DECADE=trunc(EVYEAR/10)*10) %>%
    group_by(EVENT, DECADE, variable) %>%
    summarise(VALUE=sum(value))

ggplot(melted3a, aes(x=DECADE, y = VALUE)) + 
    facet_wrap(~EVENT) + 
    geom_bar(aes(fill = variable), stat="identity", position="dodge") +
    ggtitle("Total Fatalities by Decade")

ggplot(melted3b, aes(x=DECADE, y = VALUE)) + 
    facet_wrap(~EVENT) + 
    geom_bar(aes(fill = variable), stat="identity", position="dodge") +
    ggtitle("Total Property Damage by Decade")