Often severe weather in the United States causes significant damage to public health in the form of injury, mortality, property and foodstuff destruction. The National Oceanic Atmospheric Administration has been keeping records across the United States for well over fifty years as it relates to severe weather and the financial and human impact. Our report is an exploration of such a dataset that span datum collected from as far back as the 1950’s all the way to the 2000’s. We will explore both fiscal and human impacts as a result of severe weather.
require("knitr")
time <- format(Sys.time(), "%a %b %d %X %Y")
opts_chunk$set(echo=TRUE, results = "show", warning = FALSE, message = FALSE)
require("dplyr")
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require("ggplot2")
## Loading required package: ggplot2
require("lubridate")
## Loading required package: lubridate
require("data.table")
## Loading required package: data.table
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, mday, month, quarter, wday, week, yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, last
require("gridExtra")
## Loading required package: gridExtra
Establish global options and libraries
#check to see if the file exists and if not then get it and create a df
if(!file.exists("storm_data.csv.bz2")){
tempstorm<-tempfile()
download.file(
"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
destfile = "storm_data.csv.bz2")
unlink(tempstorm)
events <- read.csv(
"storm_data.csv.bz2"
, sep = ","
, stringsAsFactors = FALSE)
} else
if(file.exists("storm_data.csv.bz2")){
events <- read.csv(
"storm_data.csv.bz2"
, sep = ","
, stringsAsFactors = FALSE)
} else stop("You haven't followed the instructions and extracted the raw data and the code to the same working directory. Please start over.")
#K = 1,000, M = 1,000,000, B = 1,000,000,000 ... 25.0K = $25,000
#To account for years with missing or dad data a subset of all of the
#available data was created where only values that reflected some type of damage or cost were identified. I.e., value/event > 0
colnames(events) <- tolower(colnames(events))
events.cols <- events %>% select(bgn_date, state, evtype, fatalities, injuries
,propdmg, propdmgexp, cropdmg, cropdmgexp)
colnames(events.cols)[1] <- "date"
events.cols$date <- as.Date(events.cols$date, "%m/%d/%Y")
events.cols$year <- year(events.cols$date)
#dplyr filtering subset
dat <- filter(events.cols
, injuries > 0
| fatalities > 0
| propdmg > 0
| cropdmg > 0)
Plotting event frequency by years to identify further event filtering #opportunities. By further filtering we ensure some measure of parity #in consideration of the density of events tracked from 1950 to 2011.
#Figure 1 - Histogram of event type frequency by year
hist(dat$year, breaks = 14, col = "red", xlab = "Event Years",
ylab = "Frequency", main = "Figure 1 - Event Frequency by Years", ylim = c(0, 70000))
#Tidy up the event type codes by subsetting c/p/ropdmgexp by exp values
#We only want to keep observations from recognized values from the NOAA Handbook
dat$cropdmgexp[dat$cropdmgexp == 'h' | dat$cropdmgexp == 'H'] <- "H"
dat$cropdmgexp[dat$cropdmgexp == 'k' | dat$cropdmgexp == 'K'] <- "K"
dat$cropdmgexp[dat$cropdmgexp == 'm' | dat$cropdmgexp == 'M'] <- "M"
dat$cropdmgexp[dat$cropdmgexp == 'b' | dat$cropdmgexp == 'B'] <- "B"
dat$propdmgexp[dat$propdmgexp == 'h' | dat$propdmgexp == 'H'] <- "H"
dat$propdmgexp[dat$propdmgexp == 'k' | dat$propdmgexp == 'K'] <- "K"
dat$propdmgexp[dat$propdmgexp == 'm' | dat$propdmgexp == 'M'] <- "M"
dat$propdmgexp[dat$propdmgexp == 'b' | dat$propdmgexp == 'B'] <- "B"
#We determined that better data was available from 1990 onward and therefore to ensure we used the most accurate or 'modern' data we constructed our data frames from that date onward.
dat.years <- filter(dat, dat$year >= '1990')
#Create a new column 'pdmg' & 'cdmg', which are calculated values corresponding to the {H, K, M, B} multipliers
dat.years %>%
mutate(pdmg = ifelse(propdmgexp %in% c("K"), propdmg * 10^3
,ifelse(propdmgexp %in% c("M"),propdmg * 10^6
,ifelse(propdmgexp %in% c("B"), propdmg * 10^9
,propdmg)))
, cdmg = ifelse(cropdmgexp %in% c("K"), cropdmg * 10^3
,ifelse(cropdmgexp %in% c("M"), cropdmg * 10^6
,ifelse(cropdmgexp %in% c("B"), cropdmg * 10^9
,cropdmg)))) -> df
In the raw dataset there were over 800 unique event types even though the NOAA procedures specify a set of only 48 official event types. With this in mind we attempted to tidy up the event types into logical classifications. For certain event types such as ‘Tornados’ and ‘Thunderstorm Wind’ these types were mapped into a single category called ‘Damaging Wind Events’. This process was followed for all other event types until we ended up with a much more managable classification system.
#While they are not the officially recognized NOAA events we felt that the exploratory nature of the assignment gave us the latitude to explore the data
#more freely and still derive appropriate conclusions from the modestly tidied dataset.
#Create a new column called tidyevents, map aggregate event types to it
#and preserve the original event types column
df$tidyevents <- c("null")
df[grep("^[^spout]*torn|funnel|wind|turb|thunder|gust|tstm", df$evtype
,ignore.case = TRUE), "tidyevents"] <- "DAMAGING WIND EVENTS"
df[grep("(spout)", df$evtype,ignore.case = TRUE)
, "tidyevents"] <- "WATERSPOUT"
df[grep("FLOOD|rapid|flash|fld|stream|urban|dam|high water", df$evtype, ignore.case = TRUE)
, "tidyevents"] <- "FLOOD"
df[grep("TYPHOON|HURRICAN|tropical|TROPICAL STORM", df$evtype, ignore.case = TRUE)
, "tidyevents"] <- "OCEAN STORM EVENTS"
df[grep("HAIL", df$evtype, ignore.case = TRUE)
, "tidyevents"] <- "HAIL EVENTS"
df[grep("tide|ocean|tsunami|glaze|surf|seas$|coast|rip|swell|wave|rouge|surge|seiche|low temp|hyp", df$evtype, ignore.case = TRUE),
"tidyevents"] <- "TIDE OR SURF EVENTS"
df[grep("FIRE|fog|smoke", df$evtype, ignore.case = TRUE),
"tidyevents"] <- "FIRE | ATMOSPHERIC EVENTS"
df[grep("heat|warm", df$evtype, ignore.case = TRUE),
"tidyevents"] <- "HEAT EVENTS"
df[grep("LIGHTNING|lig", df$evtype, ignore.case = TRUE),
"tidyevents"] <- "LIGHTNING"
df[grep("ice|^icy|mix|wint|snow|cold|chill|freez|frost|avala|hypo|sleet|blizz", df$evtype, ignore.case = TRUE),
"tidyevents"] <- "WINTER WEATHER EVENTS"
df[grep("rain|burst|precip|wet|heavy shower", df$evtype, ignore.case = TRUE),
"tidyevents"] <- "RAIN EVENTS"
df[grep("land|mud|rock|ash|drought|dust", df$evtype, ignore.case = TRUE),
"tidyevents"] <- "EARTH & SOIL EVENTS"
The first table is the listing of the count of tidied event types. The following are figures and brief analysis of the graphics generated for the final data sets. Our exploratory analysis was performed from these tables and figured.
eventsTbl <- table(df$tidyevents)
print(eventsTbl)
##
## DAMAGING WIND EVENTS EARTH & SOIL EVENTS
## 144638 22297
## FIRE | ATMOSPHERIC EVENTS FLOOD
## 1440 11344
## HAIL EVENTS HEAT EVENTS
## 26623 978
## LIGHTNING null
## 13309 42
## OCEAN STORM EVENTS RAIN EVENTS
## 686 1365
## TIDE OR SURF EVENTS WATERSPOUT
## 1419 63
## WINTER WEATHER EVENTS
## 5908
#Turn the df into a data.table for simple aggregation by evtype
df.1 <- as.data.table(df)
df.2 <- df.1[ , list(
sum(fatalities, na.rm = TRUE)
,sum(injuries, na.rm = TRUE)
,sum(cropdmg, na.rm = TRUE)
,sum(propdmg, na.rm = TRUE))
,by = list(tidyevents)]
names(df.2) <- c("tidyevents", "fatalities", "injuries"
, "crop.damage", "prop.damage")
## Warning in `names<-.data.table`(`*tmp*`, value = c("tidyevents",
## "fatalities", : The names(x)<-value syntax copies the whole table. This
## is due to <- in R itself. Please change to setnames(x,old,new) which does
## not copy and is faster. See help('setnames'). You can safely ignore this
## warning if it is inconvenient to change right now. Setting options(warn=2)
## turns this warning into an error, so you can then use traceback() to find
## and change your names<- calls.