Synopsis

This analysis will use “Storm Data”, which is an official publication of the National Oceanic and Atmospheric Administration(NOAA). The events in the data start in the year 1950 and end in November 2011. The result of this analysis will show that tornadoes cause the most fatalities, excessive wetness causes the most crop damaage, and tornadoes/hail/tstm wind all tie to cause the most property damage.

Downloading, Loading and Processing the Data

Packages Used

library(stringr)
## Warning: package 'stringr' was built under R version 3.2.5
library(plyr)
## Warning: package 'plyr' was built under R version 3.2.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Download File

con <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"

download.file(con, "stormdata.csv.bz2")

Loading the data

raw.storm.data <- read.csv("stormdata.csv.bz2", header = TRUE, stringsAsFactors = FALSE)

Processing the data

First, we subset the data because there are only certain columns that are important for this analysis. To tidy the data, we remove extra spaces from the environment type, and change the column names and environment type variable to lower case letters. Finally, I convert the US dollars into a consistent currency, as they are currently in a variety of units (H= hundreds, T= thousands, M = millions, B= billions). All others are defaulted to NA as I assume they are typos in the data collection process.

#Subset Data
NameList <- c("EVTYPE", "FATALITIES", "PROPDMG", "CROPDMG", "PROPDMGEXP", "CROPDMGEXP")
col.num <- which(colnames(raw.storm.data) %in% NameList)
subsetted.data <- raw.storm.data[,col.num]

#Remove Extra spaces from variables
subsetted.data$EVTYPE <- str_trim(subsetted.data$EVTYPE, side = "both")

#Change variable names to lower case letters
names(subsetted.data) <- tolower(names(subsetted.data))

#Change evtype to lower case letters
subsetted.data$evtype <- tolower(subsetted.data$evtype)

#Change PROPDMG and CROPDMG to standard currency units
#Property Damage Multiplier
unique.propdmg.values <- unique(subsetted.data$propdmgexp)
subsetted.data$propmultiplier <- as.numeric(mapvalues(subsetted.data$propdmgexp, 
                                           unique.propdmg.values,c(1000,1000000,NA,1000000000,
                                                                   1000000,NA,NA,NA,NA,NA,NA,NA,
                                                                   NA,NA,NA,100,NA,NA,NA)))
#Create cropdmg multiplier
unique.cropdmg.values <- unique(subsetted.data$cropdmgexp)
subsetted.data$cropmultiplier <- as.numeric(mapvalues(subsetted.data$cropdmgexp, 
                                                      unique.cropdmg.values,c(NA,1000000,1000,
                                                                              1000000,1000000000,
                                                                              NA,NA,1000,NA)))

#Create final cost of property and crop damage with consistent units
subsetted.data$finalpropdmg <- with(subsetted.data,propdmg * propmultiplier)
subsetted.data$finalcropdmg <- with(subsetted.data, cropdmg * cropmultiplier)

#Create a data frame with final cleaned data used for analysis
clean.data <- with(subsetted.data,data.frame(evtype,fatalities, finalpropdmg, finalcropdmg))
rm(raw.storm.data)
rm(subsetted.data)

Results Section

Fatalities Results

Tornadoes caused more fatalities (5633), which was close to three times more than the next event (excessive heat, 1903 fatalities).

#Create Top 10 Most Dangerous Event types by fatalities
fatalities.data <- clean.data %>% group_by(evtype) %>% summarize_each(funs(sum),
                                                                      -c(finalpropdmg, finalcropdmg))
fatalities.plot.data <- head(fatalities.data[order(fatalities.data$fatalities,
                                                   decreasing = TRUE),], n = 10, na.rm = TRUE)
#To reset evtype factor levels from 977 to 10
fatalities.plot.data$evtype <- as.character(fatalities.plot.data$evtype)
fatalities.plot.data$evtype <- as.factor(fatalities.plot.data$evtype)
rm(fatalities.data)

#Top 10 Fatalities Plot
with(fatalities.plot.data,plot(evtype,fatalities, type = "p", pch = 20,col = "black",
                            main = "Top 10 Fatalities by Environment Type",
                            xlab = "Fatalities", ylab = "Environment Type"))

###Property and Crop Damage Results Tornadoes, tstm wind, and hail all caused an equal cost in property damaage, while excessive wetness was the leading cause of crop damaage

#Create Top 10 Property and Crop Damage Events
economic.cost.data <- clean.data %>% group_by(evtype) %>% summarize_each(funs(sum),
                                                                         -c(fatalities))
prop.cost.data <- head(economic.cost.data[order(economic.cost.data$finalpropdmg,
                                             decreasing = TRUE),], n = 10, na.rm = TRUE)
prop.cost.data$evtype <- as.character(prop.cost.data$evtype)
prop.cost.data$evtype <- as.factor(prop.cost.data$evtype)
crop.cost.data <- head(economic.cost.data[order(economic.cost.data$finalcropdmg,
                                                decreasing = TRUE),], n = 10, na.rm = TRUE)
crop.cost.data$evtype <- as.character(crop.cost.data$evtype)
crop.cost.data$evtype <- as.factor(crop.cost.data$evtype)

#Plot Cost Data
par(mfrow = c(1,2))
with(prop.cost.data,plot(evtype,finalpropdmg, type = "p", pch = 20,col = "black",
                               main = "Top 10 Property Damage Environment Types",
                               xlab = "Environment Type", ylab = "Cost (in US dollars"))
with(crop.cost.data,plot(evtype,finalcropdmg, type = "p", pch = 20,col = "black",
                         main = "Top 10 Crop Damage Environment Types",
                         xlab = "Environment Type", ylab = "Cost (in US dollars)"))