sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.6.0  magrittr_1.5    tools_3.6.0     htmltools_0.3.6
##  [5] yaml_2.2.0      Rcpp_1.0.1      stringi_1.4.3   rmarkdown_1.12 
##  [9] knitr_1.22      stringr_1.4.0   xfun_0.7        digest_0.6.18  
## [13] evaluate_0.13

Synopsis

What types of events in the USA are most harmful to people and property according to the NOAA dataset? From the human perspective, tornados are by far the most fatal and cause the most injuries. From an economic perspective, flooding causes the most property damage and drought causes the most crop damage. When you look at both the property and crop damage combined, the property damage dwarfs the crop damage values in magnitude.

One major cavate to this data set is that there are 985 different event types, but some of the separations seems arbitrary, for example: flood vs river flood, vs flash flood.

Questions

  1. Across the United States, which types of events are most harmful with respect to population health?

  2. Across the United States, which types of events have the greatest economic consequences?

Data Processing

The dataset I’m starting with is from the file: repdata_data_StormData.csv which was downloaded from https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2 in a compressed format.

Variables of Interest

STATE: State Name EVTYPE: Event Type
FATALITIES: Number of Fatalities
INJURIES: Number of Injuries
PROPDMG: Property Damage (without multiplier scale)
PROPDMGEXP: Multiplier Code for Property Damage
CROPDMG: Crop Damage (without multiplier scale)
CROPDMGEXP: Multiplier Code for Crop Damage

Damage Codes Explained

These are possible values of CROPDMGEXP and PROPDMGEXP:

H,h,K,k,M,m,B,b,+,-,?,0,1,2,3,4,5,6,7,8, and blank-character

H,h = hundreds = 100

K,k = kilos = thousands = 1,000

M,m = millions = 1,000,000

B,b = billions = 1,000,000,000

(+) = 1

(-) = 0

(?) = 0

black/empty character = 0

numeric 0..8 = 10

Data Processing Strategy

First, we will download the data directly from the source and load it into R. Then we will subset the data down to only our variables of interest. Next, we’ll create subsets for fatalities, injures, property damage, crop damage, and total damage. To create the subsets for damage, we’ll have to factor in the damage multiplier. We’ll create a lookup table to convert the multiplier code to a number and add new columns to the data frame. We’ll then calculate the damage by multiplying the respectful damage type and its converted multiplier code value.

#Load Packages
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Set Working Dir
setwd("~/Coursera/Data Science Specialization/Reproducible Research/Project 2")

#Download Data
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(url, "data.csv.bz2")

#Load Data (no need to decompress file)
data <- read.csv("data.csv.bz2")

#Subset Fatalities
df <- select(data, STATE, EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)

df.fatalities <- df %>% group_by(EVTYPE) %>% summarise(fatalities_total = sum(FATALITIES))

df.fatalities <- arrange(df.fatalities, desc(fatalities_total))

head(df.fatalities)
## # A tibble: 6 x 2
##   EVTYPE         fatalities_total
##   <fct>                     <dbl>
## 1 TORNADO                    5633
## 2 EXCESSIVE HEAT             1903
## 3 FLASH FLOOD                 978
## 4 HEAT                        937
## 5 LIGHTNING                   816
## 6 TSTM WIND                   504
#Subset Injuries

df.injuries <- df %>% group_by(EVTYPE) %>% summarise(Injuries_total = sum(INJURIES))

df.injuries <- arrange(df.injuries, desc(Injuries_total))

head(df.injuries)
## # A tibble: 6 x 2
##   EVTYPE         Injuries_total
##   <fct>                   <dbl>
## 1 TORNADO                 91346
## 2 TSTM WIND                6957
## 3 FLOOD                    6789
## 4 EXCESSIVE HEAT           6525
## 5 LIGHTNING                5230
## 6 HEAT                     2100
#create tables for damage multiplier

code <- sort(unique(as.character(df$PROPDMGEXP)))
value_mult <- c(0,0,0,1,10,10,10,10,10,10,10,10,10,10^9,10^2,10^2,10^3,10^6,10^6)
table_mult <- data.frame(code, value_mult)

#Add columns to df with matched value to to code
df$PROPDMG_MULT <- table_mult$value_mult[match(df$PROPDMGEXP, table_mult$code)]

df$CROPDMG_MULT <-table_mult$value_mult[match(df$CROPDMGEXP, table_mult$code)]

#Add columns for Damage Value Calc
df$PROPDMG_VAL <- df$PROPDMG * df$PROPDMG_MULT
df$CROPDMG_VAL <- df$CROPDMG * df$CROPDMG_MULT

df$TOTAL_DAM_VAL <- df$PROPDMG_VAL + df$CROPDMG_VAL

#Property Damage Subset
df.propdam <- df %>% group_by(EVTYPE) %>% summarise(propdam_total = sum(PROPDMG_VAL))

df.propdam <- arrange(df.propdam, desc(propdam_total))

head(df.propdam)
## # A tibble: 6 x 2
##   EVTYPE            propdam_total
##   <fct>                     <dbl>
## 1 FLOOD              144657709800
## 2 HURRICANE/TYPHOON   69305840000
## 3 TORNADO             56937162897
## 4 STORM SURGE         43323536000
## 5 FLASH FLOOD         16140815011
## 6 HAIL                15732269877
#Crop Damage Subset
df.cropdam <- df %>% group_by(EVTYPE) %>% summarise(cropdam_total = sum(CROPDMG_VAL))

df.cropdam <- arrange(df.cropdam, desc(cropdam_total))

head(df.cropdam)
## # A tibble: 6 x 2
##   EVTYPE            cropdam_total
##   <fct>                     <dbl>
## 1 DROUGHT             13972566000
## 2 FLOOD                5661968450
## 3 RIVER FLOOD          5029459000
## 4 ICE STORM            5022113500
## 5 HURRICANE            2741910000
## 6 HURRICANE/TYPHOON    2607872800
#Total Damage Subset

df.totaldam <- df %>% group_by(EVTYPE) %>% summarise(damage_total = sum(TOTAL_DAM_VAL))

df.totaldam <- arrange(df.totaldam, desc(damage_total))

head(df.totaldam)
## # A tibble: 6 x 2
##   EVTYPE            damage_total
##   <fct>                    <dbl>
## 1 FLOOD             150319678250
## 2 HURRICANE/TYPHOON  71913712800
## 3 TORNADO            57352117607
## 4 STORM SURGE        43323541000
## 5 FLASH FLOOD        17562132111
## 6 DROUGHT            15018672000

Results

Fatalities and Injuries

To answer our first question, we’ll plot the total fatalities and injuries for the top 5 event types.

#Load Packages
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
#Fatalites Plot
plot1 <- ggplot(df.fatalities[1:5,]) + geom_bar(stat = "identity") + aes(x=reorder(EVTYPE, -fatalities_total), y = fatalities_total)

plot1 <- plot1 + theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1)) + aes(fill=EVTYPE)

plot1 <- plot1 + ggtitle("Top 5 Events: Fatalities") +labs(x="EVENT TYPE", y="Total Fatalities") + theme(legend.position = "none")

#plot1

#Injuries Plot

plot2 <- ggplot(df.injuries[1:5,]) + geom_bar(stat = "identity") + aes(x=reorder(EVTYPE, -Injuries_total), y = Injuries_total)

plot2 <- plot2 + theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1)) + aes(fill=EVTYPE)

plot2 <- plot2 + ggtitle("Top 5 Events: Injuries") +labs(x="EVENT TYPE", y="Total Injuries")+ theme(legend.position = "none")

#plot2

grid.arrange(plot1, plot2, nrow = 1)

Damages: Property and Crop

To answer our second question, we’ll plot the total property, crop, and combined damages for the top 5 event types.

#Load Packages
library(ggplot2)
library(gridExtra)

#Property Damage Plot
plot3 <- ggplot(df.propdam[1:5,]) + geom_bar(stat = "identity") + aes(x=reorder(EVTYPE, -propdam_total), y = propdam_total)

plot3 <- plot3 + theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1)) + aes(fill=EVTYPE)

plot3 <- plot3 + ggtitle("Top 5 Events: Property Damage") +labs(x="EVENT TYPE", y="Total Property Damage") + theme(legend.position = "none")

#plot3

#Crop Damage Plot
plot4 <- ggplot(df.cropdam[1:5,]) + geom_bar(stat = "identity") + aes(x=reorder(EVTYPE, -cropdam_total), y = cropdam_total)

plot4 <- plot4 + theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1)) + aes(fill=EVTYPE)

plot4 <- plot4 + ggtitle("Top 5 Events: Crop Damage") +labs(x="EVENT TYPE", y="Total Crop Damage") + theme(legend.position = "none")

#plot4

#Total Damage Plot
plot5 <- ggplot(df.totaldam[1:5,]) + geom_bar(stat = "identity") + aes(x=reorder(EVTYPE, -damage_total), y = damage_total)

plot5 <- plot5 + theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1)) + aes(fill=EVTYPE)

plot5 <- plot5 + ggtitle("Top 5 Events: Total Damage") +labs(x="EVENT TYPE", y="Total Damage") + theme(legend.position = "none")

#plot5

grid.arrange(plot3, plot4, plot5, nrow = 1)