Synopsis

The NOAA (National Oceanic & Atmospheric Administration) tracks various weather events in the US, and within its published NATIONAL WEATHER SERVICE INSTRUCTION 10-1605 in August 2007, identifies 48 distinct weather events capable of impacting human health, as well as economic consequences on properties and crops.

In fact, using data dating back to 1950, Tornados have by far been the most consequencial event on human health, with in excess of 5500 fatalities and 87500 injuries across the US. Excessive heat has been the second and third most impactful event to fatalities and injuries, respectively. Thunderstorm winds are the second most cause of injuries, and flash flood contributed to about 1000 deaths, placing it third in cause of fatalities.

Economic impact has been widely felt as well. From the study, the first observation is damage to property is significantly more than damage to crops, when considering the cumulative impact over time. Flood, tornado and storm surges are the three most impactful event on property damage, while drought, flood and hurrnicane/typhoon contribute most to crops damage. Flood is by far the most impactful event, where total economic cost from flood is approximately 150 USD Billions, when combining property with crops damage, off of which 140 USD Billions come from property alone. Drought, ranked seventh on total economic cost, is the largest contributor to crops damage, with about 10 USD Billions.

Data processing

We first present the reader with session info for ensuring package consistency when reproducing reseults.

library(plyr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.6.1
library(rmarkdown)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(knitr)
library(stringr)
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] stringr_1.4.0  knitr_1.23     ggplot2_3.1.1  dplyr_0.8.3   
## [5] rmarkdown_1.13 tidyr_1.0.0    plyr_1.8.4    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.1       magrittr_1.5     munsell_0.5.0    tidyselect_0.2.5
##  [5] colorspace_1.4-1 R6_2.4.0         rlang_0.4.0      tools_3.6.0     
##  [9] grid_3.6.0       gtable_0.3.0     xfun_0.7         withr_2.1.2     
## [13] htmltools_0.3.6  lazyeval_0.2.2   yaml_2.2.0       digest_0.6.19   
## [17] assertthat_0.2.1 lifecycle_0.1.0  tibble_2.1.2     crayon_1.3.4    
## [21] purrr_0.3.2      vctrs_0.2.0      zeallot_0.1.0    glue_1.3.1      
## [25] evaluate_0.14    stringi_1.4.3    compiler_3.6.0   pillar_1.4.1    
## [29] scales_1.0.0     backports_1.1.4  pkgconfig_2.0.2

We then load the underlying data for analysis, and select the wanted columns only

setwd("C:/Users/mikael.herve/Documents/R/Reproducible")

if (!file.exists("C:/Users/mikael.herve/Documents/R/Reproducible/repdata-data-StormData.csv.bz2")) {
  download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", 
                "C:/Users/mikael.herve/Documents/R/Reproducible/repdata-data-StormData.csv.bz2")
}

data <- read.csv(bzfile("C:/Users/mikael.herve/Documents/R/Reproducible/repdata-data-StormData.csv.bz2"), header=TRUE)

storm_data <- data %>% select(EVTYPE,FATALITIES,INJURIES,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP)

We next proceed with text mining to re-allocate events within its list of 48 official events.

storm_data$EVTYPE2 <-storm_data$EVTYPE

storm_data$EVTYPE2[grep(".*TORNADO.*",storm_data$EVTYPE, ignore.case = TRUE)] <- "TORNADO"
storm_data$EVTYPE2[grep(".*THUNDERSTORM* WIND*.*",storm_data$EVTYPE, ignore.case = TRUE)] <- "THUNDERSTORM WIND"
storm_data$EVTYPE2[grep(".*THUNDERSTORMW",storm_data$EVTYPE, ignore.case = TRUE)] <- "THUNDERSTORM WIND"
storm_data$EVTYPE2[grep(".*THUNDERSTORM WINDS*.*",storm_data$EVTYPE, ignore.case = TRUE)] <- "THUNDERSTORM WIND"
storm_data$EVTYPE2[grep("TSTM WIND",storm_data$EVTYPE, ignore.case = TRUE)] <- "THUNDERSTORM WIND"
storm_data$EVTYPE2[grep("HEAT WAVE",storm_data$EVTYPE, ignore.case = TRUE)] <- "EXCESSIVE HEAT"
storm_data$EVTYPE2[grep("HIGH WIND*",storm_data$EVTYPE, ignore.case = TRUE)] <- "HIGH WIND"
storm_data$EVTYPE2[grep("STRONG WIND*",storm_data$EVTYPE, ignore.case = TRUE)] <- "HIGH WIND"
storm_data$EVTYPE2[grep("RIP CURRENTS",storm_data$EVTYPE, ignore.case = TRUE)] <- "RIP CURRENT"
storm_data$EVTYPE2[grep("GLAZE",storm_data$EVTYPE, ignore.case = TRUE)] <- "FREEZING FOG"
storm_data$EVTYPE2[grep("WILD/FOREST FIRE",storm_data$EVTYPE, ignore.case = TRUE)] <- "WILDFIRE"
storm_data$EVTYPE2[grep("EXTREME COLD",storm_data$EVTYPE, ignore.case = TRUE)] <- "EXTREME COLD/WIND CHILL"
storm_data$EVTYPE2[grep("EXTREME HEAT",storm_data$EVTYPE, ignore.case = TRUE)] <- "EXCESSIVE HEAT"
storm_data$EVTYPE2[grep("WILDFIRE*",storm_data$EVTYPE, ignore.case = TRUE)] <- "WILDFIRE"
storm_data$EVTYPE2[grep("WILD FIRE*",storm_data$EVTYPE, ignore.case = TRUE)] <- "WILDFIRE"
storm_data$EVTYPE2[grep("RECORD HEAT",storm_data$EVTYPE, ignore.case = TRUE)] <- "EXCESSIVE HEAT"
storm_data$EVTYPE2[grep("RECORD/EXCESSIVE HEAT",storm_data$EVTYPE, ignore.case = TRUE)] <- "EXCESSIVE HEAT"
storm_data$EVTYPE2[grep("COLD",storm_data$EVTYPE, ignore.case = TRUE)] <- "COLD/WIND CHILL"
storm_data$EVTYPE2[grep("HEAVY SURF/HIGH SURF",storm_data$EVTYPE, ignore.case = TRUE)] <- "HIGH SURF"
storm_data$EVTYPE2[grep("HEAVY SURF",storm_data$EVTYPE, ignore.case = TRUE)] <- "HIGH SURF"
storm_data$EVTYPE2[grep("TROPICAL STORM*",storm_data$EVTYPE, ignore.case = TRUE)] <- "TROPICAL STORM"
storm_data$EVTYPE2[grep("STORM SURGE",storm_data$EVTYPE, ignore.case = TRUE)] <- "STORM SURGE/TIDE"
storm_data$EVTYPE2[grep("HURRICANE*",storm_data$EVTYPE, ignore.case = TRUE)] <- "HURRICANE/TYPHOON"
storm_data$EVTYPE2[grep("FLOOD/FLASH FLOOD",storm_data$EVTYPE, ignore.case = TRUE)] <- "FLOOD"
storm_data$EVTYPE2[grep("FLASH FLOODING*",storm_data$EVTYPE, ignore.case = TRUE)] <- "FLASH FLOOD"
storm_data$EVTYPE2[grep("FLOODING",storm_data$EVTYPE, ignore.case = TRUE)] <- "FLOOD"

We finally proceed with alphanumeric data mining to ensure the proper multiples is applied to crops and property damage.

storm_data$PROPDMGEXP <- as.character(storm_data$PROPDMGEXP)
storm_data$CROPDMGEXP <- as.character(storm_data$CROPDMGEXP)

storm_data$PROPDMGEXP <- mapvalues(storm_data$PROPDMGEXP, c("-","?","+","0","1","2","3","4","5","6","7","8","B","h","H","K","m","M"),c(0,0,0,0,1,2,3,4,5,6,7,8,1e9,1e2,1e2,1e3,1e6,1e6))
storm_data$CROPDMGEXP <- mapvalues(storm_data$CROPDMGEXP, c("?","0","2","B","k","K","m","M"),c(0,0,2,1e9,1e3,1e3,1e6,1e6))

storm_data$PROPDMGEXP <- as.numeric(storm_data$PROPDMGEXP)
storm_data$CROPDMGEXP <- as.numeric(storm_data$CROPDMGEXP)

storm_data$PROPDMGTOT <- storm_data$PROPDMG * storm_data$PROPDMGEXP
storm_data$CROPDMGTOT <- storm_data$CROPDMG * storm_data$CROPDMGEXP

Results

Injuries by event types

Based on the data, Tornado is by far the event type causing the most injuries, with in excess of 91,000 since data going back to 1950. It outpaced by a factor of 10 the next event type, thunderstorm winds.

injuries <- aggregate(INJURIES ~ EVTYPE2,storm_data,FUN=sum)
graph1 <- injuries %>% arrange(desc(INJURIES)) %>% head(15) %>% ggplot(aes(x=reorder(EVTYPE2,desc(INJURIES)), y=INJURIES)) + geom_bar(stat = "identity",fill="blueviolet") + theme(axis.text.x = element_text(angle=90,hjust = 1)) + ggtitle("Injuries by weather event types") + xlab("Weather event types") + ylab("Total injuries")
plot(graph1)

Fatalities by event types

Much like injuries, Tornados are the number one cause of fatalities in the US. It causes more fatalities than the next five factors combined, namely excessive heat, flash flood, heat lightning, and thunderstorm winds.

fatalities <- aggregate(FATALITIES ~ EVTYPE2,storm_data,FUN=sum)
graph2 <- fatalities %>% arrange(desc(FATALITIES)) %>% head(15) %>% ggplot(aes(x=reorder(EVTYPE2,desc(FATALITIES)), y=FATALITIES)) + geom_bar(stat = "identity",fill="palegreen") + theme(axis.text.x = element_text(angle=90,hjust = 1)) + ggtitle("Fatalities by weather event types") + xlab("Weather event types") + ylab("Total fatalities")
plot(graph2)

Economic impact

Economic impact was measured and represented across both damage to properties and crops. One can easily observe from the graph that the very large contribution to economic damage is cause by property damage. In fact, the first event by size that impact crops is ranked seventh, and is due to drought. Flood, hurricane and tornado are the three largest events impacting economic damage.

property <- aggregate(PROPDMGTOT ~ EVTYPE2, storm_data, FUN=sum)
crop <- aggregate(CROPDMGTOT ~ EVTYPE2, storm_data, FUN=sum)

property_cost <- property %>% mutate(DMGTYPE = "PROPERTY") %>% dplyr::rename(DMGTOT = PROPDMGTOT)
crop_cost <- crop %>% mutate(DMGTYPE = "CROP") %>% dplyr::rename(DMGTOT = CROPDMGTOT)
total_cost <- rbind(property_cost,crop_cost)

graph5<- total_cost %>% arrange(desc(DMGTOT)) %>% head(15) %>% ggplot(aes(x=reorder(EVTYPE2,desc(DMGTOT)),y=DMGTOT/1000000000,fill=DMGTYPE)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle=90,hjust = 1)) + ggtitle("Economic impact due to most damaging weather event types") + xlab("Weather event types") + ylab("Total damage in $ Billion")
plot(graph5)