Setup

# Silence tidyverse startup messages
options(tidyverse.quiet = TRUE)

# Silence every other package
suppressPackageStartupMessages({ 
library(lubridate)
library(tidyverse)
library(ggplot2)
library(stringr)
library(usmap)
library(proj4)
library(spdep)
library(maptools)
library(rgdal)
})

# Set knitr options
knitr::opts_chunk$set(echo = TRUE, fig.align = "center", fig.retina = 3, fig.fullwidth=TRUE)

# Set default theme
theme_set(
        theme_light() +
        theme(
                legend.position = "bottom"
        )
)

# Define number formatter 
formatTwoSignificant <- scales::number_format(accuracy=.1)
formatNumber <- function(x) {
        ifelse(x < 1000, x,
               ifelse(x >= 10^3 & x < 10^6, paste0(formatTwoSignificant(x/10^3), " k"),
                      ifelse(x >= 10^6 & x < 10^9, paste0(formatTwoSignificant(x/10^6), " M"),
                             ifelse(x >= 10^9 & x < 10^12, paste0(formatTwoSignificant(x/10^9), " B"),
                                    ifelse(x >= 10^12 & x < 10^15, paste0(formatTwoSignificant(x/10^12), " T"),
                                        ifelse(x >= 10^15 & x < 10^18, paste0(formatTwoSignificant(x/10^12),"P"),NA))))))
}

# Import reorder within
# By David Robinson
# github: https://github.com/dgrtwo/drlib
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
  new_x <- paste(x, within, sep = sep)
  stats::reorder(new_x, by, FUN = fun)
}

scale_x_reordered <- function(..., sep = "___") {
  reg <- paste0(sep, ".+$")
  ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}

scale_y_reordered <- function(..., sep = "___") {
  reg <- paste0(sep, ".+$")
  ggplot2::scale_y_discrete(labels = function(x) gsub(reg, "", x), ...)
}

Abstract | Synopsis

Natural disasters, like storm related events, pose a financial risk and a risk of injuries and fatalities. In this analysis I analyzed the Storm Data Documentation by National Weather Service and can conclude that the most harmful events are (1) tornados, floods, flash floods and excessive heat with focus on health and (2) droughs, floods, hurricanes and typhoons with focus on financial losses. These events do not occur uniformly across the US, thus resources should be prioritized not only based on the cumulative damages but also on the probability of such an event in the region.

Data Processing

Reading and Rescaling

Date was reformatted to a Date datatype and a new column fips was introduced as an identifier for each individual county by the library usmap. The observation time was limited to events with a beginning date later than 1993 since only few events were observed before that date. For calculating the number of events per year NumberOfDays represents the number of days from start to end of observation.

df <- tibble(read.csv("repdata%2Fdata%2FStormData.csv.bz2", stringsAsFactors = F))
 
df$BGN_DATE <- as.Date(df$BGN_DATE, format="%m/%d/%Y")

# County identifier used by usmap
df$fips <- paste0(str_pad(df$STATE__, width=2, side="left", pad="0"),
                  str_pad(df$COUNTY, width=3, side="left", pad="0"))

# Only few events observed before 1993 
df <- df %>% filter(BGN_DATE >= "1993-01-01")

# Number of Days in Dataset
NumberOfDays <- as.integer(max(df$BGN_DATE) - as.Date("1993-01-01")) + 1

Financial losses

Financial losses must be multiplied by their respective exponent. When there is an unknown Exponent given the financial loss is ignored (NA).

# Lookup table
Exponents <- tribble(~EXP, ~Factor,
                      "K", 10^3,
                      "M", 10^6,
                      "B", 10^9,
                      "", 1)
# Join lookup table
df <- df %>% 
        left_join(Exponents, by=c("PROPDMGEXP" = "EXP")) %>% 
        mutate(PROPDMG = PROPDMG * Factor) %>% 
        select(-Factor)

df <- df %>% 
        left_join(Exponents, by=c("CROPDMGEXP" = "EXP")) %>% 
        mutate(CROPDMG = CROPDMG * Factor) %>% 
        select(-Factor)

Memory Management

Many columns given by the dataset were of no direct interest to this analysis, so they were dropped accordingly.

# Preserve Columns
df <- df %>% 
        select(BGN_DATE,
               EVTYPE,
               STATE,
               COUNTYNAME,
               fips,
               INJURIES,
               FATALITIES,
               PROPDMG,
               CROPDMG)

Category Management

A great amount of EVTYPE categories were not appropriately named. Thus categories with were uppercased and matched to their bigger (singular) category names.

# Clear EVTYPE
df$EVTYPE <- str_to_upper(df$EVTYPE)
df$EVTYPE <- gsub("WILD/FOREST FIRE", "WILDFIRE", df$EVTYPE)

# Split mixed EVTYPE
df <- df %>% 
        mutate(EVTYPE = str_split(EVTYPE, "/")) %>% 
        unnest_legacy(EVTYPE)
               

# Clear EVTYPE
df$EVTYPE <- gsub("TSTM","THUNDERSTORM", df$EVTYPE)
df$EVTYPE <- gsub("WINDS","WIND", df$EVTYPE)
df$EVTYPE <- gsub("FLD","FLOOD", df$EVTYPE)
df$EVTYPE <- gsub("FLOODING","FLOOD", df$EVTYPE)
df$EVTYPE <- gsub("FIRES","FIRE", df$EVTYPE)
df$EVTYPE <- gsub("\\d*\\.*\\d*", "", df$EVTYPE)
df$EVTYPE <- gsub("\\(.*\\)", "", df$EVTYPE)

# Squish strings
df$EVTYPE <- str_squish(df$EVTYPE)

# Remove "Urban" arisen from splitting mixed EVTYPE
df <- df %>% filter(EVTYPE!="URBAN")

df <- df %>% rename(`EVENT TYPE` = EVTYPE)

Result Preprocessing

Cumulative Sum

I want to focus on the top-10 events per damage type with highest cumulative damage and highlight the top-3 of each damage type by color. Thus the following data transformation is performed:

  • Sum up all damage by event type and damage type
  • Grab TOP-3 events per damage type
  • Grab TOP-10 events per damage type and colorize the TOP-3 events of each damage type
# Compute the cumulative damage
df.Cumulative <- df %>% 
                        ungroup() %>% 
                        select(INJURIES,FATALITIES,PROPDMG, CROPDMG, everything()) %>% 
                        pivot_longer(INJURIES:CROPDMG, 
                                      names_to = "DAMAGE TYPE",
                                      values_to = "N") %>%
                        filter(!is.na(N) & N>0) %>% 
                        group_by(`EVENT TYPE`, `DAMAGE TYPE`) %>% 
                        summarise(N = sum(N)) %>% 
                        arrange(desc(N)) %>% 
                        ungroup()

# Select top-3 events per damage type
df.Cumulative.TOP3 <-  df.Cumulative %>% 
                        group_by(`DAMAGE TYPE`) %>% 
                        slice(1:3) %>% 
                        ungroup() %>% 
                        distinct(`EVENT TYPE`) %>% 
                        mutate(Colorize = T)

# Colorize and reorder (within damage category)
df.Cumulative <- df.Cumulative %>% 
                        left_join(df.Cumulative.TOP3, by=c("EVENT TYPE")) %>% 
                        mutate(TOP3 = ifelse(!is.na(Colorize), `EVENT TYPE`, NA)) %>% 
                        group_by(`DAMAGE TYPE`) %>% 
                        slice(1:10) %>% 
                        ungroup() %>% 
                        mutate(`EVENT TYPE` = reorder_within(`EVENT TYPE`, -N, `DAMAGE TYPE`),
                               `DAMAGE TYPE` = factor(`DAMAGE TYPE`, levels=c("INJURIES", "FATALITIES", "CROPDMG", "PROPDMG")))

Events per Annum on Regional Level

I want to further emphasize the importance of regionality and frequency given the TOP-3 events per damage type. Thus following data transformation is performed:

  • The frequency (Events per Year) of each event is pivoted
  • This frequency is joined with the US map data
# Pivot TOP-3 events 
df.Regional <- df %>% 
                inner_join(df.Cumulative.TOP3, by=c("EVENT TYPE")) %>% 
                group_by(fips, `EVENT TYPE`) %>% 
                summarize(`Events per Year` = n() / NumberOfDays * 365.25) %>% 
                pivot_wider(names_from = `EVENT TYPE`, values_from = `Events per Year`)

# Aquire US map data
mapdata <- us_map(regions="county")

# Join the TOP-3 events to mapdata
df.Regional <- mapdata %>% 
                left_join(df.Regional, by="fips")

# Unpivot to get a full US-map per event
df.Regional <- df.Regional %>% 
                        pivot_longer(-x:-county,
                                     names_to = "EVENT TYPE",
                                     values_to = "Events per Year")

Results

Cumulative Damages

The events that are most harmful with respect to population health by raw numbers are tornados, floods and excessive heat. The most financial losses can be attributed to flood, hurricane, drought and typhoon. There is an overlap between events that are harmful to population health and financial losses.

ggplot(df.Cumulative, aes(x=`EVENT TYPE`, y=`N`, fill=TOP3)) +
        geom_bar(stat="identity") +
        scale_x_reordered() +
        scale_y_continuous(labels = formatNumber) +
        facet_wrap(.~`DAMAGE TYPE`, scale="free", nrow=4) +
        labs(title = "Storm Events in the US",
                subtitle = "Cumulative Damage since 1993",
                caption = "Source: Storm Data Documentation by National Weather Service") +
        ylab("Damage: Number of Cases or $") +
        xlab("Event Type")

Events per Annum on Regional Level

The events show great differences in overall frequency and frequency per region. These events do not occur uniformly accross the US, for example hurricanes do occur much more frequent directly adjancent to coastal regions in the east coast and tornadoes have a tendency to be a lot less common in the western regions of the US.

ggplot(df.Regional, aes(x=x,y=y,group=group, fill=`Events per Year`)) +
        geom_polygon() +
        facet_wrap(.~`EVENT TYPE`) +
        scale_fill_viridis_c(trans="log10") +
        theme_void() +
        labs(title = "Storm Events in the US",
            subtitle = "Events per Year",
            caption = "Source: Storm Data Documentation by National Weather Service")