# Silence tidyverse startup messages
options(tidyverse.quiet = TRUE)
# Silence every other package
suppressPackageStartupMessages({
library(lubridate)
library(tidyverse)
library(ggplot2)
library(stringr)
library(usmap)
library(proj4)
library(spdep)
library(maptools)
library(rgdal)
})
# Set knitr options
knitr::opts_chunk$set(echo = TRUE, fig.align = "center", fig.retina = 3, fig.fullwidth=TRUE)
# Set default theme
theme_set(
theme_light() +
theme(
legend.position = "bottom"
)
)
# Define number formatter
formatTwoSignificant <- scales::number_format(accuracy=.1)
formatNumber <- function(x) {
ifelse(x < 1000, x,
ifelse(x >= 10^3 & x < 10^6, paste0(formatTwoSignificant(x/10^3), " k"),
ifelse(x >= 10^6 & x < 10^9, paste0(formatTwoSignificant(x/10^6), " M"),
ifelse(x >= 10^9 & x < 10^12, paste0(formatTwoSignificant(x/10^9), " B"),
ifelse(x >= 10^12 & x < 10^15, paste0(formatTwoSignificant(x/10^12), " T"),
ifelse(x >= 10^15 & x < 10^18, paste0(formatTwoSignificant(x/10^12),"P"),NA))))))
}
# Import reorder within
# By David Robinson
# github: https://github.com/dgrtwo/drlib
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
new_x <- paste(x, within, sep = sep)
stats::reorder(new_x, by, FUN = fun)
}
scale_x_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}
scale_y_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_y_discrete(labels = function(x) gsub(reg, "", x), ...)
}
Natural disasters, like storm related events, pose a financial risk and a risk of injuries and fatalities. In this analysis I analyzed the Storm Data Documentation by National Weather Service and can conclude that the most harmful events are (1) tornados, floods, flash floods and excessive heat with focus on health and (2) droughs, floods, hurricanes and typhoons with focus on financial losses. These events do not occur uniformly across the US, thus resources should be prioritized not only based on the cumulative damages but also on the probability of such an event in the region.
Date was reformatted to a Date datatype and a new column fips was introduced as an identifier for each individual county by the library usmap. The observation time was limited to events with a beginning date later than 1993 since only few events were observed before that date. For calculating the number of events per year NumberOfDays represents the number of days from start to end of observation.
df <- tibble(read.csv("repdata%2Fdata%2FStormData.csv.bz2", stringsAsFactors = F))
df$BGN_DATE <- as.Date(df$BGN_DATE, format="%m/%d/%Y")
# County identifier used by usmap
df$fips <- paste0(str_pad(df$STATE__, width=2, side="left", pad="0"),
str_pad(df$COUNTY, width=3, side="left", pad="0"))
# Only few events observed before 1993
df <- df %>% filter(BGN_DATE >= "1993-01-01")
# Number of Days in Dataset
NumberOfDays <- as.integer(max(df$BGN_DATE) - as.Date("1993-01-01")) + 1
Financial losses must be multiplied by their respective exponent. When there is an unknown Exponent given the financial loss is ignored (NA).
# Lookup table
Exponents <- tribble(~EXP, ~Factor,
"K", 10^3,
"M", 10^6,
"B", 10^9,
"", 1)
# Join lookup table
df <- df %>%
left_join(Exponents, by=c("PROPDMGEXP" = "EXP")) %>%
mutate(PROPDMG = PROPDMG * Factor) %>%
select(-Factor)
df <- df %>%
left_join(Exponents, by=c("CROPDMGEXP" = "EXP")) %>%
mutate(CROPDMG = CROPDMG * Factor) %>%
select(-Factor)
Many columns given by the dataset were of no direct interest to this analysis, so they were dropped accordingly.
# Preserve Columns
df <- df %>%
select(BGN_DATE,
EVTYPE,
STATE,
COUNTYNAME,
fips,
INJURIES,
FATALITIES,
PROPDMG,
CROPDMG)
A great amount of EVTYPE categories were not appropriately named. Thus categories with were uppercased and matched to their bigger (singular) category names.
# Clear EVTYPE
df$EVTYPE <- str_to_upper(df$EVTYPE)
df$EVTYPE <- gsub("WILD/FOREST FIRE", "WILDFIRE", df$EVTYPE)
# Split mixed EVTYPE
df <- df %>%
mutate(EVTYPE = str_split(EVTYPE, "/")) %>%
unnest_legacy(EVTYPE)
# Clear EVTYPE
df$EVTYPE <- gsub("TSTM","THUNDERSTORM", df$EVTYPE)
df$EVTYPE <- gsub("WINDS","WIND", df$EVTYPE)
df$EVTYPE <- gsub("FLD","FLOOD", df$EVTYPE)
df$EVTYPE <- gsub("FLOODING","FLOOD", df$EVTYPE)
df$EVTYPE <- gsub("FIRES","FIRE", df$EVTYPE)
df$EVTYPE <- gsub("\\d*\\.*\\d*", "", df$EVTYPE)
df$EVTYPE <- gsub("\\(.*\\)", "", df$EVTYPE)
# Squish strings
df$EVTYPE <- str_squish(df$EVTYPE)
# Remove "Urban" arisen from splitting mixed EVTYPE
df <- df %>% filter(EVTYPE!="URBAN")
df <- df %>% rename(`EVENT TYPE` = EVTYPE)
I want to focus on the top-10 events per damage type with highest cumulative damage and highlight the top-3 of each damage type by color. Thus the following data transformation is performed:
# Compute the cumulative damage
df.Cumulative <- df %>%
ungroup() %>%
select(INJURIES,FATALITIES,PROPDMG, CROPDMG, everything()) %>%
pivot_longer(INJURIES:CROPDMG,
names_to = "DAMAGE TYPE",
values_to = "N") %>%
filter(!is.na(N) & N>0) %>%
group_by(`EVENT TYPE`, `DAMAGE TYPE`) %>%
summarise(N = sum(N)) %>%
arrange(desc(N)) %>%
ungroup()
# Select top-3 events per damage type
df.Cumulative.TOP3 <- df.Cumulative %>%
group_by(`DAMAGE TYPE`) %>%
slice(1:3) %>%
ungroup() %>%
distinct(`EVENT TYPE`) %>%
mutate(Colorize = T)
# Colorize and reorder (within damage category)
df.Cumulative <- df.Cumulative %>%
left_join(df.Cumulative.TOP3, by=c("EVENT TYPE")) %>%
mutate(TOP3 = ifelse(!is.na(Colorize), `EVENT TYPE`, NA)) %>%
group_by(`DAMAGE TYPE`) %>%
slice(1:10) %>%
ungroup() %>%
mutate(`EVENT TYPE` = reorder_within(`EVENT TYPE`, -N, `DAMAGE TYPE`),
`DAMAGE TYPE` = factor(`DAMAGE TYPE`, levels=c("INJURIES", "FATALITIES", "CROPDMG", "PROPDMG")))
I want to further emphasize the importance of regionality and frequency given the TOP-3 events per damage type. Thus following data transformation is performed:
# Pivot TOP-3 events
df.Regional <- df %>%
inner_join(df.Cumulative.TOP3, by=c("EVENT TYPE")) %>%
group_by(fips, `EVENT TYPE`) %>%
summarize(`Events per Year` = n() / NumberOfDays * 365.25) %>%
pivot_wider(names_from = `EVENT TYPE`, values_from = `Events per Year`)
# Aquire US map data
mapdata <- us_map(regions="county")
# Join the TOP-3 events to mapdata
df.Regional <- mapdata %>%
left_join(df.Regional, by="fips")
# Unpivot to get a full US-map per event
df.Regional <- df.Regional %>%
pivot_longer(-x:-county,
names_to = "EVENT TYPE",
values_to = "Events per Year")
The events that are most harmful with respect to population health by raw numbers are tornados, floods and excessive heat. The most financial losses can be attributed to flood, hurricane, drought and typhoon. There is an overlap between events that are harmful to population health and financial losses.
ggplot(df.Cumulative, aes(x=`EVENT TYPE`, y=`N`, fill=TOP3)) +
geom_bar(stat="identity") +
scale_x_reordered() +
scale_y_continuous(labels = formatNumber) +
facet_wrap(.~`DAMAGE TYPE`, scale="free", nrow=4) +
labs(title = "Storm Events in the US",
subtitle = "Cumulative Damage since 1993",
caption = "Source: Storm Data Documentation by National Weather Service") +
ylab("Damage: Number of Cases or $") +
xlab("Event Type")
The events show great differences in overall frequency and frequency per region. These events do not occur uniformly accross the US, for example hurricanes do occur much more frequent directly adjancent to coastal regions in the east coast and tornadoes have a tendency to be a lot less common in the western regions of the US.
ggplot(df.Regional, aes(x=x,y=y,group=group, fill=`Events per Year`)) +
geom_polygon() +
facet_wrap(.~`EVENT TYPE`) +
scale_fill_viridis_c(trans="log10") +
theme_void() +
labs(title = "Storm Events in the US",
subtitle = "Events per Year",
caption = "Source: Storm Data Documentation by National Weather Service")