knitr::opts_chunk$set(echo = TRUE)
library(knitr)
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(patchwork)
The United States is hit every year by a diverse sort of weather events varying in both type an intensity. Unfortunately, some of these weather event can lead to a negative economic impact, and in many cases, to the loss of human life.
The U.S. National Oceanic and Atmospheric Administration’s (NOAA) has tracked major storms and weather events in the country during the period of 1950-2011, including data on dates and location of occurrence, fatalities, injuries, and property damage. The following analysis provides an overview on the impact of these weather events, and attempts to answer which such events are more harmful to human life and have a higher economic impact.
Data was processed and the economic impact as well as the effect on human health was analyzed on a nominal an per event basis, in order to understand the events with the biggest consequences.
Data set can be downloaded from here.
#Download data from source
tmp <- tempfile(fileext = ".bz2")
if (!file.exists(tmp))
{
download.file(
"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
destfile = tmp,
mode = "wb"
)
}
df_raw <- read.csv(bzfile(tmp))
The following offers a quick overview of the data before any processing had taken place. Given the amount of columns in the data, the table is shown split for better visualization.
The following offers a quick overview of the data before any processing had taken place. Given the amount of columns in the data, the table is shown split for better visualization.
n <- ncol(df_raw)
breaks <- round(seq(0, n, length.out = 5))
kable(head(df_raw[, (breaks[1] + 1):breaks[2]]), caption = "Part 1")
| STATE__ | BGN_DATE | BGN_TIME | TIME_ZONE | COUNTY | COUNTYNAME | STATE | EVTYPE | BGN_RANGE |
|---|---|---|---|---|---|---|---|---|
| 1 | 4/18/1950 0:00:00 | 0130 | CST | 97 | MOBILE | AL | TORNADO | 0 |
| 1 | 4/18/1950 0:00:00 | 0145 | CST | 3 | BALDWIN | AL | TORNADO | 0 |
| 1 | 2/20/1951 0:00:00 | 1600 | CST | 57 | FAYETTE | AL | TORNADO | 0 |
| 1 | 6/8/1951 0:00:00 | 0900 | CST | 89 | MADISON | AL | TORNADO | 0 |
| 1 | 11/15/1951 0:00:00 | 1500 | CST | 43 | CULLMAN | AL | TORNADO | 0 |
| 1 | 11/15/1951 0:00:00 | 2000 | CST | 77 | LAUDERDALE | AL | TORNADO | 0 |
kable(head(df_raw[, (breaks[2] + 1):breaks[3]]), caption = "Part 2")
| BGN_AZI | BGN_LOCATI | END_DATE | END_TIME | COUNTY_END | COUNTYENDN | END_RANGE | END_AZI | END_LOCATI |
|---|---|---|---|---|---|---|---|---|
| 0 | NA | 0 | ||||||
| 0 | NA | 0 | ||||||
| 0 | NA | 0 | ||||||
| 0 | NA | 0 | ||||||
| 0 | NA | 0 | ||||||
| 0 | NA | 0 |
kable(head(df_raw[, (breaks[3] + 1):breaks[4]]), caption = "Part 3")
| LENGTH | WIDTH | F | MAG | FATALITIES | INJURIES | PROPDMG | PROPDMGEXP | CROPDMG | CROPDMGEXP |
|---|---|---|---|---|---|---|---|---|---|
| 14.0 | 100 | 3 | 0 | 0 | 15 | 25.0 | K | 0 | |
| 2.0 | 150 | 2 | 0 | 0 | 0 | 2.5 | K | 0 | |
| 0.1 | 123 | 2 | 0 | 0 | 2 | 25.0 | K | 0 | |
| 0.0 | 100 | 2 | 0 | 0 | 2 | 2.5 | K | 0 | |
| 0.0 | 150 | 2 | 0 | 0 | 2 | 2.5 | K | 0 | |
| 1.5 | 177 | 2 | 0 | 0 | 6 | 2.5 | K | 0 |
kable(head(df_raw[, (breaks[4] + 1):breaks[5]]), caption = "Part 4")
| WFO | STATEOFFIC | ZONENAMES | LATITUDE | LONGITUDE | LATITUDE_E | LONGITUDE_ | REMARKS | REFNUM |
|---|---|---|---|---|---|---|---|---|
| 3040 | 8812 | 3051 | 8806 | 1 | ||||
| 3042 | 8755 | 0 | 0 | 2 | ||||
| 3340 | 8742 | 0 | 0 | 3 | ||||
| 3458 | 8626 | 0 | 0 | 4 | ||||
| 3412 | 8642 | 0 | 0 | 5 | ||||
| 3450 | 8748 | 0 | 0 | 6 |
Based on the content of the data, and the requirements of this analysis, the following columns were dropped because they do not contribute to the study:
| Variables Removed | Variables Removed |
|---|---|
| STATE__ | END_AZI |
| BGN_DATE | END_LOCATI |
| BGN_TIME | LENGTH |
| TIME_ZONE | WIDTH |
| COUNTY | F |
| COUNTYNAME | MAG |
| STATE | WFO |
| BGN_RANGE | STATEOFFIC |
| BGN_AZI | ZONENAMES |
| BGN_LOCATI | LATITUDE |
| END_DATE | LONGITUDE |
| END_TIME | LATITUDE_E |
| COUNTY_END | LONGITUDE_ |
| COUNTYENDN | REMARKS |
| END_RANGE | REFNUM |
In addition, the data columns representing both property and crop damages are split in unit (PROPDMG) and exponent columns (PROPDMGEXP), with the exponent column representing a multiplier (e.g. thousands or millions of USD). In order to obtain the true numerical value of the damage in US dollars it becomes thus necessary to multiply all values for their respective multiplier. This however also requires an extra step, as exponent data is not represented numerically inside the data, but rather as strings.
As a last step, it would make sense for the analysis to compute the total amount of damage and human related incidents.
#List all columns which should not be present in the final data set.
cols_to_drop <- c(
"STATE__", "BGN_DATE", "BGN_TIME", "TIME_ZONE", "COUNTY",
"COUNTYNAME", "STATE", "BGN_RANGE", "BGN_AZI", "BGN_LOCATI",
"END_DATE", "END_TIME", "COUNTY_END", "COUNTYENDN", "END_RANGE",
"END_AZI", "END_LOCATI", "LENGTH", "WIDTH", "F", "MAG", "WFO",
"STATEOFFIC", "ZONENAMES", "LATITUDE", "LONGITUDE", "LATITUDE_E",
"LONGITUDE_", "REMARKS", "REFNUM"
)
#Defines a map to connect exponent string values to a numeric value.
mult_map <- c(H = 1e2, K = 1e3, M = 1e6, B = 1e9)
cols_to_drop_2 <- c("PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
df <- df_raw %>%
select(-all_of(cols_to_drop)) %>% #Removes not needed columns.
mutate(
HUMAN_INCIDENTS = FATALITIES + INJURIES,
PROP_DMG_USD = #Define new columns with calculated value in USD
PROPDMG * coalesce( #coalesce allows to define a default value of 1,in case value in column is NA.
#unname is necessary to remove the key from the key value pair and keep only the mapped value.
unname(
#With mult_map[key] it is possible to get the numerical value mapped to a letter.
mult_map[str_to_upper(PROPDMGEXP)] #str_to_upper is needed in case data is lowercase.
),
1
),
CROP_DMG_USD =
CROPDMG *
coalesce(
unname(mult_map[str_to_upper(CROPDMGEXP)]),
1
),
COMBINED_DMG_USD = PROP_DMG_USD + CROP_DMG_USD
) %>% select(-all_of(cols_to_drop_2))
kable(head(df), caption = "Processed Data")
| EVTYPE | FATALITIES | INJURIES | HUMAN_INCIDENTS | PROP_DMG_USD | CROP_DMG_USD | COMBINED_DMG_USD |
|---|---|---|---|---|---|---|
| TORNADO | 0 | 15 | 15 | 25000 | 0 | 25000 |
| TORNADO | 0 | 0 | 0 | 2500 | 0 | 2500 |
| TORNADO | 0 | 2 | 2 | 25000 | 0 | 25000 |
| TORNADO | 0 | 2 | 2 | 2500 | 0 | 2500 |
| TORNADO | 0 | 2 | 2 | 2500 | 0 | 2500 |
| TORNADO | 0 | 6 | 6 | 2500 | 0 | 2500 |
In order to understand the impact of a specific weather event on human health it is necessary to calculate the total impact per specific type of event (i.e. sum the total amount of injuries or deaths that a certain weather event type caused).
However this only gives a partial view of the truth, as an event type might cause more deaths in total just because there are more events of this type. In this regard it would make sense to also understand the danger a single occurrence of a certain event can pose to human lives.
Finally it would make sense to filter out events with less than 10 occurrences, as such a small sample of event could no provide enough information to infer whether the event corresponds to an outlier or not.
df_summary <- df %>%
group_by(EVTYPE) %>%
summarise(
N_EVENTS = n(),
FATALITIES = sum(FATALITIES, na.rm = TRUE),
INJURIES = sum(INJURIES, na.rm = TRUE),
HUMAN_INCIDENTS = sum(HUMAN_INCIDENTS, na.rm = TRUE),
PROP_DAMAGE = sum(PROP_DMG_USD, na.rm = TRUE),
CROP_DAMAGE = sum(CROP_DMG_USD, na.rm = TRUE),
COMBINED_DMG = sum(COMBINED_DMG_USD, na.rm = TRUE)
) %>%
filter(N_EVENTS >= 10) %>%
mutate(
FATALITIES_PER_EVENT = FATALITIES / N_EVENTS,
INJURIES_PER_EVENT = INJURIES / N_EVENTS,
HUMAN_INCIDENTS_PER_EVENT = HUMAN_INCIDENTS / N_EVENTS,
PROP_DAMAGE_PER_EVENT = PROP_DAMAGE / N_EVENTS,
CROP_DAMAGE_PER_EVENT = CROP_DAMAGE / N_EVENTS,
COMBINED_DMG_PER_EVENT = COMBINED_DMG / N_EVENTS
)
# Total human impact
total_human_impact <- df_summary %>%
select(EVTYPE, N_EVENTS, FATALITIES, INJURIES, HUMAN_INCIDENTS) %>%
arrange(desc(HUMAN_INCIDENTS))
kable(head(total_human_impact,10), caption = "Top 10 Event Types by Total Human Impact")
| EVTYPE | N_EVENTS | FATALITIES | INJURIES | HUMAN_INCIDENTS |
|---|---|---|---|---|
| TORNADO | 60652 | 5633 | 91346 | 96979 |
| EXCESSIVE HEAT | 1678 | 1903 | 6525 | 8428 |
| TSTM WIND | 219940 | 504 | 6957 | 7461 |
| FLOOD | 25326 | 470 | 6789 | 7259 |
| LIGHTNING | 15754 | 816 | 5230 | 6046 |
| HEAT | 767 | 937 | 2100 | 3037 |
| FLASH FLOOD | 54277 | 978 | 1777 | 2755 |
| ICE STORM | 2006 | 89 | 1975 | 2064 |
| THUNDERSTORM WIND | 82563 | 133 | 1488 | 1621 |
| WINTER STORM | 11433 | 206 | 1321 | 1527 |
# Human impact per event
per_event_human_impact <- df_summary %>%
select(EVTYPE, N_EVENTS,
FATALITIES_PER_EVENT,
INJURIES_PER_EVENT,
HUMAN_INCIDENTS_PER_EVENT) %>%
arrange(desc(HUMAN_INCIDENTS_PER_EVENT)) %>%
mutate(
across(
ends_with("_PER_EVENT"),
~ round(.x, 2)
)
)
kable(head(per_event_human_impact,10), caption = "Top 10 Event Types by Human Impact per Event")
| EVTYPE | N_EVENTS | FATALITIES_PER_EVENT | INJURIES_PER_EVENT | HUMAN_INCIDENTS_PER_EVENT |
|---|---|---|---|---|
| HURRICANE/TYPHOON | 88 | 0.73 | 14.49 | 15.22 |
| EXTREME HEAT | 22 | 4.36 | 7.05 | 11.41 |
| TSUNAMI | 20 | 1.65 | 6.45 | 8.10 |
| GLAZE | 32 | 0.22 | 6.75 | 6.97 |
| HEAT WAVE | 74 | 2.32 | 4.18 | 6.50 |
| EXCESSIVE HEAT | 1678 | 1.13 | 3.89 | 5.02 |
| HEAT | 767 | 1.22 | 2.74 | 3.96 |
| MIXED PRECIP | 10 | 0.20 | 2.60 | 2.80 |
| ICE | 61 | 0.10 | 2.25 | 2.34 |
| UNSEASONABLY WARM AND DRY | 13 | 2.23 | 0.00 | 2.23 |
p1 <- total_human_impact %>%
head(10) %>%
mutate(EVTYPE = reorder(EVTYPE, HUMAN_INCIDENTS)) %>%
ggplot(aes(x = EVTYPE, y = HUMAN_INCIDENTS)) +
geom_col(fill = "darkblue") +
coord_flip() +
labs(
title = "Total Human Incidents",
x = "Event Type",
y = "Total Incidents"
)
p2 <- per_event_human_impact %>%
head(10) %>%
mutate(EVTYPE = reorder(EVTYPE, HUMAN_INCIDENTS_PER_EVENT)) %>%
ggplot(aes(x = EVTYPE, y = HUMAN_INCIDENTS_PER_EVENT)) +
geom_col(fill = "darkred") +
coord_flip() +
labs(
title = "Human Incidents per Event",
x = "Event Type",
y = "Incidents per Event"
)
p1 + p2 + plot_layout(ncol = 2) + plot_annotation(title = "Figure 1: Human Impact by Event Type")
From the figure it can be seen that by far tornadoes are the higher
cause of human health related incidents, however the high amount of such
yearly events in the contiguous United States is the likely cause of
such impact. On a per event basis however, it can be observed that
hurricanes are the most dangerous to humans with more than 15 health
incidents per event.
Similar to the case for human impact, it makes sense to analyse the amount of economic damage caused by an event both a nominal and per event basis.
# Total economic damage
total_economic_impact <- df_summary %>%
select(EVTYPE, N_EVENTS, PROP_DAMAGE, CROP_DAMAGE, COMBINED_DMG) %>%
arrange(desc(COMBINED_DMG))
kable(head(total_economic_impact, 10),caption = "Top 10 Event Types by Total Economic Damage")
| EVTYPE | N_EVENTS | PROP_DAMAGE | CROP_DAMAGE | COMBINED_DMG |
|---|---|---|---|---|
| FLOOD | 25326 | 144657709807 | 5661968450 | 150319678257 |
| HURRICANE/TYPHOON | 88 | 69305840000 | 2607872800 | 71913712800 |
| TORNADO | 60652 | 56937160779 | 414953270 | 57352114049 |
| STORM SURGE | 261 | 43323536000 | 5000 | 43323541000 |
| HAIL | 288661 | 15732267543 | 3025954473 | 18758222016 |
| FLASH FLOOD | 54277 | 16140812067 | 1421317100 | 17562129167 |
| DROUGHT | 2488 | 1046106000 | 13972566000 | 15018672000 |
| HURRICANE | 174 | 11868319010 | 2741910000 | 14610229010 |
| RIVER FLOOD | 173 | 5118945500 | 5029459000 | 10148404500 |
| ICE STORM | 2006 | 3944927860 | 5022113500 | 8967041360 |
# Economic damage per event
per_event_economic_impact <- df_summary %>%
select(EVTYPE, N_EVENTS,
PROP_DAMAGE_PER_EVENT,
CROP_DAMAGE_PER_EVENT,
COMBINED_DMG_PER_EVENT) %>%
arrange(desc(COMBINED_DMG_PER_EVENT))
kable(head(per_event_economic_impact,10), caption = "Top 10 Event Types by Economic Damage per Event")
| EVTYPE | N_EVENTS | PROP_DAMAGE_PER_EVENT | CROP_DAMAGE_PER_EVENT | COMBINED_DMG_PER_EVENT |
|---|---|---|---|---|
| HURRICANE/TYPHOON | 88 | 787566364 | 2.963492e+07 | 817201282 |
| STORM SURGE | 261 | 165990559 | 1.915709e+01 | 165990579 |
| SEVERE THUNDERSTORM | 13 | 92720000 | 1.538462e+04 | 92735385 |
| HURRICANE | 174 | 68208730 | 1.575810e+07 | 83966833 |
| RIVER FLOOD | 173 | 29589280 | 2.907202e+07 | 58661298 |
| TYPHOON | 11 | 54566364 | 7.500000e+04 | 54641364 |
| STORM SURGE/TIDE | 148 | 31359378 | 5.743243e+03 | 31365122 |
| FLASH FLOOD/FLOOD | 22 | 12384091 | 2.522727e+04 | 12409318 |
| TROPICAL STORM | 690 | 11165059 | 9.831101e+05 | 12148169 |
| TSUNAMI | 20 | 7203100 | 1.000000e+03 | 7204100 |
p3 <- total_economic_impact %>%
head(10) %>%
mutate(EVTYPE = reorder(EVTYPE, COMBINED_DMG)) %>%
ggplot(aes(x = EVTYPE, y = COMBINED_DMG)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(
title = "Total Economic Damage",
x = "Event Type",
y = "Combined Damage (USD)"
)
p4 <- per_event_economic_impact %>%
head(10) %>%
mutate(EVTYPE = reorder(EVTYPE, COMBINED_DMG_PER_EVENT)) %>%
ggplot(aes(x = EVTYPE, y = COMBINED_DMG_PER_EVENT)) +
geom_col(fill = "darkorange") +
coord_flip() +
labs(
title = "Economic Damage per Event",
x = "Event Type",
y = "Damage per Event (USD)"
)
p3 + p4 +
plot_layout(ncol = 2) +
plot_annotation(
title = "Figure 2: Economic Impact by Event Type"
)
It can be seen that floods have lead to the highest combined economic
impact in the United States, however this could likely be explained by a
higher frequency of events of this type. On a per event basis,
hurricanes lead to the highest economic impact among all events.
After analyzing the data, the following can be concluded:
Limitations: