knitr::opts_chunk$set(echo = TRUE)
library(knitr)
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(patchwork)

Synopsis

The United States is hit every year by a diverse sort of weather events varying in both type an intensity. Unfortunately, some of these weather event can lead to a negative economic impact, and in many cases, to the loss of human life.

The U.S. National Oceanic and Atmospheric Administration’s (NOAA) has tracked major storms and weather events in the country during the period of 1950-2011, including data on dates and location of occurrence, fatalities, injuries, and property damage. The following analysis provides an overview on the impact of these weather events, and attempts to answer which such events are more harmful to human life and have a higher economic impact.

Data was processed and the economic impact as well as the effect on human health was analyzed on a nominal an per event basis, in order to understand the events with the biggest consequences.

Data Processing

Data set can be downloaded from here.

#Download data from source
tmp <- tempfile(fileext = ".bz2")

if (!file.exists(tmp))
{
  download.file(
    "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
    destfile = tmp,
    mode = "wb"
  )
}

df_raw <- read.csv(bzfile(tmp))

The following offers a quick overview of the data before any processing had taken place. Given the amount of columns in the data, the table is shown split for better visualization.

Raw Data Overview

The following offers a quick overview of the data before any processing had taken place. Given the amount of columns in the data, the table is shown split for better visualization.

n <- ncol(df_raw)

breaks <- round(seq(0, n, length.out = 5))

kable(head(df_raw[, (breaks[1] + 1):breaks[2]]), caption = "Part 1")
Part 1
STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE BGN_RANGE
1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL TORNADO 0
1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL TORNADO 0
1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL TORNADO 0
1 6/8/1951 0:00:00 0900 CST 89 MADISON AL TORNADO 0
1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL TORNADO 0
1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL TORNADO 0
kable(head(df_raw[, (breaks[2] + 1):breaks[3]]), caption = "Part 2")
Part 2
BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN END_RANGE END_AZI END_LOCATI
0 NA 0
0 NA 0
0 NA 0
0 NA 0
0 NA 0
0 NA 0
kable(head(df_raw[, (breaks[3] + 1):breaks[4]]), caption = "Part 3")
Part 3
LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
14.0 100 3 0 0 15 25.0 K 0
2.0 150 2 0 0 0 2.5 K 0
0.1 123 2 0 0 2 25.0 K 0
0.0 100 2 0 0 2 2.5 K 0
0.0 150 2 0 0 2 2.5 K 0
1.5 177 2 0 0 6 2.5 K 0
kable(head(df_raw[, (breaks[4] + 1):breaks[5]]), caption = "Part 4")
Part 4
WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
3040 8812 3051 8806 1
3042 8755 0 0 2
3340 8742 0 0 3
3458 8626 0 0 4
3412 8642 0 0 5
3450 8748 0 0 6

Data Cleanup

Based on the content of the data, and the requirements of this analysis, the following columns were dropped because they do not contribute to the study:

Variables Removed Variables Removed
STATE__ END_AZI
BGN_DATE END_LOCATI
BGN_TIME LENGTH
TIME_ZONE WIDTH
COUNTY F
COUNTYNAME MAG
STATE WFO
BGN_RANGE STATEOFFIC
BGN_AZI ZONENAMES
BGN_LOCATI LATITUDE
END_DATE LONGITUDE
END_TIME LATITUDE_E
COUNTY_END LONGITUDE_
COUNTYENDN REMARKS
END_RANGE REFNUM

In addition, the data columns representing both property and crop damages are split in unit (PROPDMG) and exponent columns (PROPDMGEXP), with the exponent column representing a multiplier (e.g. thousands or millions of USD). In order to obtain the true numerical value of the damage in US dollars it becomes thus necessary to multiply all values for their respective multiplier. This however also requires an extra step, as exponent data is not represented numerically inside the data, but rather as strings.

As a last step, it would make sense for the analysis to compute the total amount of damage and human related incidents.

#List all columns which should not be present in the final data set.
cols_to_drop <- c(
  "STATE__", "BGN_DATE", "BGN_TIME", "TIME_ZONE", "COUNTY",
  "COUNTYNAME", "STATE", "BGN_RANGE", "BGN_AZI", "BGN_LOCATI",
  "END_DATE", "END_TIME", "COUNTY_END", "COUNTYENDN", "END_RANGE",
  "END_AZI", "END_LOCATI", "LENGTH", "WIDTH", "F", "MAG", "WFO",
  "STATEOFFIC", "ZONENAMES", "LATITUDE", "LONGITUDE", "LATITUDE_E",
  "LONGITUDE_", "REMARKS", "REFNUM"
)

#Defines a map to connect exponent string values to a numeric value.
mult_map <- c(H = 1e2, K = 1e3, M = 1e6, B = 1e9)

cols_to_drop_2 <- c("PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")

df <- df_raw %>%
  select(-all_of(cols_to_drop)) %>% #Removes not needed columns.
  mutate(
    HUMAN_INCIDENTS = FATALITIES + INJURIES, 
    PROP_DMG_USD = #Define new columns with calculated value in USD
      PROPDMG * coalesce( #coalesce allows to define a default value of 1,in case value in column is NA. 
        #unname is necessary to remove the key from the key value pair and keep only the mapped value.
        unname(
            #With mult_map[key] it is possible to get the numerical value mapped to a letter.
            mult_map[str_to_upper(PROPDMGEXP)] #str_to_upper is needed in case data is lowercase.
          ),
        1
      ),
    CROP_DMG_USD =
      CROPDMG *
      coalesce(
        unname(mult_map[str_to_upper(CROPDMGEXP)]),
        1
      ),
    COMBINED_DMG_USD = PROP_DMG_USD + CROP_DMG_USD
  ) %>% select(-all_of(cols_to_drop_2))

kable(head(df), caption = "Processed Data")
Processed Data
EVTYPE FATALITIES INJURIES HUMAN_INCIDENTS PROP_DMG_USD CROP_DMG_USD COMBINED_DMG_USD
TORNADO 0 15 15 25000 0 25000
TORNADO 0 0 0 2500 0 2500
TORNADO 0 2 2 25000 0 25000
TORNADO 0 2 2 2500 0 2500
TORNADO 0 2 2 2500 0 2500
TORNADO 0 6 6 2500 0 2500

Results

Events Most Harmful to Human Population

In order to understand the impact of a specific weather event on human health it is necessary to calculate the total impact per specific type of event (i.e. sum the total amount of injuries or deaths that a certain weather event type caused).

However this only gives a partial view of the truth, as an event type might cause more deaths in total just because there are more events of this type. In this regard it would make sense to also understand the danger a single occurrence of a certain event can pose to human lives.

Finally it would make sense to filter out events with less than 10 occurrences, as such a small sample of event could no provide enough information to infer whether the event corresponds to an outlier or not.

df_summary <- df %>%
  group_by(EVTYPE) %>%
  summarise(
    N_EVENTS = n(),
    FATALITIES = sum(FATALITIES, na.rm = TRUE),
    INJURIES = sum(INJURIES, na.rm = TRUE),
    HUMAN_INCIDENTS = sum(HUMAN_INCIDENTS, na.rm = TRUE),
    PROP_DAMAGE = sum(PROP_DMG_USD, na.rm = TRUE),
    CROP_DAMAGE = sum(CROP_DMG_USD, na.rm = TRUE),
    COMBINED_DMG = sum(COMBINED_DMG_USD, na.rm = TRUE)
  ) %>%
  filter(N_EVENTS >= 10) %>%
  mutate(
    FATALITIES_PER_EVENT = FATALITIES / N_EVENTS,
    INJURIES_PER_EVENT = INJURIES / N_EVENTS,
    HUMAN_INCIDENTS_PER_EVENT = HUMAN_INCIDENTS / N_EVENTS,
    PROP_DAMAGE_PER_EVENT = PROP_DAMAGE / N_EVENTS,
    CROP_DAMAGE_PER_EVENT = CROP_DAMAGE / N_EVENTS,
    COMBINED_DMG_PER_EVENT = COMBINED_DMG / N_EVENTS
  )


# Total human impact

total_human_impact <- df_summary %>%
    select(EVTYPE, N_EVENTS, FATALITIES, INJURIES, HUMAN_INCIDENTS) %>%
    arrange(desc(HUMAN_INCIDENTS))
  
kable(head(total_human_impact,10), caption = "Top 10 Event Types by Total Human Impact")
Top 10 Event Types by Total Human Impact
EVTYPE N_EVENTS FATALITIES INJURIES HUMAN_INCIDENTS
TORNADO 60652 5633 91346 96979
EXCESSIVE HEAT 1678 1903 6525 8428
TSTM WIND 219940 504 6957 7461
FLOOD 25326 470 6789 7259
LIGHTNING 15754 816 5230 6046
HEAT 767 937 2100 3037
FLASH FLOOD 54277 978 1777 2755
ICE STORM 2006 89 1975 2064
THUNDERSTORM WIND 82563 133 1488 1621
WINTER STORM 11433 206 1321 1527
# Human impact per event
per_event_human_impact <- df_summary %>%
    select(EVTYPE, N_EVENTS,
           FATALITIES_PER_EVENT,
           INJURIES_PER_EVENT,
           HUMAN_INCIDENTS_PER_EVENT) %>%
    arrange(desc(HUMAN_INCIDENTS_PER_EVENT)) %>%
    mutate(
      across(
        ends_with("_PER_EVENT"),
        ~ round(.x, 2)
      )
    ) 

kable(head(per_event_human_impact,10), caption = "Top 10 Event Types by Human Impact per Event")
Top 10 Event Types by Human Impact per Event
EVTYPE N_EVENTS FATALITIES_PER_EVENT INJURIES_PER_EVENT HUMAN_INCIDENTS_PER_EVENT
HURRICANE/TYPHOON 88 0.73 14.49 15.22
EXTREME HEAT 22 4.36 7.05 11.41
TSUNAMI 20 1.65 6.45 8.10
GLAZE 32 0.22 6.75 6.97
HEAT WAVE 74 2.32 4.18 6.50
EXCESSIVE HEAT 1678 1.13 3.89 5.02
HEAT 767 1.22 2.74 3.96
MIXED PRECIP 10 0.20 2.60 2.80
ICE 61 0.10 2.25 2.34
UNSEASONABLY WARM AND DRY 13 2.23 0.00 2.23
p1 <- total_human_impact %>%
  head(10) %>%
  mutate(EVTYPE = reorder(EVTYPE, HUMAN_INCIDENTS)) %>%
  ggplot(aes(x = EVTYPE, y = HUMAN_INCIDENTS)) +
  geom_col(fill = "darkblue") +
  coord_flip() +
  labs(
    title = "Total Human Incidents",
    x = "Event Type",
    y = "Total Incidents"
  )

p2 <- per_event_human_impact %>%
  head(10) %>%
  mutate(EVTYPE = reorder(EVTYPE, HUMAN_INCIDENTS_PER_EVENT)) %>%
  ggplot(aes(x = EVTYPE, y = HUMAN_INCIDENTS_PER_EVENT)) +
  geom_col(fill = "darkred") +
  coord_flip() +
  labs(
    title = "Human Incidents per Event",
    x = "Event Type",
    y = "Incidents per Event"
  )

p1 + p2 + plot_layout(ncol = 2) + plot_annotation(title = "Figure 1: Human Impact by Event Type")

From the figure it can be seen that by far tornadoes are the higher cause of human health related incidents, however the high amount of such yearly events in the contiguous United States is the likely cause of such impact. On a per event basis however, it can be observed that hurricanes are the most dangerous to humans with more than 15 health incidents per event.

Events Leading to Highest Economic Damage

Similar to the case for human impact, it makes sense to analyse the amount of economic damage caused by an event both a nominal and per event basis.

# Total economic damage
total_economic_impact <- df_summary %>%
  select(EVTYPE, N_EVENTS, PROP_DAMAGE, CROP_DAMAGE, COMBINED_DMG) %>%
  arrange(desc(COMBINED_DMG))

kable(head(total_economic_impact, 10),caption = "Top 10 Event Types by Total Economic Damage")
Top 10 Event Types by Total Economic Damage
EVTYPE N_EVENTS PROP_DAMAGE CROP_DAMAGE COMBINED_DMG
FLOOD 25326 144657709807 5661968450 150319678257
HURRICANE/TYPHOON 88 69305840000 2607872800 71913712800
TORNADO 60652 56937160779 414953270 57352114049
STORM SURGE 261 43323536000 5000 43323541000
HAIL 288661 15732267543 3025954473 18758222016
FLASH FLOOD 54277 16140812067 1421317100 17562129167
DROUGHT 2488 1046106000 13972566000 15018672000
HURRICANE 174 11868319010 2741910000 14610229010
RIVER FLOOD 173 5118945500 5029459000 10148404500
ICE STORM 2006 3944927860 5022113500 8967041360
# Economic damage per event
per_event_economic_impact <- df_summary %>%
  select(EVTYPE, N_EVENTS,
         PROP_DAMAGE_PER_EVENT,
         CROP_DAMAGE_PER_EVENT,
         COMBINED_DMG_PER_EVENT) %>%
  arrange(desc(COMBINED_DMG_PER_EVENT))
kable(head(per_event_economic_impact,10), caption = "Top 10 Event Types by Economic Damage per Event")
Top 10 Event Types by Economic Damage per Event
EVTYPE N_EVENTS PROP_DAMAGE_PER_EVENT CROP_DAMAGE_PER_EVENT COMBINED_DMG_PER_EVENT
HURRICANE/TYPHOON 88 787566364 2.963492e+07 817201282
STORM SURGE 261 165990559 1.915709e+01 165990579
SEVERE THUNDERSTORM 13 92720000 1.538462e+04 92735385
HURRICANE 174 68208730 1.575810e+07 83966833
RIVER FLOOD 173 29589280 2.907202e+07 58661298
TYPHOON 11 54566364 7.500000e+04 54641364
STORM SURGE/TIDE 148 31359378 5.743243e+03 31365122
FLASH FLOOD/FLOOD 22 12384091 2.522727e+04 12409318
TROPICAL STORM 690 11165059 9.831101e+05 12148169
TSUNAMI 20 7203100 1.000000e+03 7204100
p3 <- total_economic_impact %>%
  head(10) %>%
  mutate(EVTYPE = reorder(EVTYPE, COMBINED_DMG)) %>%
  ggplot(aes(x = EVTYPE, y = COMBINED_DMG)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(
    title = "Total Economic Damage",
    x = "Event Type",
    y = "Combined Damage (USD)"
  )

p4 <- per_event_economic_impact %>%
  head(10) %>%
  mutate(EVTYPE = reorder(EVTYPE, COMBINED_DMG_PER_EVENT)) %>%
  ggplot(aes(x = EVTYPE, y = COMBINED_DMG_PER_EVENT)) +
  geom_col(fill = "darkorange") +
  coord_flip() +
  labs(
    title = "Economic Damage per Event",
    x = "Event Type",
    y = "Damage per Event (USD)"
  )

p3 + p4 +
  plot_layout(ncol = 2) +
  plot_annotation(
    title = "Figure 2: Economic Impact by Event Type"
  )

It can be seen that floods have lead to the highest combined economic impact in the United States, however this could likely be explained by a higher frequency of events of this type. On a per event basis, hurricanes lead to the highest economic impact among all events.

Conclusions

After analyzing the data, the following can be concluded:

Limitations: