knitr::opts_chunk$set(echo = TRUE)
library(knitr)
library(stringr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(patchwork)

Synopsis

The United States is hit every year by a diverse sort of weather events varying in both type an intensity. Unfortunately, some of these weather event can lead to a negative economic impact, and in many cases, to the loss of human life.

The U.S. National Oceanic and Atmospheric Administration’s (NOAA) has tracked major storms and weather events in the country during the period of 1950-2011, including data on dates and location of occurrence, fatalities, injuries, and property damage. The following analysis provides an overview on the impact of these weather events, and attempts to answer which such events are more harmful to human life and have a higher economic impact.

Data was processed and the economic impact as well as the effect on human health was analyzed on a nominal an per event basis, in order to understand the events with the biggest consequences.

Data Processing

Data set can be downloaded from here.

#Download data from source
tmp <- tempfile(fileext = ".bz2")

if (!file.exists(tmp))
{
  download.file(
    "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
    destfile = tmp,
    mode = "wb"
  )
}

df_raw <- read.csv(bzfile(tmp))

The following offers a quick overview of the data before any processing had taken place. Given the amount of columns in the data, the table is shown split for better visualization.

Raw Data Overview

The following offers a quick overview of the data before any processing had taken place. Given the amount of columns in the data, the table is shown split for better visualization.

n <- ncol(df_raw)

breaks <- round(seq(0, n, length.out = 5))

kable(head(df_raw[, (breaks[1] + 1):breaks[2]]), caption = "Part 1")

Part 1
STATE__	BGN_DATE	BGN_TIME	TIME_ZONE	COUNTY	COUNTYNAME	STATE	EVTYPE
1	4/18/1950 0:00:00	0130	CST	97	MOBILE	AL	TORNADO
1	4/18/1950 0:00:00	0145	CST	3	BALDWIN	AL	TORNADO
1	2/20/1951 0:00:00	1600	CST	57	FAYETTE	AL	TORNADO
1	6/8/1951 0:00:00	0900	CST	89	MADISON	AL	TORNADO
1	11/15/1951 0:00:00	1500	CST	43	CULLMAN	AL	TORNADO
1	11/15/1951 0:00:00	2000	CST	77	LAUDERDALE	AL	TORNADO

kable(head(df_raw[, (breaks[2] + 1):breaks[3]]), caption = "Part 2")

Part 2
BGN_AZI	BGN_LOCATI	END_DATE	END_TIME	COUNTY_END	COUNTYENDN	END_RANGE	END_AZI	END_LOCATI
				0	NA	0
				0	NA	0
				0	NA	0
				0	NA	0
				0	NA	0
				0	NA	0

kable(head(df_raw[, (breaks[3] + 1):breaks[4]]), caption = "Part 3")

Part 3
LENGTH	WIDTH	F	INJURIES	PROPDMG	PROPDMGEXP
14.0	100	3	15	25.0	K
2.0	150	2	0	2.5	K
0.1	123	2	2	25.0	K
0.0	100	2	2	2.5	K
0.0	150	2	2	2.5	K
1.5	177	2	6	2.5	K

kable(head(df_raw[, (breaks[4] + 1):breaks[5]]), caption = "Part 4")

Part 4
LATITUDE	LONGITUDE	LATITUDE_E	LONGITUDE_	REFNUM
3040	8812	3051	8806	1
3042	8755	0	0	2
3340	8742	0	0	3
3458	8626	0	0	4
3412	8642	0	0	5
3450	8748	0	0	6

Data Cleanup

Based on the content of the data, and the requirements of this analysis, the following columns were dropped because they do not contribute to the study:

Variables Removed	Variables Removed
STATE__	END_AZI
BGN_DATE	END_LOCATI
BGN_TIME	LENGTH
TIME_ZONE	WIDTH
COUNTY	F
COUNTYNAME	MAG
STATE	WFO
BGN_RANGE	STATEOFFIC
BGN_AZI	ZONENAMES
BGN_LOCATI	LATITUDE
END_DATE	LONGITUDE
END_TIME	LATITUDE_E
COUNTY_END	LONGITUDE_
COUNTYENDN	REMARKS
END_RANGE	REFNUM

In addition, the data columns representing both property and crop damages are split in unit (PROPDMG) and exponent columns (PROPDMGEXP), with the exponent column representing a multiplier (e.g. thousands or millions of USD). In order to obtain the true numerical value of the damage in US dollars it becomes thus necessary to multiply all values for their respective multiplier. This however also requires an extra step, as exponent data is not represented numerically inside the data, but rather as strings.

As a last step, it would make sense for the analysis to compute the total amount of damage and human related incidents.

#List all columns which should not be present in the final data set.
cols_to_drop <- c(
  "STATE__", "BGN_DATE", "BGN_TIME", "TIME_ZONE", "COUNTY",
  "COUNTYNAME", "STATE", "BGN_RANGE", "BGN_AZI", "BGN_LOCATI",
  "END_DATE", "END_TIME", "COUNTY_END", "COUNTYENDN", "END_RANGE",
  "END_AZI", "END_LOCATI", "LENGTH", "WIDTH", "F", "MAG", "WFO",
  "STATEOFFIC", "ZONENAMES", "LATITUDE", "LONGITUDE", "LATITUDE_E",
  "LONGITUDE_", "REMARKS", "REFNUM"
)

#Defines a map to connect exponent string values to a numeric value.
mult_map <- c(H = 1e2, K = 1e3, M = 1e6, B = 1e9)

cols_to_drop_2 <- c("PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")

df <- df_raw %>%
  select(-all_of(cols_to_drop)) %>% #Removes not needed columns.
  mutate(
    HUMAN_INCIDENTS = FATALITIES + INJURIES, 
    PROP_DMG_USD = #Define new columns with calculated value in USD
      PROPDMG * coalesce( #coalesce allows to define a default value of 1,in case value in column is NA. 
        #unname is necessary to remove the key from the key value pair and keep only the mapped value.
        unname(
            #With mult_map[key] it is possible to get the numerical value mapped to a letter.
            mult_map[str_to_upper(PROPDMGEXP)] #str_to_upper is needed in case data is lowercase.
          ),
        1
      ),
    CROP_DMG_USD =
      CROPDMG *
      coalesce(
        unname(mult_map[str_to_upper(CROPDMGEXP)]),
        1
      ),
    COMBINED_DMG_USD = PROP_DMG_USD + CROP_DMG_USD
  ) %>% select(-all_of(cols_to_drop_2))

kable(head(df), caption = "Processed Data")

Processed Data
EVTYPE	INJURIES	HUMAN_INCIDENTS	PROP_DMG_USD	COMBINED_DMG_USD
TORNADO	15	15	25000	25000
TORNADO	0	0	2500	2500
TORNADO	2	2	25000	25000
TORNADO	2	2	2500	2500
TORNADO	2	2	2500	2500
TORNADO	6	6	2500	2500

Results

Events Most Harmful to Human Population

In order to understand the impact of a specific weather event on human health it is necessary to calculate the total impact per specific type of event (i.e. sum the total amount of injuries or deaths that a certain weather event type caused).

However this only gives a partial view of the truth, as an event type might cause more deaths in total just because there are more events of this type. In this regard it would make sense to also understand the danger a single occurrence of a certain event can pose to human lives.

Finally it would make sense to filter out events with less than 10 occurrences, as such a small sample of event could no provide enough information to infer whether the event corresponds to an outlier or not.

df_summary <- df %>%
  group_by(EVTYPE) %>%
  summarise(
    N_EVENTS = n(),
    FATALITIES = sum(FATALITIES, na.rm = TRUE),
    INJURIES = sum(INJURIES, na.rm = TRUE),
    HUMAN_INCIDENTS = sum(HUMAN_INCIDENTS, na.rm = TRUE),
    PROP_DAMAGE = sum(PROP_DMG_USD, na.rm = TRUE),
    CROP_DAMAGE = sum(CROP_DMG_USD, na.rm = TRUE),
    COMBINED_DMG = sum(COMBINED_DMG_USD, na.rm = TRUE)
  ) %>%
  filter(N_EVENTS >= 10) %>%
  mutate(
    FATALITIES_PER_EVENT = FATALITIES / N_EVENTS,
    INJURIES_PER_EVENT = INJURIES / N_EVENTS,
    HUMAN_INCIDENTS_PER_EVENT = HUMAN_INCIDENTS / N_EVENTS,
    PROP_DAMAGE_PER_EVENT = PROP_DAMAGE / N_EVENTS,
    CROP_DAMAGE_PER_EVENT = CROP_DAMAGE / N_EVENTS,
    COMBINED_DMG_PER_EVENT = COMBINED_DMG / N_EVENTS
  )


# Total human impact

total_human_impact <- df_summary %>%
    select(EVTYPE, N_EVENTS, FATALITIES, INJURIES, HUMAN_INCIDENTS) %>%
    arrange(desc(HUMAN_INCIDENTS))
  
kable(head(total_human_impact,10), caption = "Top 10 Event Types by Total Human Impact")

Top 10 Event Types by Total Human Impact
EVTYPE	N_EVENTS	FATALITIES	INJURIES	HUMAN_INCIDENTS
TORNADO	60652	5633	91346	96979
EXCESSIVE HEAT	1678	1903	6525	8428
TSTM WIND	219940	504	6957	7461
FLOOD	25326	470	6789	7259
LIGHTNING	15754	816	5230	6046
HEAT	767	937	2100	3037
FLASH FLOOD	54277	978	1777	2755
ICE STORM	2006	89	1975	2064
THUNDERSTORM WIND	82563	133	1488	1621
WINTER STORM	11433	206	1321	1527

# Human impact per event
per_event_human_impact <- df_summary %>%
    select(EVTYPE, N_EVENTS,
           FATALITIES_PER_EVENT,
           INJURIES_PER_EVENT,
           HUMAN_INCIDENTS_PER_EVENT) %>%
    arrange(desc(HUMAN_INCIDENTS_PER_EVENT)) %>%
    mutate(
      across(
        ends_with("_PER_EVENT"),
        ~ round(.x, 2)
      )
    ) 

kable(head(per_event_human_impact,10), caption = "Top 10 Event Types by Human Impact per Event")

Top 10 Event Types by Human Impact per Event
EVTYPE	N_EVENTS	FATALITIES_PER_EVENT	INJURIES_PER_EVENT	HUMAN_INCIDENTS_PER_EVENT
HURRICANE/TYPHOON	88	0.73	14.49	15.22
EXTREME HEAT	22	4.36	7.05	11.41
TSUNAMI	20	1.65	6.45	8.10
GLAZE	32	0.22	6.75	6.97
HEAT WAVE	74	2.32	4.18	6.50
EXCESSIVE HEAT	1678	1.13	3.89	5.02
HEAT	767	1.22	2.74	3.96
MIXED PRECIP	10	0.20	2.60	2.80
ICE	61	0.10	2.25	2.34
UNSEASONABLY WARM AND DRY	13	2.23	0.00	2.23

p1 <- total_human_impact %>%
  head(10) %>%
  mutate(EVTYPE = reorder(EVTYPE, HUMAN_INCIDENTS)) %>%
  ggplot(aes(x = EVTYPE, y = HUMAN_INCIDENTS)) +
  geom_col(fill = "darkblue") +
  coord_flip() +
  labs(
    title = "Total Human Incidents",
    x = "Event Type",
    y = "Total Incidents"
  )

p2 <- per_event_human_impact %>%
  head(10) %>%
  mutate(EVTYPE = reorder(EVTYPE, HUMAN_INCIDENTS_PER_EVENT)) %>%
  ggplot(aes(x = EVTYPE, y = HUMAN_INCIDENTS_PER_EVENT)) +
  geom_col(fill = "darkred") +
  coord_flip() +
  labs(
    title = "Human Incidents per Event",
    x = "Event Type",
    y = "Incidents per Event"
  )

p1 + p2 + plot_layout(ncol = 2) + plot_annotation(title = "Figure 1: Human Impact by Event Type")

From the figure it can be seen that by far tornadoes are the higher cause of human health related incidents, however the high amount of such yearly events in the contiguous United States is the likely cause of such impact. On a per event basis however, it can be observed that hurricanes are the most dangerous to humans with more than 15 health incidents per event.

Events Leading to Highest Economic Damage

Similar to the case for human impact, it makes sense to analyse the amount of economic damage caused by an event both a nominal and per event basis.

# Total economic damage
total_economic_impact <- df_summary %>%
  select(EVTYPE, N_EVENTS, PROP_DAMAGE, CROP_DAMAGE, COMBINED_DMG) %>%
  arrange(desc(COMBINED_DMG))

kable(head(total_economic_impact, 10),caption = "Top 10 Event Types by Total Economic Damage")

Top 10 Event Types by Total Economic Damage
EVTYPE	N_EVENTS	PROP_DAMAGE	CROP_DAMAGE	COMBINED_DMG
FLOOD	25326	144657709807	5661968450	150319678257
HURRICANE/TYPHOON	88	69305840000	2607872800	71913712800
TORNADO	60652	56937160779	414953270	57352114049
STORM SURGE	261	43323536000	5000	43323541000
HAIL	288661	15732267543	3025954473	18758222016
FLASH FLOOD	54277	16140812067	1421317100	17562129167
DROUGHT	2488	1046106000	13972566000	15018672000
HURRICANE	174	11868319010	2741910000	14610229010
RIVER FLOOD	173	5118945500	5029459000	10148404500
ICE STORM	2006	3944927860	5022113500	8967041360

# Economic damage per event
per_event_economic_impact <- df_summary %>%
  select(EVTYPE, N_EVENTS,
         PROP_DAMAGE_PER_EVENT,
         CROP_DAMAGE_PER_EVENT,
         COMBINED_DMG_PER_EVENT) %>%
  arrange(desc(COMBINED_DMG_PER_EVENT))
kable(head(per_event_economic_impact,10), caption = "Top 10 Event Types by Economic Damage per Event")

Top 10 Event Types by Economic Damage per Event
EVTYPE	N_EVENTS	PROP_DAMAGE_PER_EVENT	CROP_DAMAGE_PER_EVENT	COMBINED_DMG_PER_EVENT
HURRICANE/TYPHOON	88	787566364	2.963492e+07	817201282
STORM SURGE	261	165990559	1.915709e+01	165990579
SEVERE THUNDERSTORM	13	92720000	1.538462e+04	92735385
HURRICANE	174	68208730	1.575810e+07	83966833
RIVER FLOOD	173	29589280	2.907202e+07	58661298
TYPHOON	11	54566364	7.500000e+04	54641364
STORM SURGE/TIDE	148	31359378	5.743243e+03	31365122
FLASH FLOOD/FLOOD	22	12384091	2.522727e+04	12409318
TROPICAL STORM	690	11165059	9.831101e+05	12148169
TSUNAMI	20	7203100	1.000000e+03	7204100

p3 <- total_economic_impact %>%
  head(10) %>%
  mutate(EVTYPE = reorder(EVTYPE, COMBINED_DMG)) %>%
  ggplot(aes(x = EVTYPE, y = COMBINED_DMG)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(
    title = "Total Economic Damage",
    x = "Event Type",
    y = "Combined Damage (USD)"
  )

p4 <- per_event_economic_impact %>%
  head(10) %>%
  mutate(EVTYPE = reorder(EVTYPE, COMBINED_DMG_PER_EVENT)) %>%
  ggplot(aes(x = EVTYPE, y = COMBINED_DMG_PER_EVENT)) +
  geom_col(fill = "darkorange") +
  coord_flip() +
  labs(
    title = "Economic Damage per Event",
    x = "Event Type",
    y = "Damage per Event (USD)"
  )

p3 + p4 +
  plot_layout(ncol = 2) +
  plot_annotation(
    title = "Figure 2: Economic Impact by Event Type"
  )

It can be seen that floods have lead to the highest combined economic impact in the United States, however this could likely be explained by a higher frequency of events of this type. On a per event basis, hurricanes lead to the highest economic impact among all events.

Conclusions

After analyzing the data, the following can be concluded:

The events with the highest combined effect on human lives are tornadoes.
The events with the highest combined economic effect on the US economy are floods.
On a per even basis, hurricanes are by far the leading cause of economic damage and human related incidents during the 61 year period under study.

Limitations:

The study only takes into consideration nominal values (i.e. not adjusted for inflation)
The study does not take into consideration population size, thus cannot really quantify the human impact on a true per capita basis.

Severe weather events in the United States: A 60-year insight

Marlon Cabrera

2026-06-03