Abstract

The following is a basic analysis of severe weather data from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database contains information from 1950 to 2011 related to the characteristics of major storms and weather events in the United States, including injuries, fatalities, and property damage. This analysis addresses the questions of which types of severe weather events are most harmful with respect to population health and which types of events have the greatest economic consequences. The researcher has selected property damage estimates and injuries as the metrics by which to measure economic impact and harm to population health respectively.

Results

The data indicate clearly that tornadoes are the most costly severe weather events in terms of personal injuries, while floods are the most costly in terms of estimated property damage.

Data Processing

setwd("~/R for DS/StormResearch/")

sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Big Sur 11.7.10
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.33     R6_2.5.1          fastmap_1.1.1     xfun_0.40        
##  [5] cachem_1.0.8      knitr_1.43        htmltools_0.5.6   rmarkdown_2.24   
##  [9] cli_3.6.1         sass_0.4.7        jquerylib_0.1.4   compiler_4.3.1   
## [13] rstudioapi_0.15.0 tools_4.3.1       evaluate_0.21     bslib_0.5.1      
## [17] yaml_2.3.7        rlang_1.1.2       jsonlite_1.8.7
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(httr)
library(readr)
library(knitr)

# Set source url
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"

# Save local file destination
src <- "src/StormData.bz2"

# Download the file
download.file(url, destfile = src, method = "curl")

# Read the .bzip2 file into a data frame
df <- read.csv(pipe(paste("bzcat", src)), header = TRUE)

Explore Data

kable(head(df,10))
STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL TORNADO 0 0 NA 0 14.0 100 3 0 0 15 25.0 K 0 3040 8812 3051 8806 1
1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL TORNADO 0 0 NA 0 2.0 150 2 0 0 0 2.5 K 0 3042 8755 0 0 2
1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL TORNADO 0 0 NA 0 0.1 123 2 0 0 2 25.0 K 0 3340 8742 0 0 3
1 6/8/1951 0:00:00 0900 CST 89 MADISON AL TORNADO 0 0 NA 0 0.0 100 2 0 0 2 2.5 K 0 3458 8626 0 0 4
1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL TORNADO 0 0 NA 0 0.0 150 2 0 0 2 2.5 K 0 3412 8642 0 0 5
1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL TORNADO 0 0 NA 0 1.5 177 2 0 0 6 2.5 K 0 3450 8748 0 0 6
1 11/16/1951 0:00:00 0100 CST 9 BLOUNT AL TORNADO 0 0 NA 0 1.5 33 2 0 0 1 2.5 K 0 3405 8631 0 0 7
1 1/22/1952 0:00:00 0900 CST 123 TALLAPOOSA AL TORNADO 0 0 NA 0 0.0 33 1 0 0 0 2.5 K 0 3255 8558 0 0 8
1 2/13/1952 0:00:00 2000 CST 125 TUSCALOOSA AL TORNADO 0 0 NA 0 3.3 100 3 0 1 14 25.0 K 0 3334 8740 3336 8738 9
1 2/13/1952 0:00:00 2000 CST 57 FAYETTE AL TORNADO 0 0 NA 0 2.3 100 3 0 0 0 25.0 K 0 3336 8738 3337 8737 10
kable(summary(df))
STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
Min. : 1.0 Length:902297 Length:902297 Length:902297 Min. : 0 Length:902297 Length:902297 Length:902297 Min. : 0.00 Length:902297 Length:902297 Length:902297 Length:902297 Min. :0 Mode:logical Min. : 0.000 Length:902297 Length:902297 Min. : 0.00 Min. : 0.0 Min. :0 Min. : 0.0 Min. : 0.000 Min. : 0.000 Min. : 0.0 Length:902297 Min. : 0.00 Length:902297 Length:902297 Length:902297 Length:902297 Min. : 0 Min. :-14451 Min. : 0 Min. :-14455 Length:902297 Min. : 1
1st Qu.:19.0 Class :character Class :character Class :character 1st Qu.: 31 Class :character Class :character Class :character 1st Qu.: 0.00 Class :character Class :character Class :character Class :character 1st Qu.:0 NA’s:902297 1st Qu.: 0.000 Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.:0 1st Qu.: 0.0 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.0 Class :character 1st Qu.: 0.00 Class :character Class :character Class :character Class :character 1st Qu.:2802 1st Qu.: 7247 1st Qu.: 0 1st Qu.: 0 Class :character 1st Qu.:225575
Median :30.0 Mode :character Mode :character Mode :character Median : 75 Mode :character Mode :character Mode :character Median : 0.00 Mode :character Mode :character Mode :character Mode :character Median :0 NA Median : 0.000 Mode :character Mode :character Median : 0.00 Median : 0.0 Median :1 Median : 50.0 Median : 0.000 Median : 0.000 Median : 0.0 Mode :character Median : 0.00 Mode :character Mode :character Mode :character Mode :character Median :3540 Median : 8707 Median : 0 Median : 0 Mode :character Median :451149
Mean :31.2 NA NA NA Mean :101 NA NA NA Mean : 1.48 NA NA NA NA Mean :0 NA Mean : 0.986 NA NA Mean : 0.23 Mean : 7.5 Mean :1 Mean : 46.9 Mean : 0.017 Mean : 0.156 Mean : 12.1 NA Mean : 1.53 NA NA NA NA Mean :2875 Mean : 6940 Mean :1452 Mean : 3509 NA Mean :451149
3rd Qu.:45.0 NA NA NA 3rd Qu.:131 NA NA NA 3rd Qu.: 1.00 NA NA NA NA 3rd Qu.:0 NA 3rd Qu.: 0.000 NA NA 3rd Qu.: 0.00 3rd Qu.: 0.0 3rd Qu.:1 3rd Qu.: 75.0 3rd Qu.: 0.000 3rd Qu.: 0.000 3rd Qu.: 0.5 NA 3rd Qu.: 0.00 NA NA NA NA 3rd Qu.:4019 3rd Qu.: 9605 3rd Qu.:3549 3rd Qu.: 8735 NA 3rd Qu.:676723
Max. :95.0 NA NA NA Max. :873 NA NA NA Max. :3749.00 NA NA NA NA Max. :0 NA Max. :925.000 NA NA Max. :2315.00 Max. :4400.0 Max. :5 Max. :22000.0 Max. :583.000 Max. :1700.000 Max. :5000.0 NA Max. :990.00 NA NA NA NA Max. :9706 Max. : 17124 Max. :9706 Max. :106220 NA Max. :902297
NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA’s :843563 NA NA NA NA NA NA NA NA NA NA NA’s :47 NA NA’s :40 NA NA NA

Clean up formatting and filter data

Because the earlier years represented in the database record fewer events and are less reliable due to inferior record-keeping, this analysis focuses on the 25 year period from 1986 to 2011.

beg <- ymd("1986-01-01", tz = "America/Chicago")
end <- ymd("2011-11-30", tz = "America/Chicago")

# Convert BGD_DATE field from character to date; Create normalized property
# damage field by using PROPDMG value multiplied by the PROPDMGEXP value. The
# property damage is normalized to represent millions of dollars.
df <- df %>% 
        mutate(DMGCOSTNORM = case_when(
                PROPDMGEXP == "M" ~ (PROPDMG * 1000000),
                PROPDMGEXP == "B" ~ (PROPDMG * 1000000000),
                PROPDMG > 0 ~ (PROPDMG * 1000)),
               DMGCOSTNORM = ifelse(DMGCOSTNORM > 0, DMGCOSTNORM / 1000000,
                                    DMGCOSTNORM),
               BGN_DATE = mdy_hms(BGN_DATE, tz = "America/Chicago")) %>% 
        filter(BGN_DATE >= beg, BGN_DATE <= end) %>% 
        relocate(DMGCOSTNORM, .before = CROPDMG)
# Analyze Event Types by Injury count
injuries <- df %>%
  group_by(EVTYPE) %>%
  summarize(totalInjuries = sum(INJURIES, na.rm = TRUE)) %>% 
        filter(totalInjuries > 50) %>%
  arrange(desc(totalInjuries))

# Display top 5 injurious events
kable(head(injuries, 5))
EVTYPE totalInjuries
TORNADO 30186
FLOOD 6789
EXCESSIVE HEAT 6525
TSTM WIND 6418
LIGHTNING 5230
# Create a bar plot showing injuries counts by events with more than 50 injuries
ggplot(injuries, aes(x = reorder(EVTYPE, -totalInjuries), y = totalInjuries)) +
  geom_bar(stat = "identity", fill = "cornflowerblue") +
  labs(title = "Total Injuries by Event Type",
       x = "Event Type",
       y = "Total Injuries") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Calculate total injuries from severe weather events and total minus tornadoes.
allInj <- sum(df$INJURIES)
twisterInj <- df %>% 
        filter(EVTYPE == "TORNADO") %>% 
        summarize(twistInj = sum(INJURIES))
allTornado <- twisterInj[1,1]
allOthers <- allInj - allTornado

The data in this plot are difficult to analyze because of the extremely high number of tornado injuries as compared to injuries from other events. tornadoes accounted roughly three-fifths of all severe weather injuries throughout the United States from 1986 to 2011. During this period, there were 78735 total severe weather injuries, of which 30186 were from tornadoes and 48549 were from other events.

# Plot non-tornado injuries
nonTornado <- df %>%
  group_by(EVTYPE) %>%
  summarize(totalInjuries = sum(INJURIES, na.rm = TRUE)) %>% 
        filter(totalInjuries > 50,
               EVTYPE != "TORNADO") %>%
  arrange(desc(totalInjuries))

# Create a bar plot showing injuries counts by non-Tornado events with more than 
# 50 injuries
ggplot(nonTornado, aes(x = reorder(EVTYPE, -totalInjuries), y = totalInjuries)) +
  geom_bar(stat = "identity", fill = "cornflowerblue") +
  labs(title = "Total Injuries by Event Type Excluding tornadoes",
       x = "Event Type",
       y = "Total Injuries") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Property Damage

# Analyze Event Types by Injury count
damage <- df %>%
  group_by(EVTYPE) %>%
  summarize(totalDamage = sum(DMGCOSTNORM, na.rm = TRUE)) %>% 
        filter(totalDamage > 500) %>%
  arrange(desc(totalDamage))

# Display the 5 most costly events in terms of property damage estimates
kable(head(damage, 5))
EVTYPE totalDamage
FLOOD 144657.7
HURRICANE/TYPHOON 69305.8
STORM SURGE 43323.5
TORNADO 34774.3
FLASH FLOOD 16141.4
# Create a bar plot showing injuries counts by events with more than 50 injuries
ggplot(damage, aes(x = reorder(EVTYPE, -totalDamage), y = totalDamage)) +
  geom_bar(stat = "identity", fill = "cornflowerblue") +
  labs(title = "Total Property Damage Estimate by Event Type",
       x = "Event Type",
       y = "Total Damage in Millions $") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))