Synopsis

This data analysis involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database, which tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.

This report addresses two questions:
1. Across the United States, which types of events are most harmful with respect to population health?
2. Across the United States, which types of events have the greatest economic consequences?

Data Processing

The reason that I read the data from the local file instead of using download.file() here is that MY NETWORK SPEED IS SO SLOW! Everytime I tried to use download.file(), it got stuck and only a portion of the full size was downloaded. So, I have to download the file via my Chrome to my local repository first and use read.csv() to load it. The data can be found HERE.

Load the data

library(knitr)
library(readr)
sdata <- read.csv("StormData.csv", sep = ",")

Process the data

  1. Select only the events whose dates are after Jan 1996 and scale down the data.
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
st <- sdata %>% mutate(BGN_DATE = mdy_hms(BGN_DATE))
st <- st %>% filter(BGN_DATE >= "1996-01-01") %>% 
      select(BGN_DATE, EVTYPE, FATALITIES, INJURIES, 
             PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
  1. Covert exps to numbers.
st$PROPDMGEXP[st$PROPDMGEXP == "B"] <- 10^9
st$PROPDMGEXP[st$PROPDMGEXP == "M"] <- 10^6
st$PROPDMGEXP[st$PROPDMGEXP == "K"] <- 10^3
st$PROPDMGEXP[st$PROPDMGEXP == ""] <- 0
st$CROPDMGEXP[st$CROPDMGEXP == "B"] <- 10^9
st$CROPDMGEXP[st$CROPDMGEXP == "M"] <- 10^6
st$CROPDMGEXP[st$CROPDMGEXP == "K"] <- 10^3
st$CROPDMGEXP[st$CROPDMGEXP == ""] <- 0
st <- st %>% mutate(PROPDMGEXP = as.numeric(PROPDMGEXP), 
                    CROPDMGEXP = as.numeric(CROPDMGEXP))
names(st) <- tolower(names(st))
  1. Quantify the losses. The economic losses are from the damage of properties and crops. The health losses is made up of fatalities and injuries, the weights of which are 1 and 0.3, respectively.
st <- st %>% mutate(ecnmcloss = propdmg * propdmgexp + cropdmg * cropdmgexp, 
                    hlthloss = fatalities + 0.3 * injuries) %>% 
            select(bgn_date:injuries, ecnmcloss, hlthloss)
  1. Load the 48 official events. Classification found HERE. I manually input the data in a csv file! Maybe there is a better way to do it.
events48 <- read.csv("events.csv")
head(events48, 5)
tail(events48, 5)
  1. Now, reduce the number of events to 48!
library(stringr)
library(stringdist)
## Warning: package 'stringdist' was built under R version 4.0.2
  1. Covert the names of events in st to proper forms. Change “Tstm” to “Thunderstorm” for more accurate matching.
st$evtype <- str_to_title(st$evtype)
st$evtype <- gsub("Tstm", "Thunderstorm", st$evtype)
  1. Use amatch in the stringdist package for approximate matching.
st <- st %>% mutate(evclass = events48$events[amatch(evtype, events48$events, maxDist = 100)])

Results

1. Across the United States, which types of events are most harmful with respect to population health?

library(ggplot2)
ev_hlth <- st %>% group_by(evclass) %>% 
      summarize(health_loss = sum(hlthloss)) %>% 
      rename(event = evclass) %>% arrange(desc(health_loss))
## `summarise()` ungrouping output (override with `.groups` argument)
ev_hlth_top5 <- ev_hlth[1:5, ]

g1 <- ggplot(data = ev_hlth_top5, 
             aes(x = reorder(event, -health_loss), y = health_loss))
g1 + geom_bar(stat = "identity", fill = "steelblue") + 
      geom_text(aes(label = health_loss), vjust = -0.3, size = 3) + 
      labs(x = "Event") + labs(y = "Loss") + 
      labs(title = "Population Health Loss v. Severe Weather Event in US Since 1996") + 
      labs(subtitle = "Note: Loss = 1 * fatalities + 0.3 * Injuries") + 
      theme_classic()

2. Across the United States, which types of events have the greatest economic consequences?

library(ggplot2)
ev_eco <- st %>% group_by(evclass) %>% 
      summarize(economic_loss = sum(ecnmcloss)) %>% 
      rename(event = evclass) %>% arrange(desc(economic_loss))
## `summarise()` ungrouping output (override with `.groups` argument)
ev_eco_top5 <- ev_eco[1:5, ]
ev_eco_top5 <- ev_eco_top5 %>% mutate(scl_eco_loss = round(economic_loss / 10^9, digits = 1))

g1 <- ggplot(data = ev_eco_top5, 
             aes(x = reorder(event, -scl_eco_loss), y = scl_eco_loss))
g1 + geom_bar(stat = "identity", fill = "brown") + 
      geom_text(aes(label = scl_eco_loss), vjust = -0.3, size = 3) + 
      labs(x = "Event") + labs(y = "Loss") + 
      labs(title = "Economic Loss v. Severe Weather Event in US Since 1996") + 
      labs(subtitle = "Note: Economic Loss = Property Loss + Crop Loss (in billions)") + 
      theme_classic()