Synopsis: This document describes effects of storms and severe weather events on public health and economic outcomes. Data were downloaded from a publicly available dataset from NOAA. Public health outcomes were described using the total number of injuries and fatalities per event, while economic outcomes were described using the cost of property and crop damage. Due to the large number of severe weather events, I only depicted the top 15 events that caused the greatest adverse health outcomes (total injuries + fatalities) and greatest economic consequences (total property + crop damage). Missing data were not incuded in this specific analysis. Although these values may be imputed in a variety of ways, I did not want to risk any confusion during the peer review process.

Data Processing

# Download
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"

if (!file.exists("StormData.csv.bz2")) {
  download.file(url, destfile = "StormData.csv.bz2",mode = "wb")
  }

# Import, read_csv() will automatically decompress the file
storm.df <- read_csv("StormData.csv.bz2")
## Rows: 902297 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): BGN_DATE, BGN_TIME, TIME_ZONE, COUNTYNAME, STATE, EVTYPE, BGN_AZI,...
## dbl (18): STATE__, COUNTY, BGN_RANGE, COUNTY_END, END_RANGE, LENGTH, WIDTH, ...
## lgl  (1): COUNTYENDN
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert factor variables
factor.vars <- c("TIME_ZONE", "STATE", "EVTYPE", "CROPDMGEXP", "PROPDMGEXP")
storm.df[factor.vars] <- lapply(storm.df[factor.vars], as.factor)


###############################  Data processing: Question 1  ######################

# Q1: Across the United States, which types of events are most harmful with respect to population health?

# Note to reviewer: Health here is being defined as injuries and fatalities. I describe these using the total number, mean, and SD of these events. I also calculate the total number of injuries + fatalities as a broad summary measure

# Make a dataframe containing summary statistics. 
# Calculated the total sum of injuries/fatalities per event
# Calculated the Mean and SD of the number of injuries/fatalities per event. May not give full picture since these events may not occur in (densely) populated areas.
pop.health <- storm.df %>%
  group_by(EVTYPE) %>%
  summarise(N_Events = n(),
            InjuriesTotal = sum(INJURIES, na.rm = TRUE),
            InjuriesMean = round(mean(INJURIES, na.rm = TRUE),2),
            InjuriesSD = round(sd(INJURIES, na.rm = TRUE), 2),
            FatalitiesTotal = sum(FATALITIES, na.rm = TRUE),
            FatalitiesMean = round(mean(FATALITIES, na.rm = TRUE), 2),
            FatalitiesSD = round(sd(FATALITIES, na.rm = TRUE), 2),
            TotalHealthImpact = FatalitiesTotal + InjuriesTotal)

# Rank the events by the Total Health Impact (total number of fatalities + injuries), then select the top 15 events
top.concerns <- pop.health %>%
  arrange(desc(TotalHealthImpact)) %>% 
  slice(1:15)

# Pivot longer, will assist with graphing the data below
top.concerns.l <- pivot_longer(top.concerns, cols = c("InjuriesTotal":"FatalitiesSD"),
                               names_to = c("Outcome", ".value"),
                               names_pattern = "(Injuries|Fatalities)(Total|Mean|SD)")
top.concerns.l$Outcome <- factor(top.concerns.l$Outcome, c("Injuries", "Fatalities"))


# Make concise summary table for the total numer of injuries + fatalities per event
top.concerns2 <- select(top.concerns, c(EVTYPE, N_Events, InjuriesTotal, FatalitiesTotal))
colnames(top.concerns2) <- c("Event", "Number of Events", "Total Number of Injuries",
                            "Total Number of Fatalities")



###############################  Data processing: Question 2  ######################

# Q2: Across the United States, which types of events have the greatest economic consequences?

# Convert economic damage (crop and property) to more usable numeric format
# First identify the levels of these two variables
levels(storm.df$CROPDMGEXP)
## [1] "?" "0" "2" "B" "k" "K" "m" "M"
levels(storm.df$PROPDMGEXP)
##  [1] "-" "?" "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "h" "H" "K" "m" "M"
# Adjust the multipliers to be more compatible with data analysis in R
econ.df <- storm.df %>%
  mutate(crop.mult = case_when(CROPDMGEXP == "K" |CROPDMGEXP == "k"  ~ 1e3,
                               CROPDMGEXP == "M" |CROPDMGEXP == "m" ~ 1e6,
                               CROPDMGEXP == "B" ~ 1e9,
                               CROPDMGEXP %in% as.character(0:9) ~ 10^(as.numeric(CROPDMGEXP)),
                               CROPDMGEXP == "?" ~ NA,
                               TRUE ~ NA),
         prop.mult = case_when(PROPDMGEXP == "H" |PROPDMGEXP == "h"  ~ 1e2,
                               PROPDMGEXP == "K" |PROPDMGEXP == "k"  ~ 1e3,
                               PROPDMGEXP == "M" |PROPDMGEXP == "m" ~ 1e6,
                               PROPDMGEXP == "B" ~ 1e9,
                               CROPDMGEXP %in% as.character(0:9) ~ 10^(as.numeric(PROPDMGEXP)),
                               PROPDMGEXP %in% c("-", "?", "+", "-") ~ NA,
                               TRUE ~ NA))


# Multiply the damage (property and crop) by respective multiplier to create a corrected, numeric column of economic costs
econ.df$prop.cor <- econ.df$PROPDMG*econ.df$prop.mult
econ.df$crop.cor <- econ.df$CROPDMG*econ.df$crop.mult

# Calculate the sum of the property + crop damage
econ.df$tot.dam <- econ.df$prop.cor + econ.df$crop.cor


# Calculate summary measures for property and crop damage
econ.dmg <- econ.df %>%
  group_by(EVTYPE) %>%
  summarise(N_Events = n(),
            PropDmgTotal = sum(prop.cor, na.rm = TRUE),
            CropDmgTotal = sum(crop.cor, na.rm = TRUE),
            TotalEconImpact = PropDmgTotal + CropDmgTotal)

# Rank the events by the Total Economic Impact, then select the top 15
top.econ <- econ.dmg %>%
  arrange(desc(TotalEconImpact)) %>% 
  slice(1:15)

# Pivot longer, will assist with graphing the data below
top.econ.l <- select(top.econ, -c("TotalEconImpact")) %>%
  pivot_longer(cols = c("PropDmgTotal","CropDmgTotal"),
               names_to = "Type", values_to = "EconomicDamage")
top.econ.l$Type <- as.factor(top.econ.l$Type) %>%
  fct_recode("PropertyDamage" = "PropDmgTotal",
             "CropDamage" = "CropDmgTotal")


# Add more descriptive column names for kable table
top.econ2 <- top.econ
colnames(top.econ2) <- c("Event", "Number of Events", "Total Property Damage (USD)",
                            "Total Crop Damage (USD)", "Total Economic Damage (USD)")

Results

###################################   Question 1   ######################################
# Figure 1: Create bar graph showing the top 15 events that adversely affect population health, and the number of injuries/fatalities per event.

fig1.bar <- ggplot(top.concerns.l, 
                           mapping = aes(x=Outcome, y= Total, fill=Outcome)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~EVTYPE, ncol = 3, scales = "free_y") +
  labs(x = "Outcome", y = 'Number of Observations', 
       title = "Most Harmful Events: Injuries and Fatalities", ) +
  theme_bw() +
  theme(strip.background = element_rect(colour="black", fill="white"),
        plot.title = element_text(hjust=0.5))

fig1.bar

# Table 1: Summarize the public health outcomes in an easy to read table
health.table <- top.concerns2 %>% 
  kbl(align="c") %>%
  kable_styling(bootstrap_options = c("striped","hover"), position = "center")

  
health.table
Event Number of Events Total Number of Injuries Total Number of Fatalities
TORNADO 60652 91346 5633
EXCESSIVE HEAT 1678 6525 1903
TSTM WIND 219944 6957 504
FLOOD 25326 6789 470
LIGHTNING 15755 5230 816
HEAT 767 2100 937
FLASH FLOOD 54278 1777 978
ICE STORM 2006 1975 89
THUNDERSTORM WIND 82563 1488 133
WINTER STORM 11433 1321 206
HIGH WIND 20212 1137 248
HAIL 288661 1361 15
HURRICANE/TYPHOON 88 1275 64
HEAVY SNOW 15708 1021 127
WILDFIRE 2761 911 75
########################################    Question 2   ##########################################

# Figure 2: Create bar graph showing the top 15 events that have the greatest economic consequences. The total damage ($) to property or crops per event

fig2.bar <- ggplot(top.econ.l, 
                           mapping = aes(x=EconomicDamage, y=EVTYPE , fill=EVTYPE)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~Type, ncol = 1, scales = "free_y") +
  labs(x = 'Total Econonomic Consequences ($)', y = "Event Type",
       title = "Events with Greatest Economic Consequences", ) +
  theme_bw() +
  theme(strip.background = element_rect(colour="black", fill="white"),
        plot.title = element_text(hjust=0.5),
        legend.position = "none")

fig2.bar

# Table 2: Summarize the economic damage in an easy to read table
econ.table <- top.econ2 %>% 
  kbl(align="c") %>%
  kable_styling(bootstrap_options = c("striped","hover"), position = "center")

  
econ.table
Event Number of Events Total Property Damage (USD) Total Crop Damage (USD) Total Economic Damage (USD)
FLOOD 25326 144657709800 5661968450 150319678250
HURRICANE/TYPHOON 88 69305840000 2607872800 71913712800
TORNADO 60652 56937160480 414969110 57352129590
STORM SURGE 261 43323536000 5000 43323541000
HAIL 288661 15732267220 3025956450 18758223670
FLASH FLOOD 54278 16140861510 1421317100 17562178610
DROUGHT 2488 1046106000 13972566000 15018672000
HURRICANE 174 11868319010 2741910000 14610229010
RIVER FLOOD 173 5118945500 5029459000 10148404500
ICE STORM 2006 3944927810 5022113500 8967041310
TROPICAL STORM 690 7703890550 678346000 8382236550
WINTER STORM 11433 6688497250 26944000 6715441250
HIGH WIND 20212 5270046260 638571300 5908617560
WILDFIRE 2761 4765114000 295472800 5060586800
TSTM WIND 219944 4493028440 554007350 5047035790