library(ggplot2)
suppressMessages(library(dplyr))  
suppressMessages(library(R.utils))
## Warning: package 'R.utils' was built under R version 4.4.2

Analysis of Storm Data on U.S. Population Health and Economic Damage.

Synopsis:

This papers analyzes data in the NOAA Storm Data database spanning from 1950 to Nov, 2011. It identifies the top five weather event types contributing to fatalities in the U.S. as Tornado, Excessive Heat, Flash Flood, Heat, and Lightning. It also identifies the top five weather event types contributing to injuries in the U.S. as Tornado, TSTM Wind, Flood, Excessive Heat, and Lightning.

The top 10 weather event types contributing to economic damage in the U.S., either through property or crop damage, are Flood, Hurricane, Tornado, Storm Surge, Hail, Flash Flood, Drought, River Flood, Ice Storm, and Tropical Storm.

Data Processing

Data was obtained from NOAA as a zipped csv file (repdata_data_StormData.csv). The csv file was extracted and placed in the R working directory.
* Values for the EVTYPE (event type) were trimmed of leading and trailing whitespace, as well as capitalized for uniformity.
* Two values of EVTYPE (“HURRICANE/TYPHOON” and “HURRICANE”), each pertaining to rows with significant data, were merged into one value, i.e. “HURRICANE”.

bunzip2("repdata_data_StormData.csv.bz2", "repdata_data_StormData.csv", remove=FALSE, overwrite=TRUE)

sd = read.csv("repdata_data_StormData.csv", header=TRUE, sep=",") %>%
      mutate(EVTYPE = toupper(trimws(EVTYPE)))

sd$EVTYPE[sd$EVTYPE=="HURRICANE/TYPHOON"] = "HURRICANE" # fix glaring categorization error

Data Analysis - Fatalities and Injuries

fat_by_ev = aggregate(FATALITIES ~ EVTYPE, data=sd, FUN=sum)
fat_ev5 = fat_by_ev %>% 
          top_n(5, FATALITIES) %>% 
          arrange(desc(FATALITIES))
fat_ev5$EVTYPE = factor(fat_ev5$EVTYPE, levels=fat_ev5$EVTYPE)

inj_by_ev = aggregate(INJURIES ~ EVTYPE, data=sd, FUN=sum)
inj_ev5 = inj_by_ev %>% 
          top_n(5, INJURIES) %>% 
          arrange(desc(INJURIES))
inj_ev5$EVTYPE = factor(inj_ev5$EVTYPE, levels=inj_ev5$EVTYPE)

Results - Fatalities

ggplot(fat_ev5, aes(x=EVTYPE, y=FATALITIES)) + 
  geom_col(aes(fill=EVTYPE)) +
  #geom_bar(stat="identity") +
  theme_bw() +
  theme(legend.position = "none") +
  labs(title="Top 5 Event Types Harmful to Population Health - Fatalities.",
       caption="source: NOAA Storm Data Database",
       y="Total Fatalities",
       x="Event Type")

Results - Injuries

ggplot(inj_ev5, aes(x=EVTYPE, y=INJURIES)) + 
  geom_col(aes(fill=EVTYPE)) +
  #geom_bar(stat="identity") +
  theme_bw() +
  theme(legend.position = "none") +
  labs(title="Top 5 Event Types Harmful to Population Health - Injuries.",
       caption="source: NOAA Storm Data Database",
       y="Total Injuries",
       x="Event Type")

The analysis shows that tornado is by far the greatest cause of fatality and injury in the U.S. Variations of heat, flood, wind, and lightning are the next greatest contributors after tornado.

Data Analysis - Economic Damage

calc_multiplier = function(dmgexp){
  # Determines an integer value from a text value found in the ___DMGEXP field of Storm Data
  multiplier = case_when(
        dmgexp %in% c("B", "9") ~ 1e9,
        dmgexp %in% c("8") ~ 1e8,
        dmgexp %in% c("7") ~ 1e7,
        dmgexp %in% c("M", "m", "6") ~ 1e6,
        dmgexp %in% c("5") ~ 1e5,
        dmgexp %in% c("4") ~ 1e4,
        dmgexp %in% c("K", "k", "3") ~ 1e3,
        dmgexp %in% c("2") ~ 1e2,
        dmgexp %in% c("1") ~ 1e1,
        TRUE               ~ 1
      )
  return(multiplier)
}
# Calculate dollar values of damage to prop & crop
propcrop = sd[sd$PROPDMG + sd$CROPDMG != 0,] %>% 
           select(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) %>%
           mutate(propdmg_value = PROPDMG * calc_multiplier(PROPDMGEXP)) %>%
           mutate(cropdmg_value = CROPDMG * calc_multiplier(CROPDMGEXP)) # %>%
           #mutate(fulldmg_value = propdmg_value + cropdmg_value)  # use fulldmg_value for sorting

# Find aggregate values
propcrop_totals = aggregate(cbind(propdmg_value, cropdmg_value) ~ EVTYPE
                            , data=propcrop
                            , FUN = function(x) total=sum(x)
                            )
# Take top 10. Sort later in the plot.
propcrop_totals = propcrop_totals %>% 
                  top_n(10, propdmg_value + cropdmg_value) 
propcrop_totals$EVTYPE = factor(propcrop_totals$EVTYPE, levels=propcrop_totals$EVTYPE)

# Melt for stacked col chart
suppressMessages(library(reshape2))
## Warning: package 'reshape2' was built under R version 4.4.2
# Rename the melting cols before melt, so that 'nice' factor levels become values which become chart-legend values.
names(propcrop_totals) = c("EVTYPE", "Property_Damage", "Crop_Damage") 
pct_melt = melt(propcrop_totals, id.vars=c("EVTYPE"))
names(pct_melt) = c("EVTYPE", "Damage_Type", "dollar_value")

Results - Economic Damage

ggplot(pct_melt, aes(x=reorder(EVTYPE, -(dollar_value)), y=dollar_value/1e9, fill=Damage_Type)) + 
  #geom_col() +
  geom_bar(stat="identity") +
  theme_bw() +
  theme(legend.position = "right",
        axis.text.x = element_text(angle=90)
        ) +
  labs(title="Top 10 Event Types Causing Economic Damage.",
       caption="source: NOAA Storm Data Database",
       y="Damage (billion $)",
       x="Event Type")

The analysis shows that Flood has the greatest impact on U.S. economic interests and that most of this impacts property. Hurricane, tornado, and storm surge - related but distinct weather event types - also have a large impact. In the case of drought, river flood, and ice storm, the damage is inflicted on crops just as much or more so than on property.