Synopsis

This analysis makes use of data from weather events that occurred in the United States between 1950 and 2011, publically available from the U.S. National Oceanic and Atmospheric Administration (NOAA). The goal of this work is to identify the types of events which 1) have the most severe human impacts, and 2) have the highest economic cost. We first clean the data and categorize all events by type. We then chart the total number of human fatalities, human injuries, and total economic costs (estimated as the value of property damage and crop damage) due to each type of event over the whole period. We also plot the average economic costs of damages per event. Results suggest that over the 1950-2011 period, tornadoes and floods had the highest human costs, while storms/winds had the highest economic costs. However, floods had the highest average cost per event.

DATA PROCESSING

  1. Loading data into R

This section loads the data and processes it for analysis. In particular, the following steps are taken:

# Load necessary packages
list.of.packages <- c("plyr", "dplyr")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Read the data file from its URL
dataUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if (!file.exists("MyStormData.csv.bz2")) {
    download.file(dataUrl, destfile = "MyStormData.csv.bz2")
}
rawdata <- read.csv("MyStormData.csv.bz2")
  1. Cleaning the data

This section cleans the data in preparation for the analysis. The following steps are taken:

library(dplyr)
rd <- rawdata 
# Convert crop and property damages to millions USD 
rd$CROPDMGEXP[rd$CROPDMGEXP=="k"] <- "K"
rd$CROPDMGEXP[rd$CROPDMGEXP=="m"] <- "M"
rd <- filter(rd, PROPDMGEXP =="K"|PROPDMGEXP =="M"|PROPDMGEXP =="B" )
rd <- filter(rd,CROPDMGEXP =="K"|CROPDMGEXP =="M"|CROPDMGEXP =="B" )
rd4 <- rd
rd4$CROPDMGM<-rd4$CROPDMG
rd4$PROPDMGM<-rd4$PROPDMG
rd4$CROPDMGM[rd4$CROPDMGEXP=="K"]<-rd4$CROPDMG[rd4$CROPDMGEXP=="K"] /1000
rd4$CROPDMGM[rd4$CROPDMGEXP=="B"]<-rd4$CROPDMG[rd4$CROPDMGEXP=="B"] *1000
rd4$PROPDMGM[rd4$PROPDMGEXP=="K"]<-rd4$PROPDMG[rd4$PROPDMGEXP=="K"] /1000
rd4$PROPDMGM[rd4$PROPDMGEXP=="B"]<-rd4$PROPDMG[rd4$PROPDMGEXP=="B"] *1000

# Sum into a single variable for total damages (TOTDMGM)
rd4$TOTDMGM <- rd4$CROPDMGM + rd4$PROPDMGM

# Keep only observations with non-zero values for at least one of our criteria of interest
rd5 <- filter(rd4, FATALITIES!=0 | INJURIES!=0 | TOTDMGM!=0)

## Re-categorize the EVTYPE variable
#===============================================
rd5$EVTYPE_new = toupper(rd5$EVTYPE)
# Get rid of all plurals: 
rd5$EVTYPE_new <- gsub("FLOODS", "FLOOD", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("FLD", "FLOOD", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("WINDSS", "WIND", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("WINDS", "WIND", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("STORMS", "STORM", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("FIRES", "FIRE", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("RAINS", "RAIN", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("TSTM", "TROPICAL STORM", rd5$EVTYPE_new)

# Get rid of -ing suffix: 
rd5$EVTYPE_new <- gsub("FLOODING", "FLOOD", rd5$EVTYPE_new)

# Get rid of word "severe" and similar adjectives or useless words:
rd5$EVTYPE_new <- gsub("SEVERE", "", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("DAMAGE", "", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("STRONG", "", rd5$EVTYPE_new)
rd5$EVTYPE_new<-trimws(rd5$EVTYPE_new)

# Correct a typo:  
rd5$EVTYPE_new <- gsub("THUDERSTORM", "THUNDERSTORM", rd5$EVTYPE_new)

# Remove the "first names" of hurricanes and tropical storms (e.g. "Katrina"): 
rd5$EVTYPE_new[grep("HURRICANE", rd5$EVTYPE_new)] <- "HURRICANE"
rd5$EVTYPE_new[grep("TROPICAL STORM", rd5$EVTYPE_new)] <- "TROPICAL STORM"

# Merge names that are essentially the same: 
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("HEAVY SURF/HIGH SURF","HEAVY RAIN/HIGH SURF" )] <- "HIGH SURF"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("WILDFIRE","FOREST FIRE" )] <- "WILD/FOREST FIRE"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("TORNADO F0","COLD AIR TORNADO", "FUNNEL CLOUD","WATERSPOUT" )] <- "TORNADO"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("GUSTNADO","GUSTY WIND" )] <- "HIGH WIND"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("HAIL 100","THUNDERSTORM HAIL","HAIL/WIND","ICE STORM")] <- "HAIL"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("DUST STORM/HIGH WIND")] <- "DUST STORM"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("DENSE FOG")] <- "FOG"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("FROST","FREEZE","GLAZE ICE", "ICY ROADS")] <- "COLD/FROST/FREEZE"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("COLD/WIND CHILL","EXTREME COLD","EXTREME COLD/WIND CHILL","WINTER WEATHER" )] <- "COLD/FROST/FREEZE"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("STORM SURGE")] <- "SURGE/TIDE"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("THUNDERSTORM WIND LIGHTNING")] <- "LIGHTNING"


# Finally, consolidate into merged categories (where possible):
rd5$EVTYPE_new[grep("FLOOD", rd5$EVTYPE_new)] <- "FLOOD"
rd5$EVTYPE_new[grep("HAIL", rd5$EVTYPE_new)] <- "HAIL"
rd5$EVTYPE_new[grep("HEAT", rd5$EVTYPE_new)] <- "HEAT"
rd5$EVTYPE_new[grep("FOG", rd5$EVTYPE_new)] <- "FOG"
rd5$EVTYPE_new[grep("SNOW", rd5$EVTYPE_new)] <- "SNOW"
rd5$EVTYPE_new[grep("THUNDERSTORM", rd5$EVTYPE_new)] <- "Strm/Wind"
rd5$EVTYPE_new[grep("WIND", rd5$EVTYPE_new)] <- "Strm/Wind"


# Switch factor names to lowercase to make it more legible
rd6 <- rd5 
rd6$EVTYPE_new = tolower(rd6$EVTYPE_new)
substr(rd6$EVTYPE_new,1,1) = toupper(substr(rd6$EVTYPE_new,1,1))

# Our final dataset is a selection of rd6 
cleandata <- select(rd6, STATE__, STATE, EVTYPE, EVTYPE_new, FATALITIES, INJURIES, PROPDMGM, CROPDMGM, TOTDMGM)
rm(rd,rd4, rd5, rd6)

DATA ANALYSIS

This section takes the clean data and generates the variables of interest:

library(dplyr)
by_evtype <- group_by(cleandata,EVTYPE_new)
evt_sum <- summarize(by_evtype,
                      count = n(),
                      nFatal = sum(FATALITIES),
                      nInjur = sum(INJURIES),
                      avgPropDmg = mean(TOTDMGM),
                      sumPropDmg = sum(TOTDMGM)
)
mostFatal5   <- arrange(evt_sum, desc(nFatal))[1:5,]
mostInjur5   <- arrange(evt_sum, desc(nInjur))[1:5,]
mostMeanDam5 <- arrange(evt_sum, desc(avgPropDmg))[1:5,]
mostTotDam5  <-  arrange(evt_sum, desc(sumPropDmg))[1:5,]

RESULTS

  1. Human costs

Figure 1 plots the five top-ranking event types for most fatalities and most injuries over the whole 1950-2011 period. Tornadoes rank at the top in both categories, with over 1000 deaths and nearly 12000 injuries sustained. Next-highest are floods, followed by heat and storms/wind.

par(mfrow = c(1,2), mar = c(2,2,1,0))
barplot(mostFatal5$nFatal, names.arg=mostFatal5$EVTYPE_new, cex.names=0.6, main=("Most Fatalities (count)"),cex.main=0.8)
barplot(mostInjur5$nInjur, names.arg=mostInjur5$EVTYPE_new, cex.names=0.6, main=("Most Injuries (count)"), cex.main=0.8,)
mtext("Figure 1: Human costs", side = 3, outer = TRUE, cex = 1 , col = "grey20")

  1. Economic costs

Figure 2 plots the five top-ranking event types for highest overall economic costs over the 1950-2011 period, and the highest average cost per event. Although storms / winds have tallied the highest economic costs in total over the period (150 billion dollars), floods generate the highest cost per event (400 million on average).

par(mfrow = c(1,2), mar = c(2,2,1,0))
barplot(rev(mostTotDam5$sumPropDmg), names.arg=mostTotDam5$EVTYPE_new, cex.names=0.6, 
        main=("Highest Total Damages (millions USD)"),cex.main=0.8, horiz=TRUE)
barplot(rev(mostMeanDam5$avgPropDmg), names.arg=mostMeanDam5$EVTYPE_new, cex.names=0.6,
        main=("Highest Mean Damages per Event"),cex.main=0.8, horiz=TRUE)

CONCLUSIONS

While storms and high winds imposed the highest long-term economic damage in terms of property and crop destruction over the 1950-2011 period, the highest human cost in terms of fatalities and injuries was due to tornadoes and floods. These results bear relevance for policymakers looking to reduce the costs of weather-related events in the United States.