Impacts of Extreme Weather Events in the United States

Synopsis

This analysis makes use of data from weather events that occurred in the United States between 1950 and 2011, publically available from the U.S. National Oceanic and Atmospheric Administration (NOAA). The goal of this work is to identify the types of events which 1) have the most severe human impacts, and 2) have the highest economic cost. We first clean the data and categorize all events by type. We then chart the total number of human fatalities, human injuries, and total economic costs (estimated as the value of property damage and crop damage) due to each type of event over the whole period. We also plot the average economic costs of damages per event. Results suggest that over the 1950-2011 period, tornadoes and floods had the highest human costs, while storms/winds had the highest economic costs. However, floods had the highest average cost per event.

DATA PROCESSING

Loading data into R

This section loads the data and processes it for analysis. In particular, the following steps are taken:

Load the relevant R packages for this analysis
Download the data from the relevant url
Read the data into R

# Load necessary packages
list.of.packages <- c("plyr", "dplyr")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Read the data file from its URL
dataUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if (!file.exists("MyStormData.csv.bz2")) {
    download.file(dataUrl, destfile = "MyStormData.csv.bz2")
}
rawdata <- read.csv("MyStormData.csv.bz2")

Cleaning the data

This section cleans the data in preparation for the analysis. The following steps are taken:

Missing values are removed
Economic costs are converted to millions of USD and summed (crop damage + property damage)
Categories of weather events are consolidated (for example, events described as “Floods”, “Flood”, “Fld”, “Rain/Flood”, are all merged into the same category)

library(dplyr)
rd <- rawdata 
# Convert crop and property damages to millions USD 
rd$CROPDMGEXP[rd$CROPDMGEXP=="k"] <- "K"
rd$CROPDMGEXP[rd$CROPDMGEXP=="m"] <- "M"
rd <- filter(rd, PROPDMGEXP =="K"|PROPDMGEXP =="M"|PROPDMGEXP =="B" )
rd <- filter(rd,CROPDMGEXP =="K"|CROPDMGEXP =="M"|CROPDMGEXP =="B" )
rd4 <- rd
rd4$CROPDMGM<-rd4$CROPDMG
rd4$PROPDMGM<-rd4$PROPDMG
rd4$CROPDMGM[rd4$CROPDMGEXP=="K"]<-rd4$CROPDMG[rd4$CROPDMGEXP=="K"] /1000
rd4$CROPDMGM[rd4$CROPDMGEXP=="B"]<-rd4$CROPDMG[rd4$CROPDMGEXP=="B"] *1000
rd4$PROPDMGM[rd4$PROPDMGEXP=="K"]<-rd4$PROPDMG[rd4$PROPDMGEXP=="K"] /1000
rd4$PROPDMGM[rd4$PROPDMGEXP=="B"]<-rd4$PROPDMG[rd4$PROPDMGEXP=="B"] *1000

# Sum into a single variable for total damages (TOTDMGM)
rd4$TOTDMGM <- rd4$CROPDMGM + rd4$PROPDMGM

# Keep only observations with non-zero values for at least one of our criteria of interest
rd5 <- filter(rd4, FATALITIES!=0 | INJURIES!=0 | TOTDMGM!=0)

## Re-categorize the EVTYPE variable
#===============================================
rd5$EVTYPE_new = toupper(rd5$EVTYPE)
# Get rid of all plurals: 
rd5$EVTYPE_new <- gsub("FLOODS", "FLOOD", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("FLD", "FLOOD", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("WINDSS", "WIND", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("WINDS", "WIND", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("STORMS", "STORM", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("FIRES", "FIRE", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("RAINS", "RAIN", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("TSTM", "TROPICAL STORM", rd5$EVTYPE_new)

# Get rid of -ing suffix: 
rd5$EVTYPE_new <- gsub("FLOODING", "FLOOD", rd5$EVTYPE_new)

# Get rid of word "severe" and similar adjectives or useless words:
rd5$EVTYPE_new <- gsub("SEVERE", "", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("DAMAGE", "", rd5$EVTYPE_new)
rd5$EVTYPE_new <- gsub("STRONG", "", rd5$EVTYPE_new)
rd5$EVTYPE_new<-trimws(rd5$EVTYPE_new)

# Correct a typo:  
rd5$EVTYPE_new <- gsub("THUDERSTORM", "THUNDERSTORM", rd5$EVTYPE_new)

# Remove the "first names" of hurricanes and tropical storms (e.g. "Katrina"): 
rd5$EVTYPE_new[grep("HURRICANE", rd5$EVTYPE_new)] <- "HURRICANE"
rd5$EVTYPE_new[grep("TROPICAL STORM", rd5$EVTYPE_new)] <- "TROPICAL STORM"

# Merge names that are essentially the same: 
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("HEAVY SURF/HIGH SURF","HEAVY RAIN/HIGH SURF" )] <- "HIGH SURF"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("WILDFIRE","FOREST FIRE" )] <- "WILD/FOREST FIRE"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("TORNADO F0","COLD AIR TORNADO", "FUNNEL CLOUD","WATERSPOUT" )] <- "TORNADO"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("GUSTNADO","GUSTY WIND" )] <- "HIGH WIND"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("HAIL 100","THUNDERSTORM HAIL","HAIL/WIND","ICE STORM")] <- "HAIL"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("DUST STORM/HIGH WIND")] <- "DUST STORM"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("DENSE FOG")] <- "FOG"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("FROST","FREEZE","GLAZE ICE", "ICY ROADS")] <- "COLD/FROST/FREEZE"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("COLD/WIND CHILL","EXTREME COLD","EXTREME COLD/WIND CHILL","WINTER WEATHER" )] <- "COLD/FROST/FREEZE"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("STORM SURGE")] <- "SURGE/TIDE"
rd5$EVTYPE_new[rd5$EVTYPE_new %in% c("THUNDERSTORM WIND LIGHTNING")] <- "LIGHTNING"


# Finally, consolidate into merged categories (where possible):
rd5$EVTYPE_new[grep("FLOOD", rd5$EVTYPE_new)] <- "FLOOD"
rd5$EVTYPE_new[grep("HAIL", rd5$EVTYPE_new)] <- "HAIL"
rd5$EVTYPE_new[grep("HEAT", rd5$EVTYPE_new)] <- "HEAT"
rd5$EVTYPE_new[grep("FOG", rd5$EVTYPE_new)] <- "FOG"
rd5$EVTYPE_new[grep("SNOW", rd5$EVTYPE_new)] <- "SNOW"
rd5$EVTYPE_new[grep("THUNDERSTORM", rd5$EVTYPE_new)] <- "Strm/Wind"
rd5$EVTYPE_new[grep("WIND", rd5$EVTYPE_new)] <- "Strm/Wind"


# Switch factor names to lowercase to make it more legible
rd6 <- rd5 
rd6$EVTYPE_new = tolower(rd6$EVTYPE_new)
substr(rd6$EVTYPE_new,1,1) = toupper(substr(rd6$EVTYPE_new,1,1))

# Our final dataset is a selection of rd6 
cleandata <- select(rd6, STATE__, STATE, EVTYPE, EVTYPE_new, FATALITIES, INJURIES, PROPDMGM, CROPDMGM, TOTDMGM)
rm(rd,rd4, rd5, rd6)

DATA ANALYSIS

This section takes the clean data and generates the variables of interest:

Sum of all fatalities, by event type
Sum of all injuries, by event type
Total economic cost in millions, by event type
Average economic cost of an event in millions, by event type

library(dplyr)
by_evtype <- group_by(cleandata,EVTYPE_new)
evt_sum <- summarize(by_evtype,
                      count = n(),
                      nFatal = sum(FATALITIES),
                      nInjur = sum(INJURIES),
                      avgPropDmg = mean(TOTDMGM),
                      sumPropDmg = sum(TOTDMGM)
)
mostFatal5   <- arrange(evt_sum, desc(nFatal))[1:5,]
mostInjur5   <- arrange(evt_sum, desc(nInjur))[1:5,]
mostMeanDam5 <- arrange(evt_sum, desc(avgPropDmg))[1:5,]
mostTotDam5  <-  arrange(evt_sum, desc(sumPropDmg))[1:5,]

RESULTS

Human costs

Figure 1 plots the five top-ranking event types for most fatalities and most injuries over the whole 1950-2011 period. Tornadoes rank at the top in both categories, with over 1000 deaths and nearly 12000 injuries sustained. Next-highest are floods, followed by heat and storms/wind.

par(mfrow = c(1,2), mar = c(2,2,1,0))
barplot(mostFatal5$nFatal, names.arg=mostFatal5$EVTYPE_new, cex.names=0.6, main=("Most Fatalities (count)"),cex.main=0.8)
barplot(mostInjur5$nInjur, names.arg=mostInjur5$EVTYPE_new, cex.names=0.6, main=("Most Injuries (count)"), cex.main=0.8,)
mtext("Figure 1: Human costs", side = 3, outer = TRUE, cex = 1 , col = "grey20")

Economic costs

Figure 2 plots the five top-ranking event types for highest overall economic costs over the 1950-2011 period, and the highest average cost per event. Although storms / winds have tallied the highest economic costs in total over the period (150 billion dollars), floods generate the highest cost per event (400 million on average).

par(mfrow = c(1,2), mar = c(2,2,1,0))
barplot(rev(mostTotDam5$sumPropDmg), names.arg=mostTotDam5$EVTYPE_new, cex.names=0.6, 
        main=("Highest Total Damages (millions USD)"),cex.main=0.8, horiz=TRUE)
barplot(rev(mostMeanDam5$avgPropDmg), names.arg=mostMeanDam5$EVTYPE_new, cex.names=0.6,
        main=("Highest Mean Damages per Event"),cex.main=0.8, horiz=TRUE)

CONCLUSIONS