Synopsis

Using data from the National Weather Service we attempt to determine which types of weather events are most harmful with respect to population health (counted as number of deaths and injuries), and which types of events have the greatest economic consequences (in then current dollars, at the time of the incident), across the United States. From these data, we found that Tornados were the leading cause of death between 1950 and 2011, and that Flooding was the leading cause of economic loss.

Data Processing

library (dplyr)
library (ggplot2)
library (R.utils)
library (lemon)
knit_print.data.frame <- lemon_print

From the Coursera “Reproducible Research, Week 4” website, we obtained the URL of a National Weather Service “Storm Data” data set for events between 1950 and November, 2011. Two files of documentation were also obtained via URLs provided, and are shown below.

if (!file.exists ("Data"))
        dir.create("Data")
if (!file.exists ("Data/StormData.bz2"))
        download.file ("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",  
                       "Data/StormData.bz2")
if (!file.exists ("Data/StormDataDocs.pdf"))
        download.file ("https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf",  
                       "Data/StormDataDocs.pdf")
if (!file.exists ("Data/StormDataFaq.pdf"))
        download.file ("https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2FNCDC%20Storm%20Events-FAQ%20Page.pdf",  
                       "Data/StormDataFaq.pdf")  

if (!file.exists ("Data/StormData.csv"))
        bunzip2 ("Data/StormData.bz2", "Data/StormData.csv", remove = FALSE)
rawStormData <- read.csv ("Data/StormData.csv")

We check to see if the read worked correctly, expecting 902297 rows of 37 variables, and check the Head and Tail.

dim (rawStormData)  
## [1] 902297     37
head (rawStormData[, c(2, 3, 5, 6, 7, 8)], n=5L)
##             BGN_DATE BGN_TIME COUNTY COUNTYNAME STATE  EVTYPE
## 1  4/18/1950 0:00:00     0130     97     MOBILE    AL TORNADO
## 2  4/18/1950 0:00:00     0145      3    BALDWIN    AL TORNADO
## 3  2/20/1951 0:00:00     1600     57    FAYETTE    AL TORNADO
## 4   6/8/1951 0:00:00     0900     89    MADISON    AL TORNADO
## 5 11/15/1951 0:00:00     1500     43    CULLMAN    AL TORNADO
tail (rawStormData[, c(2, 3, 5, 6, 7, 8)], n=5L)
##                  BGN_DATE    BGN_TIME COUNTY   COUNTYNAME STATE     EVTYPE
## 902293 11/30/2011 0:00:00 10:30:00 PM      7 WYZ007 - 017    WY  HIGH WIND
## 902294 11/10/2011 0:00:00 02:48:00 PM      9 MTZ009 - 010    MT  HIGH WIND
## 902295  11/8/2011 0:00:00 02:58:00 PM    213       AKZ213    AK  HIGH WIND
## 902296  11/9/2011 0:00:00 10:21:00 AM    202       AKZ202    AK   BLIZZARD
## 902297 11/28/2011 0:00:00 08:00:00 PM      6       ALZ006    AL HEAVY SNOW

We select only a few columns from the dataset, taking only those rows where there were fatalities, injuries, property damage, or crop damage. We create two new “multiplier” factors, initially set to all “1”s, to more easily calculate damage estimates.

fsd <- rawStormData %>% select (c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP",  
                                                "CROPDMG", "CROPDMGEXP")) %>%
        filter (FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0  | CROPDMG > 0) %>%
        mutate (propertyDamageMultiplier = 1, cropDamageMultiplier = 1)

First for property damage, and then for crop damage, we calculate numerical mulipliers based on coded columns ending in “EXP”.

For property damage, PROPDMGEXP codings of (blank), “-”, “+”, and “0” result in default multipliers of “1.”

fsd$PROPDMGEXP <-  tolower (fsd$PROPDMGEXP) 
sort (unique (fsd$PROPDMGEXP))
##  [1] ""  "-" "+" "0" "2" "3" "4" "5" "6" "7" "b" "h" "k" "m"
fsd$propertyDamageMultiplier[fsd$PROPDMGEXP == "h" | fsd$PROPDMGEXP == "2"] <- 1e+2
fsd$propertyDamageMultiplier[fsd$PROPDMGEXP == "k" | fsd$PROPDMGEXP == "3"] <- 1e+3
fsd$propertyDamageMultiplier[                        fsd$PROPDMGEXP == "4"] <- 1e+4
fsd$propertyDamageMultiplier[                        fsd$PROPDMGEXP == "5"] <- 1e+5
fsd$propertyDamageMultiplier[fsd$PROPDMGEXP == "m" | fsd$PROPDMGEXP == "6"] <- 1e+6
fsd$propertyDamageMultiplier[                        fsd$PROPDMGEXP == "7"] <- 1e+7
fsd$propertyDamageMultiplier[                        fsd$PROPDMGEXP == "8"] <- 1e+8
fsd$propertyDamageMultiplier[fsd$PROPDMGEXP == "b" | fsd$PROPDMGEXP == "9"] <- 1e+9

fsd <- mutate (fsd, propertyDamage = PROPDMG * propertyDamageMultiplier)

For crop damage, CROPDMGEXP codings of (Blank), “?”, and “0”, result in default mulipliers of “1.”

fsd$CROPDMGEXP <-  tolower (fsd$CROPDMGEXP)
sort (unique (fsd$CROPDMGEXP))
## [1] ""  "?" "0" "b" "k" "m"
fsd$cropDamageMultiplier[fsd$CROPDMGEXP == "k"] <- 1e+3
fsd$cropDamageMultiplier[fsd$CROPDMGEXP == "m"] <- 1e+6
fsd$cropDamageMultiplier[fsd$CROPDMGEXP == "b"] <- 1e+9
        
fsd <- mutate (fsd, cropDamage = CROPDMG * cropDamageMultiplier)

We clean up the “EVTYPE” column by adding a new column, “eventType”, and filling it in with standardized values, rather than the somewhat random spellings, additions, and abbreviations in the “EVTYPE” column.

# Valid current event types were obtained from https://www.ncdc.noaa.gov/stormevents/pd01016005curr.pdf on June 4, 2020
validEventTypes <- factor (c("Astronomical Low Tide", "Avalanche", "Blizzard", "Coastal Flood", "Cold/Wind Chill", "Debris Flow", "Dense Fog", "Dense Smoke", "Drought", "Dust Devil", 
        "Dust Storm", "Excessive Heat", "Extreme Cold/Wind Chill", "Flash Flood", "Flood", "Frost/Freeze", "Funnel Cloud", "Freezing Fog", "Hail", "Heat", 
        "Heavy Rain", "Heavy Snow", "High Surf", "High Wind", "Hurricane (Typhoon)", "Ice Storm", "Lake-Effect Snow", "Lakeshore Flood", "Lightning", "Marine Dense Fog", 
        "Marine Hail", "Marine Heavy Freezing Spray", "Marine High Wind", "Marine Hurricane/Typhoon", "Marine Lightning", "Marine Strong Wind", "Marine Thunderstorm Wind", "Marine Tropical Depression", "Marine Tropical Storm", "Rip Current", 
        "Seiche", "Sleet", "Sneaker Wave", "Storm Surge/Tide", "Strong Wind", "Thunderstorm Wind", "Tornado", "Tropical Depression", "Tropical Storm", "Tsunami", 
        "Volcanic Ash", "Waterspout", "Wildfire", "Winter Storm", "Winter Weather", "Unknown"))
mySearchStrings <- c("astron", "avalan", "bliz", "coastal flood", "cold|wind chill|low temp", "debris", "se fog", "smoke", "drought", "devil",
        "dust", "heat", "extreme cold", "flash", "flood|high water| fld", "frost|freeze", "funnel", "fog", "hail", "heat", 
        "rain", "snow", "surf", "wind", "icane|typhoo", "ice", "effect", "lakeshore", "light", "marine.*fog", 
        "marine.*hail", "spray", "marine.*wind", "marine.*icane|marine.*phoon", "marine.*lightning", "marine.*wind", "marine.*thunder", "marine.*tropical dep", "marine.*tropical storm", "current", 
        "seiche", "sleet", "sneaker", "surge", "ong.*wind", "tstm|thund", "torn|nado", "tropical.*dep", "tropical.*storm", "tsunami", 
        "volcanic", "waterspout", "fire", "winter.*sto", "winter.*weath", "Unknown")

evTypes <- tolower (as.character (fsd$EVTYPE))
unknownEvent <- validEventTypes[length (validEventTypes)]
fsd <- mutate (fsd, eventType = unknownEvent)
for (i in 1:length (validEventTypes)) {
        matches <- grep (mySearchStrings[i], evTypes)
        fsd$eventType[matches] <- validEventTypes[i]
}

byEvent <- group_by (fsd, eventType)
summary <- summarize (byEvent, Fatalities = sum (FATALITIES), Injuries = sum (INJURIES), 
                      PropertyDamage = sum (propertyDamage), CropDamage = sum (cropDamage), TotalDamage = PropertyDamage + CropDamage)

Results

With the data cleaned up, we select the top storm events in terms of Fatalities, Injuries, and Total Economic Loss.

topFatalities <- head (arrange (summary, desc (Fatalities)), 5)
topInjuries <- head (arrange (summary, desc (Injuries)), 5)
topTotalDamage <- head (arrange (summary, desc (TotalDamage)), 5)
topTroubleEvents <- unique (c(as.character (topFatalities$eventType), as.character (topInjuries$eventType), as.character (topTotalDamage$eventType)))
topTrouble <- subset (summary, as.character (summary$eventType) %in% topTroubleEvents)

summaryData <- data.frame (topFatalities$eventType, topFatalities$Fatalities, 
                           topInjuries$eventType, topInjuries$Injuries, 
                           topTotalDamage$eventType, round (topTotalDamage$TotalDamage/1e+9))
names (summaryData) <- c("EventType: ", "Fatalities", "EventType: ", "Injuries", "EventType:", "USD Damages (Blns)")

We can see that the most Fatalities and Injuries were caused by Tornado events, and that Flooding caused the most Economic consequences.

summaryData
Top 5 ranked storm events in each category: Fatalities, Injuries, and USD Damages, 1950 - 2011
EventType: Fatalities EventType: Injuries EventType: USD Damages (Blns)
Tornado 5658 Tornado 91364 Flood 181
Heat 3138 Thunderstorm Wind 9545 Hurricane (Typhoon) 91
Flood 1555 Heat 9224 Tornado 59
Lightning 818 Flood 8681 Storm Surge/Tide 48
Thunderstorm Wind 731 Lightning 5233 Hail 19

Visualized all together:

ggplot(topTrouble, aes(x=Fatalities, y=TotalDamage/1e+9, size=Injuries)) +
        geom_point(alpha=1, color = "red") + scale_size(range = c(1, 24)) +
        geom_text (aes (label = eventType), position = position_nudge(x = 12, y = 12), size = 4) +
        labs (y = "Total Damage (Billions of USD)") +
        theme(legend.position = "right") +
        ggtitle("Fatalities, Injuries, and USD Damage of Storm Events in the US, 1950 - 2011") +
        scale_x_continuous(limits = c(-800, 6000))