Storm Data Reveals Events that Cause the Most Damage and the Most Deaths

Synopsis

Examing the storm data, which consists of storm events (for example, tornados, thunderstorms, or excessive heat) and the number of fatalities, injuries, property damage, and crop damage caused by these events. This analysis explores which type(s) of storm events caused the most damage, in terms of death and injury, and in terms of damage caused.

Data Processing

The dataset, repdata_data_StormData.csv, can be downloaded as a compressed file from the Repoducible Research course website. I assume that this compressed file is already on the local computer hard drive for the purposes of data processing.

  • Only events that had fatalities, injuries, property damage, or crop damage were looked at for the purposes of this evaluation.
  • Spelling for the event types (variable EVTYPE) was normalized for the purposes of this analysis.
  • Two new variables (fields) were added to contain the values of the property damage and crop damage fields. These were created by multiplying the base damage (PROPDMG or CROPDMG) by the value implied by the exponent field (PROPDMGEXP or CROPDMGEXP). If the exponent value was K, M, or B, the base damage was muliplied by one thousand, one million, or one billion, respectively. If the exponent field held none of these, just the base damages were used; fewer than 1000 records had unknown values in the expoenent field, out of more than 200,000 events.
storm_data_backup <- read.csv("repdata_data_StormData.csv.bz2", header = TRUE)

## only interested in data where there were fatalities or injuries or property damage
## or crop damage
storm_data <- subset(storm_data_backup, (FATALITIES > 0) | (INJURIES > 0) | 
                                 (PROPDMG > 0) | (CROPDMG > 0))

## clean up EVTYPE so comparisons can happen
library(stringr)
storm_data$EVTYPE <- toupper(storm_data$EVTYPE)
storm_data$EVTYPE <- str_trim(storm_data$EVTYPE, side = "both")
storm_data$EVTYPE <- gsub("  ", " ", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("-", " ", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("AVALANCE", "AVALANCHE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FLOODS", "FLOOD", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FLOODING", "FLOOD", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FLASH FLOOD", "FLOOD/FLASH", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FLOOD FLASH", "FLOOD/FLASH", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("/$", "", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("/ ", "/", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FLOOD/FLASH/FLOOD", "FLOOD/FLASH", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FLOOD/FLASH LANDSLIDES", "FLOOD/FLASH/LANDSLIDE", 
                          storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FLOOD/FLASHFLOOD", "FLOOD/FLASH", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FLOOD/FLOOD/FLASH", "FLOOD/FLASH", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FROST\\FREEZE", "FROST/FREEZE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("WINDS", "WIND", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("WAVES", "WAVE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SLIDES", "SLIDE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("RAINS", "RAIN", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("TREES", "TREE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("TSTM", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUDERSTORM", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDEERSTORM", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDERESTORM", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDERSNOW", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDERSTORMW", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDERSTORMWIND", "THUNDERSTORM WIND", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDERSTROM", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDERTORM", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNERSTORM", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("TUNDERSTORM", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("PRECIP$", "PRECIPITATION", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("CURRENTS", "CURRENT", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("STORMS", "STORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("ROADS", "ROAD", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SQUALLS", "SQUALL", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("WINDS", "WIND", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FIRES", "FIRE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("MUD SLIDE", "MUDSLIDE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("ICE ROAD", "ICY ROAD", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SNOWFALL", "SNOW", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("RAINTORM", "RAINSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SNOW AND ICE STORM", "SNOW/ICE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SNOW/ICE STORM", "SNOW/ICE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SNOW AND ICE", "SNOW/ICE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SNOW FREEZING RAIN", "SNOW/FREEZING RAIN", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SNOW AND HEAVY SNOW", "SNOW/HEAVY SNOW", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SNOMELT", "SNOWMELT", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("UNSEASONABLE", "UNSEASONABLY", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("TORNDAO", "TORNADO", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("WINTER WEATHER MIX", "WINTER WEATHER/MIX", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("WINTRY MIX", "WINTER WEATHER/MIX", storm_data$EVTYPE)

## clean up PROPDMGEXP and CROPDMGEXP so multiplication can happen
storm_data$PROPDMGEXP <- toupper(storm_data$PROPDMGEXP)
storm_data$CROPDMGEXP <- toupper(storm_data$CROPDMGEXP)

## get a subset of the data with just property damage information
property <- subset(storm_data, PROPDMG > 0, select = c(EVTYPE, PROPDMG, PROPDMGEXP))

## get a subset of the data with just crop damage information
crops <- subset(storm_data, CROPDMG > 0, select = c(EVTYPE, CROPDMG, CROPDMGEXP))

## create new variables to hold the actual values for property damage and crop damage
mult_damages <- function(DMG, DMGEXP) {
       if (DMGEXP == "K") {
           mult <- DMG * 1000
       } else if (DMGEXP == "M") {
           mult <- DMG * 1000000
       } else if (DMGEXP == "B") {
           mult <- DMG * 1000000000
       } else {
           mult <- DMG
       }    
    return(mult)  
}

## use the subset with property damage > 0
len_prop <- length(property$PROPDMG)
new_prop_damages <- numeric(len_prop)
for (i in 1:len_prop) {
    new_prop_damages[i] <- mult_damages(property$PROPDMG[i], property$PROPDMGEXP[i])
}
## add new data field to dataa frame
property$new_damages <- new_prop_damages

## use the subset with crop damage > 0
len_crops <- length(crops$CROPDMG)
new_crop_damages <- numeric(len_crops)
for (i in 1:len_crops) {
    new_crop_damages[i] <- mult_damages(crops$CROPDMG[i], crops$CROPDMGEXP[i])
}
## add new data field to dataa frame
crops$new_damages <- new_crop_damages

Analysis - Harmful to Population Health

Across the United States, which types of events are most harmful with respect to population health?

I took a subset of the storm data to look at just the event types that had caused injuries or fatalities. A box plot of these fatalities and of the injuries showed that there were clear outliers at the top of the chart, but that most of the events that caused fatalities and injuries were clustered at the bottom. Since I'm interested in the worst events, I found the events that individually had the worst number of fatalities and the worst number of injuries. I also cut off the data where the boxplot began to show the highest outliers and summed up the total fatalities and injuries for each of these types of events.
## get a subset of the data with just fatalities
fatalities <- subset(storm_data, FATALITIES > 0, select = c(EVTYPE, FATALITIES))

## which event had the most fatalities?
max_deaths <- max(fatalities$FATALITIES)
worst_death <- fatalities[fatalities$FATALITIES == max_deaths,]

## which events had the top number of fatalities?
worst_deaths <- subset(fatalities, FATALITIES > 75)
## how did these add up overall?
sum_fatalities <- tapply(worst_deaths$FATALITIES, worst_deaths$EVTYPE, sum)

## get a subset of the data with just injuries
injuries <- subset(storm_data, INJURIES > 0, select = c(EVTYPE, INJURIES))

## which event had the most injuries?
max_injuries <- max(injuries$INJURIES)
worst_injury <- injuries[injuries$INJURIES == max_injuries,]

## which events had the top number of injuries?
worst_injuries <- subset(injuries, INJURIES > 500)
## how did these add up overall?
sum_injuries <- tapply(worst_injuries$INJURIES, worst_injuries$EVTYPE, sum)

Analysis - Greatest Economic Consequences

Across the United States, which types of events have the greatest economic consequences?

I took a subset of the storm data to look at just the event types that had caused property damage or crop damage. A box plot of these damages showed, like with fatalities and injuries, that there were clear outliers at the top of the chart, but that most of the events that caused damages were clustered at the bottom. Since I'm interested in the worst events, I found the events that individually had the highest amount of property damage and crop damage. I also cut off the data where the boxplot began to show the highest outliers and summed up the total damages for each of these types of events.
## use the subset of data with just property damage information, from the Data Processing section

## which event had the most propery damages?
max_prop_damage <- max(property$new_damages)
worst_prop_damage <- property[property$new_damages == max_prop_damage,]

## which events had the highest number of property damages?
worst_prop_damages <- subset(property, new_damages > 1000000000)
## how did these add up overall?
sum_prop_damages <- tapply(worst_prop_damages$new_damages, 
                           worst_prop_damages$EVTYPE, sum)


## use the subset of data with just propert damage information, from the Data Processing section

## which event had the most crop damages?
max_crop_damage <- max(crops$new_damages)
worst_crop_damage <- crops[crops$new_damages == max_crop_damage,]

## which events had the highest number of property damages?
worst_crop_damages <- subset(crops, new_damages > 500000000)
## how did these add up overall?
sum_crop_damages <- tapply(worst_crop_damages$new_damages, 
                           worst_crop_damages$EVTYPE, sum)

Results

# boxplots of fatalities and injuries
options(scipen = 999) ## sets the y axis labels to be integers instead of exponents
par(mfrow = c(1, 2))
boxplot(fatalities$FATALITIES, main = "Number of Fatalities per Event Type")
boxplot(injuries$INJURIES, main = "Number of Injuries per Event Type")

plot of chunk unnamed-chunk-4

Types of Events Most Harmful

Boxplots of Fatalities and Injuries show that while the number of most fatalties and injuries are clustered at low numbers, from the analysis above, I found that the event types that caused the most fatalies and the most injuries were:

  • Event type HEAT caused 583 fatalities, the most for any event type.
  • Event type TORNADO caused 1700 injuries, the most for any event type.

The boxplots showed us that these events that caused the most fatalities and injuries, other event types that occurred on different days may have caused in total more damage than these highest events. Looking at just the events with at least 75 fatalities (based on the boxplot number that showed the higher number events started there), and at just the events with at least 500 injuries (again, based on the boxplot showing the higher number events stared around there), we see that summing up the total per event type we get:

Major fatalities per event type:

sum_fatalities
## EXCESSIVE HEAT           HEAT        TORNADO 
##             99            583            478

Major injuries per event type:

sum_injuries
##    EXCESSIVE HEAT             FLOOD HURRICANE/TYPHOON         ICE STORM 
##               519              2700               780              1568 
##           TORNADO 
##              9174

You can see that the event types that caused the greatest amount of damage as single events still caused the most damage even when other event types have all of their fatalities and injuries added together. So, HEAT and TORNADO appear to be the most dangerous event types regarding fatalities and injuries.

# boxplots of property and crop damages
options(scipen = 999) ## sets the y axis labels to be integers instead of exponents
par(mfrow = c(1, 2))
boxplot(property$new_damages, main = "Property Damages per Event Type")
boxplot(crops$new_damages, main = "Crop Damages per Event Type")

plot of chunk unnamed-chunk-7

Types of Events Causing Worst Damage

Boxplots of Property Damages and Crop Damages show that while the highest number of damages are clustered at low numbers, from the analysis above I found that the event types that caused the most damages were:

  • Event type FLOOD caused 115000000000 dollars in damages, the most for any event type.
  • Event types RIVER FLOOD and ICE STORM tied as the cause of the worst crop damage, which was 5000000000 dollars in damages.

The boxplots showed us that these events that caused the most property damage and crop damage, other event types that occurred on different days may have caused in total more damage than these highest events. Looking at just the events with at least $1,000,000,000 in property damage (based on the boxplot number that showed the higher number events started there), and at just the events with at least $500,000,000 in crop damage (again, based on the boxplot showing the higher number events stared around there), we see that summing up the total per event type we get:

Highest amount of property damage per event type:

sum_prop_damages
##                              FLOOD                               HAIL 
##                       121500000000                         1800000000 
##          HEAVY RAIN/SEVERE WEATHER                          HIGH WIND 
##                         2500000000                         1300000000 
##                          HURRICANE                     HURRICANE OPAL 
##                         4700000000                         2100000000 
##                  HURRICANE/TYPHOON                        RIVER FLOOD 
##                        64500000000                         5000000000 
##                SEVERE THUNDERSTORM                        STORM SURGE 
##                         1200000000                        42560000000 
##                   STORM SURGE/TIDE                            TORNADO 
##                         4000000000                         4300000000 
## TORNADOES, THUNDERSTORM WIND, HAIL                     TROPICAL STORM 
##                         1600000000                         5150000000 
##                   WILD/FOREST FIRE                           WILDFIRE 
##                         1500000000                         1040000000 
##                       WINTER STORM 
##                         5000000000

Highest amount of crop damage per event type:

sum_crop_damages
##           DROUGHT      EXTREME COLD HURRICANE/TYPHOON         ICE STORM 
##        2093850000         596000000        1510000000        5000000000 
##       RIVER FLOOD 
##        5000000000

You can see that the event types that caused the greatest amount of damage as single events still caused the most damage even when other event types have all of their damages added together. So, FLOOD appears to be the event type that causes the most property damage, and RIVER FLOOD and ICE STORM appear to be the event types that cause the most crop damage, though for both types of damages other event types can be close.