Examing the storm data, which consists of storm events (for example, tornados, thunderstorms, or excessive heat) and the number of fatalities, injuries, property damage, and crop damage caused by these events. This analysis explores which type(s) of storm events caused the most damage, in terms of death and injury, and in terms of damage caused.
The dataset, repdata_data_StormData.csv, can be downloaded as a compressed file from the Repoducible Research course website. I assume that this compressed file is already on the local computer hard drive for the purposes of data processing.
storm_data_backup <- read.csv("repdata_data_StormData.csv.bz2", header = TRUE)
## only interested in data where there were fatalities or injuries or property damage
## or crop damage
storm_data <- subset(storm_data_backup, (FATALITIES > 0) | (INJURIES > 0) |
(PROPDMG > 0) | (CROPDMG > 0))
## clean up EVTYPE so comparisons can happen
library(stringr)
storm_data$EVTYPE <- toupper(storm_data$EVTYPE)
storm_data$EVTYPE <- str_trim(storm_data$EVTYPE, side = "both")
storm_data$EVTYPE <- gsub(" ", " ", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("-", " ", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("AVALANCE", "AVALANCHE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FLOODS", "FLOOD", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FLOODING", "FLOOD", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FLASH FLOOD", "FLOOD/FLASH", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FLOOD FLASH", "FLOOD/FLASH", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("/$", "", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("/ ", "/", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FLOOD/FLASH/FLOOD", "FLOOD/FLASH", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FLOOD/FLASH LANDSLIDES", "FLOOD/FLASH/LANDSLIDE",
storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FLOOD/FLASHFLOOD", "FLOOD/FLASH", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FLOOD/FLOOD/FLASH", "FLOOD/FLASH", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FROST\\FREEZE", "FROST/FREEZE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("WINDS", "WIND", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("WAVES", "WAVE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SLIDES", "SLIDE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("RAINS", "RAIN", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("TREES", "TREE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("TSTM", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUDERSTORM", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDEERSTORM", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDERESTORM", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDERSNOW", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDERSTORMW", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDERSTORMWIND", "THUNDERSTORM WIND", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDERSTROM", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNDERTORM", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("THUNERSTORM", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("TUNDERSTORM", "THUNDERSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("PRECIP$", "PRECIPITATION", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("CURRENTS", "CURRENT", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("STORMS", "STORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("ROADS", "ROAD", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SQUALLS", "SQUALL", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("WINDS", "WIND", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("FIRES", "FIRE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("MUD SLIDE", "MUDSLIDE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("ICE ROAD", "ICY ROAD", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SNOWFALL", "SNOW", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("RAINTORM", "RAINSTORM", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SNOW AND ICE STORM", "SNOW/ICE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SNOW/ICE STORM", "SNOW/ICE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SNOW AND ICE", "SNOW/ICE", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SNOW FREEZING RAIN", "SNOW/FREEZING RAIN", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SNOW AND HEAVY SNOW", "SNOW/HEAVY SNOW", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("SNOMELT", "SNOWMELT", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("UNSEASONABLE", "UNSEASONABLY", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("TORNDAO", "TORNADO", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("WINTER WEATHER MIX", "WINTER WEATHER/MIX", storm_data$EVTYPE)
storm_data$EVTYPE <- gsub("WINTRY MIX", "WINTER WEATHER/MIX", storm_data$EVTYPE)
## clean up PROPDMGEXP and CROPDMGEXP so multiplication can happen
storm_data$PROPDMGEXP <- toupper(storm_data$PROPDMGEXP)
storm_data$CROPDMGEXP <- toupper(storm_data$CROPDMGEXP)
## get a subset of the data with just property damage information
property <- subset(storm_data, PROPDMG > 0, select = c(EVTYPE, PROPDMG, PROPDMGEXP))
## get a subset of the data with just crop damage information
crops <- subset(storm_data, CROPDMG > 0, select = c(EVTYPE, CROPDMG, CROPDMGEXP))
## create new variables to hold the actual values for property damage and crop damage
mult_damages <- function(DMG, DMGEXP) {
if (DMGEXP == "K") {
mult <- DMG * 1000
} else if (DMGEXP == "M") {
mult <- DMG * 1000000
} else if (DMGEXP == "B") {
mult <- DMG * 1000000000
} else {
mult <- DMG
}
return(mult)
}
## use the subset with property damage > 0
len_prop <- length(property$PROPDMG)
new_prop_damages <- numeric(len_prop)
for (i in 1:len_prop) {
new_prop_damages[i] <- mult_damages(property$PROPDMG[i], property$PROPDMGEXP[i])
}
## add new data field to dataa frame
property$new_damages <- new_prop_damages
## use the subset with crop damage > 0
len_crops <- length(crops$CROPDMG)
new_crop_damages <- numeric(len_crops)
for (i in 1:len_crops) {
new_crop_damages[i] <- mult_damages(crops$CROPDMG[i], crops$CROPDMGEXP[i])
}
## add new data field to dataa frame
crops$new_damages <- new_crop_damages
Across the United States, which types of events are most harmful with respect to population health?
I took a subset of the storm data to look at just the event types that had caused injuries or fatalities. A box plot of these fatalities and of the injuries showed that there were clear outliers at the top of the chart, but that most of the events that caused fatalities and injuries were clustered at the bottom. Since I'm interested in the worst events, I found the events that individually had the worst number of fatalities and the worst number of injuries. I also cut off the data where the boxplot began to show the highest outliers and summed up the total fatalities and injuries for each of these types of events.
## get a subset of the data with just fatalities
fatalities <- subset(storm_data, FATALITIES > 0, select = c(EVTYPE, FATALITIES))
## which event had the most fatalities?
max_deaths <- max(fatalities$FATALITIES)
worst_death <- fatalities[fatalities$FATALITIES == max_deaths,]
## which events had the top number of fatalities?
worst_deaths <- subset(fatalities, FATALITIES > 75)
## how did these add up overall?
sum_fatalities <- tapply(worst_deaths$FATALITIES, worst_deaths$EVTYPE, sum)
## get a subset of the data with just injuries
injuries <- subset(storm_data, INJURIES > 0, select = c(EVTYPE, INJURIES))
## which event had the most injuries?
max_injuries <- max(injuries$INJURIES)
worst_injury <- injuries[injuries$INJURIES == max_injuries,]
## which events had the top number of injuries?
worst_injuries <- subset(injuries, INJURIES > 500)
## how did these add up overall?
sum_injuries <- tapply(worst_injuries$INJURIES, worst_injuries$EVTYPE, sum)
Across the United States, which types of events have the greatest economic consequences?
I took a subset of the storm data to look at just the event types that had caused property damage or crop damage. A box plot of these damages showed, like with fatalities and injuries, that there were clear outliers at the top of the chart, but that most of the events that caused damages were clustered at the bottom. Since I'm interested in the worst events, I found the events that individually had the highest amount of property damage and crop damage. I also cut off the data where the boxplot began to show the highest outliers and summed up the total damages for each of these types of events.
## use the subset of data with just property damage information, from the Data Processing section
## which event had the most propery damages?
max_prop_damage <- max(property$new_damages)
worst_prop_damage <- property[property$new_damages == max_prop_damage,]
## which events had the highest number of property damages?
worst_prop_damages <- subset(property, new_damages > 1000000000)
## how did these add up overall?
sum_prop_damages <- tapply(worst_prop_damages$new_damages,
worst_prop_damages$EVTYPE, sum)
## use the subset of data with just propert damage information, from the Data Processing section
## which event had the most crop damages?
max_crop_damage <- max(crops$new_damages)
worst_crop_damage <- crops[crops$new_damages == max_crop_damage,]
## which events had the highest number of property damages?
worst_crop_damages <- subset(crops, new_damages > 500000000)
## how did these add up overall?
sum_crop_damages <- tapply(worst_crop_damages$new_damages,
worst_crop_damages$EVTYPE, sum)
# boxplots of fatalities and injuries
options(scipen = 999) ## sets the y axis labels to be integers instead of exponents
par(mfrow = c(1, 2))
boxplot(fatalities$FATALITIES, main = "Number of Fatalities per Event Type")
boxplot(injuries$INJURIES, main = "Number of Injuries per Event Type")
Boxplots of Fatalities and Injuries show that while the number of most fatalties and injuries are clustered at low numbers, from the analysis above, I found that the event types that caused the most fatalies and the most injuries were:
The boxplots showed us that these events that caused the most fatalities and injuries, other event types that occurred on different days may have caused in total more damage than these highest events. Looking at just the events with at least 75 fatalities (based on the boxplot number that showed the higher number events started there), and at just the events with at least 500 injuries (again, based on the boxplot showing the higher number events stared around there), we see that summing up the total per event type we get:
Major fatalities per event type:
sum_fatalities
## EXCESSIVE HEAT HEAT TORNADO
## 99 583 478
Major injuries per event type:
sum_injuries
## EXCESSIVE HEAT FLOOD HURRICANE/TYPHOON ICE STORM
## 519 2700 780 1568
## TORNADO
## 9174
You can see that the event types that caused the greatest amount of damage as single events still caused the most damage even when other event types have all of their fatalities and injuries added together. So, HEAT and TORNADO appear to be the most dangerous event types regarding fatalities and injuries.
# boxplots of property and crop damages
options(scipen = 999) ## sets the y axis labels to be integers instead of exponents
par(mfrow = c(1, 2))
boxplot(property$new_damages, main = "Property Damages per Event Type")
boxplot(crops$new_damages, main = "Crop Damages per Event Type")
Boxplots of Property Damages and Crop Damages show that while the highest number of damages are clustered at low numbers, from the analysis above I found that the event types that caused the most damages were:
The boxplots showed us that these events that caused the most property damage and crop damage, other event types that occurred on different days may have caused in total more damage than these highest events. Looking at just the events with at least $1,000,000,000 in property damage (based on the boxplot number that showed the higher number events started there), and at just the events with at least $500,000,000 in crop damage (again, based on the boxplot showing the higher number events stared around there), we see that summing up the total per event type we get:
Highest amount of property damage per event type:
sum_prop_damages
## FLOOD HAIL
## 121500000000 1800000000
## HEAVY RAIN/SEVERE WEATHER HIGH WIND
## 2500000000 1300000000
## HURRICANE HURRICANE OPAL
## 4700000000 2100000000
## HURRICANE/TYPHOON RIVER FLOOD
## 64500000000 5000000000
## SEVERE THUNDERSTORM STORM SURGE
## 1200000000 42560000000
## STORM SURGE/TIDE TORNADO
## 4000000000 4300000000
## TORNADOES, THUNDERSTORM WIND, HAIL TROPICAL STORM
## 1600000000 5150000000
## WILD/FOREST FIRE WILDFIRE
## 1500000000 1040000000
## WINTER STORM
## 5000000000
Highest amount of crop damage per event type:
sum_crop_damages
## DROUGHT EXTREME COLD HURRICANE/TYPHOON ICE STORM
## 2093850000 596000000 1510000000 5000000000
## RIVER FLOOD
## 5000000000
You can see that the event types that caused the greatest amount of damage as single events still caused the most damage even when other event types have all of their damages added together. So, FLOOD appears to be the event type that causes the most property damage, and RIVER FLOOD and ICE STORM appear to be the event types that cause the most crop damage, though for both types of damages other event types can be close.