Sanyam Jain
In this report, the goal is to analyze the impact of different weather events on public health and economy based on the storm database collected from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) from 1950 - 2011. The data used will be estimates of fatalities, injuries, property and crop damage to decide which types of event are most harmful to the population health and economy. From these data, we found that high temperatures and tornado are most harmful with respect to population health, while flood, drought, and hurricane/typhoon have the greatest economic impacts.
Loading Packages:
library(ggplot2)
library(reshape2)
library(plyr)
# Check if the file exists in the working directory, download it (if it does
# not exist) and store it into the variable StormData
filename <- "stormData.csv.bz2"
if (!file.exists("StormData.csv")) {
fileurl <- "http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileurl, filename, "auto")
}
StormData <- read.csv(filename)
Aggregating the number of fatalities and injuries per weather event.
fatalities <- ddply(StormData, "EVTYPE", summarize, Fatalities = sum(FATALITIES,
na.rm = T))
injuries <- ddply(StormData, "EVTYPE", summarize, Injuries = sum(INJURIES, na.rm = T))
harmdata <- merge(fatalities, injuries, by = "EVTYPE")
Cleaning Up misspelled Variables:
cleanHarm <- function(param) {
subtotalFatalities <- sum(harmdata[grepl(param, harmdata$EVTYPE, ignore.case = T),
2])
subtotalInjuries <- sum(harmdata[grepl(param, harmdata$EVTYPE, ignore.case = T),
3])
regex <- grepl(param, harmdata$EVTYPE, ignore.case = T)
total <- sum(harmdata[regex, 2])
harmdata$EVTYPE <- as.character(harmdata$EVTYPE)
harmdata <- subset(harmdata, !grepl(param, harmdata$EVTYPE, ignore.case = T))
newRow <- data.frame(EVTYPE = param, Fatalities = subtotalFatalities, Injuries = subtotalInjuries)
harmdata <- rbind(harmdata, newRow)
return(harmdata)
}
harmdata <- cleanHarm("HEAT")
harmdata <- cleanHarm("TORNADO")
harmdata <- cleanHarm("FLOOD")
harmdata <- cleanHarm("HAIL")
harmdata <- cleanHarm("SNOW")
harmdata <- cleanHarm("CURRENT")
harmdata <- cleanHarm("COLD")
harmdata <- cleanHarm("TORM")
harmdata <- cleanHarm("TSTM")
harmdata <- cleanHarm("TORM|TSTM")
harmdata[nrow(harmdata), 1] <- "THUNDERSTORM"
harmdata <- cleanHarm("AVALAN")
harmdata[nrow(harmdata), 1] <- "AVALANCHE"
harmdata <- cleanHarm("WIND")
harmdata <- cleanHarm("HURRICANE")
harmdata <- cleanHarm("WINTER WEATHER")
harmdata <- cleanHarm("WILD")
harmdata[nrow(harmdata), 1] <- "WILD FIRE"
harmdata <- cleanHarm("LIGHT")
harmdata[nrow(harmdata), 1] <- "LIGHTNING"
harmdata <- cleanHarm("DUST DEVIL")
harmdata <- cleanHarm("ROAD")
harmdata[nrow(harmdata), 1] <- "ICE ON ROAD"
harmdata <- cleanHarm("ICE")
harmdata <- cleanHarm("FOG")
harmdata <- cleanHarm("HYPOTHERMIA")
harmdata <- cleanHarm("LANDSLIDE")
harmdata <- cleanHarm("MUDSLIDE")
harmdata <- cleanHarm("SURF")
harmdata[nrow(harmdata), 1] <- "HAZARDOUS SURF"
harmdata <- cleanHarm("WARM")
harmdata[nrow(harmdata), 1] <- "WARM WEATHER"
harmdata <- cleanHarm("MARINE")
harmdata[nrow(harmdata), 1] <- "MARINE ACCIDENT"
harmdata <- cleanHarm("WARM")
harmdata <- cleanHarm("low|cold")
harmdata[nrow(harmdata), 1] <- "COLD"
harmdata <- cleanHarm("RAINFALL|HEAVY RAIN")
harmdata[nrow(harmdata), 1] <- "HEAVY RAIN"
The fatalities total were seperated from the injuries total in the final dataset. Merging top 10 causes of injuries with the top 10 causes of fatalities:
MostHarmful <- harmdata[harmdata$EVTYPE %in% arrange(harmdata, desc(Injuries))[1:10,
1] | harmdata$EVTYPE %in% arrange(harmdata, desc(Fatalities))[1:10, 1],
]
MostHarmful_melt <- melt(MostHarmful, id = "EVTYPE")
Cleaning up some of the odd notations:
StormData$PROPDMGEXP <- mapvalues(StormData$PROPDMGEXP, from = c("B", "b", "M",
"m", "K", "k", "H", "h"), to = c("9", "9", "6", "6", "3", "3", "2", "2"),
warn_missing = F)
StormData$CROPDMGEXP <- mapvalues(StormData$CROPDMGEXP, from = c("B", "b", "M",
"m", "K", "k", "H", "h"), to = c("9", "9", "6", "6", "3", "3", "2", "2"),
warn_missing = F)
StormData <- subset(StormData, StormData$PROPDMGEXP != "+" & StormData$PROPDMGEXP !=
"-" & StormData$PROPDMGEXP != "?")
StormData <- subset(StormData, StormData$CROPDMGEXP != "+" & StormData$CROPDMGEXP !=
"-" & StormData$CROPDMGEXP != "?")
Calculating and aggregating the damage value for properties and crops per tracked weather event:
dataPropDmg <- ddply(StormData, "EVTYPE", summarize, PropertyDamage = sum(PROPDMG *
10^as.numeric(as.character(PROPDMGEXP)), na.rm = T))
dataCropDmg <- ddply(StormData, "EVTYPE", summarize, CropDamage = sum(CROPDMG *
10^as.numeric(as.character(CROPDMGEXP)), na.rm = T))
damage <- merge(dataPropDmg, dataCropDmg, by = "EVTYPE")
Cleaning Mispelled Variables :
cleanDamage <- function(arg) {
subtotalProp <- sum(damage[grepl(arg, damage$EVTYPE, ignore.case = T), 2])
subtotalCrop <- sum(damage[grepl(arg, damage$EVTYPE, ignore.case = T), 3])
regex <- grepl(arg, damage$EVTYPE, ignore.case = T)
total <- sum(damage[regex, 2])
damage$EVTYPE <- as.character(damage$EVTYPE)
damage <- subset(damage, !grepl(arg, damage$EVTYPE, ignore.case = T))
newRow <- data.frame(EVTYPE = arg, PropertyDamage = subtotalProp, CropDamage = subtotalCrop)
damage <- rbind(damage, newRow)
return(damage)
}
damage <- cleanDamage("HEAT")
damage <- cleanDamage("TORNADO")
damage <- cleanDamage("FLOOD")
damage <- cleanDamage("HAIL")
damage <- cleanDamage("SNOW")
damage <- cleanDamage("CURRENT")
damage <- cleanDamage("COLD")
damage <- cleanDamage("TORM")
damage <- cleanDamage("TSTM")
damage <- cleanDamage("TORM|TSTM")
damage[nrow(damage), 1] <- "THUNDERSTORM"
damage <- cleanDamage("AVALAN")
damage[nrow(damage), 1] <- "AVALANCHE"
damage <- cleanDamage("WIND")
damage <- cleanDamage("HURRICANE")
damage <- cleanDamage("WINTER WEATHER")
damage <- cleanDamage("WILD")
damage[nrow(damage), 1] <- "WILD FIRE"
damage <- cleanDamage("LIGHT")
damage[nrow(damage), 1] <- "LIGHTNING"
damage <- cleanDamage("DUST DEVIL")
damage <- cleanDamage("ROAD")
damage[nrow(damage), 1] <- "ICE ON ROAD"
damage <- cleanDamage("ICE")
damage <- cleanDamage("FOG")
damage <- cleanDamage("HYPOTHERMIA")
damage <- cleanDamage("LANDSLIDE")
damage <- cleanDamage("MUDSLIDE")
damage <- cleanDamage("SURF")
damage[nrow(damage), 1] <- "HAZARDOUS SURF"
damage <- cleanDamage("WARM")
damage[nrow(damage), 1] <- "WARM WEATHER"
damage <- cleanDamage("MARINE")
damage[nrow(damage), 1] <- "MARINE ACCIDENT"
damage <- cleanDamage("WARM")
damage <- cleanDamage("low|cold")
damage[nrow(damage), 1] <- "COLD"
damage <- cleanDamage("RAINFALL|HEAVY RAIN")
damage[nrow(damage), 1] <- "HEAVY RAIN"
Finally, merging the 7 events that cause most of the damage. In addition we’ll rescale the values to USD Billions for better comparision/visualization:
MostDamage <- damage[damage$EVTYPE %in% arrange(damage, desc(PropertyDamage))[1:6,
1] | damage$EVTYPE %in% arrange(damage, desc(CropDamage))[1:6, 1], ]
MostDamage_melt <- melt(MostDamage, id = "EVTYPE")
MostDamage_melt$value <- MostDamage_melt$value/(10^9)
arrange(MostHarmful, EVTYPE)
## EVTYPE Fatalities Injuries
## 1 AVALANCHE 225 170
## 2 COLD 443 320
## 3 CURRENT 577 529
## 4 FLOOD 1525 8604
## 5 HAIL 20 1467
## 6 HEAT 3138 9224
## 7 HURRICANE 133 1328
## 8 LIGHTNING 817 5231
## 9 SNOW 169 1166
## 10 THUNDERSTORM 1148 13659
## 11 TORNADO 5661 91407
## 12 WILD FIRE 90 1606
## 13 WIND 469 1896
g <- ggplot(MostHarmful_melt, aes(EVTYPE, log(value), fill = variable)) + geom_bar(stat = "identity") +
coord_flip()
g <- g + xlab("Events") + ylab("Log (Total) ") + labs(title = "Injuries and Fatalities")
print(g)
From the economic perspective; damage to property is higher than the damage to crops. The most damaging weather event is flood when considering both property and crops or when analysing only property. For crops; the worst weather event is drought.
arrange(MostDamage, EVTYPE)
## EVTYPE PropertyDamage CropDamage
## 1 COLD 245869400 1416765500
## 2 DROUGHT 1046106000 13972566000
## 3 FLOOD 168211315589 12380109100
## 4 HAIL 16022991456 3111712870
## 5 HURRICANE 84656180010 5505292800
## 6 THUNDERSTORM 77519697134 6960767230
## 7 TORNADO 58603317864 417461520
## 8 WILD FIRE 8491563500 402781630
g <- ggplot(MostDamage_melt, aes(EVTYPE, value, fill = variable)) + geom_bar(stat = "identity")
g <- g + xlab("Event") + ylab("Total Damage USD Billions") + labs(title = "Property and Crop Damage") +
coord_flip()
print(g)
1.Tornadoes cause the highest number of total injuries and fatalities in the U.S. 2.Excessive heat, although it occurs less often than tornadoes, causes a relatively high number of injuries and fatalites per event. 3.Flooding causes the highest total damage expense. 4.Hurricanes and Typhoons, although they occur less frequently than flooding, causes a relatively high number of cost/impact per event.