Tornados and Thunderstorms have the highest levels of health impact (defined as fatalities plus injuries) per occurance. Floods and hurricanes cause the most economic damage (defined as property plus crop damage) per occurrence.
library(plyr)
library(dplyr)
library(ggplot2)
First we load the dataset - either from the URL or locally.
if(!file.exists("StormData.csv")) {
download.file(url="https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile="StormData.csv.bz2")
bunzip2(filename="StormData.csv.bz2", destname="StormData.csv", remove=TRUE)
}
stormData <-read.csv(file="StormData.csv")
exponentLevel <- c("", "-", "?", "+", 0:9, "B", "b", "H", "h", "K", "k", "M", "m")
exponentLevel.Fixed <- c(0, 0, 0, 0, 0:9, 9, 9, 2, 2, 3, 3, 6, 6)
stormData$CROPDMGEXP <- plyr::mapvalues(stormData$CROPDMGEXP,
from = exponentLevel,
to = exponentLevel.Fixed, warn_missing = F)
stormData$PROPDMGEXP <- plyr::mapvalues(stormData$PROPDMGEXP,
from = exponentLevel,
to = exponentLevel.Fixed, warn_missing = F)
stormData$CROPDMGEXP <- as.numeric(stormData$CROPDMGEXP)
stormData$PROPDMGEXP <- as.numeric(stormData$PROPDMGEXP)
stormData$PROPDMG = stormData$PROPDMG * 10 ^ stormData$PROPDMGEXP
stormData$CROPDMG = stormData$CROPDMG * 10 ^ stormData$CROPDMGEXP
stormData <- subset(stormData, select = -c(PROPDMGEXP, CROPDMGEXP))
There are many anomolies in the event type (EVTYPE) in this dataset. This section will attempt to clean and group like events.
levels(stormData$EVTYPE) <- toupper(levels(stormData$EVTYPE))
stormData$EVTYPE <- factor(stormData$EVTYPE)
levels(stormData$EVTYPE) <- gsub('^TSTM.*|^THU.*|^\\sTSTM.*|.*THUNDER.*', 'THUNDERSTORM', levels(stormData$EVTYPE))
levels(stormData$EVTYPE) <- gsub('^TORN.*', 'TORNADO', levels(stormData$EVTYPE))
levels(stormData$EVTYPE) <- gsub('^HIGH\\sWIND.*', 'HIGH WIND', levels(stormData$EVTYPE))
levels(stormData$EVTYPE) <- gsub('^TROPICAL\\sS.*', 'TROPICAL STORM', levels(stormData$EVTYPE))
levels(stormData$EVTYPE) <- gsub('^HURRICANE.*|^TYPHOON.*', 'HURRICANE (TYPHOON)', levels(stormData$EVTYPE))
levels(stormData$EVTYPE) <- gsub('^LIGHTNING.*|^LIGHTING.*', 'LIGHTNING', levels(stormData$EVTYPE))
levels(stormData$EVTYPE) <- gsub('.*HAIL.*', 'HAIL', levels(stormData$EVTYPE))
levels(stormData$EVTYPE) <- gsub('.*FLASH.*', 'FLASH FLOOD', levels(stormData$EVTYPE))
levels(stormData$EVTYPE) <- gsub('^(?!FLASH).*FLOOD.*', 'FLOOD', levels(stormData$EVTYPE), perl=TRUE)
levels(stormData$EVTYPE) <- gsub('^HEAVY.*SNOW.*', 'HEAVY SNOW', levels(stormData$EVTYPE))
levels(stormData$EVTYPE) <- gsub('^HEAVY.*RAIN.*', 'HEAVY RAIN', levels(stormData$EVTYPE))
levels(stormData$EVTYPE) <- gsub('^SNOW.*', 'HEAVY SNOW', levels(stormData$EVTYPE))
levels(stormData$EVTYPE) <- gsub('^WILD.*', 'WILDFIRE', levels(stormData$EVTYPE))
levels(stormData$EVTYPE) <- gsub('^GUST.*', 'HIGH WIND', levels(stormData$EVTYPE))
levels(stormData$EVTYPE) <- gsub('^WATERSP.*', 'WATERSPOUT', levels(stormData$EVTYPE))
This code creates seperate data frames for economic damage and health damage. While creating those data sets we create new variables with total health and econimic impacts.
By creating these datasets the code required to create the plots will be easier to read and understand.
stormData <- stormData %>% mutate(ECONOMICDMG = PROPDMG + CROPDMG, HEALTHDMG = FATALITIES + INJURIES)
stormDataHealth <- stormData %>% select(EVTYPE, FATALITIES, INJURIES, HEALTHDMG) %>% group_by(EVTYPE) %>% summarise(fatalityTotal=sum(FATALITIES), injuryTotall=sum(INJURIES), totalHealthImpact=sum(HEALTHDMG)) %>% filter(totalHealthImpact > 0) %>% arrange(desc(totalHealthImpact)) %>% head(10)
stormDataEcon <- stormData %>% select(EVTYPE, PROPDMG, CROPDMG, ECONOMICDMG) %>% group_by(EVTYPE) %>% summarise(propTotal=sum(PROPDMG), cropTotal=sum(CROPDMG), totalEconImpact=sum(ECONOMICDMG)/1000000000) %>% filter(totalEconImpact > 0) %>% arrange(desc(totalEconImpact)) %>% head(10)
The results the weather and health analysis are shown below. Health effects are defined as fatalities plus injuries and are grouped together under a new variable, totalHealthImpact. The weather events with the greatest health impacts for this data set were Tornados and Thunderstorms.
ggplot(stormDataHealth, aes(x=reorder(EVTYPE, -totalHealthImpact), y=totalHealthImpact)) + geom_bar(stat="identity") + labs(title="Top 10 Weather Events\nWith Highest Impact To Human Health", x="Event", y="Number of Fatalities/Injuries") + theme(axis.text.x=element_text(angle = -90, hjust = 0))
The results of the weather and economic analysis are shown below. Economic effects are defined as total property and crop damage per weather instance and are grouped together under a new variable, totalEconImpact. The weather events with the greatest economic damage were floods and hurricanes.
ggplot(stormDataEcon, aes(x=reorder(EVTYPE, -totalEconImpact), y=totalEconImpact)) + geom_bar(stat="identity") + labs(title="Top 10 Weather Events\nWith Highest Economic Damage", x="Event", y="Dollars (in Billions)") + theme(axis.text.x=element_text(angle = -90, hjust = 0))