We are looking at data from the US National Oceanic Atmospheric Administration’s storm database at the National Weather Service. The goal is to download the data, clean it, and use it to answer a few simple questions about severe weather events. From the data analysis, we have the 7 weather events that cause the highest number of fatalities and the 7 weather events that cause the highest number of injuries. This data suggests tornadoes by far cause the highest number of fatalities and injuries. In terms of economic damage, if we combine Crop and Property damage together, Floods cause the most damage followed by Hurricanes and Tornadoes.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
fname <- "repdata_data_StormData.csv.bz2"
furl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if (!file.exists(fname)){
download.file(furl, fname, method="curl")
}
if (!file.exists(fname)) {
unzip(fname)
}
#change working directory to unzipped file directory
setwd(paste0(getwd(), "/repdata_data_StormData.csv"))
#read data into R
stormData <- read.csv("repdata_data_StormData.csv", sep = ",", header = TRUE)
The raw data contains extraneous variables we don’t need and the property damage and crop damage data are stored in two separate columns each (PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP). We need to tidy up the data and make it more intuitive and manageable. We also need to combine crop damage and property damage together into a variable that estimates the total economic damage (stored in “totalDamage”).
#keep relevant data
stormData <- stormData[,c('STATE', 'BGN_DATE', 'BGN_TIME', 'TIME_ZONE', 'STATE',
'EVTYPE','FATALITIES','INJURIES', 'PROPDMG', 'PROPDMGEXP', 'CROPDMG', 'CROPDMGEXP')]
#create multiples for the exponents provided in the data (K = thousands, M = millions etc)
stormData$propExp <- 1
stormData$propExp[stormData$PROPDMGEXP == "h"] <- 10^2
stormData$propExp[stormData$PROPDMGEXP == "H"] <- 10^2
stormData$propExp[stormData$PROPDMGEXP == "K"] <- 10^3
stormData$propExp[stormData$PROPDMGEXP == "m"] <- 10^6
stormData$propExp[stormData$PROPDMGEXP == "M"] <- 10^6
stormData$propExp[stormData$PROPDMGEXP == "B"] <- 10^9
stormData$cropExp <- 1
stormData$cropExp[stormData$CROPDMGEXP == "k"] <- 10^3
stormData$cropExp[stormData$CROPDMGEXP == "K"] <- 10^3
stormData$cropExp[stormData$CROPDMGEXP == "m"] <- 10^6
stormData$cropExp[stormData$CROPDMGEXP == "M"] <- 10^6
stormData$cropExp[stormData$CROPDMGEXP == "B"] <- 10^9
stormData$PROPTOTAL <- stormData$PROPDMG * stormData$propExp
stormData$CROPTOTAL <- stormData$CROPDMG * stormData$cropExp
#setup data for total crop and property damages
stormData$totalDamage <- stormData$PROPTOTAL + stormData$CROPTOTAL
require(gridExtra)
#set up data for fatalities plot
fatal <- aggregate(FATALITIES ~ EVTYPE, data=stormData, sum)
fatal <- fatal[order(-fatal$FATALITIES), ][1:7,]
fatal$EVTYPE <- factor(fatal$EVTYPE, levels = fatal$EVTYPE)
gg1 <- ggplot(fatal, aes(x = EVTYPE, y = FATALITIES)) +
geom_bar(stat= "identity", fill = "red", width = NULL) +
xlab("Event Type") + ylab("Fatalities") + ggtitle("Weather Event Fatalities")
#setup data for injuries plot
injury <- aggregate(INJURIES ~ EVTYPE, data=stormData, sum)
injury <- injury[order(-injury$INJURIES), ][1:7,]
injury$EVTYPE <- factor(injury$EVTYPE, levels = injury$EVTYPE)
gg2 <- ggplot(injury, aes(x = EVTYPE, y = INJURIES)) +
geom_bar(stat= "identity", fill = "orange", width = NULL) +
xlab("Event Type") + ylab("Injuries") + ggtitle("Weather Event Injuries")
#plot injuries and fatalities together
grid.arrange(gg1, gg2, nrow=2)
In the above plots, we see that tornadoes cause by far the most fatalities and injuries, having about as much impact as the next 6 severe weather events combined.
totDamage <- aggregate(totalDamage ~ EVTYPE, data=stormData, sum)
totDamage <- totDamage[order(-totDamage$totalDamage), ][1:7,]
totDamage$EVTYPE <- factor(totDamage$EVTYPE, levels = totDamage$EVTYPE)
#plot damages
gg1 <- ggplot(totDamage, aes(x = EVTYPE, y = totalDamage)) +
geom_bar(stat= "identity", fill = "green", width = NULL) +
xlab("Event Type") + ylab("Total Damage") + ggtitle("Weather Event Damages (Crop + Property)")
plot(gg1)
In the above plot, we can see Floods cause the most severe economic damage followed by Hurricane/Typhoon and Tornadoes.