This paper has been produced as a component of the Reproducible Research course by Roger D. Peng, PhD, Jeff Leek, PhD, Brian Caffo, PhD which is a requirement of the Data Science specialization offered by Johns Hopkins University through Coursera.
Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern. [1]
In this paper we will address 2 simple questions:
To answer these questions the paper relied on the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. [1]
The first step in the analysis will be to download and load the provided database. In this case we will directly load the compressed .csv data. We then check for missing values. Finally, we compactly display the internal structure of data frame.
setwd("~/Documents/coursera/Reproducible Research/PeerAssessment2")
if (!"NOAAData" %in% ls()){
if (!file.exists("repdata-data-StormData.csv.bz2"))
{
download.file("http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", "repdata-data-StormData.csv.bz2", quiet=FALSE)
sprintf("File repdata-data-StormData.csv.bz2 downloaded on %s", date())
}
NOAAData <- read.csv("repdata-data-StormData.csv.bz2")
}
sapply(NOAAData, function(x) sum(is.na(x)))
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME
## 0 0 0 0 0 0
## STATE EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE
## 0 0 0 0 0 0
## END_TIME COUNTY_END COUNTYENDN END_RANGE END_AZI END_LOCATI
## 0 0 902297 0 0 0
## LENGTH WIDTH F MAG FATALITIES INJURIES
## 0 0 843563 0 0 0
## PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC
## 0 0 0 0 0 0
## ZONENAMES LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS
## 0 47 0 40 0 0
## REFNUM
## 0
str(NOAAData, width=80, strict.width = "cut")
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11..
## $ BGN_TIME : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 ..
## $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE"..
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 83..
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : Factor w/ 35 levels ""," N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_LOCATI: Factor w/ 54429 levels ""," Christiansburg",..: 1 1 1 1 1 1 1 1..
## $ END_DATE : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1..
## $ END_TIME : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ..
## $ END_LOCATI: Factor w/ 34506 levels ""," CANTON"," TULIA",..: 1 1 1 1 1 1 1 ..
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 ..
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ WFO : Factor w/ 542 levels ""," CI","%SD",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 ..
## $ ZONENAMES : Factor w/ 25112 levels ""," "..
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : Factor w/ 436781 levels "","\t","\t\t",..: 1 1 1 1 1 1 1 1 1 1 ..
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
Question 1
Now that we have the data downloaded and loaded into a data frame. We will anser the first question:
First we will group and count the fatalities (deaths) and injuries using plyr.
library(plyr, quietly = TRUE)
summaryDeathData <- ddply(.data = NOAAData, c("EVTYPE"), summarize,
total_deaths = sum(FATALITIES))
summaryDeathData <- arrange(summaryDeathData, desc(total_deaths))[1:7,]
summaryDeathData$EVTYPE <- factor(summaryDeathData$EVTYPE, as.character(summaryDeathData$EVTYPE))
summaryInjuryData <- ddply(.data = NOAAData, c("EVTYPE"), summarize,
total_injuries = sum(INJURIES))
summaryInjuryData <- arrange(summaryInjuryData, desc(total_injuries))[1:7,]
summaryInjuryData$EVTYPE <- factor(summaryInjuryData$EVTYPE, as.character(summaryInjuryData$EVTYPE))
Now let’s create a 2 panel plot to get the answer.
library(ggplot2, quietly = TRUE)
plot1 <- ggplot(data=summaryDeathData, aes(x=EVTYPE, y=total_deaths, fill=1)) +
geom_bar(colour="black", stat="identity") +
ggtitle("Seven Most Deadly Events") +
ylab("Deaths") + xlab ("Event") +
theme(axis.text.x = element_text(angle=90, vjust=1)) +
guides(fill=FALSE)
plot2 <- ggplot(data=summaryInjuryData, aes(x=EVTYPE, y=total_injuries, fill=1)) +
geom_bar(colour="black", stat="identity") +
ggtitle("Top Seven Injury Types") +
ylab("Deaths") + xlab ("Event") +
theme(axis.text.x = element_text(angle=90, vjust=1)) +
guides(fill=FALSE)
library(gridExtra, quietly = TRUE)
grid.arrange(plot1, plot2, ncol=2)
We clearly see tornados have the largest impact on health.
Question 2
We will now answer the second question:
We will first do some data cleanup. Per the documentation, I will interpret the EXP related fields as follows:
B=1,000,000,000 M=1,000,000 K= 1,000 H = 100
Further cleanup of these fields are out of scope per the forum post at https://class.coursera.org/repdata-005/forum/thread?thread_id=99.
NOAAData$CropMultiplier <- 1
NOAAData$PropMultiplier <- 1
NOAAData$CROPDMGEXP <- toupper(NOAAData$CROPDMGEXP)
NOAAData$PROPDMGEXP <- toupper(NOAAData$PROPDMGEXP)
NOAAData$CropMultiplier[NOAAData$CROPDMGEXP == "B"] <- 1000000000
NOAAData$PropMultiplier[NOAAData$PROPDMGEXP == "B"] <- 1000000000
NOAAData$CropMultiplier[NOAAData$CROPDMGEXP == "M"] <- 1000000
NOAAData$PropMultiplier[NOAAData$PROPDMGEXP == "M"] <- 1000000
NOAAData$CropMultiplier[NOAAData$CROPDMGEXP == "K"] <- 1000
NOAAData$PropMultiplier[NOAAData$PROPDMGEXP == "K"] <- 1000
NOAAData$CropMultiplier[NOAAData$CROPDMGEXP == "H"] <- 100
NOAAData$PropMultiplier[NOAAData$PROPDMGEXP == "H"] <- 100
NOAAData$CropTotalDamage <- NOAAData$CropMultiplier * NOAAData$CROPDMG
NOAAData$PropTotalDamage <- NOAAData$PropMultiplier * NOAAData$PROPDMG
Now that we have the data cleaned up, we will group and count the crop damage and property damage using plyr.
library(plyr, quietly = TRUE)
summaryCropData <- ddply(.data = NOAAData, c("EVTYPE"), summarize,
total_crop_dmg = sum(CropTotalDamage))
summaryCropData <- arrange(summaryCropData, desc(total_crop_dmg))[1:7,]
summaryCropData$EVTYPE <- factor(summaryCropData$EVTYPE, as.character(summaryCropData$EVTYPE))
summaryPropData <- ddply(.data = NOAAData, c("EVTYPE"), summarize,
total_prop_dmg = sum(PropTotalDamage))
summaryPropData <- arrange(summaryPropData, desc(total_prop_dmg))[1:7,]
summaryPropData$EVTYPE <- factor(summaryPropData$EVTYPE, as.character(summaryPropData$EVTYPE))
We then can create a 2 panel plot to answer the question.
library(ggplot2, quietly = TRUE)
plot1 <- ggplot(data=summaryCropData, aes(x=EVTYPE, y=total_crop_dmg, fill=1)) +
geom_bar(colour="black", stat="identity") +
ggtitle("Seven Most Costly Crop Events") +
ylab("Financial Impact") + xlab ("Event") +
theme(axis.text.x = element_text(angle=90, vjust=1)) +
guides(fill=FALSE)
plot2 <- ggplot(data=summaryPropData, aes(x=EVTYPE, y=total_prop_dmg, fill=1)) +
geom_bar(colour="black", stat="identity") +
ggtitle("Seven Most Costly Property Events") +
ylab("Financial Impact") + xlab ("Event") +
theme(axis.text.x = element_text(angle=90, vjust=1)) +
guides(fill=FALSE)
library(gridExtra, quietly = TRUE)
grid.arrange(plot1, plot2, ncol=2)
We can see from the plots that droughts have caused the most crop damage and floods have created the most property damage.