The U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database tracks the characteristics of major storms and weather events in the United States. This study examines the health and economic impact of those events as measured by fatalities, injuries and the estimated dollar impact. The ten largest events and the 10 largest event types were identified for each metric and plotted to aid in visually understanding what types of storms and events had the largest impacts.
theURL<-"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
The NOAA database included data from 1950 to Nov 2001 and was downloaded from a link provided by the instructor (above).
Several data quality issues were noticed and addressed.
The economic damage was recorded for property and crops. The NOAA documentation states that the numerical damage quantity should be scaled by the related “EXP” “dollar factor code” (K=thousand, M=million and B=billion). Other values that were observed in this field were ignored; however ideally, these relatively few records could be researched and corrected.
Surprisingly, the largest property damage event found in NOAA was a flood in NAPA valley in Dec 2005 and Jan 2006. It had two records in NOAA, one on 12/31/2005 for 115 million and another on 01/01/2006 for 115 billion. Research in this NOAA database and the current U.S. NOAA database, which can be viewed here reveals that there are two essentially duplicate records: the one of Dec 31, 2005 has 115 million property damage and the one on Jan 1, 2006 has 115 billion property damage. The 115 billion item was excluded because the NOAA event narrative and a Google search both confirm that actual damages were in the range of 100 million, not 100 billion.
One date field was needed for this analysis, so “BGN_DATE” was converted into an R date format.
During development, turn on R markdown caching so each step happens only once.
knitr::opts_chunk$set(cache=TRUE)
setwd("~/Desktop/Online-Classes/Johns Hopkins Data Science/Reproducable Research/RepData_PeerA2")
#### download to temp space if necessary and read the file.
# temp <- tempfile()
# download.file(theURL,dest=temp,method="curl")
# noaa <- read.csv(temp,colClasses=c(REMARKS="character",CROPDMGEXP="factor"),nrows=10000)
# unlink(temp)
#### download to working directory and read the file
bzfile<-"repdata-data-StormData.csv.bz2"
#download.file(theURL,dest=bzfile,method="curl") # uncomment this line if needed
noaa <- read.csv(bzfile,colClasses=c(REMARKS="character",PROPDMGEXP="factor",CROPDMGEXP="factor")) #,nrows=1000)
noaa100<-noaa[1:100,]
codesum<-aggregate(noaa$PROPDMG,list(code=noaa$PROPDMGEXP),sum)
EVsum<-aggregate(noaa$PROPDMG,list(evtype=noaa$EVTYPE),sum)
EVcodesum<-aggregate(noaa$PROPDMG,list(evtype=noaa$EVTYPE,code=noaa$PROPDMGEXP),sum)
# create valid dates from the beginning date of each event
noaa100$date<-as.Date(noaa100$BGN_DATE,"%m/%d/%Y")
noaa$date<-as.Date(noaa$BGN_DATE,"%m/%d/%Y")
# create consistent property damage quantities in millions of dollars
dollar.factor<-function(code) {
if (code=="B") return(9) else {
if (code=="K") return(3) else {
if (code=="M") return(6) else return(1)
} } }
noaa100$prop.dollar.factor<-sapply(noaa100$PROPDMGEXP,dollar.factor)
noaa100$prop.dollars<-noaa100$PROPDMG*10^noaa100$prop.dollar.factor/10^6
noaa100$crop.dollar.factor<-sapply(noaa100$CROPDMGEXP,dollar.factor)
noaa100$crop.dollars<-noaa100$CROPDMG*10^noaa100$crop.dollar.factor/10^6
noaa100$m.of.dollars<-noaa100$prop.dollars + noaa100$crop.dollars
noaa$prop.dollar.factor<-sapply(noaa$PROPDMGEXP,dollar.factor)
noaa$prop.dollars<-noaa$PROPDMG*10^noaa$prop.dollar.factor/10^6
noaa$crop.dollar.factor<-sapply(noaa$CROPDMGEXP,dollar.factor)
noaa$crop.dollars<-noaa$CROPDMG*10^noaa$crop.dollar.factor/10^6
noaa$m.of.dollars<-noaa$prop.dollars + noaa$crop.dollars
# this section restores the m.of.dollar value to 115B, so following test will work.
B115<-605953L # 115B row number that was determined with which.max function.
# this line restores the m.of.dollar value to 115B, so following test will work.
noaa[B115,38:43]
## date prop.dollar.factor prop.dollars crop.dollar.factor
## 605953 2006-01-01 9 115000 6
## crop.dollars m.of.dollars
## 605953 32.5 115032.5
# find the max m.of.dollar rownumber
max.rownum<-which.max(noaa$m.of.dollars)
max.rownum
## [1] 605953
# test and if needed, save, print and adjust the 115000 m.of.dollars to zero.
if (noaa[B115,"prop.dollars"]==115000) { # only adjust this one record
NAPA115B<-noaa[B115,] # save the adjusted record
noaa[B115,"m.of.dollars"]<-0 # For B115, make the noaa$m.of.dollar adjustment to zero
}
with(NAPA115B, {
paste("Date: ", date, ". Property damage in millions:", prop.dollars, ". Remarks:", as.character(REMARKS))
} )
## [1] "Date: 2006-01-01 . Property damage in millions: 115000 . Remarks: Major flooding continued into the early hours of January 1st, before the Napa River finally fell below flood stage and the water receeded. Flooding was severe in Downtown Napa from the Napa Creek and the City and Parks Department was hit with $6 million in damage alone. The City of Napa had 600 homes with moderate damage, 150 damaged businesses with costs of at least $70 million."
noaa[B115,38:43]
## date prop.dollar.factor prop.dollars crop.dollar.factor
## 605953 2006-01-01 9 115000 6
## crop.dollars m.of.dollars
## 605953 32.5 0
max.rownum<-which.max(noaa$m.of.dollars)
max.rownum
## [1] 577676
fatalEVsum<-aggregate(noaa$FATALITIES,list(evtype=noaa$EVTYPE),sum)
injurEVsum<-aggregate(noaa$INJURIES,list(evtype=noaa$EVTYPE),sum)
dollarEVsum<-aggregate(noaa$m.of.dollars,list(code=noaa$EVTYPE),sum)
x1<-order(noaa$INJURIES,decreasing=TRUE)[1:10]
noaa.10big.injuries<-noaa[x1,]
x1<-order(noaa$FATALITIES,decreasing=TRUE)[1:10]
noaa.10big.fatalities<-noaa[x1,]
x1<-order(noaa$m.of.dollars,decreasing=TRUE)[1:10]
noaa.10big.m.of.dollars<-noaa[x1,]
x1<-order(injurEVsum$x,decreasing=TRUE)[1:10]
noaa.10bigEV.injuries<-injurEVsum[x1,]
noaa.allEV.injuries<-sum(injurEVsum$x)
noaa.othEV.injuries<-noaa.allEV.injuries-sum(noaa.10bigEV.injuries$x)
x1<-order(fatalEVsum$x,decreasing=TRUE)[1:10]
noaa.10bigEV.fatalities<-fatalEVsum[x1,]
noaa.allEV.fatalities<-sum(fatalEVsum$x)
noaa.othEV.fatalities<-noaa.allEV.fatalities-sum(noaa.10bigEV.fatalities$x)
x1<-order(dollarEVsum$x,decreasing=TRUE)[1:10]
noaa.10bigEV.m.of.dollars<-dollarEVsum[x1,]
noaa.allEV.m.of.dollars<-sum(dollarEVsum$x)
noaa.othEV.m.of.dollars<-noaa.allEV.m.of.dollars-sum(noaa.10bigEV.m.of.dollars$x)
Turn cache off.
knitr::opts_chunk$set(cache=FALSE)
par(mfrow = c(2, 1)) # put two on a page
####### Plot1
with (noaa.10big.fatalities,p1.df<<-data.frame("Event.Type"=paste(as.character(EVTYPE),
" Date: " ,as.character(date), " Fatalities: ",paste(as.character(FATALITIES))),"Fatalities"=FATALITIES))
par(cex.axis=1,mar=c(4,2,2,1))
with (p1.df, {
midpoints<-barplot(Fatalities,
xlab="Count for each item",horiz=T,las=1,
main="Fatality Counts for Top Single Events")
text(c(300,Fatalities[2:10]+150),midpoints, labels=Event.Type,cex=.6)
} )
###### Plot3
p3.df<-data.frame("Event.Type"=as.character(noaa.10bigEV.fatalities$evtype),
"Fatalities"=noaa.10bigEV.fatalities$x)
par(cex.axis=1,mar=c(4,2,2,1))
with (p3.df, {
midpoints<-barplot(Fatalities,
xlab="Total sum of all years 1950 to 2011",horiz=T,las=1,
main="Fatality Counts for Top Event Types")
text(c(3000,Fatalities[2:10]+600),midpoints, labels=Event.Type,cex=.7)
} )
#par(mfrow = c(2, 1)) # put two on a page
## Plot2
with (noaa.10big.injuries, { p2.df<<-data.frame("Event.Type"=paste(as.character(EVTYPE), " Date: ", as.character(date)),"Injuries"=INJURIES) } )
par(cex.axis=1,mar=c(4,2,2,1))
with (p2.df, {
midpoints<-barplot(Injuries,
xlab="Count for each item",horiz=T,las=1,
main="Injury Counts for Top Single Events")
text(c(400),midpoints, labels=Event.Type,cex=.7)
} )
## Plot4
p4.df<<-data.frame("Event.Type"=as.character(noaa.10bigEV.injuries$evtype),
"Injuries"=noaa.10bigEV.injuries$x)
par(cex.axis=1,mar=c(4,2,2,1))
with (p4.df, {
midpoints<-barplot(Injuries,
xlab="Total sum of all years 1950 to 2011",horiz=T,las=1,
main="Injury Counts for Top Event Types")
text(c(25000,Injuries[2:10]+11000),midpoints, labels=Event.Type,cex=.7)
} )
par(mfrow = c(2, 1)) # put two on a page
## Plot5
with (noaa.10big.m.of.dollars, {
p5.df<<-data.frame("Event.Type"=paste(as.character(EVTYPE), " Date: " ,as.character(date)),
"Dollars"=m.of.dollars/1000)
} )
par(cex.axis=1,mar=c(4,2,2,1))
with (p5.df, {
midpoints<-barplot(Dollars,
xlab="Billions of dollars for each item",horiz=T,las=1,
main="Economic Impact of Top Individual Events")
text(c(15,Dollars[2:10]+10),midpoints, labels=Event.Type,cex=.7)
} )
## Plot6
p6.df<-data.frame("Event.Type"=as.character(noaa.10bigEV.m.of.dollars$code),
"Dollars"=noaa.10bigEV.m.of.dollars$x/1000)
par(cex.axis=1,mar=c(4,2,2,1))
with (p6.df, {
midpoints<-barplot(Dollars,
xlab="Billions of dollars - sum of all years 1950 to 2011",horiz=T,las=1,
main="Economic Impact of Top Event Types")
text(c(50,Dollars[2:10]+6),midpoints, labels=Event.Type,cex=.7)
} )
# report is limited to three figures
library(gridExtra)
#grid.table(p1.df,main="Injury - Top 10 Single Events")
#grid.newpage()
#grid.table(p3.df,main="Injury - Top 10 Event Types")
#grid.newpage()
#grid.table(p2.df,main="Fatality - Top 10 Single Events")
#grid.newpage()
#grid.table(p4.df,main="Fatality - Top 10 Event Types")
#grid.newpage()
#grid.table(p5.df,main="Economic Impact - Top 10 Single Events")
#grid.newpage()
#grid.table(p6.df,main="Economic Impact - Top 10 Event Types")