Synopsis

The U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database tracks the characteristics of major storms and weather events in the United States. This study examines the health and economic impact of those events as measured by fatalities, injuries and the estimated dollar impact. The ten largest events and the 10 largest event types were identified for each metric and plotted to aid in visually understanding what types of storms and events had the largest impacts.

Data Processing

theURL<-"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"

The NOAA database included data from 1950 to Nov 2001 and was downloaded from a link provided by the instructor (above).

Several data quality issues were noticed and addressed.

The following steps were used to load, clean, summarize, analyze and report the results.

During development, turn on R markdown caching so each step happens only once.

knitr::opts_chunk$set(cache=TRUE)
  1. Download the NOAA data and read it into a dataframe. Make field “REMARKS” character to speed things up.
setwd("~/Desktop/Online-Classes/Johns Hopkins Data Science/Reproducable Research/RepData_PeerA2")

#### download to temp space if necessary and read the file.
 #  temp <- tempfile()
 #  download.file(theURL,dest=temp,method="curl")
 #  noaa <- read.csv(temp,colClasses=c(REMARKS="character",CROPDMGEXP="factor"),nrows=10000)
 #  unlink(temp)

#### download to working directory and read the file
  bzfile<-"repdata-data-StormData.csv.bz2"
  #download.file(theURL,dest=bzfile,method="curl")  # uncomment this line if needed
  noaa <- read.csv(bzfile,colClasses=c(REMARKS="character",PROPDMGEXP="factor",CROPDMGEXP="factor")) #,nrows=1000)
  1. Examine the data structures by creating a small NOAA dataframe containing 100 rows and making summary aggregations for the EXP codes and the Event Types.
noaa100<-noaa[1:100,]
codesum<-aggregate(noaa$PROPDMG,list(code=noaa$PROPDMGEXP),sum)
EVsum<-aggregate(noaa$PROPDMG,list(evtype=noaa$EVTYPE),sum)
EVcodesum<-aggregate(noaa$PROPDMG,list(evtype=noaa$EVTYPE,code=noaa$PROPDMGEXP),sum)
  1. Convert BGN_DATE to R date format, interpret CROPDMGEXP and PROPDMGEXP dollar factor codes and combine dollar amounts for crop and property damage into one field.
# create valid dates from the beginning date of each event
noaa100$date<-as.Date(noaa100$BGN_DATE,"%m/%d/%Y")
noaa$date<-as.Date(noaa$BGN_DATE,"%m/%d/%Y")
# create consistent property damage quantities in millions of dollars
dollar.factor<-function(code) {
        if (code=="B") return(9) else {
        if (code=="K") return(3) else {
        if (code=="M") return(6) else return(1)
        } } }

noaa100$prop.dollar.factor<-sapply(noaa100$PROPDMGEXP,dollar.factor)
noaa100$prop.dollars<-noaa100$PROPDMG*10^noaa100$prop.dollar.factor/10^6
noaa100$crop.dollar.factor<-sapply(noaa100$CROPDMGEXP,dollar.factor)
noaa100$crop.dollars<-noaa100$CROPDMG*10^noaa100$crop.dollar.factor/10^6
noaa100$m.of.dollars<-noaa100$prop.dollars + noaa100$crop.dollars

noaa$prop.dollar.factor<-sapply(noaa$PROPDMGEXP,dollar.factor)
noaa$prop.dollars<-noaa$PROPDMG*10^noaa$prop.dollar.factor/10^6
noaa$crop.dollar.factor<-sapply(noaa$CROPDMGEXP,dollar.factor)
noaa$crop.dollars<-noaa$CROPDMG*10^noaa$crop.dollar.factor/10^6
noaa$m.of.dollars<-noaa$prop.dollars + noaa$crop.dollars
  1. Change the m.of.dollars value from $115,032.5 million to zero for the apparently duplicate event in NAPA. Print the remark that shows the actual damage was in the range of 100 million dollars.
# this section restores the m.of.dollar value to 115B, so following test will work. 
B115<-605953L  # 115B row number that was determined with which.max function.
# this line restores the m.of.dollar value to 115B, so following test will work.
noaa[B115,38:43]
##              date prop.dollar.factor prop.dollars crop.dollar.factor
## 605953 2006-01-01                  9       115000                  6
##        crop.dollars m.of.dollars
## 605953         32.5     115032.5
# find the max m.of.dollar rownumber
max.rownum<-which.max(noaa$m.of.dollars)
max.rownum
## [1] 605953
# test and if needed, save, print and adjust the 115000 m.of.dollars to zero.
if (noaa[B115,"prop.dollars"]==115000) {  # only adjust this one record
        NAPA115B<-noaa[B115,]   # save the adjusted record
        noaa[B115,"m.of.dollars"]<-0 # For B115, make the noaa$m.of.dollar adjustment to zero 
   }
with(NAPA115B, {
        paste("Date: ", date, ".  Property damage in millions:", prop.dollars, ".  Remarks:", as.character(REMARKS))
        } )
## [1] "Date:  2006-01-01 .  Property damage in millions: 115000 .  Remarks: Major flooding continued into the early hours of January 1st, before the Napa River finally fell below flood stage and the water receeded. Flooding was severe in Downtown Napa from the Napa Creek and the City and Parks Department was hit with $6 million in damage alone. The City of Napa had 600 homes with moderate damage, 150 damaged businesses with costs of at least $70 million."
noaa[B115,38:43]
##              date prop.dollar.factor prop.dollars crop.dollar.factor
## 605953 2006-01-01                  9       115000                  6
##        crop.dollars m.of.dollars
## 605953         32.5            0
max.rownum<-which.max(noaa$m.of.dollars)
max.rownum
## [1] 577676
  1. Summarize the data into Event Types and sort to reveal the top 10 Event Types for fatalities, injuries and damages (in dollars). Also find the 10 biggest individual events for the entire time period, in terms of fatalities, injuries and damages.
fatalEVsum<-aggregate(noaa$FATALITIES,list(evtype=noaa$EVTYPE),sum)
injurEVsum<-aggregate(noaa$INJURIES,list(evtype=noaa$EVTYPE),sum) 
dollarEVsum<-aggregate(noaa$m.of.dollars,list(code=noaa$EVTYPE),sum)

x1<-order(noaa$INJURIES,decreasing=TRUE)[1:10]
noaa.10big.injuries<-noaa[x1,]
x1<-order(noaa$FATALITIES,decreasing=TRUE)[1:10]
noaa.10big.fatalities<-noaa[x1,]
x1<-order(noaa$m.of.dollars,decreasing=TRUE)[1:10]
noaa.10big.m.of.dollars<-noaa[x1,]

x1<-order(injurEVsum$x,decreasing=TRUE)[1:10]
  noaa.10bigEV.injuries<-injurEVsum[x1,]
  noaa.allEV.injuries<-sum(injurEVsum$x)
  noaa.othEV.injuries<-noaa.allEV.injuries-sum(noaa.10bigEV.injuries$x)
x1<-order(fatalEVsum$x,decreasing=TRUE)[1:10]
  noaa.10bigEV.fatalities<-fatalEVsum[x1,]
  noaa.allEV.fatalities<-sum(fatalEVsum$x)
  noaa.othEV.fatalities<-noaa.allEV.fatalities-sum(noaa.10bigEV.fatalities$x)
x1<-order(dollarEVsum$x,decreasing=TRUE)[1:10]
  noaa.10bigEV.m.of.dollars<-dollarEVsum[x1,]
  noaa.allEV.m.of.dollars<-sum(dollarEVsum$x)
  noaa.othEV.m.of.dollars<-noaa.allEV.m.of.dollars-sum(noaa.10bigEV.m.of.dollars$x)

Turn cache off.

knitr::opts_chunk$set(cache=FALSE) 

RESULTS

The plots below provide provide a clear understanding of the events and types of events that impact human health and the economy.

  1. Plot the 10 biggest single events and event types in terms of fatalities and then injuries.
par(mfrow = c(2, 1)) # put two on a page
####### Plot1
with (noaa.10big.fatalities,p1.df<<-data.frame("Event.Type"=paste(as.character(EVTYPE),
  " Date: " ,as.character(date), " Fatalities: ",paste(as.character(FATALITIES))),"Fatalities"=FATALITIES))

par(cex.axis=1,mar=c(4,2,2,1)) 

with (p1.df, {
        midpoints<-barplot(Fatalities,
                           xlab="Count for each item",horiz=T,las=1,
                           main="Fatality Counts for Top Single Events")
        text(c(300,Fatalities[2:10]+150),midpoints, labels=Event.Type,cex=.6)     
        }     )


###### Plot3
p3.df<-data.frame("Event.Type"=as.character(noaa.10bigEV.fatalities$evtype),
                 "Fatalities"=noaa.10bigEV.fatalities$x)

par(cex.axis=1,mar=c(4,2,2,1))

with (p3.df, {
        midpoints<-barplot(Fatalities,
                           xlab="Total sum of all years 1950 to 2011",horiz=T,las=1,
                           main="Fatality Counts for Top Event Types")
        text(c(3000,Fatalities[2:10]+600),midpoints, labels=Event.Type,cex=.7)     
        }     )

#par(mfrow = c(2, 1)) # put two on a page

## Plot2
with (noaa.10big.injuries, { p2.df<<-data.frame("Event.Type"=paste(as.character(EVTYPE), " Date: ", as.character(date)),"Injuries"=INJURIES) } )

par(cex.axis=1,mar=c(4,2,2,1))

with (p2.df, {
        midpoints<-barplot(Injuries,
                           xlab="Count for each item",horiz=T,las=1,
                           main="Injury Counts for Top Single Events")
        text(c(400),midpoints, labels=Event.Type,cex=.7)     
        }     )



## Plot4
p4.df<<-data.frame("Event.Type"=as.character(noaa.10bigEV.injuries$evtype),
                 "Injuries"=noaa.10bigEV.injuries$x)

par(cex.axis=1,mar=c(4,2,2,1))
with (p4.df, {
        midpoints<-barplot(Injuries,
                           xlab="Total sum of all years 1950 to 2011",horiz=T,las=1,
                           main="Injury Counts for Top Event Types")
        text(c(25000,Injuries[2:10]+11000),midpoints, labels=Event.Type,cex=.7)     
        }     )

The charts above show the health impact in numbers of fatalities and injuries for the top 10 events and event types over the period from 1950 until November, 2011. Tornados have far greater impact on human fatalities and injuries than any other event type.

  • The top fatality event was the Chicago Heat Wave of 1995 link
  • The top fatality event type was TORNADO
  • The top injury event was a tornado on “Terrible Tuesday”: April 10, 1979 in Wichita Falls, Texas link
  • The top injury event type was TORNADO
  1. Plot the 10 biggest events and event types in terms of billions of dollars.
par(mfrow = c(2, 1)) # put two on a page
## Plot5 
with (noaa.10big.m.of.dollars, {
  p5.df<<-data.frame("Event.Type"=paste(as.character(EVTYPE), "   Date: " ,as.character(date)),
                 "Dollars"=m.of.dollars/1000)      
 } )

par(cex.axis=1,mar=c(4,2,2,1))

with (p5.df, {
        midpoints<-barplot(Dollars,
                           xlab="Billions of dollars for each item",horiz=T,las=1,
                           main="Economic Impact of Top Individual Events")
        text(c(15,Dollars[2:10]+10),midpoints, labels=Event.Type,cex=.7)     
        }     )


## Plot6
p6.df<-data.frame("Event.Type"=as.character(noaa.10bigEV.m.of.dollars$code),
                 "Dollars"=noaa.10bigEV.m.of.dollars$x/1000)
par(cex.axis=1,mar=c(4,2,2,1))

with (p6.df, {
        midpoints<-barplot(Dollars,
                           xlab="Billions of dollars - sum of all years 1950 to 2011",horiz=T,las=1,
                           main="Economic Impact of Top Event Types")
        text(c(50,Dollars[2:10]+6),midpoints, labels=Event.Type,cex=.7)     
        }     )

The plots above show the economic impact of the top 10 events and event types over the period from 1950 through November, 2011.

  • The top economic impact event was the Hurricane Katrina Storm Surge in August 2005 link
  • The top economic impact event type was HURRICANE/TYPHOON
# report is limited to three figures
library(gridExtra)
#grid.table(p1.df,main="Injury - Top 10 Single Events")
#grid.newpage()
#grid.table(p3.df,main="Injury - Top 10 Event Types")
#grid.newpage()
#grid.table(p2.df,main="Fatality - Top 10 Single Events")
#grid.newpage()
#grid.table(p4.df,main="Fatality - Top 10 Event Types")
#grid.newpage()
#grid.table(p5.df,main="Economic Impact - Top 10 Single Events")
#grid.newpage()
#grid.table(p6.df,main="Economic Impact - Top 10 Event Types")