Analysis of fatalities, injuries and damages due to severe weather events

Synopsis

The catastrophic weather and environmental disasters cause a lot of effects in human health and finance. We are studying the the impact due to these events through an exploratory data analysis.

To provide some insight into the effects of severe weather on the public, we are considering U.S. National Oceanic and Atmospheric Administration's (NOAA) storm data during 1950 through 2011. These kinds of analysis would be beneficial to plan properly to severe weather events and to prepare contingency plans.

You can download the Storm Data, Storm Data Documentation and FAQ for your own reference and to understand the analysis better.

In the earlier years, there are fewer events (as indicated in the EVTYPE variable) recorded, most likely due to a lack of availability of records. More recent years should be considered more complete. We figured out that 1992 is the cut off year where TORNADO is the only event rersponsible for fatalities, injuries and property damages.

Points to Analyze

Across the United States, which types of events are most harmful with respect to population health
Across the United States, which types of events have the greatest economic consequences

The above points are analyzed in R and the report is generated through a markdown file.

Data Processing

As mentioned before, we are considering the U.S. National Oceanic and Atmospheric Administration's (NOAA) Storm Data for our analysis

Loading the Raw Data into R

if(!file.exists("./repdata_data_StormData.bz2")) {
   download.file("http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
   "./repdata_data_StormData.bz2")
}
stormdata <- read.csv(bzfile("./repdata_data_StormData.bz2"), stringsAsFactors=FALSE)

Creating Tidy Data for Analysis

Reducing data set to necessary columns: “BGN_DATE”, “EVTYPE”, “FATALITIES”, “INJURIES, "PROPDMG, "PROPDMGEXP, "CROPDMG”, “CROPDMGEXP”

columns.reqd             <- c("BGN_DATE", "EVTYPE", "FATALITIES", "INJURIES", 
                              "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
stormdata.reqd           <- stormdata[, columns.reqd]
stormdata.reqd$BGN_DATE2 <- as.Date(strptime(as.character(stormdata.reqd$BGN_DATE), format = "%m/%d/%Y %H:%M:%S"))

Let us check the total # of records and distinct catastrophic weather events of the original data

events <-unique(stormdata.reqd$EVTYPE)
nrow(stormdata.reqd);length(events)

## [1] 902297

## [1] 985

As we figured out there are many typo/similar wording/upper-lower case/leading-trailing spaces issues for the events(EVTYPE column), we need to correct them as much as possible

require(stringr)

## Loading required package: stringr

stormdata.reqd$EVTYPE <- str_trim(toupper(stormdata.reqd$EVTYPE))
stormdata.reqd$EVTYPE <- gsub("FLOODING", "FLOOD", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FLOODS", "FLOOD", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FLOOD/", "FLOOD", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FLOOD/FLOOD", "FLOOD", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FLOODFLOOD", "FLOOD", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FLOODFLASH FLOOD", "FLOOD FLASH", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FLOODFLASH", "FLOOD FLASH", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FLOODFLASH/FLOOD", "FLOOD FLASH", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FLOODFLASHFLOOD", "FLOOD FLASH", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("COASTAL  FLOODEROSION", "COASTAL FLOOD EROSION", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("COASTAL FLOODEROSION", "COASTAL FLOOD EROSION", stormdata.reqd$EVTYPE)

stormdata.reqd$EVTYPE <- gsub("WINDS", "WIND", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("STORMS", "STORM", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FIRES", "FIRE", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("WINDCHILL", "WIND CHILL", stormdata.reqd$EVTYPE)

stormdata.reqd$EVTYPE <- gsub("THUDERSTORM", "THUNDERSTORM", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERESTORM", "THUNDERSTORM", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERTORM", "THUNDERSTORM", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERSTROM", "THUNDERSTORM", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDEERSTORM", "THUNDERSTORM", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERSTORMS", "THUNDERSTORM", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERSTORM  WIND", "THUNDERSTORM WIND", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERSTORM WINDS", "THUNDERSTORM WIND", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERSTORM WINS", "THUNDERSTORM WIND", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERSTORMW", "THUNDERSTORM WIND", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERSTORMWIND", "THUNDERSTORM WIND", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNERSTORM WIND", "THUNDERSTORM WIND", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("TUNDERSTORM WIND", "THUNDERSTORM WIND", stormdata.reqd$EVTYPE)

Here, we are concentrating to clean up the records that do not have the information we are interested

Filtering out data that did not cause fatalities, injuries or damage.
Removing the records that have both unusable values for exponent variables (“”, “-”, “?” or “+”) in property damage or crop damage.
Removing the records having EVTYPE marked as “?”

stormdata.filtered1 <- stormdata.reqd[stormdata.reqd$FATALITIES > 0 | stormdata.reqd$INJURIES > 0 |
                                      stormdata.reqd$PROPDMG > 0    | stormdata.reqd$PROPDMGEXP > 0 |
                                      stormdata.reqd$CROPDMG > 0    | stormdata.reqd$CROPDMGEXP > 0, ]

records.rm.set1 <- ((stormdata.filtered1$PROPDMGEXP == "" | stormdata.filtered1$PROPDMGEXP == "-" | 
                     stormdata.filtered1$PROPDMGEXP == "?" | stormdata.filtered1$PROPDMGEXP == "+") & 
                    (stormdata.filtered1$CROPDMGEXP == "" | stormdata.filtered1$CROPDMGEXP == "-" | 
                     stormdata.filtered1$CROPDMGEXP == "?" | stormdata.filtered1$CROPDMGEXP == "+"))
stormdata.filtered2 <- stormdata.filtered1[!records.rm.set1, ]

records.rm.set2 <- stormdata.filtered2$EVTYPE == "?"
stormdata.filtered3 <- stormdata.filtered2[!records.rm.set2, ]

We noticed that the 01/01/1992 is the cut-off date before which only TORNADO was the event for above filtered data

cutoff <- as.Date("01/01/1992", format = "%m/%d/%Y")

stormdata.filtered4 <- stormdata.filtered3[stormdata.filtered3$BGN_DATE2 <= cutoff, ]
events <-unique(toupper(stormdata.filtered4$EVTYPE))
events

## [1] "TORNADO"

We are ignoring the records before 1992 to make our analysis more meaningful for advanced planning against the severe weather events. This is our tidy data for analysis

stormdata.filtered92 <- stormdata.filtered3[stormdata.filtered3$BGN_DATE2 > cutoff, ]

Let us check the total # of records and distinct catastrophic weather events of this tidy data

events <-unique(toupper(stormdata.filtered92$EVTYPE))
nrow(stormdata.filtered92);length(events)

## [1] 407297

## [1] 350

We are preparing dataset with the relevant information for the analysis performing the following

PROPDMGEXP and CROPDMGEXP are modified in order to obtain the real values
The required values for different events have been calculated
The new data are converted to data frame

eventdata <- NULL

stormdata.filtered92$totalPropDamage<-stormdata.filtered92$PROPDMG*sapply(stormdata.filtered92$PROPDMGEXP,FUN=function(x){switch(x, K = 1000, k = 1000, M = 1e+06, m = 1e+06, B = 1e+09, b = 1e+09, 1)})
stormdata.filtered92$totalCropDamage<-stormdata.filtered92$CROPDMG*sapply(stormdata.filtered92$CROPDMGEXP,FUN=function(x){switch(x, K = 1000, k = 1000, M = 1e+06, m = 1e+06, B = 1e+09, b = 1e+09, 1)})

for (i in 1:length(events))
{
    eventdata$event[i]      = events[i]
    eventdata$fatalities[i] = sum(stormdata.filtered92$FATALITIES[stormdata.filtered92$EVTYPE==events[i]])
    eventdata$injuries[i]   = sum(stormdata.filtered92$INJURIES[stormdata.filtered92$EVTYPE==events[i]])
    eventdata$pDMG[i]       = sum(stormdata.filtered92$totalPropDamage[stormdata.filtered92$EVTYPE==events[i]])
    eventdata$cDMG[i]       = sum(stormdata.filtered92$totalCropDamage[stormdata.filtered92$EVTYPE==events[i]])
}

eventdata <- as.data.frame(eventdata)

Now, the top 10 events that cause strongest impact have been extracted

eventdata.healthSubset<-eventdata[,c(1:3)]
eventdata.financeSubset<-eventdata[,c(1,4:5)]

eventdata.healthSubset<-eventdata.healthSubset[order(eventdata$fatalities+eventdata$injuries,decreasing=TRUE)[1:10],]
eventdata.financeSubset<-eventdata.financeSubset[order(eventdata$pDMG+eventdata$cDMG,decreasing=TRUE)[1:10],]

head(eventdata.healthSubset,5)

##                event fatalities injuries
## 1            TORNADO       1618    23741
## 22             FLOOD        408     6748
## 43         TSTM WIND        169     2742
## 4  THUNDERSTORM WIND        177     2201
## 11       FLASH FLOOD        752     1559

head(eventdata.financeSubset,5)

##                 event      pDMG      cDMG
## 22              FLOOD 1.448e+11 5.671e+09
## 330 HURRICANE/TYPHOON 6.931e+10 2.608e+09
## 69        STORM SURGE 4.332e+10 5.000e+03
## 1             TORNADO 2.775e+10 4.150e+08
## 10               HAIL 1.573e+10 3.026e+09

Results

Severe weather events most harmful to population health

library(ggplot2)
library(plyr)
library(reshape2)

toPlot<-melt(eventdata.healthSubset, id.vars=c(1))
ggplot(toPlot, aes(x=reorder(event,-value), y=value, fill=variable)) + labs(title="Top 10 harmful catastrophic events",y="Total Fatalities and Injuries",x="Event") + geom_bar(stat="identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1))

plot of chunk unnamed-chunk-11

Severe weather events having the greatest economic consequences

toPlot<-melt(eventdata.financeSubset, id.vars=c(1))
ggplot(toPlot, aes(x=reorder(event,-value), y=value, fill=variable)) + labs(title="Top 10  economical damageable catastrophic events",y="Damage ($)",x="Event") + geom_bar(stat="identity") + theme(axis.text.x = element_text(size=8,angle = 90, hjust = 1))

plot of chunk unnamed-chunk-12