The catastrophic weather and environmental disasters cause a lot of effects in human health and finance. We are studying the the impact due to these events through an exploratory data analysis.
To provide some insight into the effects of severe weather on the public, we are considering U.S. National Oceanic and Atmospheric Administration's (NOAA) storm data during 1950 through 2011. These kinds of analysis would be beneficial to plan properly to severe weather events and to prepare contingency plans.
You can download the Storm Data, Storm Data Documentation and FAQ for your own reference and to understand the analysis better.
In the earlier years, there are fewer events (as indicated in the EVTYPE variable) recorded, most likely due to a lack of availability of records. More recent years should be considered more complete. We figured out that 1992 is the cut off year where TORNADO is the only event rersponsible for fatalities, injuries and property damages.
Across the United States, which types of events are most harmful with respect to population health
Across the United States, which types of events have the greatest economic consequences
The above points are analyzed in R and the report is generated through a markdown file.
As mentioned before, we are considering the U.S. National Oceanic and Atmospheric Administration's (NOAA) Storm Data for our analysis
if(!file.exists("./repdata_data_StormData.bz2")) {
download.file("http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
"./repdata_data_StormData.bz2")
}
stormdata <- read.csv(bzfile("./repdata_data_StormData.bz2"), stringsAsFactors=FALSE)
Reducing data set to necessary columns: “BGN_DATE”, “EVTYPE”, “FATALITIES”, “INJURIES, "PROPDMG, "PROPDMGEXP, "CROPDMG”, “CROPDMGEXP”
columns.reqd <- c("BGN_DATE", "EVTYPE", "FATALITIES", "INJURIES",
"PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
stormdata.reqd <- stormdata[, columns.reqd]
stormdata.reqd$BGN_DATE2 <- as.Date(strptime(as.character(stormdata.reqd$BGN_DATE), format = "%m/%d/%Y %H:%M:%S"))
Let us check the total # of records and distinct catastrophic weather events of the original data
events <-unique(stormdata.reqd$EVTYPE)
nrow(stormdata.reqd);length(events)
## [1] 902297
## [1] 985
As we figured out there are many typo/similar wording/upper-lower case/leading-trailing spaces issues for the events(EVTYPE column), we need to correct them as much as possible
require(stringr)
## Loading required package: stringr
stormdata.reqd$EVTYPE <- str_trim(toupper(stormdata.reqd$EVTYPE))
stormdata.reqd$EVTYPE <- gsub("FLOODING", "FLOOD", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FLOODS", "FLOOD", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FLOOD/", "FLOOD", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FLOOD/FLOOD", "FLOOD", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FLOODFLOOD", "FLOOD", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FLOODFLASH FLOOD", "FLOOD FLASH", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FLOODFLASH", "FLOOD FLASH", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FLOODFLASH/FLOOD", "FLOOD FLASH", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FLOODFLASHFLOOD", "FLOOD FLASH", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("COASTAL FLOODEROSION", "COASTAL FLOOD EROSION", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("COASTAL FLOODEROSION", "COASTAL FLOOD EROSION", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("WINDS", "WIND", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("STORMS", "STORM", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("FIRES", "FIRE", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("WINDCHILL", "WIND CHILL", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUDERSTORM", "THUNDERSTORM", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERESTORM", "THUNDERSTORM", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERTORM", "THUNDERSTORM", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERSTROM", "THUNDERSTORM", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDEERSTORM", "THUNDERSTORM", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERSTORMS", "THUNDERSTORM", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERSTORM WIND", "THUNDERSTORM WIND", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERSTORM WINDS", "THUNDERSTORM WIND", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERSTORM WINS", "THUNDERSTORM WIND", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERSTORMW", "THUNDERSTORM WIND", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNDERSTORMWIND", "THUNDERSTORM WIND", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("THUNERSTORM WIND", "THUNDERSTORM WIND", stormdata.reqd$EVTYPE)
stormdata.reqd$EVTYPE <- gsub("TUNDERSTORM WIND", "THUNDERSTORM WIND", stormdata.reqd$EVTYPE)
Here, we are concentrating to clean up the records that do not have the information we are interested
stormdata.filtered1 <- stormdata.reqd[stormdata.reqd$FATALITIES > 0 | stormdata.reqd$INJURIES > 0 |
stormdata.reqd$PROPDMG > 0 | stormdata.reqd$PROPDMGEXP > 0 |
stormdata.reqd$CROPDMG > 0 | stormdata.reqd$CROPDMGEXP > 0, ]
records.rm.set1 <- ((stormdata.filtered1$PROPDMGEXP == "" | stormdata.filtered1$PROPDMGEXP == "-" |
stormdata.filtered1$PROPDMGEXP == "?" | stormdata.filtered1$PROPDMGEXP == "+") &
(stormdata.filtered1$CROPDMGEXP == "" | stormdata.filtered1$CROPDMGEXP == "-" |
stormdata.filtered1$CROPDMGEXP == "?" | stormdata.filtered1$CROPDMGEXP == "+"))
stormdata.filtered2 <- stormdata.filtered1[!records.rm.set1, ]
records.rm.set2 <- stormdata.filtered2$EVTYPE == "?"
stormdata.filtered3 <- stormdata.filtered2[!records.rm.set2, ]
We noticed that the 01/01/1992 is the cut-off date before which only TORNADO was the event for above filtered data
cutoff <- as.Date("01/01/1992", format = "%m/%d/%Y")
stormdata.filtered4 <- stormdata.filtered3[stormdata.filtered3$BGN_DATE2 <= cutoff, ]
events <-unique(toupper(stormdata.filtered4$EVTYPE))
events
## [1] "TORNADO"
We are ignoring the records before 1992 to make our analysis more meaningful for advanced planning against the severe weather events. This is our tidy data for analysis
stormdata.filtered92 <- stormdata.filtered3[stormdata.filtered3$BGN_DATE2 > cutoff, ]
Let us check the total # of records and distinct catastrophic weather events of this tidy data
events <-unique(toupper(stormdata.filtered92$EVTYPE))
nrow(stormdata.filtered92);length(events)
## [1] 407297
## [1] 350
We are preparing dataset with the relevant information for the analysis performing the following
eventdata <- NULL
stormdata.filtered92$totalPropDamage<-stormdata.filtered92$PROPDMG*sapply(stormdata.filtered92$PROPDMGEXP,FUN=function(x){switch(x, K = 1000, k = 1000, M = 1e+06, m = 1e+06, B = 1e+09, b = 1e+09, 1)})
stormdata.filtered92$totalCropDamage<-stormdata.filtered92$CROPDMG*sapply(stormdata.filtered92$CROPDMGEXP,FUN=function(x){switch(x, K = 1000, k = 1000, M = 1e+06, m = 1e+06, B = 1e+09, b = 1e+09, 1)})
for (i in 1:length(events))
{
eventdata$event[i] = events[i]
eventdata$fatalities[i] = sum(stormdata.filtered92$FATALITIES[stormdata.filtered92$EVTYPE==events[i]])
eventdata$injuries[i] = sum(stormdata.filtered92$INJURIES[stormdata.filtered92$EVTYPE==events[i]])
eventdata$pDMG[i] = sum(stormdata.filtered92$totalPropDamage[stormdata.filtered92$EVTYPE==events[i]])
eventdata$cDMG[i] = sum(stormdata.filtered92$totalCropDamage[stormdata.filtered92$EVTYPE==events[i]])
}
eventdata <- as.data.frame(eventdata)
Now, the top 10 events that cause strongest impact have been extracted
eventdata.healthSubset<-eventdata[,c(1:3)]
eventdata.financeSubset<-eventdata[,c(1,4:5)]
eventdata.healthSubset<-eventdata.healthSubset[order(eventdata$fatalities+eventdata$injuries,decreasing=TRUE)[1:10],]
eventdata.financeSubset<-eventdata.financeSubset[order(eventdata$pDMG+eventdata$cDMG,decreasing=TRUE)[1:10],]
head(eventdata.healthSubset,5)
## event fatalities injuries
## 1 TORNADO 1618 23741
## 22 FLOOD 408 6748
## 43 TSTM WIND 169 2742
## 4 THUNDERSTORM WIND 177 2201
## 11 FLASH FLOOD 752 1559
head(eventdata.financeSubset,5)
## event pDMG cDMG
## 22 FLOOD 1.448e+11 5.671e+09
## 330 HURRICANE/TYPHOON 6.931e+10 2.608e+09
## 69 STORM SURGE 4.332e+10 5.000e+03
## 1 TORNADO 2.775e+10 4.150e+08
## 10 HAIL 1.573e+10 3.026e+09
library(ggplot2)
library(plyr)
library(reshape2)
toPlot<-melt(eventdata.healthSubset, id.vars=c(1))
ggplot(toPlot, aes(x=reorder(event,-value), y=value, fill=variable)) + labs(title="Top 10 harmful catastrophic events",y="Total Fatalities and Injuries",x="Event") + geom_bar(stat="identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
toPlot<-melt(eventdata.financeSubset, id.vars=c(1))
ggplot(toPlot, aes(x=reorder(event,-value), y=value, fill=variable)) + labs(title="Top 10 economical damageable catastrophic events",y="Damage ($)",x="Event") + geom_bar(stat="identity") + theme(axis.text.x = element_text(size=8,angle = 90, hjust = 1))