This exploratory analysis examines data from the U.S. National Oceanic and Atmospheric Administration’s(NOAA) storm database to address two fundamental questions.
The analysis consits of getting and cleaning a raw data set of 37 variables obtained form obserbation of US storm events from the 1950 to 2011.
#Import required libraries.
library(data.table)
library(ggplot2)
library(reshape)
library(gridExtra)
## Loading required package: grid
#Create data directory to house unzipped activity files
if(!file.exists("./data")){
dir.create("./data")
}
## download and unzip input data
if(!file.exists("./data/repdata-data-StormData.csv.bz2")){
fileUrl <- "http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileUrl, "./data/repdata-data-StormData.csv.bz2")
}
if(!file.exists("./data/repdata-data-StormData.csv.bz2")){
bunzip2("./data/repdata-data-StormData.csv.bz2")
}
stormData <-read.csv("./data/repdata-data-StormData.csv",
header = TRUE,
stringsAsFactors = F)
extract out necessary variables to address the analysis question
BGN_DATE Date the storm event began
EVTYPE Type of storm event. Notice similar storm events can be listed using different wording e.g. “HURRICANE” and “TYPHOON”
FATALITIES Number of people killed
INJURIES Number of people injured
PROPDMG Property damage in whole numbers
PROPDMGEXP Property damage multiplier Hundred (H), Thousand (K), Million (M), Billion (B)
CROPDMG crop damage in whole numbers
CROPDMGEXP crop damage multiplier Hundred (H), Thousand (K), Million (M), Billion (B)
weather.dt<-data.table(stormData[,c("BGN_DATE",
"EVTYPE",
"FATALITIES",
"INJURIES",
"PROPDMG",
"PROPDMGEXP",
"CROPDMG",
"CROPDMGEXP")])
#clean up initial large data set
rm(stormData)
gc()
similar storm events using different wording are consolidated e.g. “HURRICANE” and “TYPHOON”.
weather.dt[EVTYPE %like% "TORNADO", EVTYPE:="TORNADO"]
weather.dt[EVTYPE %like% "HURRICANE|TYPHOON", EVTYPE:="HURRICANE"]
weather.dt[EVTYPE %like% "TSTM|T+ORM", EVTYPE:="THUNDERSTORM"]
weather.dt[EVTYPE %like% "SNOW", EVTYPE:="SNOW"]
weather.dt[EVTYPE %like% "BLIZZARD", EVTYPE:="BLIZZARD"]
weather.dt[EVTYPE %like% "HAIL", EVTYPE:="HAIL"]
weather.dt[EVTYPE %like% "RAIN|PRECIP", EVTYPE:="RAIN"]
weather.dt[EVTYPE %like% "TROPICALSTORM", EVTYPE:="TROPICALSTORM"]
weather.dt[EVTYPE %like% "FLOOD", EVTYPE:="FLOOD"]
weather.dt[EVTYPE %like% "FIRE", EVTYPE:="FIRE"]
weather.dt[EVTYPE %like% "LIGHTNING", EVTYPE:="LIGHTNING"]
weather.dt[EVTYPE %like% "WIND", EVTYPE:="WIND"]
weather.dt[EVTYPE %like% "HEAT", EVTYPE:="HEAT"]
weather.dt[EVTYPE %like% "FROST", EVTYPE:="FROST"]
weather.dt[EVTYPE %like% "COLD", EVTYPE:="COLD"]
weather.dt[, EVTYPE:=as.factor(EVTYPE)]
In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete. Therefore a subset of the last 15 years is taken
weather.dt[,BGN_DATE:=as.Date(as.POSIXct(BGN_DATE, format = "%m/%d/%Y %H:%M:%S"))]
setkey(weather.dt,BGN_DATE)
#subtract 15 years from last recorded date
dateFrom<-weather.dt[.N]$BGN_DATE-floor(15*365.25)
#subset the data table with last 15 years
weather.dt<-weather.dt[BGN_DATE>dateFrom]
Combines:
PROPDMG and PROPDMGEXP fields to calculate total dollar value of property damage
CROPDMG and CROPDMGEXP fields to calculate total dollar value of crop damage
total dollar value of property damage and total dollar value of crop damage to calculate total damage dollar amount
#calculate dollar value of property damage
weather.dt[!(PROPDMGEXP %in% "H") &
!(PROPDMGEXP %in% "K") &
!(PROPDMGEXP %in% "M") &
!(PROPDMGEXP %in% "B") ,
PROPDMGTOTAL :=round(PROPDMG ,2)]
weather.dt[PROPDMGEXP %in% "H", PROPDMGTOTAL :=round(PROPDMG*100,2)]
weather.dt[PROPDMGEXP %in% "K", PROPDMGTOTAL :=round(PROPDMG*1000,2)]
weather.dt[PROPDMGEXP %in% "M", PROPDMGTOTAL :=round(PROPDMG*1000000,2)]
weather.dt[PROPDMGEXP %in% "B", PROPDMGTOTAL :=round(PROPDMG*1000000000,2)]
weather.dt[,PROPDMGTOTAL:= PROPDMGTOTAL/1000000000]
#calculate dollar value of crop damage
weather.dt[!(CROPDMGEXP %in% "H") &
!(CROPDMGEXP %in% "K") &
!(CROPDMGEXP %in% "M") &
!(CROPDMGEXP %in% "B") ,
CROPDMGTOTAL :=round(CROPDMG ,2)]
weather.dt[CROPDMGEXP %in% "H", CROPDMGTOTAL :=round(CROPDMG*100,2)]
weather.dt[CROPDMGEXP %in% "K", CROPDMGTOTAL :=round(CROPDMG*1000,2)]
weather.dt[CROPDMGEXP %in% "M", CROPDMGTOTAL :=round(CROPDMG*1000000,2)]
weather.dt[CROPDMGEXP %in% "B", CROPDMGTOTAL :=round(CROPDMG*1000000000,2)]
weather.dt[,CROPDMGTOTAL:= CROPDMGTOTAL/1000000000]
weather.dt[, DMGTOTAL :=PROPDMGTOTAL+CROPDMGTOTAL]
#calculate total damage dollar amount, and frequency of each storm event type
weather.damage.dt<-weather.dt[,j=list(CROPDMGTOTAL = sum(CROPDMGTOTAL),
PROPDMGTOTAL = sum(PROPDMGTOTAL),
DMGTOTAL = sum(DMGTOTAL),
DMGPEREVTYPE=round(sum(DMGTOTAL)/.N,2),
TFREQ=.N,
FREQ=round(log2(.N))), by=list(EVTYPE)]
setkey(weather.damage.dt,DMGTOTAL)
#sort event type by total damage numbers
setorder(weather.damage.dt,-DMGTOTAL)
sum INJURIES and FATALITIES to obtain total human CASUALTY
## economic damage calculation and consolidation
#calculate total storm CASUALTY numbers, and frequency of each storm event type
weather.harm.dt<-weather.dt[,j=list(FATALITIES = sum(FATALITIES),
INJURIES=sum(INJURIES),
CASUALTY=sum(INJURIES)+sum(FATALITIES),
CASUALTYPEREVTYPE=round((sum(INJURIES)+sum(FATALITIES))/.N,2),
TFREQ=.N,
FREQ=round(log2(.N))), by=list(EVTYPE)]
setkey(weather.harm.dt,CASUALTY)
#sort event type by CASUALTY numbers
setorder(weather.harm.dt,-CASUALTY )
To generate the most relevant results, the top 10 most Dangerous events is used for plot generation.
# take only the top 10 events
weather.damage.melt.dt<-head(weather.damage.dt, 10)
# average damage bubble plot
p1 <- ggplot(data=weather.damage.melt.dt, aes(DMGPEREVTYPE, DMGTOTAL,color=EVTYPE, size=FREQ, label=EVTYPE)) +
geom_point(alpha = .6) +
geom_text(aes(label=EVTYPE),size=3, color="black")+
theme_bw()+
xlab("Mean Damages per Event Type") +
ylab("Total Damages in Billions of Dollars") +
scale_size_area(max_size=20,guide="none")
# total damage barplot
weather.damage.melt.dt<-weather.damage.melt.dt[,c("EVTYPE","CROPDMGTOTAL","PROPDMGTOTAL"), with=FALSE]
weather.damage.melt.dt<-melt(weather.damage.melt.dt, id=c("EVTYPE"))
temp.plot1<-ggplot(weather.damage.melt.dt, aes(x=EVTYPE, y=value, fill=variable)) +
geom_bar(stat="identity",position = "stack")+
theme(legend.position="top")+
theme(legend.title=element_blank())+
ylab("Billons of Dollars")+
theme(axis.text.x = element_text(angle = 90, hjust = 1),axis.title.x=element_blank())
# take only the top 10 events
weather.harm.melt.dt<-head(weather.harm.dt, 10)
# average CASUALTY bubble plot
p2 <- ggplot(data=weather.harm.melt.dt, aes(CASUALTYPEREVTYPE, CASUALTY,color=EVTYPE, size=FREQ,label=EVTYPE)) +
geom_point(alpha = .6) +
geom_text(aes(label=EVTYPE),size=3,color="black")+
theme_bw()+
xlab("Mean Casualty per Event Type ") +
ylab("Total Human Casualty") +
scale_size_area(max_size=20,guide="none")
# total damage barplot
weather.harm.melt.dt<-weather.harm.melt.dt[,c("EVTYPE","FATALITIES","INJURIES"), with=FALSE]
weather.harm.melt.dt<-melt(weather.harm.melt.dt, id=c("EVTYPE"))
temp.plot2<-ggplot(weather.harm.melt.dt, aes(x=EVTYPE, y=value, fill=variable)) +
geom_bar(stat="identity",position = "stack")+
theme(legend.position="top")+
theme(legend.title=element_blank())+
ylab("Casualty Count")+
theme(axis.text.x = element_text(angle = 90, hjust = 1),axis.title.x=element_blank())
grid.arrange(arrangeGrob(temp.plot1,temp.plot2, nrow = 1,ncol=2),
main=textGrob("Total Losses of Due to Each Storm Type", gp=gpar(cex=2.2, fontface="bold", col="#990000")),
nrow=1)
From the perliminary anaylsis, the storm type with the greatest economic consequences in billions of dollars are:
knitr::kable(head(weather.damage.dt[,c("EVTYPE", "DMGTOTAL"), with=FALSE], 10), digits = 2)
| EVTYPE | DMGTOTAL |
|---|---|
| FLOOD | 163.48 |
| HURRICANE | 85.34 |
| THUNDERSTORM | 69.67 |
| TORNADO | 24.17 |
| HAIL | 16.29 |
| DROUGHT | 13.80 |
| FIRE | 8.09 |
| WIND | 5.94 |
| RAIN | 1.32 |
| COLD | 1.17 |
From the perliminary anaylsis, the most harmful storm types with respect to population health:
knitr::kable(head(weather.harm.dt[,c("EVTYPE", "CASUALTY"), with=FALSE], 10), digits = 2)
| EVTYPE | CASUALTY |
|---|---|
| TORNADO | 21449 |
| FLOOD | 9562 |
| HEAT | 9554 |
| THUNDERSTORM | 7559 |
| LIGHTNING | 4427 |
| WIND | 2041 |
| FIRE | 1512 |
| HURRICANE | 1398 |
| SNOW | 789 |
| FOG | 765 |
Since the frequency of events isn’t taken into account, analysis of total losses can be misleading. To further investigate the economic and human danger to each storm types, average losses per storm type must be evaluated.
grid.arrange(arrangeGrob(p1,p2, nrow = 2,ncol=1),
main=textGrob("Average Losses of Due to Each Storm Type", gp=gpar(cex=2.2, fontface="bold", col="#990000")),
nrow=1)
From the Average losses per event type, a more complete picture of actual danger due to each storm type is presented. We can take into account not just total losses, but how much damage and harm each event type has been known to cause.