Setting the environment
knitr::opts_chunk$set(echo=TRUE, fig.width=12, fig.height=4)
setwd("/Users/rameshmaganti/Desktop/coursera/data/RepData_StormData")
Loading the required libraries and downloading data files
library(plyr)
library(ggplot2)
library(gridExtra)
## Loading required package: grid
if (!file.exists("repdata-data-StormData.csv")) {
download.file("http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", "repdata-data-StormData.csv.bz2")
dateDownloaded=date()
bunzip2(file="repdata-data-StormData.csv.bz2",destname="repdata-data-StormData.csv")
# unlink("repdata-data-StormData.csv.bz2")
}
Loading the data.
stormdata<-read.csv("repdata-data-StormData.csv")
Pre-processing: Cleaning, filtering and TRANSFORMING the raw data loaded
Based on the analysis requirement, we firstly examine the data:
dim(stormdata)
## [1] 902297 37
The storm data set has 902297 rows and 37 columns The column names of the storm dataset are:
STATE_, BGNDATE, BGN_TIME, TIME_ZONE, COUNTY, COUNTYNAME, STATE, EVTYPE, BGN_RANGE, BGN_AZI, BGN_LOCATI, END_DATE, END_TIME, COUNTY_END, COUNTYENDN, END_RANGE, END_AZI, END_LOCATI, LENGTH, WIDTH, F, MAG, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP, WFO, STATEOFFIC, ZONENAMES, LATITUDE, LONGITUDE, LATITUDE_E, LONGITUDE_, REMARKS, REFNUM
impactdata<-stormdata[, c("EVTYPE", "PROPDMG", "PROPDMGEXP",
"FATALITIES", "INJURIES", "CROPDMG", "CROPDMGEXP")]
impactdata<-impactdata[!grepl("Summary", impactdata$EVTYPE),]
impactdata$EVTYPE<-as.factor(toupper(impactdata$EVTYPE))
impactdata$PROPDMGEXP<-as.factor(toupper(impactdata$PROPDMGEXP))
impactdata$CROPDMGEXP<-as.factor(toupper(impactdata$CROPDMGEXP))
After the minor clean up and filtering above the impactdata data set now has 902224 rows and 7 columns and looks like this:
head(impactdata, n=5)
## EVTYPE PROPDMG PROPDMGEXP FATALITIES INJURIES CROPDMG CROPDMGEXP
## 1 TORNADO 25.0 K 0 15 0
## 2 TORNADO 2.5 K 0 0 0
## 3 TORNADO 25.0 K 0 2 0
## 4 TORNADO 2.5 K 0 2 0
## 5 TORNADO 2.5 K 0 2 0
The data analysis must address the following questions:
1: Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
2: Across the United States, which types of events have the greatest economic consequences?
To compute the total dollar amounts for property damage and crop damage, the approach being followed, as per the documentation and analysis based on examining the dataset, is to multiply the PROPDMGEXP values with the PROPDMG values, as well as the CROPDMGEXP values with the CROPDMG values to compute the total property damages and crop damages by event respectively. However, as can be seen by summarizing the PROPDMGEXP and CROPDMGEXP variables, the values in both the PROPDMGEXP and CROPDMGEXP are inconsistent and alpha-numeric.
summary(impactdata$PROPDMGEXP)
## - ? + 0 1 2 3 4 5
## 465861 1 8 5 216 25 13 4 4 28
## 6 7 8 B H K M
## 4 5 1 40 7 424665 11337
summary(impactdata$CROPDMGEXP)
## ? 0 2 B K M
## 618340 7 19 1 9 281853 1995
sum(impactdata$PROPDMGEXP %in% c("", "0", "B", "H", "M", "K")) / nrow(impactdata)
## [1] 0.9999
prop<-as.character(impactdata$PROPDMGEXP)
prop[prop %in% c("?","-","+","1","2","3","4","5","6","7","8")]<-NA
prop[prop %in% c("","0")]<-0
prop<-gsub("B", 10^9, prop)
prop<-gsub("K", 1000, prop)
prop<-gsub("H", 100, prop)
prop<-gsub("M", 10^6, prop)
impactdata$PROPDMGEXP<-as.numeric(prop)
impactdata$TOTALPROPDMG<-impactdata$PROPDMGEXP*impactdata$PROPDMG
crop<-as.character(impactdata$CROPDMGEXP)
crop[crop %in% c("?","2")]<-NA
crop[crop %in% c("","0")]<-0
crop<-gsub("B", 10^9, crop)
crop<-gsub("K", 1000, crop)
crop<-gsub("M", 10^6, crop)
impactdata$CROPDMGEXP<-as.numeric(crop)
impactdata$TOTALCROPDMG<-impactdata$CROPDMGEXP*impactdata$CROPDMG
totalImpactdata<-ddply(impactdata, .(EVTYPE), summarize,
totalFatal=sum(FATALITIES),
totalInjured=sum(INJURIES),
totalPropDmg=sum(TOTALPROPDMG),
totalCropDmg=sum(TOTALCROPDMG),
totalDamages=sum(totalPropDmg,totalCropDmg))
fataldata<-totalImpactdata[ order(-totalImpactdata[, 2]), ][1:10, ]
injurdata<-totalImpactdata[ order(-totalImpactdata[,3]), ][1:10, ]
The top ten fatalities causing events are:
print(fataldata[,1:2], row.names=FALSE)
## EVTYPE totalFatal
## TORNADO 5633
## EXCESSIVE HEAT 1903
## FLASH FLOOD 978
## HEAT 937
## LIGHTNING 816
## TSTM WIND 504
## FLOOD 470
## RIP CURRENT 368
## HIGH WIND 248
## AVALANCHE 224
The top ten injuries causing events are:
print(injurdata[,c(1,3)], row.names=FALSE)
## EVTYPE totalInjured
## TORNADO 91346
## TSTM WIND 6957
## FLOOD 6789
## EXCESSIVE HEAT 6525
## LIGHTNING 5230
## HEAT 2100
## ICE STORM 1975
## FLASH FLOOD 1777
## THUNDERSTORM WIND 1488
## HAIL 1361
Impact of storm events on health: top ten events causing the highest number of fatalities and injuries. Tornados, by far, were the leading cause of both fatalities and injuries across the USA, followed by Excessive Heat for fatalities and Thunderstorm winds for injuries.
g1<-ggplot(data = fataldata, aes(y=totalFatal,
x=reorder(EVTYPE, totalFatal))) +
geom_bar(stat="identity", fill="orange") +coord_flip()+
ylab("Total Fatalities")+
xlab("Event Type")+
ggtitle("Top Event Types by Total Fatalities")
g2<-ggplot(data = injurdata, aes(y=totalInjured,
x=reorder(EVTYPE, totalInjured))) +
geom_bar(stat="identity", fill="red")+ coord_flip()+
ylab("Total Injuries")+
xlab("Event Type") +
ggtitle("Top Event Types by Total Injuries")
grid.arrange(g1,g2, ncol=2)
Impact of storm events on the economy: top ten events causing the highest amount of property and crop damages
propdmgdata<-totalImpactdata[ order(-totalImpactdata[, 4]), ][1:10, ]
cropdmgdata<-totalImpactdata[ order(-totalImpactdata[,5]), ][1:10, ]
totalEconImpact<-totalImpactdata[ order(-totalImpactdata[,6]), ][1:10, ]
The top ten events that caused the maximum property damage are:
print(propdmgdata[,c(1,4)], row.names=FALSE)
## EVTYPE totalPropDmg
## FLOOD 1.447e+11
## HURRICANE/TYPHOON 6.931e+10
## STORM SURGE 4.332e+10
## HURRICANE 1.187e+10
## TROPICAL STORM 7.704e+09
## WINTER STORM 6.688e+09
## RIVER FLOOD 5.119e+09
## WILDFIRE 4.765e+09
## STORM SURGE/TIDE 4.641e+09
## TSTM WIND 4.485e+09
The top ten events that caused the maximum crop damage are:
print(cropdmgdata[,c(1,5)], row.names=FALSE)
## EVTYPE totalCropDmg
## DROUGHT 1.397e+10
## FLOOD 5.662e+09
## RIVER FLOOD 5.029e+09
## ICE STORM 5.022e+09
## HAIL 3.026e+09
## HURRICANE 2.742e+09
## HURRICANE/TYPHOON 2.608e+09
## FLASH FLOOD 1.421e+09
## EXTREME COLD 1.313e+09
## FROST/FREEZE 1.094e+09
Economic impact analysis plots of severe storm events: Property Damages, Crop Damages and the combined total damage of both property and crops
g3<-ggplot(data = propdmgdata, aes(y=totalPropDmg/10^9, x=reorder(EVTYPE, totalPropDmg))) +
geom_bar(stat="identity", fill="orange") +coord_flip()+
ylab("Total Property Damages (US$ Billions)")+
xlab("Event Type")+
ggtitle("Top Event Types by Total Property Damages")
g4<-ggplot(data = cropdmgdata, aes(y=totalCropDmg/10^9, x=reorder(EVTYPE, totalCropDmg))) +
geom_bar(stat="identity", fill="orange") +coord_flip()+
ylab("Total Crop Damages (US$ Billions)")+
xlab("Event Type")+
ggtitle("Top Event Types by Total Crop Damages")
grid.arrange(g3, g4, ncol=2)
Each of the above plots show the top ten maximum property damage (the plot on the left) and crop damage (the plot on the right) causing events in the USA.
The plot below shows the storm events that caused the maximum combined economic impact (the sum of property and crop damages) in the USA
ggplot(data = totalEconImpact, aes(y=totalDamages/10^9, x=reorder(EVTYPE, totalDamages))) +
geom_bar(stat="identity", fill="orange") +coord_flip()+
ylab("Total Damages (US$ Billions)")+
xlab("Event Type")+
ggtitle("Top Event Types by Total Damages")