The aim of this report is to explore the NOAA Storm Database and answer some basic questions about severe weather events. More specifically, we want to know which types of weather events are most harmful for health of the population and for the economy. The results of the finding are summarized in the “Results” section at the end.
Load the libraries needed for the analysis
library(ggplot2)
library(plyr)
Download and read the data
download.file("http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = "StormData.cvs.bz2")
stormData<-read.csv(bzfile("StormData.cvs.bz2"), header=TRUE)
The columns of interest are EVTYPE, INJURIES, FATALITIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP. We need to combine the information in PROPDMG and PROPDMGEXP, and CROPDMG and CROPDMGEXP. Lets take a closer look at these columns.
levels(stormData$PROPDMGEXP)
## [1] "" "-" "?" "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "h" "H" "K"
## [18] "m" "M"
levels(stormData$CROPDMGEXP)
## [1] "" "?" "0" "2" "B" "k" "K" "m" "M"
We see that there are several factors in the PROPDMGEXP and CROPDMGEXP such as “+”, “-”, “0”, “1”, “2”… etc. for which we cannot find an explanation in the documentation of the database. So we look at the data by year to understand the exp columns better.
#Understanding the Property and Crop Damage
stormData$YEAR<-(strptime(stormData$BGN_DATE, format="%m/%d/%Y %H:%M:%S"))$year
stormData$YEAR<-as.factor(stormData$YEAR)
T<-table(stormData$YEAR, stormData$PROPDMGEXP) # look at EXP by year
Data.93to95<-stormData[grep('93|94|95', stormData$YEAR), c(8,23,24,25,26,27,28, 38)] #about 61000 rows
The above code shows that the factors “+”, “-”, “0”, “1”, “2”… etc. occur only in years 1993 to 1995. Further we exame the data in these years in terms of the health and economic damage in Data.93to95 and see that it might not be too unresonable to discard the data for these years and still make useful conclusions. This also frees us from making erranous assumptions about “+”, “-”, “0”, “1”, “2”… and introduce incorrect values.So we discard the data for years 1993, 1994 and 1994 and continue with the analysis.
Below we combine the information in PROPDMG and PROPDMGEXP, and CROPDMG and CROPDMGEXP. Then we compute Health Damage (Injuries+Fatalities), and Economic Damage (Crop+Property).
# Discard the data for these 3 years
subData<-stormData[!grepl('93|94|95', stormData$YEAR),]
# Convert from factor to character
subData$CropDmgMag<-as.character(levels(subData$CROPDMGEXP))[subData$CROPDMGEXP]
subData$PropDmgMag<-as.character(levels(subData$PROPDMGEXP))[subData$PROPDMGEXP]
#Replace h, k m, b with corresponding numeric values
subData$CropDmgMag[which(subData$CropDmgMag=="")]<-"empty"
subData$CropDmgMag<-gsub("empty", "1", subData$CropDmgMag, ignore.case=TRUE)
subData$CropDmgMag<-gsub("h", "1", subData$CropDmgMag, ignore.case=TRUE)
subData$CropDmgMag<-gsub("k", "10", subData$CropDmgMag, ignore.case=TRUE)
subData$CropDmgMag<-gsub("m", "10000", subData$CropDmgMag, ignore.case=TRUE)
subData$CropDmgMag<-gsub("B", "10000000", subData$CropDmgMag, ignore.case=TRUE)
subData$CropDmgMag<-as.numeric(subData$CropDmgMag) #Convert to Numeric
subData$PropDmgMag[which(subData$PropDmgMag=="")]<-"empty"
subData$PropDmgMag<-gsub("empty", "1", subData$PropDmgMag, ignore.case=TRUE)
subData$PropDmgMag<-gsub("k", "10", subData$PropDmgMag, ignore.case=TRUE)
subData$PropDmgMag<-gsub("h", "1", subData$PropDmgMag, ignore.case=TRUE)
subData$PropDmgMag<-gsub("m", "10000", subData$PropDmgMag, ignore.case=TRUE)
subData$PropDmgMag<-gsub("B", "10000000", subData$PropDmgMag, ignore.case=TRUE)
subData$PropDmgMag<-as.numeric(subData$PropDmgMag) # Convert to Numeric
#Multiply the columns to get values in $H
subData$PropDmgAmt<-subData$PROPDMG*subData$PropDmgMag
subData$CropDmgAmt<-subData$CROPDMG*subData$CropDmgMag
# Create in subData columns with Injuries+Fatalities, and Crop+Property Damage
subData$HealthDmg<-subData$FATALITIES+subData$INJURIES
subData$EconoDmg<-subData$PropDmgAmt+subData$CropDmgAmt
Now we create a new Data frame with Event Type and Health and Economic Damage and remove the rows with no Health and Economic Damage
Data.Event<-aggregate(subData[, c("HealthDmg", "EconoDmg")], by=list(subData$EVTYPE), FUN=sum)
colnames(Data.Event)[1]<-"EVTYPE"
#Convert from Factor to Character
Data.Event$EVTYPE<-as.character(levels(Data.Event$EVTYPE))[Data.Event$EVTYPE]
# Further filter out the rows with no Health and Economic Damage
Data.Event1<-Data.Event[Data.Event$HealthDmg > 0 | Data.Event$EconoDmg > 0, ]
We use regular expression to combine the types of events
Data.Event2<-Data.Event1
Data.Event2[grep('flood|coastal flood|flash flood|fld', Data.Event2$EVTYPE, ignore.case = TRUE), "EVTYPE"]<-"FLOOD"
Data.Event2[grep("TORNADO", Data.Event2$EVTYPE, ignore.case = TRUE), "EVTYPE"]<-"TORNADO"
Data.Event2[grep("LIGHTNING", Data.Event2$EVTYPE, ignore.case = TRUE), "EVTYPE"]<-"LIGHTNING"
Data.Event2[grep('COLD|HYPOTHERMIA|LOW TEMP|FREEZE|FROST', Data.Event2$EVTYPE, ignore.case = TRUE), "EVTYPE"]<-"COLD"
Data.Event2[grep('ICE|ICY|SNOW|WINTER|FREEZING|MIX', Data.Event2$EVTYPE, ignore.case = TRUE), "EVTYPE"]<-"ICE/SNOW"
Data.Event2[grep("HURRI", Data.Event2$EVTYPE, ignore.case = TRUE), "EVTYPE"]<-"HURRICANE"
Data.Event2[grep('HEAT|HYPERTHERMIA|WARM', Data.Event2$EVTYPE, ignore.case = TRUE), "EVTYPE"]<-"HEAT"
Data.Event2[grep('WILD|FIRE', Data.Event2$EVTYPE, ignore.case = TRUE), "EVTYPE"]<-"WILDFIRE"
Data.Event2[grep('STORM|BLIZZARD|THUNDERSTORM|TSTM', Data.Event2$EVTYPE, ignore.case = TRUE), "EVTYPE"]<-"STORM"
Data.Event2[grep("WIND", Data.Event2$EVTYPE, ignore.case = TRUE), "EVTYPE"]<-"WIND"
Data.Event2[grep("FOG", Data.Event2$EVTYPE, ignore.case = TRUE), "EVTYPE"]<-"FOG"
Data.Event2[grep('CURRENT|SURF|WAVE|SEAS|WATER|SWELLS|TSUNAMI|TIDE', Data.Event2$EVTYPE, ignore.case = TRUE), "EVTYPE"]<-"SEA/CURRENTS"
Data.Event2[grep('SLUMP|SLIDE', Data.Event2$EVTYPE, ignore.case = TRUE), "EVTYPE"]<-"LANDSLIDE"
Data.Event2$EVTYPE<-as.factor(Data.Event2$EVTYPE) # Convert to factor
levels(Data.Event2$EVTYPE)
## [1] "AVALANCHE" "Beach Erosion" "BLOWING DUST"
## [4] "COASTAL EROSION" "COLD" "DAM BREAK"
## [7] "DENSE SMOKE" "DOWNBURST" "DROUGHT"
## [10] "DROWNING" "DRY MICROBURST" "Dust Devil"
## [13] "DUST DEVIL" "FLOOD" "FOG"
## [16] "FUNNEL CLOUD" "Glaze" "GLAZE"
## [19] "HAIL" "HEAT" "HEAVY RAIN"
## [22] "HURRICANE" "ICE/SNOW" "LANDSLIDE"
## [25] "LANDSPOUT" "LIGHTNING" "Marine Accident"
## [28] "MARINE HAIL" "Microburst" "Other"
## [31] "OTHER" "RAIN" "SEA/CURRENTS"
## [34] "SEICHE" "SMALL HAIL" "STORM"
## [37] "TORNADO" "Torrential Rainfall" "TROPICAL DEPRESSION"
## [40] "TYPHOON" "VOLCANIC ASH" "WET MICROBURST"
## [43] "WILDFIRE" "WIND"
## Finally we add the Health and Economic Damage values by these new event types
Data.Event3<-aggregate(Data.Event2[, c(2,3)], by=list(Data.Event2$EVTYPE), FUN=sum)
colnames(Data.Event3)[1]<-"EVTYPE"
So we boil down to 44 different weather event types.
# BarGraph for Health Damage
Data.Event3<-transform(Data.Event3, EVTYPE=reorder(EVTYPE,HealthDmg))
ggplot(data=Data.Event3[Data.Event3$HealthDmg > 1000,], aes(EVTYPE, HealthDmg))+geom_bar(stat='identity')+coord_flip()+ ylab("Total Number of People Injured or Killed")+ xlab("Weather Event")
According to our analysis, Tornado is the most harmful type of weather event with respect to population health. This is followed by Storm, Flood and Heat which cause equal amount of destruction in comparison with each other but only 1/10 as harmful as Tornado.
# BarGraph for Economic Damage
Data.Event3<-transform(Data.Event3, EVTYPE=reorder(EVTYPE,EconoDmg))
ggplot(data=Data.Event3[Data.Event3$EconoDmg > 10000000, ], aes(EVTYPE, EconoDmg))+geom_bar(stat='identity')+coord_flip()+ ylab("Property and Crop Damage ($100 M or more)")+ xlab("Weather Event")
The most harmful event type for economy is Flood which includes Flood, Coastal Flood and Flash Flood. This is followed by Hurricane, Storm and Tornado.