=========================================================================================================================
Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage. Preventing such outcomes to the extent possible is a key concern. This report explores the U.S. National Oceanic and Atmospheric Administration`s (NOAA) storm database to determine which type of events are most harmful with respect to population health, as well as, which types of events have the greatest economic consequences.
The data for this analysis comes in the form of a comma-separated-value (CSV) file compressed via the bzip2 algorithm to reduce its size. The storm data file was downloaded from the Coursera Reproducible Research website on August 20, 2014. The events in the database start in the year 1950 and end in November 2011. In the earlier years, there are generally fewer events recorded due to a lack of good records. Most recent years should be complete.
library(knitr)
opts_knit$set(progress=FALSE, verbose = TRUE)
opts_chunk$set(echo=TRUE, message=FALSE, tidy=TRUE, comment=NA,
fig.path="figure/", fig.keep="high", fig.width=10, fig.height=6,
fig.align="center")
Load needed libraries.
require(plyr)
require(ggplot2)
download.file("http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
"~/Data/repdata_data_StormData.csv.bz2")
csv <- bzfile("~/Data/repdata_data_StormData.csv.bz2", "repdata_data_StormData.csv")
stormdata <- read.csv2(csv, sep = ",", stringsAsFactors = FALSE)
close(csv)
unlink(csv)
=========================================================================================================================
str(stormdata)
'data.frame': 902297 obs. of 37 variables:
$ STATE__ : chr "1.00" "1.00" "1.00" "1.00" ...
$ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
$ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
$ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
$ COUNTY : chr "97.00" "3.00" "57.00" "89.00" ...
$ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
$ STATE : chr "AL" "AL" "AL" "AL" ...
$ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
$ BGN_RANGE : chr "0.00" "0.00" "0.00" "0.00" ...
$ BGN_AZI : chr "" "" "" "" ...
$ BGN_LOCATI: chr "" "" "" "" ...
$ END_DATE : chr "" "" "" "" ...
$ END_TIME : chr "" "" "" "" ...
$ COUNTY_END: chr "0.00" "0.00" "0.00" "0.00" ...
$ COUNTYENDN: logi NA NA NA NA NA NA ...
$ END_RANGE : chr "0.00" "0.00" "0.00" "0.00" ...
$ END_AZI : chr "" "" "" "" ...
$ END_LOCATI: chr "" "" "" "" ...
$ LENGTH : chr "14.00" "2.00" "0.10" "0.00" ...
$ WIDTH : chr "100.00" "150.00" "123.00" "100.00" ...
$ F : int 3 2 2 2 2 2 2 1 3 3 ...
$ MAG : chr "0.00" "0.00" "0.00" "0.00" ...
$ FATALITIES: chr "0.00" "0.00" "0.00" "0.00" ...
$ INJURIES : chr "15.00" "0.00" "2.00" "2.00" ...
$ PROPDMG : chr "25.00" "2.50" "25.00" "2.50" ...
$ PROPDMGEXP: chr "K" "K" "K" "K" ...
$ CROPDMG : chr "0.00" "0.00" "0.00" "0.00" ...
$ CROPDMGEXP: chr "" "" "" "" ...
$ WFO : chr "" "" "" "" ...
$ STATEOFFIC: chr "" "" "" "" ...
$ ZONENAMES : chr "" "" "" "" ...
$ LATITUDE : chr "3040.00" "3042.00" "3340.00" "3458.00" ...
$ LONGITUDE : chr "8812.00" "8755.00" "8742.00" "8626.00" ...
$ LATITUDE_E: chr "3051.00" "0.00" "0.00" "0.00" ...
$ LONGITUDE_: chr "8806.00" "0.00" "0.00" "0.00" ...
$ REMARKS : chr "" "" "" "" ...
$ REFNUM : chr "1.00" "2.00" "3.00" "4.00" ...
Show First 10 Rows of Data.
head(stormdata, 10)
STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
1 1.00 4/18/1950 0:00:00 0130 CST 97.00 MOBILE AL
2 1.00 4/18/1950 0:00:00 0145 CST 3.00 BALDWIN AL
3 1.00 2/20/1951 0:00:00 1600 CST 57.00 FAYETTE AL
4 1.00 6/8/1951 0:00:00 0900 CST 89.00 MADISON AL
5 1.00 11/15/1951 0:00:00 1500 CST 43.00 CULLMAN AL
6 1.00 11/15/1951 0:00:00 2000 CST 77.00 LAUDERDALE AL
7 1.00 11/16/1951 0:00:00 0100 CST 9.00 BLOUNT AL
8 1.00 1/22/1952 0:00:00 0900 CST 123.00 TALLAPOOSA AL
9 1.00 2/13/1952 0:00:00 2000 CST 125.00 TUSCALOOSA AL
10 1.00 2/13/1952 0:00:00 2000 CST 57.00 FAYETTE AL
EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
1 TORNADO 0.00 0.00
2 TORNADO 0.00 0.00
3 TORNADO 0.00 0.00
4 TORNADO 0.00 0.00
5 TORNADO 0.00 0.00
6 TORNADO 0.00 0.00
7 TORNADO 0.00 0.00
8 TORNADO 0.00 0.00
9 TORNADO 0.00 0.00
10 TORNADO 0.00 0.00
COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
1 NA 0.00 14.00 100.00 3 0.00 0.00
2 NA 0.00 2.00 150.00 2 0.00 0.00
3 NA 0.00 0.10 123.00 2 0.00 0.00
4 NA 0.00 0.00 100.00 2 0.00 0.00
5 NA 0.00 0.00 150.00 2 0.00 0.00
6 NA 0.00 1.50 177.00 2 0.00 0.00
7 NA 0.00 1.50 33.00 2 0.00 0.00
8 NA 0.00 0.00 33.00 1 0.00 0.00
9 NA 0.00 3.30 100.00 3 0.00 1.00
10 NA 0.00 2.30 100.00 3 0.00 0.00
INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
1 15.00 25.00 K 0.00
2 0.00 2.50 K 0.00
3 2.00 25.00 K 0.00
4 2.00 2.50 K 0.00
5 2.00 2.50 K 0.00
6 6.00 2.50 K 0.00
7 1.00 2.50 K 0.00
8 0.00 2.50 K 0.00
9 14.00 25.00 K 0.00
10 0.00 25.00 K 0.00
LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
1 3040.00 8812.00 3051.00 8806.00 1.00
2 3042.00 8755.00 0.00 0.00 2.00
3 3340.00 8742.00 0.00 0.00 3.00
4 3458.00 8626.00 0.00 0.00 4.00
5 3412.00 8642.00 0.00 0.00 5.00
6 3450.00 8748.00 0.00 0.00 6.00
7 3405.00 8631.00 0.00 0.00 7.00
8 3255.00 8558.00 0.00 0.00 8.00
9 3334.00 8740.00 3336.00 8738.00 9.00
10 3336.00 8738.00 3337.00 8737.00 10.00
Convert Fatalities and Injury Fields to Numeric
stormdata$FATALITIES <- as.numeric(stormdata$FATALITIES)
stormdata$INJURIES <- as.numeric(stormdata$INJURIES)
Show First 10 Rows of Data.
head(stormdata$FATALITIES, 10)
[1] 0 0 0 0 0 0 0 0 1 0
# [1] 0 0 0 0 0 0 0 0 1 0
head(stormdata$INJURIES, 10)
[1] 15 0 2 2 2 6 1 0 14 0
# [1] 15 0 2 2 2 6 1 0 14 0
Convert Event Type to Factor
stormdata$EVTYPE <- toupper(stormdata$EVTYPE)
eventtype <- sort(unique(stormdata$EVTYPE))
stormdata$EVTYPE <- as.factor(stormdata$EVTYPE)
eventtype[1:10]
[1] " HIGH SURF ADVISORY" " COASTAL FLOOD"
[3] " FLASH FLOOD" " LIGHTNING"
[5] " TSTM WIND" " TSTM WIND (G45)"
[7] " WATERSPOUT" " WIND"
[9] "?" "ABNORMAL WARMTH"
# Note there are 898 event types
Consolidate Fatalities and Injuries for Graphs
injuries <- aggregate(stormdata$INJURIES, by = list(EVTYPE = stormdata$EVTYPE),
sum)
injuries <- injuries[order(injuries$x, decreasing = TRUE), ]
top20injuries <- injuries[1:20, ]
fatalities <- aggregate(stormdata$FATALITIES, by = list(EVTYPE = stormdata$EVTYPE),
sum)
fatalities <- fatalities[order(fatalities$x, decreasing = TRUE), ]
top20fatalities <- fatalities[1:20, ]
The graphs indicate that the type of event the most harmful with respect to population health is a Tornado.
ggplot(top20injuries, aes(EVTYPE, y = x)) + geom_bar(stat = "Identity", fill = "red") +
xlab("Event Type") + ylab("Number of Injuries") + ggtitle("Top 20 Injuries by Event Type") +
coord_flip() + theme(legend.position = "none")
ggplot(top20fatalities, aes(EVTYPE, y = x)) + geom_bar(stat = "Identity", fill = "blue") +
xlab("Event Type") + ylab("Number of Fatalities") + ggtitle("Top 20 Fatalities by Event Type") +
coord_flip() + theme(legend.position = "none")
=========================================================================================================================
unique(stormdata$PROPDMGEXP)
[1] "K" "M" "" "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-"
[18] "1" "8"
# [1] 'K' 'M' '' 'B' 'm' '+' '0' '5' '6' '?' '4' '2' '3' 'h' '7' 'H' '-' '1'
# '8'
unique(stormdata$CROPDMGEXP)
[1] "" "M" "K" "m" "B" "?" "0" "k" "2"
# [1] '' 'M' 'K' 'm' 'B' '?' '0' 'k' '2'
Convert PROPDMGEXP an CROPDMGEXP to All Upper Case Letters
stormdata$PROPDMGEXP <- toupper(stormdata$PROPDMGEXP)
stormdata$CROPDMGEXP <- toupper(stormdata$CROPDMGEXP)
unique(c(stormdata$PROPDMGEXP, stormdata$CROPDMGEXP))
[1] "K" "M" "" "B" "+" "0" "5" "6" "?" "4" "2" "3" "H" "7" "-" "1" "8"
# [1] 'K' 'M' '' 'B' '+' '0' '5' '6' '?' '4' '2' '3' 'H' '7' '-' '1' '8'
Convert PROPDMGEXP an CROPDMGEXP to Numeric Using Exponential Multiplier
stormdatasub <- stormdata[, c("EVTYPE", "PROPDMG", "PROPDMGEXP", "CROPDMG",
"CROPDMGEXP")]
stormdatasub[stormdatasub$PROPDMGEXP %in% c("", "+", "-", "?"), "PROPDMGEXP"] <- "0"
stormdatasub[stormdatasub$CROPDMGEXP %in% c("", "+", "-", "?"), "CROPDMGEXP"] <- "0"
unique(c(stormdatasub$PROPDMGEXP, stormdatasub$CROPDMGEXP))
[1] "K" "M" "0" "B" "5" "6" "4" "2" "3" "H" "7" "1" "8"
# [1] 'K' 'M' '0' 'B' '5' '6' '4' '2' '3' 'H' '7' '1' '8'
stormdatasub[stormdatasub$PROPDMGEXP == "B", "PROPDMGEXP"] <- 9
stormdatasub[stormdatasub$CROPDMGEXP == "B", "CROPDMGEXP"] <- 9
stormdatasub[stormdatasub$PROPDMGEXP == "M", "PROPDMGEXP"] <- 6
stormdatasub[stormdatasub$CROPDMGEXP == "M", "CROPDMGEXP"] <- 6
stormdatasub[stormdatasub$PROPDMGEXP == "K", "PROPDMGEXP"] <- 3
stormdatasub[stormdatasub$CROPDMGEXP == "K", "CROPDMGEXP"] <- 3
stormdatasub[stormdatasub$PROPDMGEXP == "H", "PROPDMGEXP"] <- 2
stormdatasub[stormdatasub$CROPDMGEXP == "H", "CROPDMGEXP"] <- 2
unique(c(stormdatasub$PROPDMGEXP, stormdatasub$CROPDMGEXP))
[1] "3" "6" "0" "9" "5" "4" "2" "7" "1" "8"
# [1] '3' '6' '0' '9' '5' '4' '2' '7' '1' '8'
Combine Exponent and Value
stormdatasub$PROPDMGEXP <- 10^(as.numeric(stormdatasub$PROPDMGEXP))
stormdatasub$CROPDMGEXP <- 10^(as.numeric(stormdatasub$CROPDMGEXP))
stormdatasub$PROPDMG <- as.numeric(stormdatasub$PROPDMG)
stormdatasub$CROPDMG <- as.numeric(stormdatasub$CROPDMG)
stormdatasub[is.na(stormdatasub$PROPDMG), "PROPDMG"] <- 0
stormdatasub[is.na(stormdatasub$CROPDMG), "CROPDMG"] <- 0
Calculate the Total Storm Damage
stormdatasub <- within(stormdatasub, TOTALDMG <- PROPDMG * PROPDMGEXP + CROPDMG *
CROPDMGEXP)
damagetype <- aggregate(stormdatasub$TOTALDMG, by = list(EVTYPE = stormdatasub$EVTYPE),
FUN = sum)
damagetype <- damagetype[order(damagetype$x, decreasing = TRUE), ]
head(damagetype, 10)
EVTYPE x
154 FLOOD 1.503e+11
372 HURRICANE/TYPHOON 7.191e+10
758 TORNADO 5.736e+10
599 STORM SURGE 4.332e+10
212 HAIL 1.876e+10
138 FLASH FLOOD 1.824e+10
84 DROUGHT 1.502e+10
363 HURRICANE 1.461e+10
529 RIVER FLOOD 1.015e+10
387 ICE STORM 8.967e+09
# EVTYPE x 154 FLOOD 150319678257 372 HURRICANE/TYPHOON 71913712800 758
# TORNADO 57362333946 599 STORM SURGE 43323541000 212 HAIL 18761221986 138
# FLASH FLOOD 18243991078 84 DROUGHT 15018672000 363 HURRICANE 14610229010
# 529 RIVER FLOOD 10148404500 387 ICE STORM 8967041360
The graph indicates that the type of event that would have the greatest economic consequence would be a flood.
damagesub <- damagetype[1:20, ]
ggplot(damagesub, aes(EVTYPE, y = x)) + geom_bar(stat = "Identity", fill = "orange") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Event Type") +
ylab("Damage in Dollars") + ggtitle("Top 20 Greatest Economical Consequences by Event Type") +
theme(legend.position = "none")