Using a U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database, which “tracks characteristics of major storms and weather events in the United States” between 1950 and 2011, a comparison of the different weather event types was completed to assess which had the largest public health and economic impacts.
Defined either as cumulative total injuries or deaths for all recorded incidences of an event type, tornado was the weather event with worst public health impact. By contrats, flooding had the highest economic impact, as measured by combined crop and property damage estimates.
Data is downloded from the below URL into a temporary file on your computer, and then read into the R environment as a dataframe called ‘df’. It is a large dataframe, so only the variables of interest are kept for analysis, which are:
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
temp <- tempfile()
download.file(url,temp)
df <- read.csv(temp)
unlink(temp)
rm(temp, url)
df <- df[ ,c(8,23:28)]
Some of the exponents on the property damage and crop damage estimates are given as characters (B = billions etc), which needs to be converted to numeric for calculating totals. Any blank or ambiguous entries (eg “+” and “?”) will be made NA. The exponents are applied to the numbers in PROPDMG and CROPDMG to extract the actual estimates in one numeric variable each (property.cost and crop.cost). These two estimates are added for each event to give a combined “economic.cost” variable, with and ‘NA’ missing values assumed to be zero.
df$CROPDMGEXP <- as.character(df$CROPDMGEXP)
df$CROPDMGEXP[df$CROPDMGEXP %in% c("B")] = "9"
df$CROPDMGEXP[df$CROPDMGEXP %in% c("M")] = "6"
df$CROPDMGEXP[df$CROPDMGEXP %in% c("m")] = "6"
df$CROPDMGEXP[df$CROPDMGEXP %in% c("K")] = "3"
df$CROPDMGEXP[df$CROPDMGEXP %in% c("k")] = "3"
df$CROPDMGEXP[df$CROPDMGEXP %in% c("H")] = "2"
df$CROPDMGEXP[df$CROPDMGEXP %in% c("h")] = "2"
df$CROPDMGEXP <- as.numeric(df$CROPDMGEXP)
## Warning: NAs introduced by coercion
df <- dplyr::mutate(df, crop.costs=CROPDMG*(10^CROPDMGEXP))
df$PROPDMGEXP <- as.character(df$PROPDMGEXP)
df$PROPDMGEXP[df$PROPDMGEXP %in% c("B")] = "9"
df$PROPDMGEXP[df$PROPDMGEXP %in% c("M")] = "6"
df$PROPDMGEXP[df$PROPDMGEXP %in% c("m")] = "6"
df$PROPDMGEXP[df$PROPDMGEXP %in% c("K")] = "3"
df$PROPDMGEXP[df$PROPDMGEXP %in% c("H")] = "2"
df$PROPDMGEXP[df$PROPDMGEXP %in% c("h")] = "2"
df$PROPDMGEXP <- as.numeric(df$PROPDMGEXP)
## Warning: NAs introduced by coercion
df <- dplyr::mutate(df, property.costs=PROPDMG*(10^PROPDMGEXP))
df$property.costs[is.na(df$property.costs)==TRUE] <- 0
df$crop.costs[is.na(df$crop.costs)==TRUE] <- 0
df <- dplyr::mutate(df, economic.cost=crop.costs+property.costs)
To estimate public health impact, we look at the total number of injuries recorded for each event type and give the top 10. The same is repeated for deaths.
events <- dplyr::group_by(df,EVTYPE)
events <- dplyr::summarise(events, total.injuries=sum(INJURIES))
events <- events[order(events$total.injuries, decreasing = TRUE), ]
events <- events[c(1:10),]
print(events)
## Source: local data frame [10 x 2]
##
## EVTYPE total.injuries
## (fctr) (dbl)
## 1 TORNADO 91346
## 2 TSTM WIND 6957
## 3 FLOOD 6789
## 4 EXCESSIVE HEAT 6525
## 5 LIGHTNING 5230
## 6 HEAT 2100
## 7 ICE STORM 1975
## 8 FLASH FLOOD 1777
## 9 THUNDERSTORM WIND 1488
## 10 HAIL 1361
p1 <- ggplot(events, aes(x=EVTYPE, y=log10(total.injuries), fill=EVTYPE))
p1 + geom_bar(stat = "identity", alpha=0.7) + xlab("") + ylab("Log10 total injuries count") + ggtitle("Worst 10 weather events for injuries") +
guides(fill=FALSE) + theme(axis.text.x=element_text(angle = 45, hjust=1, vjust=1))
events <- dplyr::group_by(df,EVTYPE)
events <- dplyr::summarise(events, total.deaths=sum(FATALITIES))
events <- events[order(events$total.deaths, decreasing = TRUE), ]
events <- events[c(1:10),]
print(events)
## Source: local data frame [10 x 2]
##
## EVTYPE total.deaths
## (fctr) (dbl)
## 1 TORNADO 5633
## 2 EXCESSIVE HEAT 1903
## 3 FLASH FLOOD 978
## 4 HEAT 937
## 5 LIGHTNING 816
## 6 TSTM WIND 504
## 7 FLOOD 470
## 8 RIP CURRENT 368
## 9 HIGH WIND 248
## 10 AVALANCHE 224
p2 <- ggplot(events, aes(x=EVTYPE, y=log10(total.deaths), fill=EVTYPE))
p2 + geom_bar(stat = "identity", alpha=0.7) + xlab("") + ylab("Log10 total deaths count") + ggtitle("Worst 10 weather events for deaths") +
guides(fill=FALSE) + theme(axis.text.x=element_text(angle = 45, hjust=1, vjust=1))
To estimate economic impact we look at estimated total combined cost of property damage and crop loss associated with different weather events, again giving top ten.
events <- dplyr::group_by(df,EVTYPE)
events <- dplyr::summarise(events, cumulative.cost=sum(economic.cost))
events <- events[order(events$cumulative.cost, decreasing = TRUE), ]
events <- events[c(1:10),]
print(events)
## Source: local data frame [10 x 2]
##
## EVTYPE cumulative.cost
## (fctr) (dbl)
## 1 FLOOD 150319678250
## 2 HURRICANE/TYPHOON 71913712800
## 3 TORNADO 57362333884
## 4 STORM SURGE 43323541000
## 5 HAIL 18761221926
## 6 FLASH FLOOD 18243990872
## 7 DROUGHT 15018672000
## 8 HURRICANE 14610229010
## 9 RIVER FLOOD 10148404500
## 10 ICE STORM 8967041360
p3 <- ggplot(events, aes(x=EVTYPE, y=(cumulative.cost), fill=EVTYPE))
p3 + geom_bar(stat = "identity", alpha=0.7) + xlab("") + ylab("Cumulative cost in property damage and crop loss") + ggtitle("Ten weather events with highest economic impact") +
guides(fill=FALSE) + theme(axis.text.x=element_text(angle = 45, hjust=1, vjust=1))