In this paper we attempt to explore the NOAA Storm Database and answer some basic questions about severe weather events. Storms and other severe weather events cause are a major source of health and economic issues for nearly every person on the planet. This analysis will begin to assess the effect of those events using the data housed by the U.S. National Oceanic and Atmospheric Administration (NOAA) in the US and it’s territories between 1950 and 2011.
Across the United States, which types of events are most harmful with respect to population health?
Across the United States, which types of events have the greatest economic consequences?
echo = TRUE # Always make code visible
library(ggplot2)
library(plyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:plyr':
##
## here
## The following object is masked from 'package:base':
##
## date
library(reshape)
##
## Attaching package: 'reshape'
## The following object is masked from 'package:lubridate':
##
## stamp
## The following objects are masked from 'package:plyr':
##
## rename, round_any
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
rawFfile <- "repdata-data-StormData.csv.bz2"
# check if the target file exists, download file if not
if(!file.exists(rawFfile))
{
download(url, "repdata-data-StormData.csv.bz2")
}
# check if the dataframe exists in memory, import it if not
if (!"rawStormData" %in% ls())
{
rawStormData <- read.csv(bzfile("repdata-data-StormData.csv.bz2"))
}
rawStormData$BGN_DATE <- as.Date(rawStormData$BGN_DATE, format = "%m/%d/%Y")
rawStormData$BGN_YEAR<-year(as.Date(rawStormData$BGN_DATE))
count(rawStormData, "BGN_YEAR")
## BGN_YEAR freq
## 1 1950 223
## 2 1951 269
## 3 1952 272
## 4 1953 492
## 5 1954 609
## 6 1955 1413
## 7 1956 1703
## 8 1957 2184
## 9 1958 2213
## 10 1959 1813
## 11 1960 1945
## 12 1961 2246
## 13 1962 2389
## 14 1963 1968
## 15 1964 2348
## 16 1965 2855
## 17 1966 2388
## 18 1967 2688
## 19 1968 3312
## 20 1969 2926
## 21 1970 3215
## 22 1971 3471
## 23 1972 2168
## 24 1973 4463
## 25 1974 5386
## 26 1975 4975
## 27 1976 3768
## 28 1977 3728
## 29 1978 3657
## 30 1979 4279
## 31 1980 6146
## 32 1981 4517
## 33 1982 7132
## 34 1983 8322
## 35 1984 7335
## 36 1985 7979
## 37 1986 8726
## 38 1987 7367
## 39 1988 7257
## 40 1989 10410
## 41 1990 10946
## 42 1991 12522
## 43 1992 13534
## 44 1993 12607
## 45 1994 20631
## 46 1995 27970
## 47 1996 32270
## 48 1997 28680
## 49 1998 38128
## 50 1999 31289
## 51 2000 34471
## 52 2001 34962
## 53 2002 36293
## 54 2003 39752
## 55 2004 39363
## 56 2005 39184
## 57 2006 44034
## 58 2007 43289
## 59 2008 55663
## 60 2009 45817
## 61 2010 48161
## 62 2011 62174
As you can see from the data above, there is a significant change in number of observations made after 1981. So we will restrict the analysis to the 1982 - 2011 timeperiod.
StormDataClean <- rawStormData[rawStormData$BGN_YEAR > 1981,]
The property and crop damage estimates came in a difficult format with the value in one column and the magnitude of the value in a second column. To handle this, first we inspect the Unique exponents, then devise assign the appropriate power of 10. These will be used to create 2 new columns with the values in US Dollars (USD).
unique(rawStormData$PROPDMGEXP)
## [1] K M B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
unique(rawStormData$CROPDMGEXP)
## [1] M K m B ? 0 k 2
## Levels: ? 0 2 B k K m M
exponentLabel <- c("", "+", "-", "?", 0:9, "h", "H", "k", "K", "m", "M", "b", "B");
exponentFactor <- c(0, 0, 0, 0, 0:9, 2, 2, 3, 3, 6, 6, 9, 9)
expLookUp <- data.frame (exponentLabel, exponentFactor)
#Expand damage value by the appropriate exponent factor
StormDataClean$PROPDMG_USD <- StormDataClean$PROPDMG*10^expLookUp[match(StormDataClean$PROPDMGEXP,expLookUp$exponentLabel),2]
StormDataClean$CROPDMG_USD <- StormDataClean$CROPDMG*10^expLookUp[match(StormDataClean$CROPDMGEXP,expLookUp$exponentLabel),2]
Aggregate and inspect the results.
StormDataAgg <- ddply(StormDataClean, .(EVTYPE), summarize, personFatals = sum(FATALITIES), personInjurs = sum(INJURIES), ecoCostProp = sum(PROPDMG_USD), ecoCostCrop = sum(CROPDMG_USD))
subset(StormDataAgg, grepl("WILD", EVTYPE))
## EVTYPE personFatals personInjurs ecoCostProp ecoCostCrop
## 954 WILD FIRES 3 150 624100000 0
## 955 WILD/FOREST FIRE 12 545 3001829500 106796830
## 956 WILD/FOREST FIRES 0 0 20000 12000
## 957 WILDFIRE 75 911 4765114000 295472800
## 958 WILDFIRES 0 0 100500000 500000
As you can see from the example above, there are multiple names for a single event type in this data. The example of WILDFIRE and WILDFIRES shows variability. We will attempt to correct for the major problems by standardizing some of the Event Types in the code below.
StormDataClean$EVTYPE[grepl("AVALANCE", StormDataClean$EVTYPE)] <-"AVALANCHE"
StormDataClean$EVTYPE[grepl("Coastal Flood|Coastal Flooding|COASTAL FLOODING", StormDataClean$EVTYPE)] <-"COASTAL FLOOD"
StormDataClean$EVTYPE[grepl("DENSE FOG", StormDataClean$EVTYPE)] <-"FOG"
StormDataClean$EVTYPE[grepl("EXCESSIVE HEAT", StormDataClean$EVTYPE)] <-"HEAT"
StormDataClean$EVTYPE[grepl("FLASH FLOOD/FLOOD", StormDataClean$EVTYPE)] <-"FLASH FLOOD"
StormDataClean$EVTYPE[grepl("FLASH FLOODING", StormDataClean$EVTYPE)] <-"FLASH FLOOD"
StormDataClean$EVTYPE[grepl("FLOOD/FLASH FLOOD", StormDataClean$EVTYPE)] <-"FLASH FLOOD"
StormDataClean$EVTYPE[grepl("Heat Wave|HEAT WAVE|HEAT WAVES", StormDataClean$EVTYPE)] <-"HEAT"
StormDataClean$EVTYPE[grepl("HEAVY SURF/HIGH SURF", StormDataClean$EVTYPE)] <-"HIGH SURF"
StormDataClean$EVTYPE[grepl("Heavy Surf", StormDataClean$EVTYPE)] <-"HIGH SURF"
StormDataClean$EVTYPE[grepl("HIGH WINDS", StormDataClean$EVTYPE)] <-"HIGH WIND"
StormDataClean$EVTYPE[grepl("HIGH WINDS", StormDataClean$EVTYPE)] <-"HIGH WIND"
StormDataClean$EVTYPE[grepl("HURRICANE", StormDataClean$EVTYPE)] <-"HURRICANE/TYPHOON"
StormDataClean$EVTYPE[grepl("TYPHOON", StormDataClean$EVTYPE)] <-"HURRICANE/TYPHOON"
StormDataClean$EVTYPE[grepl("LIGHTING", StormDataClean$EVTYPE)] <-"LIGHTNING"
StormDataClean$EVTYPE[grepl("RIP CURRENTS", StormDataClean$EVTYPE)] <-"RIP CURRENT"
StormDataClean$EVTYPE[grepl("THUNDERTORM", StormDataClean$EVTYPE)] <- "THUNDERSTORM"
StormDataClean$EVTYPE[grepl("THUNDERSTORM",StormDataClean$EVTYPE)] <- "THUNDERSTORM"
StormDataClean$EVTYPE[grepl("THUNDERSTORM WINDS", StormDataClean$EVTYPE)] <-"THUNDERSTORM WIND"
StormDataClean$EVTYPE[grepl("TSTM WIND", StormDataClean$EVTYPE)] <-"THUNDERSTORM WIND"
StormDataClean$EVTYPE[grepl("TSTM WIND/HAIL", StormDataClean$EVTYPE)] <-"THUNDERSTORM WIND"
StormDataClean$EVTYPE[grepl("TORNADOES, TSTM WIND, HAIL", StormDataClean$EVTYPE)] <-"TORNADO"
StormDataClean$EVTYPE[grepl("WATERSPOUT/TORNADO", StormDataClean$EVTYPE)] <-"TORNADO"
StormDataClean$EVTYPE[grepl("WILDFIRES", StormDataClean$EVTYPE)] <-"WILD FIRE"
## Warning in `[<-.factor`(`*tmp*`, grepl("WILDFIRES", StormDataClean
## $EVTYPE), : invalid factor level, NA generated
StormDataAgg <- ddply(StormDataClean, .(EVTYPE), summarize, personFatals = sum(FATALITIES), personInjurs = sum(INJURIES), ecoCostProp = sum(PROPDMG_USD), ecoCostCrop = sum(CROPDMG_USD))
StormDataAgg<-StormDataAgg[complete.cases(StormDataAgg),]
The following graph shows the relationship between the different types of major weather events and the average number of people’s health effected. The focus is on events with More Than 100 Fatalities or 200 Injuries
StormDataAggC<-subset(StormDataAgg,personFatals > 100 | personInjurs > 200)
StormDataAggC<-StormDataAggC[,c('EVTYPE','personFatals','personInjurs')]
StormDataAggC$EVTYPE <- factor(StormDataAggC$EVTYPE, levels = StormDataAggC$EVTYPE[order(StormDataAggC$personInjurs, decreasing=TRUE)])
StormDataAggC.m <- melt(StormDataAggC, id.vars='EVTYPE')
ggplot(StormDataAggC.m, aes(EVTYPE, value)) +
geom_bar(aes(fill = variable), position = "dodge", stat="identity")+
xlab("Weather Event")+
ylab('Number of Humans Involved (Log 2 Scale)')+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
scale_y_sqrt("Number of Humans Involved (Log 2 Scale)")+
scale_fill_discrete(name="Type of Involvement",labels=c("Fatality", "Injury"))+
ggtitle("Total Injuries and Fatalities by Weather Event in the U.S. 1982 - 2011")
The following graph shows the relationship between the different types of major weather events and the damage done to property and crops. The focus is on events with More Than 1 Billion US Dollars in Property or 100 Million US Dollars in Crop damage.
StormDataAggC<-subset(StormDataAgg,ecoCostProp > 1e+09 | ecoCostCrop > 1e+08)
StormDataAggC<-StormDataAggC[,c('EVTYPE','ecoCostProp','ecoCostCrop')]
StormDataAggC$EVTYPE <- factor(StormDataAggC$EVTYPE, levels = StormDataAggC$EVTYPE[order(StormDataAggC$ecoCostProp, decreasing=TRUE)])
StormDataAggC$ecoCostProp = StormDataAggC$ecoCostProp/(10^6)
StormDataAggC$ecoCostCrop = StormDataAggC$ecoCostCrop/(10^6)
StormDataAggC.m <- melt(StormDataAggC, id.vars='EVTYPE')
ggplot(StormDataAggC.m, aes(EVTYPE, value)) +
geom_bar(aes(fill = variable), position = "dodge", stat="identity")+
xlab("Weather Event")+
scale_y_sqrt("Damage (Millions $) (Log 2 Scale)")+
scale_fill_discrete(name="Damage",labels=c("Property", "Crop"))+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
ggtitle("Total Ecomonic Damage by Weather Event in the U.S. 1982 - 2011")