This is the 2nd Peer Assessment assignment for Reproducible Research, a Coursera class through Johns Hopkins. The data comes from the NOAA storm database. We are tasked to provide information and analysis regarding the impacts of storm events on population health in the US and the economic consequences of those events.
The analysis file is broken up into three sections.
Libraries
library(reshape2)
Set the working directory
setwd("~/Git/RepData_PeerAssessment2")
Define the data directory and filename
fileDirec <- "~/Data Science Specialization/Reproducible Research/Week3/repdata-data-StormData.csv.bz2"
Download the data
if(!file.exists(fileDirec)) {
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", fileDirec)
}
Read in the csv.bz2 data file
rawData <- read.csv(bzfile(fileDirec), stringsAsFactors = FALSE)
Extract relevant fields for this analysis to data data frame
data <- data.frame(startDate = as.Date(rawData$BGN_DATE, format = "%m/%d/%Y %H:%M:%S"), state = rawData$STATE, event = toupper(rawData$EVTYPE), fatalities = rawData$FATALITIES, injuries = rawData$INJURIES, propertyDmg = rawData$PROPDMG, propertyDmgAmt = toupper(rawData$PROPDMGEXP), cropDmg = rawData$CROPDMG, cropDmgAmt = toupper(rawData$CROPDMGEXP))
head(data)
## startDate state event fatalities injuries propertyDmg propertyDmgAmt
## 1 1950-04-18 AL TORNADO 0 15 25.0 K
## 2 1950-04-18 AL TORNADO 0 0 2.5 K
## 3 1951-02-20 AL TORNADO 0 2 25.0 K
## 4 1951-06-08 AL TORNADO 0 2 2.5 K
## 5 1951-11-15 AL TORNADO 0 2 2.5 K
## 6 1951-11-15 AL TORNADO 0 6 2.5 K
## cropDmg cropDmgAmt
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
Clean summary lines from data set
data <- data[grepl("(.*)SUMMARY(.*)", data$event, ignore.case = TRUE) == FALSE,]
Create a table of event frequency by year to identify years for inclusion
yearBrkdwn <- table(as.factor(as.numeric(as.POSIXlt(data$startDate)$year) + 1900))
yearBrkdwn
##
## 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961
## 223 269 272 492 609 1413 1703 2184 2213 1813 1945 2246
## 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973
## 2389 1968 2348 2855 2388 2688 3312 2926 3215 3471 2168 4463
## 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985
## 5386 4975 3768 3728 3657 4279 6146 4517 7132 8322 7335 7979
## 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997
## 8726 7367 7257 10410 10946 12522 13534 12607 20631 27970 32198 28676
## 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
## 38128 31289 34471 34962 36293 39752 39363 39184 44034 43289 55663 45817
## 2010 2011
## 48161 62174
Examine the large jump in the number of reported events between 1993 and 1995
yearBrkdwn[c("1992", "1993", "1994", "1995", "1996")]
##
## 1992 1993 1994 1995 1996
## 13534 12607 20631 27970 32198
Plot event frequency by year with a highlight on the period between 1993 and 1995
plot(yearBrkdwn, xlab = "Year", ylab = "Number of Events", main = "Number of Events by Year", col = "blue4")
abline(a = yearBrkdwn["1995"], b = 0, col="red")
text(1955, yearBrkdwn["1995"], "1995", cex = .75, pos = 3)
abline(a = yearBrkdwn["1993"], b = 0, col="red")
text(1955, yearBrkdwn["1993"], "1993", cex = .75, pos = 3)
Examine the percent change rate from 1993 to 1995
pctChange <- as.numeric(((yearBrkdwn["1995"] - yearBrkdwn["1993"]) / yearBrkdwn["1993"]) * 100)
We see a change rate of 121.8609% over the two year span from 1993 to 1995.
Examine the percent change rate for all 2 year periods in the data set
## Set first two entries to NA due to 2 year lag
twoYrChangePct <- vector()
for(i in 1:nrow(yearBrkdwn)){
if(i < 3){
twoYrChangePct[i] <- NA
} else {
twoYrChangePct[i] <- ((yearBrkdwn[i] - yearBrkdwn[i-2]) / yearBrkdwn[i-2]) * 100
}
}
twoYrChng <- data.frame(year = as.numeric(names(yearBrkdwn)), twoYrChangePct)
twoYrChng <- twoYrChng[order(twoYrChng$twoYrChangePct, decreasing = TRUE),]
## First ten years excluded to avoid outsized impact of small number of observations in those years on two year change percent.
head(twoYrChng[twoYrChng$year > min(twoYrChng$year) + 10,])
## year twoYrChangePct
## 25 1974 148.43
## 46 1995 121.86
## 34 1983 84.24
## 31 1980 68.06
## 47 1996 56.07
## 45 1994 52.44
We find that 1993 to 1995 is the 2nd largest 2 year increase outside of the first ten years. The largest is 1972 to 1974.
Look at the 5 years around and including 1972
yearBrkdwn[c("1970", "1971", "1972", "1973", "1974")]
##
## 1970 1971 1972 1973 1974
## 3215 3471 2168 4463 5386
We see that an unexplained single-year drop seems to be responsible for 1972’s large change rate, rather than a trend change.
The large change from 1993 to 1995 however, seems to indicate a change in record keeping method and quality, rather than true jump in occurrence. With that in mind, we will start our analysis with 1995, and exclude the years prior to that which show a smaller number of reported events.
Exclude years in data set before 1995
data <- data[data$startDate >= as.Date("1995-01-01"),]
Clean the event listing in the eventCln column
data$event <- data$event
data$eventCln <- data$event
data$eventCln <- gsub("(.*)COLD(.*)|(.*)WIND\\s?CHILL(.*)", "COLD/WINDCHILL", data$eventCln)
data$eventCln <- gsub("(.*)SURF(.*)|(.*)SEA(.*)|(.*)SWELL(.*)", "HEAVY SURF/SEAS", data$eventCln)
data$eventCln <- gsub("(.*)HEAT(.*)", "HEAT", data$eventCln)
data$eventCln <- gsub("(.*)FOG(.*)", "FOG", data$eventCln)
data$eventCln <- gsub("(.*)HEAT(.*)", "HEAT", data$eventCln)
data$eventCln <- gsub("(.*)FLOOD(.*)", "FLOOD", data$eventCln)
data$eventCln <- gsub("(.*)RIP CURRENT(.*)", "RIP CURRENT", data$eventCln)
data$eventCln <- gsub("(.*)EROSION(.*)", "COASTAL EROSION", data$eventCln)
data$eventCln <- gsub("(.*)LANDSLIDE(.*)|(.*)LANDSLUMP(.*)", "LANDSLIDE", data$eventCln)
data$eventCln <- gsub("(.*)HURRICANE(.*)", "HURRICANE", data$eventCln)
data$eventCln <- gsub("(.*)MARINE(.*)WIND", "MARINE WIND", data$eventCln)
data$eventCln <- gsub("(.*)FIRE(.*)", "WILDFIRE", data$eventCln)
data$eventCln <- gsub("(.*)TSTM(.*)|(.*)THUNDERSTORM(.*)", "THUNDERSTORM", data$eventCln)
data$eventCln <- gsub("(.*)WINTER(.*)", "WINTER WEATHER", data$eventCln)
data$eventCln <- gsub("(.*)RAIN/SNOW(.*)|(.*)MIX(.*)", "MIXED PRECIP", data$eventCln)
data$eventCln <- gsub("(.*)MUD\\s?SLIDE(.*)", "MUD SLIDE", data$eventCln)
data$eventCln <- gsub("(.*)DUST(.*)", "DUST STORM", data$eventCln)
data$eventCln <- gsub("(.*)FREEZE(.*)", "FREEZE", data$eventCln)
data$eventCln <- gsub("(.*)RAIN(.*)|(.*)BURST(.*)|(.*)SPOUT(.*)", "RAIN", data$eventCln)
data$eventCln <- gsub("(.*)SNOW(.*)", "SNOW", data$eventCln)
data$eventCln <- gsub("(.*)MARINE MISHAP(.*)", "MARINE ACCIDENT", data$eventCln)
data$eventCln <- gsub("(.*)LIG.TNING(.*)", "LIGHTNING", data$eventCln)
data$eventCln <- gsub("(.*)HIGH WIND(.*)", "HIGH WIND", data$eventCln)
data$eventCln <- gsub("(.*)GLAZE(.*)|(.*)FREEZING DRIZZLE(.*)", "ICE STORM", data$eventCln)
data$eventCln <- gsub("(.*)TROPICAL(.*)", "TROPICAL STORM", data$eventCln)
data$eventCln <- gsub("(.*)TORNADO(.*)|(.*)FUNNEL CLOUD(.*)|(.*)GUSTNADO(.*)", "TORNADO", data$eventCln)
data$eventCln <- gsub("(.*)STORM SURGE(.*)", "STORM SURGE", data$eventCln)
data$eventCln <- gsub("(.*)HAIL(.*)", "HAIL", data$eventCln)
data$eventCln <- gsub("(.*)WIND(.*)", "HIGH WIND", data$eventCln)
data$eventCln <- gsub("(.*)ROAD(.*)", "ICY ROAD", data$eventCln)
Remove items from the data set with no impacts on population health
hlthData <- data[data$injuries != 0 & data$fatalities != 0,]
Calculate at the overall injury numbers by event
injEvnts <- aggregate(injuries ~ eventCln, data = hlthData, sum)
injEvnts <- injEvnts[order(injEvnts$injuries, decreasing = TRUE),]
head(injEvnts, 10)
## eventCln injuries
## 25 TORNADO 13553
## 10 HEAT 6439
## 6 FLOOD 3298
## 13 HURRICANE 1237
## 30 WINTER WEATHER 828
## 17 LIGHTNING 579
## 24 THUNDERSTORM 553
## 12 HIGH WIND 506
## 7 FOG 432
## 3 BLIZZARD 312
Calculate at the overall fatality numbers by event
ftlEvnts <- aggregate(fatalities ~ eventCln, data = hlthData, sum)
ftlEvnts <- ftlEvnts[order(ftlEvnts$fatalities, decreasing = TRUE),]
head(ftlEvnts, 10)
## eventCln fatalities
## 25 TORNADO 1338
## 10 HEAT 496
## 17 LIGHTNING 256
## 6 FLOOD 247
## 12 HIGH WIND 183
## 24 THUNDERSTORM 165
## 30 WINTER WEATHER 110
## 21 RIP CURRENT 81
## 29 WILDFIRE 62
## 22 SNOW 53
So we can see that tornadoes and heat related issues are the events most harmful to overall population health.
Calculate the percent of total injuries and fatalities that are Tornado related.
torInjPct <- (injEvnts$injuries[injEvnts$eventCln == "TORNADO"] / sum(injEvnts$injuries)) * 100
torFtlPct <- (ftlEvnts$fatalities[ftlEvnts$eventCln == "TORNADO"] / sum(ftlEvnts$fatalities)) * 100
In fact, we find that Tornadoes alone account for 46.1709% of all injuries in our data set, and 39.3414% of all fatalities in our data set.
For both crop and property damage, exclude values from the dataset where crop/property damage = 0 or is missing
pData <- data[data$propertyDmg != 0 & !is.na(data$propertyDmg),]
cData <- data[data$cropDmg != 0 & !is.na(data$propertyDmg),]
Determine the importance of ambiguous values (not = B, K, or M)
pDPctAmb <- (nrow(pData[!pData$propertyDmgAmt %in% c("B","K","M"),]) / nrow(pData)) * 100
cDPctAmb <- (nrow(cData[!cData$cropDmgAmt %in% c("B","K","M"),]) / nrow(cData)) * 100
The ambiguous values of propertyDmgAmt make up 0.1448% of the total number of values, and the ambiguous values of cropDmgAmt make up 0.0204% of the total number of values.
While these are very small percentages, that does not tell the whole story. We must still be careful about excluding values when we do not know their true impact on the data. This is particularly important in a variable that can represent a multiplier of another variable. In cases like this, even a very small number of observations can have a large effect on total.
Property damage thousands vs billions
thouTotal <- sum(pData$propertyDmg[pData$propertyDmgAmt == "K"]) * 1000
bilTotal <- sum(pData$propertyDmg[pData$propertyDmgAmt == "B"]) * 1000000000
obsRatio <- 1/(nrow(pData[pData$propertyDmgAmt == "B",]) / nrow(pData[pData$propertyDmgAmt == "K",]))
For example, while there are 5152.2973 property damage observations measured in thousands of dollars for each observation measured in billions, the total value of the observations measured in billions (2.6425 × 1011) far exceeds the total value of the observations measured in thousands (7.8483 × 109).
So while we can’t know the true value of the ambiguous entries, we also can’t account for them accurately in our analysis due to lack of information. At this point, the best we can do is exclude them from the analysis while acknowledging the problem they create, and continue to look for more information on their meaning for possible consideration at a later date
Exclude ambiguous values from data set
pData <- pData[pData$propertyDmgAmt %in% c("B","K","M"),]
cData <- cData[cData$cropDmgAmt %in% c("B","K","M"),]
Add multiplier column to property and crop damage table
valConv <- data.frame(abbrv = c("K", "M", "B"), mult = c(1000, 1000000, 1000000000))
pData <- merge(pData, valConv, by.x = "propertyDmgAmt", by.y = "abbrv")
cData <- merge(cData, valConv, by.x = "cropDmgAmt", by.y = "abbrv")
Calculate total damage amount per observation
pData$totalPDmg <- pData$propertyDmg * pData$mult
cData$totalCDmg <- cData$cropDmg * cData$mult
Calculate the damage per event type
pDmgEvent <- aggregate(totalPDmg ~ eventCln, data = pData, sum)
cDmgEvent <- aggregate(totalCDmg ~ eventCln, data = cData, sum)
pDmgEvent <- pDmgEvent[order(pDmgEvent$totalPDmg, decreasing = TRUE),]
cDmgEvent <- cDmgEvent[order(cDmgEvent$totalCDmg, decreasing = TRUE),]
head(pDmgEvent)
## eventCln totalPDmg
## 11 FLOOD 1.602e+11
## 19 HURRICANE 8.465e+10
## 34 STORM SURGE 4.783e+10
## 36 TORNADO 2.493e+10
## 15 HAIL 1.529e+10
## 35 THUNDERSTORM 9.856e+09
head(cDmgEvent)
## eventCln totalCDmg
## 2 DROUGHT 1.392e+10
## 6 FLOOD 7.032e+09
## 13 HURRICANE 5.515e+09
## 9 HAIL 2.635e+09
## 12 HIGH WIND 2.160e+09
## 7 FREEZE 1.839e+09
From this we can see that flooding has the greatest economic impact in terms of property damage, while drought has the largest negative impact on crops.
Create a bar plot of high-impact events
combDmg <- merge(pDmgEvent, cDmgEvent, by = "eventCln", all = TRUE)
combDmg[is.na(combDmg)] <- 0
combDmg$totalDmg <- combDmg$totalCDmg + combDmg$totalPDmg
## Look at high impact events; those with the highest 25% of total impact
highImpact <- combDmg[combDmg$totalDmg >= quantile(combDmg$totalDmg, probs = .75)[["75%"]],]
## Measure impact in billions of dollars
highImpact$totalCDmg <- highImpact$totalCDmg / 1000000000
highImpact$totalPDmg <- highImpact$totalPDmg / 1000000000
## Reshape data set for plotting
hIMelt <- melt(highImpact[,1:3], id = "eventCln")
highImpactTx <- acast(hIMelt, variable~eventCln)
par(mar = c(10,5,3,3) + 0.1)
barplot(highImpactTx, col = c("gold1", "firebrick1"), main = "Economic Impact of Highest Impact Events", ylab = "Economic Impact (Billion US$)", las = 3, cex.axis = .85, legend.text = c("Property Damage", "Crop Damage"), args.legend = (x = "topright"))
And so we see flooding and hurricanes having far and away the greatest economic consequences, with storm surges and tornadoes also standing out for their considerable property and crop damage impact.
sessionInfo()
## R version 3.0.3 (2014-03-06)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] reshape2_1.4
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.3 evaluate_0.5.3 formatR_0.10 htmltools_0.2.4
## [5] knitr_1.6 plyr_1.8.1 Rcpp_0.11.1 rmarkdown_0.2.46
## [9] stringr_0.6.2 tools_3.0.3 yaml_2.1.11