Synopsis

This is the 2nd Peer Assessment assignment for Reproducible Research, a Coursera class through Johns Hopkins. The data comes from the NOAA storm database. We are tasked to provide information and analysis regarding the impacts of storm events on population health in the US and the economic consequences of those events.

The analysis file is broken up into three sections.


Data Processing

Libraries

library(reshape2)

Set the working directory

setwd("~/Git/RepData_PeerAssessment2")

Define the data directory and filename

fileDirec <- "~/Data Science Specialization/Reproducible Research/Week3/repdata-data-StormData.csv.bz2"

Download the data

if(!file.exists(fileDirec)) {
     download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", fileDirec)
}

Read in the csv.bz2 data file

rawData <- read.csv(bzfile(fileDirec), stringsAsFactors = FALSE)

Extract relevant fields for this analysis to data data frame

data <- data.frame(startDate = as.Date(rawData$BGN_DATE, format = "%m/%d/%Y %H:%M:%S"), state = rawData$STATE, event = toupper(rawData$EVTYPE), fatalities = rawData$FATALITIES, injuries = rawData$INJURIES, propertyDmg = rawData$PROPDMG, propertyDmgAmt = toupper(rawData$PROPDMGEXP), cropDmg = rawData$CROPDMG, cropDmgAmt = toupper(rawData$CROPDMGEXP))

head(data)
##    startDate state   event fatalities injuries propertyDmg propertyDmgAmt
## 1 1950-04-18    AL TORNADO          0       15        25.0              K
## 2 1950-04-18    AL TORNADO          0        0         2.5              K
## 3 1951-02-20    AL TORNADO          0        2        25.0              K
## 4 1951-06-08    AL TORNADO          0        2         2.5              K
## 5 1951-11-15    AL TORNADO          0        2         2.5              K
## 6 1951-11-15    AL TORNADO          0        6         2.5              K
##   cropDmg cropDmgAmt
## 1       0           
## 2       0           
## 3       0           
## 4       0           
## 5       0           
## 6       0

Clean summary lines from data set

data <- data[grepl("(.*)SUMMARY(.*)", data$event, ignore.case = TRUE) ==  FALSE,]

Create a table of event frequency by year to identify years for inclusion

yearBrkdwn <- table(as.factor(as.numeric(as.POSIXlt(data$startDate)$year) + 1900))

yearBrkdwn
## 
##  1950  1951  1952  1953  1954  1955  1956  1957  1958  1959  1960  1961 
##   223   269   272   492   609  1413  1703  2184  2213  1813  1945  2246 
##  1962  1963  1964  1965  1966  1967  1968  1969  1970  1971  1972  1973 
##  2389  1968  2348  2855  2388  2688  3312  2926  3215  3471  2168  4463 
##  1974  1975  1976  1977  1978  1979  1980  1981  1982  1983  1984  1985 
##  5386  4975  3768  3728  3657  4279  6146  4517  7132  8322  7335  7979 
##  1986  1987  1988  1989  1990  1991  1992  1993  1994  1995  1996  1997 
##  8726  7367  7257 10410 10946 12522 13534 12607 20631 27970 32198 28676 
##  1998  1999  2000  2001  2002  2003  2004  2005  2006  2007  2008  2009 
## 38128 31289 34471 34962 36293 39752 39363 39184 44034 43289 55663 45817 
##  2010  2011 
## 48161 62174

Examine the large jump in the number of reported events between 1993 and 1995

yearBrkdwn[c("1992", "1993", "1994", "1995", "1996")]
## 
##  1992  1993  1994  1995  1996 
## 13534 12607 20631 27970 32198

Plot event frequency by year with a highlight on the period between 1993 and 1995

plot(yearBrkdwn, xlab = "Year", ylab = "Number of Events", main = "Number of Events by Year", col = "blue4")
abline(a = yearBrkdwn["1995"], b = 0, col="red")
text(1955, yearBrkdwn["1995"], "1995", cex = .75, pos = 3)
abline(a = yearBrkdwn["1993"], b = 0, col="red")
text(1955, yearBrkdwn["1993"], "1993", cex = .75, pos = 3)

plot of chunk byYearPlot

Examine the percent change rate from 1993 to 1995

pctChange <- as.numeric(((yearBrkdwn["1995"] - yearBrkdwn["1993"]) / yearBrkdwn["1993"]) * 100)

We see a change rate of 121.8609% over the two year span from 1993 to 1995.

Examine the percent change rate for all 2 year periods in the data set

## Set first two entries to NA due to 2 year lag
twoYrChangePct <- vector()
for(i in 1:nrow(yearBrkdwn)){
     if(i < 3){
          twoYrChangePct[i] <- NA
     } else {
     twoYrChangePct[i] <- ((yearBrkdwn[i] - yearBrkdwn[i-2]) / yearBrkdwn[i-2]) * 100
     }
}

twoYrChng <- data.frame(year = as.numeric(names(yearBrkdwn)), twoYrChangePct)

twoYrChng <- twoYrChng[order(twoYrChng$twoYrChangePct, decreasing = TRUE),]

## First ten years excluded to avoid outsized impact of small number of observations in those years on two year change percent.
head(twoYrChng[twoYrChng$year > min(twoYrChng$year) + 10,])
##    year twoYrChangePct
## 25 1974         148.43
## 46 1995         121.86
## 34 1983          84.24
## 31 1980          68.06
## 47 1996          56.07
## 45 1994          52.44

We find that 1993 to 1995 is the 2nd largest 2 year increase outside of the first ten years. The largest is 1972 to 1974.

Look at the 5 years around and including 1972

yearBrkdwn[c("1970", "1971", "1972", "1973", "1974")]
## 
## 1970 1971 1972 1973 1974 
## 3215 3471 2168 4463 5386

We see that an unexplained single-year drop seems to be responsible for 1972’s large change rate, rather than a trend change.

The large change from 1993 to 1995 however, seems to indicate a change in record keeping method and quality, rather than true jump in occurrence. With that in mind, we will start our analysis with 1995, and exclude the years prior to that which show a smaller number of reported events.

Exclude years in data set before 1995

data <- data[data$startDate >= as.Date("1995-01-01"),]

Clean the event listing in the eventCln column

data$event <- data$event

data$eventCln <- data$event

data$eventCln <- gsub("(.*)COLD(.*)|(.*)WIND\\s?CHILL(.*)", "COLD/WINDCHILL", data$eventCln)

data$eventCln <- gsub("(.*)SURF(.*)|(.*)SEA(.*)|(.*)SWELL(.*)", "HEAVY SURF/SEAS", data$eventCln)

data$eventCln <- gsub("(.*)HEAT(.*)", "HEAT", data$eventCln)

data$eventCln <- gsub("(.*)FOG(.*)", "FOG", data$eventCln)

data$eventCln <- gsub("(.*)HEAT(.*)", "HEAT", data$eventCln)

data$eventCln <- gsub("(.*)FLOOD(.*)", "FLOOD", data$eventCln)

data$eventCln <- gsub("(.*)RIP CURRENT(.*)", "RIP CURRENT", data$eventCln)

data$eventCln <- gsub("(.*)EROSION(.*)", "COASTAL EROSION", data$eventCln)

data$eventCln <- gsub("(.*)LANDSLIDE(.*)|(.*)LANDSLUMP(.*)", "LANDSLIDE", data$eventCln)

data$eventCln <- gsub("(.*)HURRICANE(.*)", "HURRICANE", data$eventCln)

data$eventCln <- gsub("(.*)MARINE(.*)WIND", "MARINE WIND", data$eventCln)

data$eventCln <- gsub("(.*)FIRE(.*)", "WILDFIRE", data$eventCln)

data$eventCln <- gsub("(.*)TSTM(.*)|(.*)THUNDERSTORM(.*)", "THUNDERSTORM", data$eventCln)

data$eventCln <- gsub("(.*)WINTER(.*)", "WINTER WEATHER", data$eventCln)

data$eventCln <- gsub("(.*)RAIN/SNOW(.*)|(.*)MIX(.*)", "MIXED PRECIP", data$eventCln)

data$eventCln <- gsub("(.*)MUD\\s?SLIDE(.*)", "MUD SLIDE", data$eventCln)

data$eventCln <- gsub("(.*)DUST(.*)", "DUST STORM", data$eventCln)

data$eventCln <- gsub("(.*)FREEZE(.*)", "FREEZE", data$eventCln)

data$eventCln <- gsub("(.*)RAIN(.*)|(.*)BURST(.*)|(.*)SPOUT(.*)", "RAIN", data$eventCln)

data$eventCln <- gsub("(.*)SNOW(.*)", "SNOW", data$eventCln)

data$eventCln <- gsub("(.*)MARINE MISHAP(.*)", "MARINE ACCIDENT", data$eventCln)

data$eventCln <- gsub("(.*)LIG.TNING(.*)", "LIGHTNING", data$eventCln)

data$eventCln <- gsub("(.*)HIGH WIND(.*)", "HIGH WIND", data$eventCln)

data$eventCln <- gsub("(.*)GLAZE(.*)|(.*)FREEZING DRIZZLE(.*)", "ICE STORM", data$eventCln)

data$eventCln <- gsub("(.*)TROPICAL(.*)", "TROPICAL STORM", data$eventCln)
                      
data$eventCln <- gsub("(.*)TORNADO(.*)|(.*)FUNNEL CLOUD(.*)|(.*)GUSTNADO(.*)", "TORNADO", data$eventCln)

data$eventCln <- gsub("(.*)STORM SURGE(.*)", "STORM SURGE", data$eventCln)

data$eventCln <- gsub("(.*)HAIL(.*)", "HAIL", data$eventCln)

data$eventCln <- gsub("(.*)WIND(.*)", "HIGH WIND", data$eventCln)

data$eventCln <- gsub("(.*)ROAD(.*)", "ICY ROAD", data$eventCln)

Data Assumptions

  • All data in the data set is considered data regarding the United States, since it is collected by NOAA using US based sources.

Results

Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

Remove items from the data set with no impacts on population health

hlthData <- data[data$injuries != 0 & data$fatalities != 0,]

Calculate at the overall injury numbers by event

injEvnts <- aggregate(injuries ~ eventCln, data = hlthData, sum)

injEvnts <- injEvnts[order(injEvnts$injuries, decreasing = TRUE),]

head(injEvnts, 10)
##          eventCln injuries
## 25        TORNADO    13553
## 10           HEAT     6439
## 6           FLOOD     3298
## 13      HURRICANE     1237
## 30 WINTER WEATHER      828
## 17      LIGHTNING      579
## 24   THUNDERSTORM      553
## 12      HIGH WIND      506
## 7             FOG      432
## 3        BLIZZARD      312

Calculate at the overall fatality numbers by event

ftlEvnts <- aggregate(fatalities ~ eventCln, data = hlthData, sum)

ftlEvnts <- ftlEvnts[order(ftlEvnts$fatalities, decreasing = TRUE),]

head(ftlEvnts, 10)
##          eventCln fatalities
## 25        TORNADO       1338
## 10           HEAT        496
## 17      LIGHTNING        256
## 6           FLOOD        247
## 12      HIGH WIND        183
## 24   THUNDERSTORM        165
## 30 WINTER WEATHER        110
## 21    RIP CURRENT         81
## 29       WILDFIRE         62
## 22           SNOW         53

So we can see that tornadoes and heat related issues are the events most harmful to overall population health.

Calculate the percent of total injuries and fatalities that are Tornado related.

torInjPct <- (injEvnts$injuries[injEvnts$eventCln == "TORNADO"] / sum(injEvnts$injuries)) * 100

torFtlPct <- (ftlEvnts$fatalities[ftlEvnts$eventCln == "TORNADO"] / sum(ftlEvnts$fatalities)) * 100

In fact, we find that Tornadoes alone account for 46.1709% of all injuries in our data set, and 39.3414% of all fatalities in our data set.


Across the United States, which types of events have the greatest economic consequences?

For both crop and property damage, exclude values from the dataset where crop/property damage = 0 or is missing

pData <- data[data$propertyDmg != 0 & !is.na(data$propertyDmg),]

cData <- data[data$cropDmg != 0 & !is.na(data$propertyDmg),]

Ambiguous Damage Values Analysis

Determine the importance of ambiguous values (not = B, K, or M)

pDPctAmb <- (nrow(pData[!pData$propertyDmgAmt %in% c("B","K","M"),]) / nrow(pData)) * 100

cDPctAmb <- (nrow(cData[!cData$cropDmgAmt %in% c("B","K","M"),]) / nrow(cData)) * 100

The ambiguous values of propertyDmgAmt make up 0.1448% of the total number of values, and the ambiguous values of cropDmgAmt make up 0.0204% of the total number of values.

While these are very small percentages, that does not tell the whole story. We must still be careful about excluding values when we do not know their true impact on the data. This is particularly important in a variable that can represent a multiplier of another variable. In cases like this, even a very small number of observations can have a large effect on total.

Property damage thousands vs billions

thouTotal <- sum(pData$propertyDmg[pData$propertyDmgAmt == "K"]) * 1000

bilTotal <- sum(pData$propertyDmg[pData$propertyDmgAmt == "B"]) * 1000000000

obsRatio <- 1/(nrow(pData[pData$propertyDmgAmt == "B",]) / nrow(pData[pData$propertyDmgAmt == "K",]))

For example, while there are 5152.2973 property damage observations measured in thousands of dollars for each observation measured in billions, the total value of the observations measured in billions (2.6425 × 1011) far exceeds the total value of the observations measured in thousands (7.8483 × 109).

So while we can’t know the true value of the ambiguous entries, we also can’t account for them accurately in our analysis due to lack of information. At this point, the best we can do is exclude them from the analysis while acknowledging the problem they create, and continue to look for more information on their meaning for possible consideration at a later date


Exclude ambiguous values from data set

pData <- pData[pData$propertyDmgAmt %in% c("B","K","M"),]

cData <- cData[cData$cropDmgAmt %in% c("B","K","M"),]

Add multiplier column to property and crop damage table

valConv <- data.frame(abbrv = c("K", "M", "B"), mult = c(1000, 1000000, 1000000000))

pData <- merge(pData, valConv, by.x = "propertyDmgAmt", by.y = "abbrv")

cData <- merge(cData, valConv, by.x = "cropDmgAmt", by.y = "abbrv")

Calculate total damage amount per observation

pData$totalPDmg <- pData$propertyDmg * pData$mult

cData$totalCDmg <- cData$cropDmg * cData$mult

Calculate the damage per event type

pDmgEvent <- aggregate(totalPDmg ~ eventCln, data = pData, sum)

cDmgEvent <- aggregate(totalCDmg ~ eventCln, data = cData, sum)

pDmgEvent <- pDmgEvent[order(pDmgEvent$totalPDmg, decreasing = TRUE),]

cDmgEvent <- cDmgEvent[order(cDmgEvent$totalCDmg, decreasing = TRUE),]

head(pDmgEvent)
##        eventCln totalPDmg
## 11        FLOOD 1.602e+11
## 19    HURRICANE 8.465e+10
## 34  STORM SURGE 4.783e+10
## 36      TORNADO 2.493e+10
## 15         HAIL 1.529e+10
## 35 THUNDERSTORM 9.856e+09
head(cDmgEvent)
##     eventCln totalCDmg
## 2    DROUGHT 1.392e+10
## 6      FLOOD 7.032e+09
## 13 HURRICANE 5.515e+09
## 9       HAIL 2.635e+09
## 12 HIGH WIND 2.160e+09
## 7     FREEZE 1.839e+09

From this we can see that flooding has the greatest economic impact in terms of property damage, while drought has the largest negative impact on crops.

Create a bar plot of high-impact events

combDmg <- merge(pDmgEvent, cDmgEvent, by = "eventCln", all = TRUE)

combDmg[is.na(combDmg)] <- 0

combDmg$totalDmg <- combDmg$totalCDmg + combDmg$totalPDmg

## Look at high impact events; those with the highest 25% of total impact
highImpact <- combDmg[combDmg$totalDmg >= quantile(combDmg$totalDmg, probs = .75)[["75%"]],]

## Measure impact in billions of dollars
highImpact$totalCDmg <- highImpact$totalCDmg / 1000000000
highImpact$totalPDmg <- highImpact$totalPDmg / 1000000000

## Reshape data set for plotting
hIMelt <- melt(highImpact[,1:3], id = "eventCln")
highImpactTx <- acast(hIMelt, variable~eventCln)

par(mar = c(10,5,3,3) + 0.1)
barplot(highImpactTx, col = c("gold1", "firebrick1"), main = "Economic Impact of Highest Impact Events", ylab = "Economic Impact (Billion US$)", las = 3, cex.axis = .85, legend.text = c("Property Damage", "Crop Damage"), args.legend = (x = "topright"))

plot of chunk dmgPlot

And so we see flooding and hurricanes having far and away the greatest economic consequences, with storm surges and tornadoes also standing out for their considerable property and crop damage impact.


Analysis Environment Details

sessionInfo()
## R version 3.0.3 (2014-03-06)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] reshape2_1.4
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.3     evaluate_0.5.3   formatR_0.10     htmltools_0.2.4 
##  [5] knitr_1.6        plyr_1.8.1       Rcpp_0.11.1      rmarkdown_0.2.46
##  [9] stringr_0.6.2    tools_3.0.3      yaml_2.1.11