library(knitr)  
library(markdown)  
library(dplyr)  
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)  

I. Synopsis

This paper explores the NOAA Storm Database and identifies weather events that:
1. are most harmful to population health; and
2. have the greatest economic effects.

II. Data Processing

The dataset that forms the basis for the analysis can be downloaded here. The following reads in compressed csv file and verifies the number of unique event weather types. The last line of the code chunk lists the variables collected for each individual weather event. From this list, we’ll retain only those that pertain to the analysis we wish to perform, i.e., events with significant health and economic consequences.

data <- read.csv("repdata-data-StormData.csv.bz2",header=TRUE,stringsAsFactors=FALSE)
length(levels(factor(data$EVTYPE)))  
## [1] 985
names(data)  
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

We will retain “BGN_DATE”" which serves as an indirect index of completeness of the data collected for each weather event: events in more recent years are assumed to be more complete. In fact, the “BGN_DATE” column is particularly relevant and helpful when we notice that the original dataset’s “EVTYPE” column has 985 unique values, in contrast to the 48 event categories identified by the NOAA documentation. That same documentation points out that the revisions provided therein became effective August 3, 2007. To test whether reporting parties, namely, those providing the data, began identifyin events in compliance with the categories identified in the document, we check the data that were reported after that date. For simplicity’s sake, we look at the events that occurred beginning in 2008. The dataset is ordered by date, with more recent events occupying lower rows. Perusal of the data indicates that the first post-2007 data entry occurrs at line 690503. Every entry from line 690503 until we reach the end of the dataset is post-2007.

data <- data[690503:902297,]  
levels(factor(data$EVTYPE))  
##  [1] "ASTRONOMICAL LOW TIDE"    "AVALANCHE"               
##  [3] "BLIZZARD"                 "COASTAL FLOOD"           
##  [5] "COLD/WIND CHILL"          "DENSE FOG"               
##  [7] "DENSE SMOKE"              "DROUGHT"                 
##  [9] "DUST DEVIL"               "DUST STORM"              
## [11] "EXCESSIVE HEAT"           "EXTREME COLD/WIND CHILL" 
## [13] "FLASH FLOOD"              "FLOOD"                   
## [15] "FREEZING FOG"             "FROST/FREEZE"            
## [17] "FUNNEL CLOUD"             "HAIL"                    
## [19] "HEAT"                     "HEAVY RAIN"              
## [21] "HEAVY SNOW"               "HIGH SURF"               
## [23] "HIGH WIND"                "HURRICANE"               
## [25] "ICE STORM"                "LAKE-EFFECT SNOW"        
## [27] "LAKESHORE FLOOD"          "LANDSLIDE"               
## [29] "LIGHTNING"                "MARINE HAIL"             
## [31] "MARINE HIGH WIND"         "MARINE STRONG WIND"      
## [33] "MARINE THUNDERSTORM WIND" "RIP CURRENT"             
## [35] "SEICHE"                   "SLEET"                   
## [37] "STORM SURGE/TIDE"         "STRONG WIND"             
## [39] "THUNDERSTORM WIND"        "TORNADO"                 
## [41] "TROPICAL DEPRESSION"      "TROPICAL STORM"          
## [43] "TSUNAMI"                  "VOLCANIC ASHFALL"        
## [45] "WATERSPOUT"               "WILDFIRE"                
## [47] "WINTER STORM"             "WINTER WEATHER"

We see that eliminating the first 690,502 records simplifies the dataset significantly insofar as the event types correspond to the 48 categories recognized by the NOAA. Moreover, as a practical matter, to the extent that policymakers should rely on temporally relevant data in devising their governance priorities, the elimination effected by the code chunk above may yield more relevant insight. The informal explanation for such intuition might be that significant changes have occurred in:
1. The types and magnitude of weather events;
2. The particular targets of such events and the magnitude of their vulnerability. One salient observation in support of this claim is that the United States has rapidly shifted away from an agricultural economy, which obviously effects the extent to which weather events would cause crop damage.
3. Geogrphic distribution of the population. While one would need hard data in support, one might surmise that the shift away from agricultural priorities contributed to an urbanization of the country, thereby, making population clusters denser than before, which would affect the number of fatalities and injuries that a particular weather event would cause.

Moreover, while the exlusion of about 700,000 observation may seem to distort the data from which we seek to cull useful insight, the fact that the excision still leaves about 200,000 events, which is almost certainly sufficient to provide reliable insight.

Columns relevant to population health are:
1. FATALITIES
2. INJURIES

Columns relevant to economic consequences are:
1. PROPDMG
2. PROPDMGEXP
3. CROPDMG
4. CROPDMGEXP

First, we simplify the current dataset by limiting the columns to those named above.

data <- data[,c("EVTYPE","FATALITIES","INJURIES","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]
data <- tbl_df(data)

The columns that end in “-EXP” refer to the magnitude of the damage, specifically whether the damage caused is in the thousands (“K”) or millions (“M”) of dollars. In effect, the total amount of property and crop damage is the product of the PROPDMG and PROPDMGEXP, and CROPDMG and CROPDMGEXP, respectively. We want to combine these four columns into two simpler columns explicitly specifying the amount of damage. We do so as follows.

# We check what the unique values are for each "-EXP" variable
levels(factor(data$PROPDMGEXP)) 
## [1] "0" "B" "K" "M"
levels(factor(data$CROPDMGEXP))
## [1] "B" "K" "M"
data$PROPDMGEXP[data$PROPDMGEXP=="0"] = 0
data$PROPDMGEXP[data$PROPDMGEXP=="K"] = 1000
data$PROPDMGEXP[data$PROPDMGEXP=="M"] = 1000000
data$PROPDMGEXP[data$PROPDMGEXP=="B"] = 1000000000
data$CROPDMGEXP[data$CROPDMGEXP=="K"] = 1000
data$CROPDMGEXP[data$CROPDMGEXP=="M"] = 1000000
data$CROPDMGEXP[data$CROPDMGEXP=="B"] = 1000000000
data$PROPDMGEXP <- as.numeric(data$PROPDMGEXP)
data$CROPDMGEXP <- as.numeric(data$CROPDMGEXP)  

Now we simply multiply the pairs of columns relating to property and crop damage to find a dollar figure that’s easier to work with.

PropertyDamage <- as.vector(data$PROPDMG) * as.vector(data$PROPDMGEXP)
CropDamage <- as.vector(data$CROPDMG) * as.vector(data$CROPDMGEXP)
data$PropertyDamage <- PropertyDamage
data$CropDamage <- CropDamage
data <- data[,-c(4:7)]  
rownames(data) <- NULL
data[1:20,]
## Source: local data frame [20 x 5]
## 
##               EVTYPE FATALITIES INJURIES PropertyDamage CropDamage
## 1            DROUGHT          0        0              0          0
## 2            DROUGHT          0        0              0          0
## 3               HAIL          0        0              0          0
## 4               HAIL          0        0              0          0
## 5  THUNDERSTORM WIND          0        0           1000          0
## 6  THUNDERSTORM WIND          0        0              0          0
## 7  THUNDERSTORM WIND          0        0           1500          0
## 8  THUNDERSTORM WIND          0        0              0          0
## 9       FUNNEL CLOUD          0        0              0          0
## 10      FUNNEL CLOUD          0        0              0          0
## 11      FUNNEL CLOUD          0        0              0          0
## 12           TORNADO          0        0         100000          0
## 13 THUNDERSTORM WIND          0        0              0          0
## 14      FUNNEL CLOUD          0        0              0          0
## 15           TORNADO          0        0         105000          0
## 16              HAIL          0        0              0          0
## 17 THUNDERSTORM WIND          0        0              0          0
## 18              HAIL          0        0              0          0
## 19 THUNDERSTORM WIND          0        1          10000          0
## 20 THUNDERSTORM WIND          0        0              0          0

Now, in order to begin analyzing which weather events cause the most physical and economic damage, we split the data frame by EVTYPE.

byEvent <- split(data, data$EVTYPE)

We need to decide whether we’re interested in the economic/health consequences as an aggregate measure, i.e., the sum of consequences; in the average level of damage per class of event; or both. This paper will focus on the aggregate effect, as that choice is intutively more reflective of the priorities policymakers should have in mind when implementing a regulatory framework. The cost/benefit analysis of such a choice is beyond the scope of this paper; we simply assume that a focus on the aggregate damage (even if spread out among a large number of not-so-serious events) is more cost-efficient.

Event_Type <- as.list(names(byEvent))

FatalitiesByEvent <- lapply(byEvent, function(x) {
sum(x$FATALITIES)
})

InjuriesByEvent <- lapply(byEvent, function(x) {
sum(x$INJURIES)
})

PDbyEvent <- lapply(byEvent, function(x) {
sum(x$PropertyDamage)
})

CDbyEvent <- lapply(byEvent, function(x) {
sum(x$CropDamage)
})

evTypeIndex <- c(1:48)
finalData <- as.data.frame(cbind(Event_Type,evTypeIndex,FatalitiesByEvent,InjuriesByEvent,PDbyEvent,CDbyEvent))

finalData$FatalitiesByEvent <- as.numeric(finalData$FatalitiesByEvent)  
finalData$InjuriesByEvent <- as.numeric(finalData$InjuriesByEvent)  
finalData$PDbyEvent <- as.numeric(finalData$PDbyEvent)
finalData$CDbyEvent <- as.numeric(finalData$CDbyEvent)  

III. Analysis and Preliminary Results

fatalities <- arrange(finalData, desc(FatalitiesByEvent))
topKillers <- fatalities[1:10,]  

injuries <- arrange(finalData, desc(InjuriesByEvent))  
topInjurers <- injuries[1:10,]  

propdam <- arrange(finalData, desc(PDbyEvent))
topPropertyDamagers <- propdam[1:10,]

cropdam <- arrange(finalData, desc(CDbyEvent))
topCropDamagers <- cropdam[1:10,]

topKillers$Event_Type<-as.factor(as.character(topKillers$Event_Type))
cutpoints <- quantile(topKillers$FatalitiesByEvent,seq(0,1,length=4),na.rm=TRUE)
topKillers$FatalitiesByEventQuantile <- cut(topKillers$FatalitiesByEvent,cutpoints)
qplot(as.integer(evTypeIndex),log10(InjuriesByEvent),data=topKillers,facets=.~FatalitiesByEventQuantile,color=Event_Type,size=3,xlab="Event Type Index: 1 - 48",ylab="Injuries on Log 10 Scale",main="Number of Injuries by Event Type & 3 Levels of Fatalities")  

Now we plot the property and crop damage caused on separate graphs.

library(RColorBrewer)
topCropDamagers$Event_Type<-as.factor(as.character(topCropDamagers$Event_Type))
barplot(topCropDamagers$CDbyEvent,beside = TRUE,col=brewer.pal(10,name = "Blues"),legend=topCropDamagers$Event_Type)
## Warning in brewer.pal(10, name = "Blues"): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors

topPropertyDamagers$Event_Type<-as.factor(as.character(topPropertyDamagers$Event_Type))
barplot(topPropertyDamagers$PDbyEvent,beside = TRUE,col=brewer.pal(10,name = "Set3"),legend=topPropertyDamagers$Event_Type)