SYNOPSIS

This document explores the National Oceanic and Atmospheric Administration (NOAA) Storm Database to adress the severity of weather events between 1950 and November 2011. We begin by defining the questions, give short answers then focus on the the fully reproducible research below.

We prefilter the data by making use of a mapping table for the exponential fields. We present the data separately and for the final results make use of a weighting logic, since fatalities are orders of magnitude more harmful than injuries, similar property vs crop damage.

Data Processing

The Data used for the analysis is downloaded directly from the course website Storm Data [47Mb]

Retrieving data

# fetch data from URL directly into a data.frame
getCSVBz2DataFromUrl <- function(url){
  dest <- tempfile(fileext = ".csv.bz2")
  setInternet2(use = TRUE) # support for HTTPS
  download.file(url, dest, mode = "wb")
  read.csv(dest)
}

# define URL
url = 'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'

# retrieve storms data
storms = getCSVBz2DataFromUrl(url)

# some benchmarks
dim(storms)
## [1] 902297     37
colnames(storms)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"
print(object.size(storms), units="Mb")
## 409.4 Mb

Preprocessing the data, Damage definition

We prefilter the dataset and extract only the columns that contain the data that we need.

Variable Description
STATE the State this event took place
EVTYPE the type of weather event (full )
BGN_DATE Event beginn (trends / timeseries)
FATALITIES number of fatalities
INJURIES number of injuried people
PROPDMG Economic property loss
PROPDMGEXP Economic property loss multiplier value
CROPDMG Economic crop loss
CROPDMGEXP Economic crop loss multiplier

Next we use a powers of 10 mapping table for the exponential fields, merge it twice then normalize the fields.

opts = c('STATE', 'EVTYPE', 'BGN_DATE', 'FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP', 'CROPDMG', 'CROPDMGEXP')
ds = storms[, opts]

# converting the date
ds$BGN_DATE = strptime(as.character(ds$BGN_DATE),"%m/%e/%Y %H:%M:%S" )

# retain the year
ds$BGN_YEAR = format(ds$BGN_DATE, '%Y')
ds$BGN_DATE = NULL

#since the ninenties there is a constant increase in the number of recorded weather events
table(ds$BGN_YEAR)
## 
##  1950  1951  1952  1953  1954  1955  1956  1957  1958  1959  1960  1961 
##   223   269   272   492   609  1413  1703  2184  2213  1813  1945  2246 
##  1962  1963  1964  1965  1966  1967  1968  1969  1970  1971  1972  1973 
##  2389  1968  2348  2855  2388  2688  3312  2926  3215  3471  2168  4463 
##  1974  1975  1976  1977  1978  1979  1980  1981  1982  1983  1984  1985 
##  5386  4975  3768  3728  3657  4279  6146  4517  7132  8322  7335  7979 
##  1986  1987  1988  1989  1990  1991  1992  1993  1994  1995  1996  1997 
##  8726  7367  7257 10410 10946 12522 13534 12607 20631 27970 32270 28680 
##  1998  1999  2000  2001  2002  2003  2004  2005  2006  2007  2008  2009 
## 38128 31289 34471 34962 36293 39752 39363 39184 44034 43289 55663 45817 
##  2010  2011 
## 48161 62174
# mapping the exponents
mappingPowersOf10 = data.frame(
  Exp = c("",  "-", "?", "+", "0", "1", "2", "3", "4", "5", "6", "7", "8", "B", "h", "H", "K", "m", "M"),
  Val = c(0,     0,   0,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,  9,   2,   2,    3,  6,    6)
)

# merge the exponential markings from the source with the proper values
ds = merge(ds, mappingPowersOf10, by.x = "PROPDMGEXP", by.y = "Exp")

# merge the two fields into a normalised one
ds$PropertyDamage = ds$PROPDMG* (10^ds$Val) 

ds$PROPDMG = NULL
ds$PROPDMGEXP = NULL 
ds$Val = NULL

# merge the exponential markings from the source with the proper values
ds = merge(ds, mappingPowersOf10, by.x = "CROPDMGEXP", by.y = "Exp")

# merge the two fields into a normalised one
ds$CropDamage = ds$CROPDMG* (10^ds$Val) 

ds$CROPDMG = NULL
ds$CROPDMGEXP = NULL 
ds$Val = NULL

# defining the weight
weightFATALITIES = 10

# applying the weight
ds$COMBINED = weightFATALITIES*ds$FATALITIES + ds$INJURIES

# dropping two thirds of the data
ds = subset(ds, COMBINED>0 | PropertyDamage>0 | CropDamage > 0)

# aggregating by Event Type
aggHUMAN = merge (
          aggregate(FATALITIES ~ EVTYPE, ds, sum),
          aggregate(INJURIES ~ EVTYPE, ds, sum)
          )
# applying the weight
aggHUMAN$COMBINED = weightFATALITIES*aggHUMAN$FATALITIES + aggHUMAN$INJURIES

# selecting just top 10, combining all remaining entities into another factor
toDrop = subset(aggHUMAN, COMBINED<3617)
aggHUMAN = head(aggHUMAN[order(- aggHUMAN$COMBINED),], 9)
aggHUMAN = rbind(aggHUMAN,
                data.frame(EVTYPE='OTHERS', FATALITIES = sum(toDrop$FATALITIES), INJURIES = sum(toDrop$INJURIES), COMBINED = sum(toDrop$INJURIES)))

aggPROPERTY = aggregate(PropertyDamage ~ EVTYPE, ds, sum)
aggCROP = aggregate(CropDamage ~ EVTYPE, ds, sum)

aggPROPERTY = head(aggPROPERTY[order(- aggPROPERTY$PropertyDamage),], 10)
aggCROP = head(aggCROP[order(- aggCROP$CropDamage),], 10)

# checking dimensions
sapply(c(aggCROP, aggPROPERTY), length)
##         EVTYPE     CropDamage         EVTYPE PropertyDamage 
##             10             10             10             10

Results

# plot the weather events most harmful to the human population
ggplot(data=aggHUMAN, aes(reorder(EVTYPE, -COMBINED), COMBINED, fill=EVTYPE, color=EVTYPE)) + 
              geom_bar(stat = "identity") + 
              geom_text(aes(label=EVTYPE), vjust=0.8, hjust=0, color="black", angle=10) +
              ggtitle("Most harmful weather events") +
              labs(x="",y="Combined damage to human population")  + 
              theme(text = element_text(size = 8), axis.line=element_blank(),axis.text.x=element_blank(),
              plot.title = element_text(color="#666666", face="bold", size=20) )

# plot the weather events with greatest impact to Propery
ggplot(data=aggPROPERTY, aes(reorder(EVTYPE, -PropertyDamage), PropertyDamage, fill=EVTYPE, color=EVTYPE)) + 
              geom_bar(stat = "identity") + 
              geom_text(aes(label=EVTYPE), vjust=0.8, hjust=0, color="black", angle=10) +
              ggtitle("Events with greatest impact to Property") +
              labs(x="",y="Normalized Damage to Property")  + 
              theme(text = element_text(size = 8), axis.line=element_blank(),axis.text.x=element_blank(),
              plot.title = element_text(color="#666666", face="bold", size=20) )

# plot the weather events with greatest impact to Crops
ggplot(data=aggCROP, aes(reorder(EVTYPE, -CropDamage), CropDamage, fill=EVTYPE, color=EVTYPE)) + 
              geom_bar(stat = "identity") + 
              geom_text(aes(label=EVTYPE), vjust=0.8, hjust=0, color="black", angle=10) +
              ggtitle("Events with greatest impact to Crops") +
              labs(x="",y="Normalized Damage to Crops")  + 
              theme(text = element_text(size = 8), axis.line=element_blank(),axis.text.x=element_blank(),
              plot.title = element_text(color="#666666", face="bold", size=20) )