Synopsis

Hurricanes in the US have a high economic cost and signficant impact on human life. This was most evident in the events of Hurricane Katrina in 2005. Thankfully, hurricanes happen less frequently than other storm events such as Tornados or Floods, which have higher overall impacts on the population and economic cost than Hurricanes, simply because they occur more frequently.

Summary of analysis

This analysis used the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database to perform an analysis on these events’ impacts on people and the economy.

The analysis has been split into two parts: Part 1 - Impact on people Part 2 - Impact on the economy

Assumptions

1 - The load files are in your current working directory

Setup

These steps were required for the analysis to be performed

# Load required libraries
library(lubridate)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reshape2)
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, last
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, mday, month, quarter, wday, week, yday, year
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
library(R.utils)
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.6.1 (2014-01-04) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.18.0 (2014-02-22) successfully loaded. See ?R.oo for help.
## 
## Attaching package: 'R.oo'
## 
## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods
## 
## The following objects are masked from 'package:base':
## 
##     attach, detach, gc, load, save
## 
## R.utils v1.34.0 (2014-10-07) successfully loaded. See ?R.utils for help.
## 
## Attaching package: 'R.utils'
## 
## The following object is masked from 'package:utils':
## 
##     timestamp
## 
## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, parse, warnings
# Set options
#options("scipen"=0, "digits"= 4)

# Multiple plot function
# Source: http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/

multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  require(grid)
  
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  
  numPlots = length(plots)
  
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  
  if (numPlots==1) {
    print(plots[[1]])
    
  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    
    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

Impact on Population

Data Processing

The first step was to read in the Storm database

# Read in the dataset
bunzip2(filename = "repdata-data-StormData.csv.bz2")
rawData <- read.csv("repdata-data-StormData.csv",header = TRUE, sep = ",")

In addition to the main data table, a table of categorised storm events was loaded This was used to help consolidate the number of storm event categories

# Read in the storm events table 2.1.1
storm <- read.csv("Storm_Table.csv",header = TRUE, sep = ",",)
storm$grpEvent <- tolower(storm$Storm_Event)
storm$grpEvent <- gsub("^\\s+|\\s+$", "", storm$grpEvent)

The next steps manipulate these datasets for the purpose of analysing the impact on the population The key steps are to: Calculate a total population affected field (“AFFECTED”) Clean the EVTYPE (“Events”) field Group these events into fewer categories for summary analysis

# Refine the table for this analysis
# by selecting desired fields
popData <- data.table(subset(x = rawData, select = c("EVTYPE","FATALITIES","INJURIES")))

# Create a variable for total persons affected
popData$AFFECTED <- popData$FATALITIES + popData$INJURIES

# Exclude 'Summary' records from the dataset
popData <- sqldf(x = "SELECT * FROM popData WHERE EVTYPE NOT LIKE 'Summary%'")
## Loading required package: tcltk
popData <- sqldf(x = "SELECT * FROM popData WHERE EVTYPE <> '?'")

# Clean, create a new field and 
# Re-map some categories matching values
# In the storm events table
popData$EVTYPE <- tolower(popData$EVTYPE)
popData$EVTYPE <- gsub("^\\s+|\\s+$", "", popData$EVTYPE)
popData$grpEvent <- popData$EVTYPE

for (i in 1:length(storm$grpEvent)) {
    value <- storm$grpEvent[i]
    print(value)
    popData[grep(value, popData$EVTYPE), "grpEvent"] <- value
}
## [1] "astronomical low tide"
## [1] "avalanche"
## [1] "blizzard"
## [1] "coastal flood"
## [1] "cold/wind chill"
## [1] "debris flow"
## [1] "dense fog"
## [1] "dense smoke"
## [1] "drought"
## [1] "dust devil"
## [1] "dust storm"
## [1] "excessive heat"
## [1] "extreme cold/wind chill"
## [1] "flash flood"
## [1] "flood"
## [1] "frost/freeze"
## [1] "funnel loud"
## [1] "freezing fog"
## [1] "hail"
## [1] "heat"
## [1] "heavy rain"
## [1] "heavy snow"
## [1] "high surf"
## [1] "high wind"
## [1] "hurricane (typhoon)"
## [1] "ice storm"
## [1] "lake-effect snow"
## [1] "lakeshore flood"
## [1] "lightning"
## [1] "marine hail"
## [1] "marine high wind"
## [1] "marine strong wind"
## [1] "marine thunderstorm wind"
## [1] "rip current"
## [1] "seiche"
## [1] "sleet"
## [1] "storm surge/tide"
## [1] "strong wind"
## [1] "thunderstorm wind"
## [1] "tornado"
## [1] "tropical depression"
## [1] "tropical storm"
## [1] "tsunami"
## [1] "volcanic ash"
## [1] "waterspout"
## [1] "wildfire"
## [1] "winter storm"
## [1] "winter weather"
# Also do some manual mapping
popData[grep("hurricane", popData$EVTYPE), "grpEvent"] <- "hurricane"
popData[grep("wind", popData$EVTYPE), "grpEvent"] <- "wind"
popData[grep("tornado", popData$EVTYPE), "grpEvent"] <- "tornado"
popData[grep("snow", popData$EVTYPE), "grpEvent"] <- "snow"

Results

The cleansed dataset was then aggregated by event type The incidents field is the count of incidents The impact field is the number of persons affected by either injury or death per event

The first plot created shows the total affected people on record The second plot shows, for these events, how many people are impacted per event.

# Aggregate and Order by number of people
# affected
popEventSum <- sqldf(x = "SELECT grpEvent as event
                 , COUNT(grpEvent) as incidents
                 , SUM(FATALITIES) as fatalities
                 , SUM(INJURIES) as injuries
                 , SUM(AFFECTED) as affected
                 , SUM(AFFECTED) / COUNT(grpEvent) as impact
                 FROM popData
                 WHERE AFFECTED > 0 
                 GROUP BY grpEvent
                 ORDER BY SUM(AFFECTED) DESC
                 LIMIT 20")

# Create levels for the event variable for chart rank
lvlEvent <- rev(popEventSum$event)
popEventSum$event <- factor(popEventSum$event, levels = lvlEvent)

# Make a bar plot summarised by weather events
# of total population affected 
p1 <- ggplot(data = popEventSum 
             , aes(x = event,y = affected, fill = "firebrick")) +
  geom_bar(position="dodge",stat="identity") +
  labs(x = "Event", y = "Affected population") + 
  theme(legend.title=element_blank()) +
  ggtitle("Total population affected by event") + 
  coord_flip()
#print(p1)

# Make a bar plot summarised by weather events
# of total population affected per event (impact)
p2 <- ggplot(data = popEventSum 
             , aes(x = event,y = impact, fill = "firebrick", )) +
  geom_bar(position="dodge",stat="identity") +
  labs(x = "Event", y = "Affected population per event") +
  theme(legend.title=element_blank()) +
  ggtitle("Total population affected per event") + 
  coord_flip()
#print(p2)

The third plot looks at the most fatal events, and splits it fatalities and injuries The fourth plot presents the number of people affected per incident with the same split

# Reproduce the plot split by injuries and fatalities
popEventSum <- sqldf(x = "SELECT grpEvent as event
                 , COUNT(grpEvent) as incidents
                 , SUM(FATALITIES) as fatalities
                 , SUM(INJURIES) as injuries
                 FROM popData
                 WHERE AFFECTED > 0 
                 GROUP BY grpEvent
                 ORDER BY SUM(FATALITIES) DESC
                 LIMIT 20")

# Create levels for the event variable for chart rank
lvlEvent <- rev(popEventSum$event)
popEventSum$event <- factor(popEventSum$event, levels = lvlEvent)

# Transform the data for a multiple bar plot
popEventSum <- melt(data = popEventSum,id.vars = c("event","incidents"), measure.vars = c("injuries","fatalities"))
popEventSum$impact <- popEventSum$value / popEventSum$incidents
names(popEventSum) <- c("event","incidents","category","persons","impact")

# Create the plot
p3 <- ggplot(data = popEventSum 
             , aes(x = event,y = persons, fill = factor(category))) +
  geom_bar(position="dodge",stat="identity") +
  labs(x = "Event", y = "Affected population") + 
  theme(legend.title = element_text(colour="black", size=10)) + 
  scale_color_discrete(name="Category of persons affected") + 
  ggtitle("Total population affected by event and category") + 
  coord_flip()
#print(p3)

# Create a final plot showing impact split by injuries or fatalities
p4 <- ggplot(data = popEventSum 
             , aes(x = event,y = impact, fill = factor(category))) +
  geom_bar(position="dodge",stat="identity") +
  labs(x = "Event", y = "Affected population per event") +
  theme(legend.title = element_text(colour="black", size=10)) + 
  scale_color_discrete(name="Category of persons affected") + 
  ggtitle("Total population affected per event and category") + 
  coord_flip()
#print(p4)

Overall results showed Tornados to have impacted the population the most over time They also occur frequently enough to rank relatively highly in terms of persons affected per incident. The event of the highest impact are Hurricanes, which, while infrequently, can be catastropic. Perhaps US citizens are better adapted to dealing with Torndaos, but less equipped at coping with Hurricanes.

In terms of the split between fatalities and injuries, Tornados had a much higher injury rate than fatalities. By comparison, heat related events had a closer rates of death to injuries.

# Combine all the plots into a multiplot
mInput <- matrix(c(1,2,3,4), nrow=2, byrow=TRUE)
multiplot(p1, p2, p3, p4, layout = mInput)
## Loading required package: grid

Economic Impact

Data Processing

A dataframe was created for the economic impact of these storm events A date field was retained for this analysis A mapping table was setup to convert the reported value into a value specified by the units Records that did not have legitimate units were excluded A field for total damage was also calculated

# Select relevant fields and include only events where
# damage was more than $0
ecoData <- subset(x = rawData, 
                  select = c("EVTYPE","BGN_DATE","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP"), 
                  subset = CROPDMG > 0 | PROPDMG > 0)
 
# Tidy up the units fields to calculate damage
ecoData$CROPDMGEXP <- tolower(ecoData$CROPDMGEXP)
ecoData$PROPDMGEXP <- tolower(ecoData$PROPDMGEXP)
ecoData <- sqldf(x = "SELECT * FROM ecoData WHERE EVTYPE NOT LIKE 'Summary%'")
ecoData <- sqldf(x = "SELECT * FROM ecoData WHERE EVTYPE <> '?'")

# Create a date field
ecoData$Date <- as.Date(x = ecoData$BGN_DATE, format = "%m/%d/%Y")
ecoData$mo <- strftime(ecoData$Date, "%m")
ecoData$yr <- strftime(ecoData$Date, "%Y")

# Clean, create a new field and 
# Re-map some categories matching values
# In the storm events table
ecoData$EVTYPE <- tolower(ecoData$EVTYPE)
ecoData$EVTYPE <- gsub("^\\s+|\\s+$", "", ecoData$EVTYPE)
ecoData$grpEvent <- ecoData$EVTYPE

for (i in 1:length(storm$grpEvent)) {
  value <- storm$grpEvent[i]
  print(value)
  ecoData[grep(value, ecoData$EVTYPE), "grpEvent"] <- value
}
## [1] "astronomical low tide"
## [1] "avalanche"
## [1] "blizzard"
## [1] "coastal flood"
## [1] "cold/wind chill"
## [1] "debris flow"
## [1] "dense fog"
## [1] "dense smoke"
## [1] "drought"
## [1] "dust devil"
## [1] "dust storm"
## [1] "excessive heat"
## [1] "extreme cold/wind chill"
## [1] "flash flood"
## [1] "flood"
## [1] "frost/freeze"
## [1] "funnel loud"
## [1] "freezing fog"
## [1] "hail"
## [1] "heat"
## [1] "heavy rain"
## [1] "heavy snow"
## [1] "high surf"
## [1] "high wind"
## [1] "hurricane (typhoon)"
## [1] "ice storm"
## [1] "lake-effect snow"
## [1] "lakeshore flood"
## [1] "lightning"
## [1] "marine hail"
## [1] "marine high wind"
## [1] "marine strong wind"
## [1] "marine thunderstorm wind"
## [1] "rip current"
## [1] "seiche"
## [1] "sleet"
## [1] "storm surge/tide"
## [1] "strong wind"
## [1] "thunderstorm wind"
## [1] "tornado"
## [1] "tropical depression"
## [1] "tropical storm"
## [1] "tsunami"
## [1] "volcanic ash"
## [1] "waterspout"
## [1] "wildfire"
## [1] "winter storm"
## [1] "winter weather"
# Also do some manual mapping
ecoData[grep("hurricane", ecoData$EVTYPE), "grpEvent"] <- "hurricane"
ecoData[grep("wind", ecoData$EVTYPE), "grpEvent"] <- "wind"
ecoData[grep("tornado", ecoData$EVTYPE), "grpEvent"] <- "tornado"
ecoData[grep("snow", ecoData$EVTYPE), "grpEvent"] <- "snow"

# Create a mapping table
Symbols <- c("b","m","k")
Units <- c(1000000000,1000000,1000)
tblUnits <- data.frame(Symbols, Units)

# Clean the ecoData table via an inner join
# Also multiply through the units to the values
ecoData <- sqldf("SELECT e.grpEvent
                  , e.Date
                  , e.mo
                  , e.yr
                  , (e.PROPDMG * t.Units) as PropDmg
                  , (e.CROPDMG * t.Units) as CropDmg
                  , (e.PROPDMG * t.Units) + (e.CROPDMG * t.Units) as TotalDmg
                  FROM ecoData e
                  JOIN tblUnits t
                  ON e.CROPDMGEXP = t.Symbols
                  JOIN tblUnits u
                  ON e.PROPDMGEXP = t.Symbols")

Results

In generating results, an aggregate table was formed by event. These results show floods to have the highest economic cost, followed by hurricanes. However when the cost of floods is divided by the number of floods, the cost per flood is small. Hurricanes, happen less frequently, but when they do, they prove to be costly to property and crops.

# Create a summary of total cost by Event
ecoEventSum <- sqldf("SELECT e.grpEvent as event
                      , COUNT(e.grpEvent) as incidents
                      , SUM(PropDmg) as PropDmg
                      , SUM(CropDmg) as CropDmg
                      , SUM(TotalDmg) as TotalDmg
                      , SUM(TotalDmg) / COUNT(e.grpEvent) as impact
                      FROM ecoData e
                      GROUP BY e.grpEvent
                      ORDER BY SUM(TotalDmg) DESC
                      LIMIT 20")

# Create levels for the event variable for chart rank
lvlEvent <- rev(ecoEventSum$event)
ecoEventSum$event <- factor(ecoEventSum$event, levels = lvlEvent)

# Make a bar plot summarised by weather events
# of total cost accumulated over time
p1 <- ggplot(data = ecoEventSum 
             , aes(x = event,y = TotalDmg)) +
  geom_bar(position="dodge",stat="identity", fill = "dodgerblue") +
  labs(x = "Event", y = "Cost per event") +
  ggtitle("Total cost of events") + 
  coord_flip()
#print(p1)

# Make a bar plot summarised by weather events
# of total cost per event
p2 <- ggplot(data = ecoEventSum 
             , aes(x = event,y = impact)) +
  geom_bar(position="dodge",stat="identity", fill = "dodgerblue") +
  labs(x = "Event", y = "Cost per event") +
  ggtitle("Total cost per event") + 
  coord_flip()
#print(p2)

mInput <- matrix(c(1,2), nrow=1, byrow=TRUE)
multiplot(p1, p2, layout = mInput)

This final chart shows the economic cost per month for the top 5 most costly storm events Hurricane Katrina can be identified as the event in 2005 which was a costly hurricane. It also shows the infreqency of hurricanes relative to flooding. This chart also highlights a very expensive flood before 1995, which may need further investigation as it is estimated to have cost more than Katrina.

# Plot a time series of cost for the Top 5 most expensive events
ecoTop <- sqldf("SELECT e.Event FROM ecoEventSum e LIMIT 5")
ecoTime <- sqldf("SELECT e.grpEvent as event
                  , e.Date
                  , e.mo
                  , e.yr
                  , SUM(e.TotalDmg) as TotalDmg
                  FROM ecoData e
                  JOIN ecoTop t
                  ON e.grpEvent = t.event
                  GROUP BY e.grpEvent 
                  , e.Date
                  , e.mo
                  , e.yr")

ecoTime <- aggregate(TotalDmg ~ mo + yr + event, ecoTime, FUN = sum)
ecoTime$Date <- as.POSIXct(paste(ecoTime$yr, ecoTime$mo, "01", sep = "-"))

p3 <- ggplot(data = ecoTime, aes(Date, TotalDmg)) +
      geom_bar(stat="identity") + 
      facet_wrap(~event, ncol = 1) + 
      labs(x = "Event", y = "Cost") +
      ggtitle("Total cost of storm events over time") 
print(p3)