Overview

This analysis uses the NOAA weather database to determine which events cause the most human, property and crop damage. The analysis has been restricted to the data from Jan 2006, since a comparitive analysis between event types is only possible after that. The first part of the analysis focuses on cleaning and merging event types. The second part of the analysis looks at the top 10 events that cause human or property damage, on an average basis.

Data Processing

This section reads the data directly from the downloaded bz2 file. The relevant columns are subset into a new dataset and filtered for events that occur from 1996. The data is further filtered to remove any events that had fewer than 5 occurances. Event types that are similar are merged through a series of transformations.

library(knitr)
opts_chunk$set(tidy.opts=list(width.cutoff=60),tidy=TRUE)
# Reading the file into a dataframe with cache on

if (!file.exists("stormdata.csv.bz2")) {
    download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", 
        "stormdata.csv.bz2")
}

stormdata <- read.csv("stormdata.csv.bz2", sep = ",", header = TRUE)
# Load packages
library(dplyr)
library(ggplot2)
library(scales)
# Subset and clean data

# Select only relevant columns
stormsub <- select(stormdata, STATE, BGN_DATE, EVTYPE, FATALITIES, 
    INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)

# Select only observations from 1996 since event types were
# expanded only from that year
year <- as.POSIXct(stormsub$BGN_DATE, format = "%m/%d/%Y %H:%M:%S")

stormsub$Year <- lubridate::year(year)

after1996 <- filter(stormsub, Year > 1995)

# Select only events that have more than 5 occurances from
# 1996 till 2011
by_event <- group_by(after1996, EVTYPE) %>% summarise(count = n()) %>% 
    filter(count > 5) %>% arrange(desc(count))

gt5post96 <- filter(after1996, EVTYPE %in% by_event$EVTYPE)

# Clean data to standardize duplicates

# Standardize Heat events
gt5post96$EVTYPE[gt5post96$EVTYPE == "EXCESSIVE HEAT"] <- "HEAT"

gt5post96$EVTYPE[gt5post96$EVTYPE == "RECORD HEAT"] <- "HEAT"

warm <- unique(gt5post96$EVTYPE[grep("warm", gt5post96$EVTYPE, 
    ignore.case = TRUE)])

gt5post96$EVTYPE[gt5post96$EVTYPE %in% warm] <- "HEAT"


# Standarsize Rip currents events
gt5post96$EVTYPE[gt5post96$EVTYPE == "RIP CURRENT"] <- "RIP CURRENTS"

# Standardize hurricane events
gt5post96$EVTYPE[gt5post96$EVTYPE == "HURRICANE/TYPHOON"] <- "HURRICANE"

gt5post96$EVTYPE[gt5post96$EVTYPE == "TYPHOON"] <- "HURRICANE"

# Standardize landslide events
gt5post96$EVTYPE[gt5post96$EVTYPE == "Mudslide"] <- "LANDSLIDE"

gt5post96$EVTYPE[gt5post96$EVTYPE == "MUDSLIDE"] <- "LANDSLIDE"

# Standardize storm events
gt5post96$EVTYPE[gt5post96$EVTYPE %in% c("TSTM WIND", "THUNDERSTORM WIND", 
    "TROPICAL STORM", "STORM SURGE", "SEVERE THUNDERSTORMS")] <- "THUNDERSTORM"

# Standardize coastal storm events
gt5post96$EVTYPE[gt5post96$EVTYPE %in% c("MARINE THUNDERSTORM WIND", 
    "STORM SURGE/TIDE")] <- "COASTAL STORM"

gt5post96$EVTYPE[gt5post96$EVTYPE == "HIGH SEAS"] <- "COASTAL STORM"
coast <- unique(gt5post96$EVTYPE[grep("coastal", gt5post96$EVTYPE, 
    ignore.case = TRUE)])
tidal <- unique(gt5post96$EVTYPE[grep("tidal", gt5post96$EVTYPE, 
    ignore.case = TRUE)])
tide <- unique(gt5post96$EVTYPE[grep("tide", gt5post96$EVTYPE, 
    ignore.case = TRUE)])
alltide <- factor(c(levels(coast)[coast], levels(tide)[tide], 
    levels(tidal)[tidal]))
gt5post96$EVTYPE[gt5post96$EVTYPE %in% alltide] <- "COASTAL STORM"
gt5post96$EVTYPE[gt5post96$EVTYPE %in% c("High Surf", "MARINE STRONG WIND", 
    "HEAVY SURF/HIGH SURF")] <- "COASTAL STORM"

# Standardize cold events
c <- unique(gt5post96$EVTYPE[grep("cold", gt5post96$EVTYPE, ignore.case = TRUE)])

gt5post96$EVTYPE[gt5post96$EVTYPE %in% c] <- "COLD"

# Standarize freezing rain
gt5post96$EVTYPE[gt5post96$EVTYPE %in% c("GLAZE", "BLACK ICE", 
    "WINTRY MIX", "WINTER WEATHER MIX", "MIXED PRECIPITATION", 
    "MIXED PRECIP", "FREEZING DRIZZLE", "Freezing Rain", "FREEZING RAIN/SLEET", 
    "FREEZING RAIN", "LIGHT FREEZING RAIN")] <- "FREEZING RAIN"

# Standardize freeze
gt5post96$EVTYPE[gt5post96$EVTYPE %in% c("FROST/FREEZE", "HARD FREEZE", 
    "FROST")] <- "FREEZE"

The weather events that cause the maximum fatalities on an average basis are depicted in the graph below.

# Create dataset with top 10 mean fatalities by event type,
# sorted descending
fatal_eve <- group_by(gt5post96, EVTYPE) %>% summarise(meanfatal = mean(FATALITIES, 
    na.rm = TRUE)) %>% arrange(desc(meanfatal)) %>% head(10)
# Sort data by descending mean injuries
fatal_eve$EVTYPE <- factor(fatal_eve$EVTYPE, levels = fatal_eve$EVTYPE[order(fatal_eve$meanfatal)])

# regroup cleaned up dataset by event
normalizedevents <- group_by(gt5post96, EVTYPE) %>% summarise(count = n()) %>% 
    filter(count > 5) %>% arrange(desc(count))

# Obtain top 10 fatality event counts
fatalevecnt <- filter(normalizedevents, EVTYPE %in% fatal_eve$EVTYPE)
fatalevecnt$EVTYPE <- factor(fatalevecnt$EVTYPE, levels = fatal_eve$EVTYPE)


# Plot the top 10 fatality events alongside the number of
# occurances of those events
ggplot(fatal_eve, aes(EVTYPE, meanfatal)) + geom_col(width = 0.6, 
    fill = "dark cyan") + coord_flip() + labs(y = "Mean number of fatalities", 
    x = "Events") + ggtitle("Top 10 Events with maximum fatalities")

The frequency of occurance of these events is presented below to help understand the impact in terms of both the potential for impact and the probabilty of occurance

# ggplot(fatalevecnt, aes(EVTYPE, count)) +
# geom_col(width=.6, fill='dark cyan') + coord_flip() +
# labs(x= 'Events', y= 'Count of events') + ggtitle('Events
# with maximum fatalities: Total counts')
kable(arrange(fatalevecnt, EVTYPE))
EVTYPE count
TSUNAMI 20
HEAT 2690
RIP CURRENTS 734
AVALANCHE 378
HURRICANE 269
ICY ROADS 24
COLD 2281
Strong Winds 7
HIGH SURF 717
FOG 532

The weather events that cause the maximum injuries on an average basis are depicted in the graph below.

# Create dataset with top 10 mean injuries by event type,
# sorted descending
injury_eve <- group_by(gt5post96, EVTYPE) %>% summarise(meaninjuries = mean(INJURIES, 
    na.rm = TRUE)) %>% arrange(desc(meaninjuries)) %>% head(10)
# Sort data by descending mean injuries
injury_eve$EVTYPE <- factor(injury_eve$EVTYPE, levels = injury_eve$EVTYPE[order(injury_eve$meaninjuries)])


injuryevecnt <- filter(normalizedevents, EVTYPE %in% injury_eve$EVTYPE)
injuryevecnt$EVTYPE <- factor(injuryevecnt$EVTYPE, levels = injuryevecnt$EVTYPE[order(injury_eve$EVTYPE)])


# Plot the top 10 injury events
ggplot(injury_eve, aes(EVTYPE, meaninjuries)) + geom_col(width = 0.6, 
    fill = "dark cyan") + coord_flip() + labs(x = "Mean number of injuries", 
    y = "Events") + ggtitle("Events with maximum injuries: The top 10")

The frequency of occurance of these events is presented below.

# ggplot(injuryevecnt, aes(EVTYPE, count)) +
# geom_col(width=.6, fill='dark cyan') + coord_flip() +
# labs(x='Number of events with high injuries', y= 'Events')
# + ggtitle('Events with maximum injuries: Counts')
kable(arrange(injuryevecnt, EVTYPE))
EVTYPE count
TSUNAMI 20
ICY ROADS 24
HEAVY SURF 77
HURRICANE 269
FREEZING RAIN 393
DUST STORM 417
FOG 532
RIP CURRENTS 734
HEAT 2690
TORNADO 23154

The weather events that cause the maximum property damage on an average basis are depicted in the graph below.

# Determing property damage from exponent and PROPDMG

gt5post96[gt5post96$PROPDMGEXP == "K", "totalprop"] <- gt5post96[gt5post96$PROPDMGEXP == 
    "K", "PROPDMG"] * 1000

gt5post96[gt5post96$PROPDMGEXP == "M", "totalprop"] <- gt5post96[gt5post96$PROPDMGEXP == 
    "M", "PROPDMG"] * 1e+06

gt5post96[gt5post96$PROPDMGEXP == "B", "totalprop"] <- gt5post96[gt5post96$PROPDMGEXP == 
    "B", "PROPDMG"] * 1e+09

gt5post96[!(gt5post96$PROPDMGEXP %in% c("B", "M", "K")), "totalprop"] <- gt5post96[!(gt5post96$PROPDMGEXP %in% 
    c("B", "M", "K")), "PROPDMG"]

# Determine mean property damage by event type
prop <- group_by(gt5post96, EVTYPE) %>% summarise(meanprop = mean(totalprop, 
    na.rm = TRUE)) %>% arrange(desc(meanprop)) %>% head(10)

prop$meanprop <- round(prop$meanprop, 0)

prop$EVTYPE <- factor(prop$EVTYPE, levels = prop$EVTYPE[order(prop$meanprop)])

# Determine total number of occurances for each event type

propevents <- group_by(gt5post96, EVTYPE) %>% summarise(count = n()) %>% 
    filter(EVTYPE %in% prop$EVTYPE)

propevents$EVTYPE <- factor(propevents$EVTYPE, levels = propevents$EVTYPE[order(prop$EVTYPE)])

ggplot(prop, aes(EVTYPE, meanprop)) + geom_col(width = 0.6, fill = "dark cyan") + 
    coord_flip() + labs(x = "Events", y = "Average damage in $") + 
    ggtitle("Events that cause maximum property damage") + scale_y_continuous(labels = comma)

The frequency of occurance of these events is presented below.

# ggplot(propevents, aes(EVTYPE, count)) + geom_col(width=.6,
# fill='dark cyan') + coord_flip() + labs(x='Events', y='No.
# of occurences') + ggtitle('How often have these events
# occured?')

kable(arrange(propevents, EVTYPE))
EVTYPE count
WILDFIRE 2732
WILD/FOREST FIRE 1443
TSUNAMI 20
TORNADO 23154
LANDSLIDE 602
ICE STORM 1879
HURRICANE 269
FLOOD 24247
DROUGHT 2433
COASTAL STORM 7296

The weather events that cause the maximum property damage on an average basis are depicted in the graph below.

# Obtain actual crop damage by combining the exponent and the
# crop damage unit

gt5post96[gt5post96$CROPDMGEXP == "K", "totalcrop"] <- gt5post96[gt5post96$CROPDMGEXP == 
    "K", "CROPDMG"] * 1000

gt5post96[gt5post96$CROPDMGEXP == "M", "totalcrop"] <- gt5post96[gt5post96$CROPDMGEXP == 
    "M", "CROPDMG"] * 1e+06

gt5post96[gt5post96$CROPDMGEXP == "B", "totalcrop"] <- gt5post96[gt5post96$CROPDMGEXP == 
    "B", "CROPDMG"] * 1e+09

gt5post96[!(gt5post96$CROPDMGEXP %in% c("B", "M", "K")), "totalcrop"] <- gt5post96[!(gt5post96$CROPDMGEXP %in% 
    c("B", "M", "K")), "CROPDMG"]

# Obtain the top 10 events that cause the maximum crop damage
crop <- group_by(gt5post96, EVTYPE) %>% summarise(meancrop = mean(totalcrop, 
    na.rm = TRUE)) %>% arrange(desc(meancrop)) %>% head(10)

crop$EVTYPE <- factor(crop$EVTYPE, levels = crop$EVTYPE[order(crop$meancrop)])

# Obtain the total number of times these top 10 events have
# occured
cropevents <- group_by(gt5post96, EVTYPE) %>% summarise(count = n()) %>% 
    filter(EVTYPE %in% crop$EVTYPE)

cropevents$EVTYPE <- factor(cropevents$EVTYPE, levels = cropevents$EVTYPE[order(crop$EVTYPE)])

# Plot the top 10 events against the average damage caused
ggplot(crop, aes(EVTYPE, meancrop)) + geom_col(width = 0.6, fill = "dark cyan") + 
    coord_flip() + labs(x = "Events", y = "Average damage in USD") + 
    ggtitle("Events with max crop damage") + scale_y_continuous(labels = comma)

** The frequency of occurance of these events is presented below.**

# Plot events and frequency of events ggplot(cropevents,
# aes(EVTYPE, count)) + geom_col(width=.6, fill='dark cyan')
# + coord_flip() + labs(x='Events', y='No. of occurences') +
# ggtitle('How often have these events occured?')
kable(arrange(cropevents, EVTYPE))
EVTYPE count
WILDFIRE 2732
WILD/FOREST FIRE 1443
SMALL HAIL 45
HURRICANE 269
HEAT 2690
FREEZE 1457
FLOOD 24247
EXTREME WINDCHILL 204
DROUGHT 2433
COLD 2281

Results

The following results suggest themselves from the analysis:

Human fatalities: Tsunamis, rip currents and heat waves seem to carry the highest risk of fatalities. Of these, rip currents and heat have a higher probability of occurance.

Human injuries: Tsunamis, hurricanes and heat have the highestinjuries.Hurricanes and heat waves occur quite frequently.

Property damage: Hurricanes cause relatively (and absolutely) high property damage.

Crop damage: Hurricanes and drought cause the maximum crop damage.