This analysis uses the NOAA weather database to determine which events cause the most human, property and crop damage. The analysis has been restricted to the data from Jan 2006, since a comparitive analysis between event types is only possible after that. The first part of the analysis focuses on cleaning and merging event types. The second part of the analysis looks at the top 10 events that cause human or property damage, on an average basis.
This section reads the data directly from the downloaded bz2 file. The relevant columns are subset into a new dataset and filtered for events that occur from 1996. The data is further filtered to remove any events that had fewer than 5 occurances. Event types that are similar are merged through a series of transformations.
library(knitr)
opts_chunk$set(tidy.opts=list(width.cutoff=60),tidy=TRUE)
# Reading the file into a dataframe with cache on
if (!file.exists("stormdata.csv.bz2")) {
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
"stormdata.csv.bz2")
}
stormdata <- read.csv("stormdata.csv.bz2", sep = ",", header = TRUE)
# Load packages
library(dplyr)
library(ggplot2)
library(scales)
# Subset and clean data
# Select only relevant columns
stormsub <- select(stormdata, STATE, BGN_DATE, EVTYPE, FATALITIES,
INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
# Select only observations from 1996 since event types were
# expanded only from that year
year <- as.POSIXct(stormsub$BGN_DATE, format = "%m/%d/%Y %H:%M:%S")
stormsub$Year <- lubridate::year(year)
after1996 <- filter(stormsub, Year > 1995)
# Select only events that have more than 5 occurances from
# 1996 till 2011
by_event <- group_by(after1996, EVTYPE) %>% summarise(count = n()) %>%
filter(count > 5) %>% arrange(desc(count))
gt5post96 <- filter(after1996, EVTYPE %in% by_event$EVTYPE)
# Clean data to standardize duplicates
# Standardize Heat events
gt5post96$EVTYPE[gt5post96$EVTYPE == "EXCESSIVE HEAT"] <- "HEAT"
gt5post96$EVTYPE[gt5post96$EVTYPE == "RECORD HEAT"] <- "HEAT"
warm <- unique(gt5post96$EVTYPE[grep("warm", gt5post96$EVTYPE,
ignore.case = TRUE)])
gt5post96$EVTYPE[gt5post96$EVTYPE %in% warm] <- "HEAT"
# Standarsize Rip currents events
gt5post96$EVTYPE[gt5post96$EVTYPE == "RIP CURRENT"] <- "RIP CURRENTS"
# Standardize hurricane events
gt5post96$EVTYPE[gt5post96$EVTYPE == "HURRICANE/TYPHOON"] <- "HURRICANE"
gt5post96$EVTYPE[gt5post96$EVTYPE == "TYPHOON"] <- "HURRICANE"
# Standardize landslide events
gt5post96$EVTYPE[gt5post96$EVTYPE == "Mudslide"] <- "LANDSLIDE"
gt5post96$EVTYPE[gt5post96$EVTYPE == "MUDSLIDE"] <- "LANDSLIDE"
# Standardize storm events
gt5post96$EVTYPE[gt5post96$EVTYPE %in% c("TSTM WIND", "THUNDERSTORM WIND",
"TROPICAL STORM", "STORM SURGE", "SEVERE THUNDERSTORMS")] <- "THUNDERSTORM"
# Standardize coastal storm events
gt5post96$EVTYPE[gt5post96$EVTYPE %in% c("MARINE THUNDERSTORM WIND",
"STORM SURGE/TIDE")] <- "COASTAL STORM"
gt5post96$EVTYPE[gt5post96$EVTYPE == "HIGH SEAS"] <- "COASTAL STORM"
coast <- unique(gt5post96$EVTYPE[grep("coastal", gt5post96$EVTYPE,
ignore.case = TRUE)])
tidal <- unique(gt5post96$EVTYPE[grep("tidal", gt5post96$EVTYPE,
ignore.case = TRUE)])
tide <- unique(gt5post96$EVTYPE[grep("tide", gt5post96$EVTYPE,
ignore.case = TRUE)])
alltide <- factor(c(levels(coast)[coast], levels(tide)[tide],
levels(tidal)[tidal]))
gt5post96$EVTYPE[gt5post96$EVTYPE %in% alltide] <- "COASTAL STORM"
gt5post96$EVTYPE[gt5post96$EVTYPE %in% c("High Surf", "MARINE STRONG WIND",
"HEAVY SURF/HIGH SURF")] <- "COASTAL STORM"
# Standardize cold events
c <- unique(gt5post96$EVTYPE[grep("cold", gt5post96$EVTYPE, ignore.case = TRUE)])
gt5post96$EVTYPE[gt5post96$EVTYPE %in% c] <- "COLD"
# Standarize freezing rain
gt5post96$EVTYPE[gt5post96$EVTYPE %in% c("GLAZE", "BLACK ICE",
"WINTRY MIX", "WINTER WEATHER MIX", "MIXED PRECIPITATION",
"MIXED PRECIP", "FREEZING DRIZZLE", "Freezing Rain", "FREEZING RAIN/SLEET",
"FREEZING RAIN", "LIGHT FREEZING RAIN")] <- "FREEZING RAIN"
# Standardize freeze
gt5post96$EVTYPE[gt5post96$EVTYPE %in% c("FROST/FREEZE", "HARD FREEZE",
"FROST")] <- "FREEZE"
The weather events that cause the maximum fatalities on an average basis are depicted in the graph below.
# Create dataset with top 10 mean fatalities by event type,
# sorted descending
fatal_eve <- group_by(gt5post96, EVTYPE) %>% summarise(meanfatal = mean(FATALITIES,
na.rm = TRUE)) %>% arrange(desc(meanfatal)) %>% head(10)
# Sort data by descending mean injuries
fatal_eve$EVTYPE <- factor(fatal_eve$EVTYPE, levels = fatal_eve$EVTYPE[order(fatal_eve$meanfatal)])
# regroup cleaned up dataset by event
normalizedevents <- group_by(gt5post96, EVTYPE) %>% summarise(count = n()) %>%
filter(count > 5) %>% arrange(desc(count))
# Obtain top 10 fatality event counts
fatalevecnt <- filter(normalizedevents, EVTYPE %in% fatal_eve$EVTYPE)
fatalevecnt$EVTYPE <- factor(fatalevecnt$EVTYPE, levels = fatal_eve$EVTYPE)
# Plot the top 10 fatality events alongside the number of
# occurances of those events
ggplot(fatal_eve, aes(EVTYPE, meanfatal)) + geom_col(width = 0.6,
fill = "dark cyan") + coord_flip() + labs(y = "Mean number of fatalities",
x = "Events") + ggtitle("Top 10 Events with maximum fatalities")
The frequency of occurance of these events is presented below to help understand the impact in terms of both the potential for impact and the probabilty of occurance
# ggplot(fatalevecnt, aes(EVTYPE, count)) +
# geom_col(width=.6, fill='dark cyan') + coord_flip() +
# labs(x= 'Events', y= 'Count of events') + ggtitle('Events
# with maximum fatalities: Total counts')
kable(arrange(fatalevecnt, EVTYPE))
| EVTYPE | count |
|---|---|
| TSUNAMI | 20 |
| HEAT | 2690 |
| RIP CURRENTS | 734 |
| AVALANCHE | 378 |
| HURRICANE | 269 |
| ICY ROADS | 24 |
| COLD | 2281 |
| Strong Winds | 7 |
| HIGH SURF | 717 |
| FOG | 532 |
The weather events that cause the maximum injuries on an average basis are depicted in the graph below.
# Create dataset with top 10 mean injuries by event type,
# sorted descending
injury_eve <- group_by(gt5post96, EVTYPE) %>% summarise(meaninjuries = mean(INJURIES,
na.rm = TRUE)) %>% arrange(desc(meaninjuries)) %>% head(10)
# Sort data by descending mean injuries
injury_eve$EVTYPE <- factor(injury_eve$EVTYPE, levels = injury_eve$EVTYPE[order(injury_eve$meaninjuries)])
injuryevecnt <- filter(normalizedevents, EVTYPE %in% injury_eve$EVTYPE)
injuryevecnt$EVTYPE <- factor(injuryevecnt$EVTYPE, levels = injuryevecnt$EVTYPE[order(injury_eve$EVTYPE)])
# Plot the top 10 injury events
ggplot(injury_eve, aes(EVTYPE, meaninjuries)) + geom_col(width = 0.6,
fill = "dark cyan") + coord_flip() + labs(x = "Mean number of injuries",
y = "Events") + ggtitle("Events with maximum injuries: The top 10")
The frequency of occurance of these events is presented below.
# ggplot(injuryevecnt, aes(EVTYPE, count)) +
# geom_col(width=.6, fill='dark cyan') + coord_flip() +
# labs(x='Number of events with high injuries', y= 'Events')
# + ggtitle('Events with maximum injuries: Counts')
kable(arrange(injuryevecnt, EVTYPE))
| EVTYPE | count |
|---|---|
| TSUNAMI | 20 |
| ICY ROADS | 24 |
| HEAVY SURF | 77 |
| HURRICANE | 269 |
| FREEZING RAIN | 393 |
| DUST STORM | 417 |
| FOG | 532 |
| RIP CURRENTS | 734 |
| HEAT | 2690 |
| TORNADO | 23154 |
The weather events that cause the maximum property damage on an average basis are depicted in the graph below.
# Determing property damage from exponent and PROPDMG
gt5post96[gt5post96$PROPDMGEXP == "K", "totalprop"] <- gt5post96[gt5post96$PROPDMGEXP ==
"K", "PROPDMG"] * 1000
gt5post96[gt5post96$PROPDMGEXP == "M", "totalprop"] <- gt5post96[gt5post96$PROPDMGEXP ==
"M", "PROPDMG"] * 1e+06
gt5post96[gt5post96$PROPDMGEXP == "B", "totalprop"] <- gt5post96[gt5post96$PROPDMGEXP ==
"B", "PROPDMG"] * 1e+09
gt5post96[!(gt5post96$PROPDMGEXP %in% c("B", "M", "K")), "totalprop"] <- gt5post96[!(gt5post96$PROPDMGEXP %in%
c("B", "M", "K")), "PROPDMG"]
# Determine mean property damage by event type
prop <- group_by(gt5post96, EVTYPE) %>% summarise(meanprop = mean(totalprop,
na.rm = TRUE)) %>% arrange(desc(meanprop)) %>% head(10)
prop$meanprop <- round(prop$meanprop, 0)
prop$EVTYPE <- factor(prop$EVTYPE, levels = prop$EVTYPE[order(prop$meanprop)])
# Determine total number of occurances for each event type
propevents <- group_by(gt5post96, EVTYPE) %>% summarise(count = n()) %>%
filter(EVTYPE %in% prop$EVTYPE)
propevents$EVTYPE <- factor(propevents$EVTYPE, levels = propevents$EVTYPE[order(prop$EVTYPE)])
ggplot(prop, aes(EVTYPE, meanprop)) + geom_col(width = 0.6, fill = "dark cyan") +
coord_flip() + labs(x = "Events", y = "Average damage in $") +
ggtitle("Events that cause maximum property damage") + scale_y_continuous(labels = comma)
The frequency of occurance of these events is presented below.
# ggplot(propevents, aes(EVTYPE, count)) + geom_col(width=.6,
# fill='dark cyan') + coord_flip() + labs(x='Events', y='No.
# of occurences') + ggtitle('How often have these events
# occured?')
kable(arrange(propevents, EVTYPE))
| EVTYPE | count |
|---|---|
| WILDFIRE | 2732 |
| WILD/FOREST FIRE | 1443 |
| TSUNAMI | 20 |
| TORNADO | 23154 |
| LANDSLIDE | 602 |
| ICE STORM | 1879 |
| HURRICANE | 269 |
| FLOOD | 24247 |
| DROUGHT | 2433 |
| COASTAL STORM | 7296 |
The weather events that cause the maximum property damage on an average basis are depicted in the graph below.
# Obtain actual crop damage by combining the exponent and the
# crop damage unit
gt5post96[gt5post96$CROPDMGEXP == "K", "totalcrop"] <- gt5post96[gt5post96$CROPDMGEXP ==
"K", "CROPDMG"] * 1000
gt5post96[gt5post96$CROPDMGEXP == "M", "totalcrop"] <- gt5post96[gt5post96$CROPDMGEXP ==
"M", "CROPDMG"] * 1e+06
gt5post96[gt5post96$CROPDMGEXP == "B", "totalcrop"] <- gt5post96[gt5post96$CROPDMGEXP ==
"B", "CROPDMG"] * 1e+09
gt5post96[!(gt5post96$CROPDMGEXP %in% c("B", "M", "K")), "totalcrop"] <- gt5post96[!(gt5post96$CROPDMGEXP %in%
c("B", "M", "K")), "CROPDMG"]
# Obtain the top 10 events that cause the maximum crop damage
crop <- group_by(gt5post96, EVTYPE) %>% summarise(meancrop = mean(totalcrop,
na.rm = TRUE)) %>% arrange(desc(meancrop)) %>% head(10)
crop$EVTYPE <- factor(crop$EVTYPE, levels = crop$EVTYPE[order(crop$meancrop)])
# Obtain the total number of times these top 10 events have
# occured
cropevents <- group_by(gt5post96, EVTYPE) %>% summarise(count = n()) %>%
filter(EVTYPE %in% crop$EVTYPE)
cropevents$EVTYPE <- factor(cropevents$EVTYPE, levels = cropevents$EVTYPE[order(crop$EVTYPE)])
# Plot the top 10 events against the average damage caused
ggplot(crop, aes(EVTYPE, meancrop)) + geom_col(width = 0.6, fill = "dark cyan") +
coord_flip() + labs(x = "Events", y = "Average damage in USD") +
ggtitle("Events with max crop damage") + scale_y_continuous(labels = comma)
** The frequency of occurance of these events is presented below.**
# Plot events and frequency of events ggplot(cropevents,
# aes(EVTYPE, count)) + geom_col(width=.6, fill='dark cyan')
# + coord_flip() + labs(x='Events', y='No. of occurences') +
# ggtitle('How often have these events occured?')
kable(arrange(cropevents, EVTYPE))
| EVTYPE | count |
|---|---|
| WILDFIRE | 2732 |
| WILD/FOREST FIRE | 1443 |
| SMALL HAIL | 45 |
| HURRICANE | 269 |
| HEAT | 2690 |
| FREEZE | 1457 |
| FLOOD | 24247 |
| EXTREME WINDCHILL | 204 |
| DROUGHT | 2433 |
| COLD | 2281 |
The following results suggest themselves from the analysis:
Human fatalities: Tsunamis, rip currents and heat waves seem to carry the highest risk of fatalities. Of these, rip currents and heat have a higher probability of occurance.
Human injuries: Tsunamis, hurricanes and heat have the highestinjuries.Hurricanes and heat waves occur quite frequently.
Property damage: Hurricanes cause relatively (and absolutely) high property damage.
Crop damage: Hurricanes and drought cause the maximum crop damage.