This paper looks at the data collected by National Oceanic and Atmospheric Administration on storms and other significant weather phenomena occuring between 1950 and 2011, and the effect they have on population (deaths and injuries) and on economy (property and crop damages).
The analysis in the paper will answer two questions:
1. Which types of weather phenomena are the most harmful with respect to population health?
2. Which types of weather phenomena have the greatest economic consequences?
Both questions will be addressed in terms of aggregate impact, average impact of an individual event and total number of event occurrences.
While this paper is not inteded to help with actual policy decisions, it’s written (as part of the assignment) to demonstrate how this type of analysis coud be used to support decision making.
The analysis in this paper is fully reproducible, all code used to produce the results is included below, starting with the code to retrieve the data.
We begin with setup code - loading required libraries and retrieval of the data:
library(ggplot2)
library(gridExtra)
library(stringr)
library(plyr)
library(reshape2)
if(!file.exists('stormdata.RData')){
if (!file.exists('stormdata.csv.bz2')){
download.file(url = 'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2',
destfile = 'stormdata.csv.bz2', method = 'curl')
}
}
df = read.csv(bzfile('stormdata.csv.bz2'), stringsAsFactors=F)
Next we will fix the Property Damage data - the data is split between two fields, PROPDMG and PROPDMGEXP. In some cases, PROPDMGEXP contains invalid characters. We replace these with NA values:
nas = c('', '-', '?', '+')
df[df$PROPDMGEXP %in% nas, 'PROPDMGEXP'] = NA
In other cases, PROPDMGEXP contains multipliers: M is millions, K is thousands, etc. We apply these multipliers to values in PROPDMG and after that, replace them with NA values:
multipliers = data.frame(code = c('h', 'H', 'k', 'K', 'm', 'M', 'B'),
mult = c(100, 100, 1000, 1000, 1000000, 1000000, 1000000000))
for (i in 1:dim(multipliers)[1]){
m = multipliers[i,'mult']
df[!is.na(df$PROPDMGEXP) & df$PROPDMGEXP == multipliers[i,'code'], 'PROPDMG'] =
df[!is.na(df$PROPDMGEXP) & df$PROPDMGEXP == multipliers[i,'code'], 'PROPDMG'] * m
df[!is.na(df$PROPDMGEXP) & df$PROPDMGEXP == multipliers[i,'code'], 'PROPDMGEXP'] = NA
}
Remaining values of PROPDMGEXP are overflow digits: e.g., number 253 could end up with 25 in PROPDMG and 3 in PROPDMGEXP. We fix these last:
df[!is.na(df$PROPDMGEXP), 'PROPDMG'] = df[!is.na(df$PROPDMGEXP), 'PROPDMG'] * 10 +
as.numeric(df[!is.na(df$PROPDMGEXP), 'PROPDMGEXP'])
Exact same set of corrections needs to be applied to crop damages (CROPDMG and CROPDMGEXP fields):
df[df$CROPDMGEXP %in% nas, 'CROPDMGEXP'] = NA
for (i in 1:dim(multipliers)[1]){
m = multipliers[i,'mult']
df[!is.na(df$CROPDMGEXP) & df$CROPDMGEXP == multipliers[i,'code'], 'CROPDMG'] =
df[!is.na(df$CROPDMGEXP) & df$CROPDMGEXP == multipliers[i,'code'], 'CROPDMG'] * m
df[!is.na(df$CROPDMGEXP) & df$CROPDMGEXP == multipliers[i,'code'], 'CROPDMGEXP'] = NA
}
df[!is.na(df$CROPDMGEXP), 'CROPDMG'] = df[!is.na(df$CROPDMGEXP), 'CROPDMG'] * 10 +
as.numeric(df[!is.na(df$CROPDMGEXP), 'CROPDMGEXP'])
The Event type classifier (EVTYPE field) has a number of typos and redundancies, which could skew the numbers in favor of less impacting type of event that’s easier to spell. While we do not attempt a definitive data cleanup, the most glaring issues are corrected in the code below. The approach to cleaning is to first get rid of special characters, repetitive spaces, plural endings, convert everything to lowercase and remove observations that represent summaries rather than individual events:
df$EVTYPE = str_replace(df$EVTYPE, '/', ' ')
df$EVTYPE = str_replace(df$EVTYPE, '^[ ]+', '')
df$EVTYPE = str_replace(df$EVTYPE, '[ ]+$', '')
df$EVTYPE = str_replace(df$EVTYPE, '[ ]+', ' ')
df = df[df$EVTYPE != '?',]
df$EVTYPE = tolower(df$EVTYPE)
df = df[!str_detect(df$EVTYPE, '^summary'),]
df$EVTYPE = str_replace(df$EVTYPE, 's$', '')
Next step is to take care of more noticeable typos and to combine groups of redundant event type names into single event type:
df$EVTYPE = str_replace(df$EVTYPE, 'avalance', 'avalanche')
df$EVTYPE = str_replace(df$EVTYPE, 'beach erosin', 'beach erosion')
df[str_detect(df$EVTYPE, 'blizzard'), 'EVTYPE'] = 'blizzard'
df[str_detect(df$EVTYPE, '^blowing snow'), 'EVTYPE'] = 'blowing snow'
df[str_detect(df$EVTYPE, '^coastal(.*)storm'), 'EVTYPE'] = 'coastal storm'
df[str_detect(df$EVTYPE, '^cold wind'), 'EVTYPE'] = 'cold wind'
df[str_detect(df$EVTYPE, '^dam '), 'EVTYPE'] = 'dam failure'
df[str_detect(df$EVTYPE, '^downburst'), 'EVTYPE'] = 'downburst wind'
df[str_detect(df$EVTYPE, '^drought '), 'EVTYPE'] = 'drought'
df[str_detect(df$EVTYPE, '^dry'), 'EVTYPE'] = 'dry weather'
df[str_detect(df$EVTYPE, '^dust'), 'EVTYPE'] = 'dust storm'
df[str_detect(df$EVTYPE, '^extreme(.*)wind'), 'EVTYPE'] = 'extreme wind chill'
df[str_detect(df$EVTYPE, 'flo[o]+d'), 'EVTYPE'] = 'flood'
df[str_detect(df$EVTYPE, '^freezing '), 'EVTYPE'] = 'freezing rain'
df[str_detect(df$EVTYPE, '^frost'), 'EVTYPE'] = 'frost'
df[str_detect(df$EVTYPE, '^funnel'), 'EVTYPE'] = 'funnel cloud'
df[str_detect(df$EVTYPE, '^glaze'), 'EVTYPE'] = 'glaze'
df[str_detect(df$EVTYPE, '^gusty'), 'EVTYPE'] = 'gusty wind'
df[str_detect(df$EVTYPE, '^hail'), 'EVTYPE'] = 'hail'
df[str_detect(df$EVTYPE, 'heat'), 'EVTYPE'] = 'heat wave'
df[str_detect(df$EVTYPE, 'heavy(.*)rain'), 'EVTYPE'] = 'heavy rain'
df[str_detect(df$EVTYPE, '^high surf'), 'EVTYPE'] = 'high surf'
df[str_detect(df$EVTYPE, '^high wind'), 'EVTYPE'] = 'high wind'
df[str_detect(df$EVTYPE, 'hurricane'), 'EVTYPE'] = 'hurricane'
df[str_detect(df$EVTYPE, '^late(.*)snow'), 'EVTYPE'] = 'late season snow'
df[str_detect(df$EVTYPE, '^light snow'), 'EVTYPE'] = 'light snow'
df[str_detect(df$EVTYPE, '^lig[nh]t[n]*ing'), 'EVTYPE'] = 'lightning'
df[str_detect(df$EVTYPE, '^low temperature'), 'EVTYPE'] = 'low temperature'
df[str_detect(df$EVTYPE, '^mud(.*)slide'), 'EVTYPE'] = 'mud slide'
df[str_detect(df$EVTYPE, '(tornado)|(torndao)'), 'EVTYPE'] = 'tornado'
df[str_detect(df$EVTYPE, '(t[h]*u[n]*[d]*[e]+[r]*[e]*[s]*torm)|(thunderstrom)|(thundertsorm)|(tstm)'), 'EVTYPE'] = 'thunderstorm'
df[str_detect(df$EVTYPE, 'tropical storm'), 'EVTYPE'] = 'tropical storm'
df[str_detect(df$EVTYPE, '^volcanic ash'), 'EVTYPE'] = 'volcanic ash'
df[str_detect(df$EVTYPE, '^wa[y]*ter(.*)spout'), 'EVTYPE'] = 'water spout'
df[str_detect(df$EVTYPE, 'fire'), 'EVTYPE'] = 'wild fire'
As a final step in event type cleanup, we capitalize each word in event type names for better readability:
cap_each_word = function(x) {
s = strsplit(x, ' ' )[[1]]
paste(toupper(substring(s, 1, 1)), substring(s, 2),
sep = '', collapse = ' ')
}
df$EVTYPE = sapply(df$EVTYPE, cap_each_word)
df$EVTYPE = as.factor(df$EVTYPE)
Next, we add two columns - HARM (fatalities + injuries) and DMG (property + crop damages) and aggregate the data by event type:
df$HARM = df$FATALITIES + df$INJURIES
df$DMG = df$PROPDMG + df$CROPDMG
df_agg = ddply(df, .(EVTYPE), summarize, t_harm=sum(HARM), t_fatalities=sum(FATALITIES), m_fatalities=mean(FATALITIES),
t_injuries=sum(INJURIES), m_injuries=mean(INJURIES), t_dmg=sum(DMG), m_dmg=mean(DMG))
For analyzing harm to the population, we will create a narrow dataset, containing only event types that resulted in more than 200 fatalities or more than 300 injuries in total between 1950 and 2011. We will then order these select event types by total harm (total number of fatalities and injuries), create a subset of the detailed data with just these event types and apply ordering both to the detailed data and to aggregate data:
df_agg_melt_harm = melt(df_agg[df_agg$t_fatalities>200 | df_agg$t_injuries>300,], id='EVTYPE')
df_top_melt_harm = df_agg_melt_harm[df_agg_melt_harm$variable == 't_harm',]
df_top_melt_harm = df_top_melt_harm[with(df_top_melt_harm, order(-value)),]
event_list_harm = as.character(df_top_melt_harm$EVTYPE)
df_top_harm = df[df$EVTYPE %in% event_list_harm,]
df_top_harm$EVTYPE = ordered(as.character(df_top_harm$EVTYPE), levels=event_list_harm)
df_agg_melt_harm$EVTYPE = ordered(as.character(df_agg_melt_harm$EVTYPE), levels=event_list_harm)
For analyzing economic impact, we prepare data in the same manner, with narrow dataset, containig only event types resulting in combined damage (property + crop) of over $1,000,000,000 between 1950 and 2011. We then apply same type of transformations as we did for harm data, resulting in event types for both narrow and detailed data ordered by total combined damage:
df_agg_melt_dmg = melt(df_agg[df_agg$t_dmg>1000000000,], id='EVTYPE')
df_top_melt_dmg = df_agg_melt_dmg[df_agg_melt_dmg$variable == 't_dmg',]
df_top_melt_dmg = df_top_melt_dmg[with(df_top_melt_dmg, order(-value)),]
event_list_dmg = as.character(df_top_melt_dmg$EVTYPE)
df_top_dmg = df[df$EVTYPE %in% event_list_dmg,]
df_top_dmg$EVTYPE = ordered(as.character(df_top_dmg$EVTYPE), levels=event_list_dmg)
df_agg_melt_dmg$EVTYPE = ordered(as.character(df_agg_melt_dmg$EVTYPE), levels=event_list_dmg)
harm_p1 = ggplot(df_agg_melt_harm[df_agg_melt_harm$variable %in% c('t_fatalities', 't_injuries'),],
aes(x=EVTYPE, y=value, fill=variable)) + geom_bar(stat='identity') +
theme(axis.text.y = element_text(angle=45, hjust=1), axis.text.x = element_blank(), legend.position=c(.9, .7)) +
scale_y_sqrt(breaks=c(1000, 4000, 8000, 16000, 32000, 128000)) +
xlab('') + ylab('Total Victims') +
scale_fill_discrete(name='', breaks=c('t_fatalities', 't_injuries'), labels=c('Fatalities', 'Injuries'))
harm_p2 = ggplot(df_agg_melt_harm[df_agg_melt_harm$variable %in% c('m_fatalities', 'm_injuries'),],
aes(x=EVTYPE, y=value, fill=variable)) + geom_bar(stat='identity') +
theme(axis.text.x = element_blank(), legend.position='none') +
scale_y_continuous(breaks=0:5, labels=paste('. ', 0:5)) +
xlab('') + ylab('Avg Victims per Event') +
scale_fill_discrete(name='', breaks=c('m_fatalities', 'm_injuries'), labels=c('Fatalities', 'Injuries'))
harm_p3 = ggplot(df_top_harm, aes(x=EVTYPE)) + geom_bar() +
theme(axis.text = element_text(angle=45, hjust=1)) +
xlab('') + ylab('Ocurrences') +
scale_y_sqrt(breaks=c(2000, 16000, 32000, 128000, 256000))
grid.arrange(harm_p1, harm_p2, harm_p3, nrow=3, main='Events Most Harmful to Population')
As the graph shows, while by the total number of victims tornados are the most dangerous, average number of victims per event is fairly small - the large total is due to frequent occurrences.
On the other hand, while hurricanes are the most dangerous (largest average number of victims per event), they are infrequent, so the total number of victims is low in comparison.
The two most frequent event types are thunderstorms and hail, but most of the time there are no victims.
Heat waves seem to have the largest impact, with second highest total number of victims and second highest average number of victims per event.
dmg_p1 = ggplot(df_agg_melt_dmg[df_agg_melt_dmg$variable == 't_dmg',],
aes(x=EVTYPE, y=value)) + geom_bar(stat='identity') +
theme(axis.text.y = element_text(angle=45, hjust=1), axis.text.x = element_blank()) +
scale_y_sqrt(breaks=c(2000000000, 8000000000, 32000000000, 64000000000, 128000000000),
labels=c('$2B', '$8B', '$32B', '$64B', '$128B')) +
xlab('') + ylab('Total Damage')
dmg_p2 = ggplot(df_agg_melt_dmg[df_agg_melt_dmg$variable == 'm_dmg',],
aes(x=EVTYPE, y=value)) + geom_bar(stat='identity') +
theme(axis.text.y = element_text(angle=45, hjust=1), axis.text.x = element_blank()) +
scale_y_sqrt(breaks=c(8000000, 32000000, 128000000, 256000000, 1000000000),
labels=c('$8M', '$32M', '$128M', '$256M', '$1B')) +
xlab('') + ylab('Avg Damage per Event')
dmg_p3 = ggplot(df_top_dmg, aes(x=EVTYPE)) + geom_bar() +
theme(axis.text = element_text(angle=45, hjust=1)) +
xlab('') + ylab('Number of Ocurrences') +
scale_y_sqrt(breaks=c(2000, 10000, 50000, 150000, 250000))
grid.arrange(dmg_p1, dmg_p2, dmg_p3, nrow=3, main='Events with Greatest Economic Consequences')
Similar to population harm data, we look at total damage for all events from 1950 to 2011, average damage per event and number of event occurrences, to understand which types of events have the most impact.
While floods and tornadoes are first and third in total damages, average damage per event is relatively small and the large totals are due to frequent occurrences.
Thunderstorms and hail are the most frequent, but just as with harm, they cause relatively little damage.
Hurricanes and storms cause the most damage per event and have the largest total damage, but aren’t very frequent.
Overall, floods, tornadoes (first and thrid in total damage and third and fourth in number of occurrences), hurricanes and storms (first and second in average damage per event and second and fourth in total damage) can be considered as having the highest impact.