library(stringr)
library(qcc)

1 Synopsis

This project explores the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States. The database contains estimates of any fatalities, injuries, and property damage for the period starting in the year 1950 and ending in November 2011. This project will emphasize human loss, in the form of fatalities or injuries, and monetary losses due to weather events.

2 Data Processing

dat<-read.table(file="StormData.csv",
           header=T,
           sep=",")
dat.size<-dim(dat)

The data file contains 902297 records and 37 variables. Data from states, other than the 50 states, Virgin Islands and Puerto Rico, were removed from the data set.

states.number<-length(table(dat$STATE))

The original data lists 72 states. Some of the abbreviations for states are not recognized and were removed from the data set.

# Removes data from other than the 50 states, Virgin Islands and Puerto Rico
dat2<-subset(dat, STATE !="AM" & STATE != "AN" & STATE != "AS" & STATE != "GM" &
STATE != "LC" & STATE != "LE" & STATE != "LH" & STATE != "LM" & STATE != "LO" & 
STATE != "LS" & STATE != "ME" & STATE != "MH" & STATE != "PH" & STATE != "PK" & 
STATE != "PM" & STATE != "PZ" & STATE != "SL" & STATE != "ST" & STATE != "XX")
dat<-dat2
rm(dat2)
dat.size<-dim(dat)

The resulting set contains data for the 50 USA states, Virgin Islands and Puerto Rico. Other regions in the dataset have been excluded from this analysis.

Some obvious data entry errors were noticed in the event types (EVTYPE). After closer examination of this variable it was noticed that some categories were redundant. Categories believed to be redundant were consolidated.

# Data cleansing
# To consolidate categories and to fix some data entry errors
dat$EVTYPE<-tolower(dat$EVTYPE)
dat<-dat[-grep("summary",dat$EVTYPE),] # eliminate Summary records
dat[dat$EVTYPE=="winter storm high winds" |
        dat$EVTYPE=="winter storm/high wind"|
        dat$EVTYPE=="winter storm/high winds",]$EVTYPE <- "winter storm/high winds"
dat[dat$EVTYPE=="winter storm" |
        dat$EVTYPE=="winter storms",]$EVTYPE <- "winter storms"
dat[dat$EVTYPE=="winter mix" |
            dat$EVTYPE=="winter weather mix"|
            dat$EVTYPE=="wintery mix"|
            dat$EVTYPE=="wintry mix",]$EVTYPE <- "winter mix"

dat[dat$EVTYPE=="wnd"|
            dat$EVTYPE=="whirlwind",]$EVTYPE <- "wind"

dat[dat$EVTYPE=="wild/forest fire" |
            dat$EVTYPE=="wild/forest fires"|
            dat$EVTYPE=="wildfire"|
            dat$EVTYPE=="wild fires"|
            dat$EVTYPE=="wildfires",]$EVTYPE <- "wildfire"

dat[dat$EVTYPE=="waterspouts" |
            dat$EVTYPE=="wayterspout"|
            dat$EVTYPE=="waterspout-"|
            dat$EVTYPE=="waterspout-"|
            dat$EVTYPE=="water spout"|
            dat$EVTYPE=="waterspout funnel cloud"|
            dat$EVTYPE=="waterspout/",]$EVTYPE <- "waterspout"

dat[grep("tornado",dat$EVTYPE),]$EVTYPE <- "tornado"
dat[dat$EVTYPE=="torndao",]$EVTYPE <- "tornado"

dat[dat$EVTYPE=="winter weather" |
            dat$EVTYPE=="winter weather/mix"|
            dat$EVTYPE=="winter storm/high winds"|
            dat$EVTYPE=="winter mix"|
            dat$EVTYPE=="wind chill"|
            dat$EVTYPE=="wet snow"|
            dat$EVTYPE=="blizzard"|
            dat$EVTYPE=="blizzard weather"|
            dat$EVTYPE=="ground blizzard"|
            dat$EVTYPE=="winter storms",]$EVTYPE <- "winter weather"

dat[grep("wind",dat$EVTYPE),]$EVTYPE <- "wind"
dat[grep("wet",dat$EVTYPE),]$EVTYPE <- "wet weather"
dat[grep("slide",dat$EVTYPE),]$EVTYPE <- "mud/rock/land slide"


dat[grep("wall cloud",dat$EVTYPE),]$EVTYPE <- "wall cloud"

dat[dat$EVTYPE=="volcanic ash plume" |
            dat$EVTYPE=="volcanic ashfall" |
            dat$EVTYPE=="volcanic ash" |
            dat$EVTYPE=="vog" |
            dat$EVTYPE=="volcanic eruption",]$EVTYPE <- "volcanic activity"

dat[dat$EVTYPE=="urban/small stream" |
            dat$EVTYPE=="urban/small stream flood" |
            dat$EVTYPE=="urban/small stream flooding" |
            dat$EVTYPE=="urban/small strm fldg" |
            dat$EVTYPE=="urban/sml stream fldg" |
            dat$EVTYPE=="urban/street flooding" |
            dat$EVTYPE=="urban and small stream flood" |
            dat$EVTYPE=="urban and small stream floodin" |
            dat$EVTYPE=="urban flood" |
            dat$EVTYPE=="urban flood landslide" |
            dat$EVTYPE=="urban and small" |
            dat$EVTYPE=="urban and small stream" |
            dat$EVTYPE=="urban flooding" |
            dat$EVTYPE=="urban floods" |
            dat$EVTYPE=="urban small" |
            dat$EVTYPE=="urban small stream flood" |
            dat$EVTYPE=="urban/small" |
            dat$EVTYPE=="urban/small flooding" |
            dat$EVTYPE=="urban/small stream  flood" |
            dat$EVTYPE=="urban/small stream flooding" |
            dat$EVTYPE=="urban/sml stream fld",]$EVTYPE <- "urban/small stream flooding"
dat[grep("flood",dat$EVTYPE),]$EVTYPE <- "flooding"
dat[grep("stream",dat$EVTYPE),]$EVTYPE <- "flooding"

dat[grep("warm",dat$EVTYPE),]$EVTYPE <- "warm weather"

dat[grep("rain",dat$EVTYPE),]$EVTYPE <- "rain"

dat[grep("cold",dat$EVTYPE),]$EVTYPE <- "cold"
dat[grep("low temp",dat$EVTYPE),]$EVTYPE <- "cold"
dat[grep("cool",dat$EVTYPE),]$EVTYPE <- "cold"

#dat[grep("tunderstorm",dat$EVTYPE),]$EVTYPE <- "thunderstorm"
dat[grep("thunder",dat$EVTYPE),]$EVTYPE <- "thunderstorm"
dat[grep("tstm",dat$EVTYPE),]$EVTYPE <- "thunderstorm"
dat[grep("storm",dat$EVTYPE),]$EVTYPE <- "storm"
dat[grep("snow",dat$EVTYPE),]$EVTYPE <- "snow"
dat[grep("dry",dat$EVTYPE),]$EVTYPE <- "dry weather"
dat[grep("hail",dat$EVTYPE),]$EVTYPE <- "hail"
dat[grep("current",dat$EVTYPE),]$EVTYPE <- "current"
dat[grep("record",dat$EVTYPE),]$EVTYPE <- "record temperature"
dat[grep("red flag",dat$EVTYPE),]$EVTYPE <- "red flag criteria"
dat[grep("lightning",dat$EVTYPE),]$EVTYPE <- "lightning"
dat[grep("lighting",dat$EVTYPE),]$EVTYPE <- "lightning"
dat[grep("ligntning",dat$EVTYPE),]$EVTYPE <- "lightning"
dat[grep("ice",dat$EVTYPE),]$EVTYPE <- "ice"
dat[grep("marine",dat$EVTYPE),]$EVTYPE <- "marine accident"
dat[grep("hurricane",dat$EVTYPE),]$EVTYPE <- "hurricane"
dat[grep("hypothermia",dat$EVTYPE),]$EVTYPE <- "hypothermia"
dat[grep("hyperthermia",dat$EVTYPE),]$EVTYPE <- "hypothermia"
dat[grep("hot",dat$EVTYPE),]$EVTYPE <- "heat"
dat[grep("heat",dat$EVTYPE),]$EVTYPE <- "heat"
#dat[grep("excessive heat",dat$EVTYPE),]$EVTYPE <- "heat"
#dat[grep("extreme heat",dat$EVTYPE),]$EVTYPE <- "heat"
#dat[grep("heat wave",dat$EVTYPE),]$EVTYPE <- "heat"
dat[grep("gust*",dat$EVTYPE),]$EVTYPE <- "gusts"
dat[grep("funnel",dat$EVTYPE),]$EVTYPE <- "funnel cloud"
dat[grep("freez*",dat$EVTYPE),]$EVTYPE <- "freeze"
dat[grep("frost",dat$EVTYPE),]$EVTYPE <- "freeze"
dat[grep("fire",dat$EVTYPE),]$EVTYPE <- "fire"
dat[grep("smoke",dat$EVTYPE),]$EVTYPE <- "smoke"
dat[grep("dust dev",dat$EVTYPE),]$EVTYPE <- "dust devil"
dat[grep("remnants of floyd",dat$EVTYPE),]$EVTYPE <- "hurricane"
dat[grep("precip",dat$EVTYPE),]$EVTYPE <- "precipitation"
dat[grep("precip",dat$EVTYPE),]$EVTYPE <- "precipitation"
dat[grep("avalance",dat$EVTYPE),]$EVTYPE <- "avalanche"
dat[grep("fog",dat$EVTYPE),]$EVTYPE <- "fog"
num.events<-length(table(dat$EVTYPE))

The resulting data file still has 37 variables but only contains 882758 records. After consolidating events and correcting for relevant data entry errors, the number of events is now 92.

2.1 Types of events that are most harmful with respect to population health?

In this analysis, “harmful events” is defined as those events which result in loss of life (fatalities) or injuries.

2.1.1 Fatalities

Fatalities were classified according to the type of event. The total number of fatalities per event was calculated and the percentage of the total was reported for each event.

fatal<-tapply(dat$FATALITIES, dat$EVTYPE, sum)
events<-dimnames(fatal)
freq.f<-matrix(fatal)
fatal<-data.frame(event=events[[1]],freq=freq.f)
fatal$perc<-round(fatal$freq/sum(fatal$freq)*100,2)
largest<-fatal[which.max(fatal$freq),]$event

The event with the largest number of fatalities is tornado.

The top 15 types of events account for almost 100% of the fatalities. The Pareto analysis chart shown below displays the top 15 events causing fatalities, in order of importance. The percentages have been computed based on the top 15 events; but these percentages are almost identical to the actual propoertions, since the top 15 events account for almost 100% of the fatalities..

t<-fatal[order(fatal$freq, decreasing=T),][1:15,]
t.f<-t$freq
names(t.f)<-as.vector(t$event)
pareto.chart(t.f,
             main="Most Harmful Events with Respect to Fatalities")

plot of chunk graphFatalities

##                 
## Pareto chart analysis for t.f
##                  Frequency Cum.Freq. Percentage Cum.Percent.
##   tornado             5660      5660    38.4615        38.46
##   heat                3119      8779    21.1946        59.66
##   flooding            1545     10324    10.4988        70.15
##   wind                1385     11709     9.4115        79.57
##   lightning            810     12519     5.5042        85.07
##   current              573     13092     3.8937        88.96
##   winter weather       379     13471     2.5754        91.54
##   cold                 236     13707     1.6037        93.14
##   avalanche            225     13932     1.5289        94.67
##   storm                206     14138     1.3998        96.07
##   snow                 142     14280     0.9649        97.04
##   hurricane            133     14413     0.9038        97.94
##   rain                 110     14523     0.7475        98.69
##   high surf            103     14626     0.6999        99.39
##   fire                  90     14716     0.6116       100.00

2.1.2 Injuries

Likewise, injuries were classified according to the event to which they are related. Which event types caused the most injuries?

injury<-tapply(dat$INJURIES, dat$EVTYPE, sum)
events<-dimnames(injury)
freq.i<-matrix(injury)
injury<-data.frame(event=events[[1]],freq=freq.i)
injury$perc<-round(injury$freq/sum(injury$freq)*100,2)
largest.injury<-injury[which.max(injury$freq),]$event

The analysis shows that the event type that caused the most injuries is tornado.

The Pareto analysis chart shown below displays the top 15 events causing injuries, in order of importance. Just as in the case of fatalities, the top 15 events account for about 100% of the injuries.

t<-injury[order(injury$freq, decreasing=T),][1:15,]
t.i<-t$freq
names(t.i)<-as.vector(t$event)
pareto.chart(t.i,
             main="Most Harmful Events with Respect to Injuries")

plot of chunk graphInjuries

FALSE                 
FALSE Pareto chart analysis for t.i
FALSE                  Frequency Cum.Freq. Percentage Cum.Percent.
FALSE   tornado            91388     91388    65.7387        65.74
FALSE   wind               11389    102777     8.1925        73.93
FALSE   heat                9174    111951     6.5992        80.53
FALSE   flooding            8674    120625     6.2395        86.77
FALSE   lightning           5160    125785     3.7118        90.48
FALSE   storm               2898    128683     2.0846        92.57
FALSE   winter weather      2773    131456     1.9947        94.56
FALSE   fire                1608    133064     1.1567        95.72
FALSE   hail                1369    134433     0.9848        96.70
FALSE   hurricane           1304    135737     0.9380        97.64
FALSE   snow                1120    136857     0.8057        98.45
FALSE   fog                 1052    137909     0.7567        99.20
FALSE   current              527    138436     0.3791        99.58
FALSE   rain                 301    138737     0.2165        99.80
FALSE   cold                 280    139017     0.2014       100.00

2.1.3 Joint analysis for event types causing fatalities and injuries

both<-union(names(t.f), names(t.i))
subset<-subset(dat, dat$EVTYPE %in% both )
subset$total.i.f<-subset$INJURIES+subset$FATALITIES
injury.fatal<-tapply(subset$total.i.f, subset$EVTYPE, sum)
events<-dimnames(injury.fatal)
freq.i.f<-matrix(injury.fatal)
injury.fatal<-data.frame(event=events[[1]],freq=freq.i.f)
injury.fatal$perc<-round(injury.fatal$freq/sum(injury.fatal$freq)*100,2)
largest<-injury.fatal[which.max(injury.fatal$freq),]$event

The analysis shows that the event type that caused the most injuries and fatalities is tornado.

The Pareto analysis chart shown below displays the top 15 events causing both injuries and fatalities, in order of importance.

t<-injury.fatal[order(injury.fatal$freq, decreasing=T),][1:15,]
t.i.f<-t$freq
names(t.i.f)<-as.vector(t$event)
pareto.chart(t.i.f,
             main="Most Harmful Events with Respect to Fatalities and Injuries")
##                 
## Pareto chart analysis for t.i.f
##                  Frequency Cum.Freq. Percentage Cum.Percent.
##   tornado            97048     97048    63.2235        63.22
##   wind               12774    109822     8.3218        71.55
##   heat               12293    122115     8.0085        79.55
##   flooding           10219    132334     6.6573        86.21
##   lightning           5970    138304     3.8893        90.10
##   winter weather      3152    141456     2.0534        92.15
##   storm               3104    144560     2.0221        94.18
##   fire                1698    146258     1.1062        95.28
##   hurricane           1437    147695     0.9362        96.22
##   hail                1384    149079     0.9016        97.12
##   snow                1262    150341     0.8221        97.94
##   fog                 1132    151473     0.7375        98.68
##   current             1100    152573     0.7166        99.40
##   cold                 516    153089     0.3362        99.73
##   rain                 411    153500     0.2678       100.00

2.2 Which types of events have the greatest economic consequences?

Economic costs are comprised by the monetary cost of property damage and the monetary cost of crop damage. This analysis combine both costs, resulting in the total cost.

# Must convert to amounts using the two fields provided.  One field indicated the order of magnitude and the second field provides the magnitude.
dat[dat$PROPDMGEXP == "K", ]$PROPDMG <- dat[dat$PROPDMGEXP == "K", ]$PROPDMG * 1000
dat[dat$PROPDMGEXP == "M", ]$PROPDMG <- dat[dat$PROPDMGEXP == "M", ]$PROPDMG * 1e+06
dat[dat$PROPDMGEXP == "m", ]$PROPDMG <- dat[dat$PROPDMGEXP == "m", ]$PROPDMG * 1e+06
dat[dat$PROPDMGEXP == "B", ]$PROPDMG <- dat[dat$PROPDMGEXP == "B", ]$PROPDMG * 1e+09
dat[dat$CROPDMGEXP == "K", ]$CROPDMG <- dat[dat$CROPDMGEXP == "K", ]$CROPDMG * 1000
dat[dat$CROPDMGEXP == "k", ]$CROPDMG <- dat[dat$CROPDMGEXP == "k", ]$CROPDMG * 1000
dat[dat$CROPDMGEXP == "M", ]$CROPDMG <- dat[dat$CROPDMGEXP == "M", ]$CROPDMG * 1e+06
dat[dat$CROPDMGEXP == "m", ]$CROPDMG <- dat[dat$CROPDMGEXP == "m", ]$CROPDMG * 1e+06
dat[dat$CROPDMGEXP == "B", ]$CROPDMG <- dat[dat$CROPDMGEXP == "B", ]$CROPDMG * 1e+09
dat$totaldmg<-dat$PROPDMG+dat$CROPDMG
expenses<-tapply(dat$totaldmg, dat$EVTYPE, sum)
events<-dimnames(expenses)
freq.e<-matrix(expenses)
expenses<-data.frame(event=events[[1]],freq=freq.e)
expenses$perc<-round(expenses$freq/sum(expenses$freq)*100,2)
largest.expenses<-expenses[which.max(expenses$freq),]$event

The analysis shows that the event type that caused the most economic cost and damage is flooding.

The Pareto analysis chart shown below displays the top 15 events causing most monetary cost, in order of importance.

t<-expenses[order(expenses$freq, decreasing=T),][1:15,]
t.e<-t$freq
names(t.e)<-as.vector(t$event)
pareto.chart(t.e,
             main="Most Economically Expensive Weather Events",
             ylab="Dollars")

plot of chunk graphExpense

##                 
## Pareto chart analysis for t.e
##                  Frequency Cum.Freq. Percentage Cum.Percent.
##   flooding       1.796e+11 1.796e+11    37.8709        37.87
##   hurricane      9.010e+10 2.697e+11    18.9983        56.87
##   storm          6.650e+10 3.362e+11    14.0216        70.89
##   tornado        5.900e+10 3.952e+11    12.4407        83.33
##   hail           1.878e+10 4.140e+11     3.9597        87.29
##   wind           1.822e+10 4.322e+11     3.8412        91.13
##   drought        1.502e+10 4.472e+11     3.1668        94.30
##   fire           8.900e+09 4.561e+11     1.8766        96.18
##   winter weather 7.595e+09 4.637e+11     1.6015        97.78
##   rain           4.052e+09 4.678e+11     0.8545        98.63
##   freeze         2.019e+09 4.698e+11     0.4256        99.06
##   cold           1.469e+09 4.713e+11     0.3097        99.37
##   snow           1.141e+09 4.724e+11     0.2406        99.61
##   lightning      9.362e+08 4.733e+11     0.1974        99.81
##   heat           9.248e+08 4.743e+11     0.1950       100.00

3 Results

The analysis reflects that about 80% of human loss, being it fatalities or injuries, can be attributed to four main events: tornados, strong winds, heat, and flooding.

The analysis of economic loss includes monetary losses in property as well as crops. This analysis of economic loss reflects that about 80% of the losses can be attributed to four main events: flooding, hurricane, storms, and tornadoes.

In general, most losses related to flooding, strong winds, tornadoes, hurricanes, and extreme heat.