library(stringr)
library(qcc)
This project explores the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States. The database contains estimates of any fatalities, injuries, and property damage for the period starting in the year 1950 and ending in November 2011. This project will emphasize human loss, in the form of fatalities or injuries, and monetary losses due to weather events.
dat<-read.table(file="StormData.csv",
header=T,
sep=",")
dat.size<-dim(dat)
The data file contains 902297 records and 37 variables. Data from states, other than the 50 states, Virgin Islands and Puerto Rico, were removed from the data set.
states.number<-length(table(dat$STATE))
The original data lists 72 states. Some of the abbreviations for states are not recognized and were removed from the data set.
# Removes data from other than the 50 states, Virgin Islands and Puerto Rico
dat2<-subset(dat, STATE !="AM" & STATE != "AN" & STATE != "AS" & STATE != "GM" &
STATE != "LC" & STATE != "LE" & STATE != "LH" & STATE != "LM" & STATE != "LO" &
STATE != "LS" & STATE != "ME" & STATE != "MH" & STATE != "PH" & STATE != "PK" &
STATE != "PM" & STATE != "PZ" & STATE != "SL" & STATE != "ST" & STATE != "XX")
dat<-dat2
rm(dat2)
dat.size<-dim(dat)
The resulting set contains data for the 50 USA states, Virgin Islands and Puerto Rico. Other regions in the dataset have been excluded from this analysis.
Some obvious data entry errors were noticed in the event types (EVTYPE). After closer examination of this variable it was noticed that some categories were redundant. Categories believed to be redundant were consolidated.
# Data cleansing
# To consolidate categories and to fix some data entry errors
dat$EVTYPE<-tolower(dat$EVTYPE)
dat<-dat[-grep("summary",dat$EVTYPE),] # eliminate Summary records
dat[dat$EVTYPE=="winter storm high winds" |
dat$EVTYPE=="winter storm/high wind"|
dat$EVTYPE=="winter storm/high winds",]$EVTYPE <- "winter storm/high winds"
dat[dat$EVTYPE=="winter storm" |
dat$EVTYPE=="winter storms",]$EVTYPE <- "winter storms"
dat[dat$EVTYPE=="winter mix" |
dat$EVTYPE=="winter weather mix"|
dat$EVTYPE=="wintery mix"|
dat$EVTYPE=="wintry mix",]$EVTYPE <- "winter mix"
dat[dat$EVTYPE=="wnd"|
dat$EVTYPE=="whirlwind",]$EVTYPE <- "wind"
dat[dat$EVTYPE=="wild/forest fire" |
dat$EVTYPE=="wild/forest fires"|
dat$EVTYPE=="wildfire"|
dat$EVTYPE=="wild fires"|
dat$EVTYPE=="wildfires",]$EVTYPE <- "wildfire"
dat[dat$EVTYPE=="waterspouts" |
dat$EVTYPE=="wayterspout"|
dat$EVTYPE=="waterspout-"|
dat$EVTYPE=="waterspout-"|
dat$EVTYPE=="water spout"|
dat$EVTYPE=="waterspout funnel cloud"|
dat$EVTYPE=="waterspout/",]$EVTYPE <- "waterspout"
dat[grep("tornado",dat$EVTYPE),]$EVTYPE <- "tornado"
dat[dat$EVTYPE=="torndao",]$EVTYPE <- "tornado"
dat[dat$EVTYPE=="winter weather" |
dat$EVTYPE=="winter weather/mix"|
dat$EVTYPE=="winter storm/high winds"|
dat$EVTYPE=="winter mix"|
dat$EVTYPE=="wind chill"|
dat$EVTYPE=="wet snow"|
dat$EVTYPE=="blizzard"|
dat$EVTYPE=="blizzard weather"|
dat$EVTYPE=="ground blizzard"|
dat$EVTYPE=="winter storms",]$EVTYPE <- "winter weather"
dat[grep("wind",dat$EVTYPE),]$EVTYPE <- "wind"
dat[grep("wet",dat$EVTYPE),]$EVTYPE <- "wet weather"
dat[grep("slide",dat$EVTYPE),]$EVTYPE <- "mud/rock/land slide"
dat[grep("wall cloud",dat$EVTYPE),]$EVTYPE <- "wall cloud"
dat[dat$EVTYPE=="volcanic ash plume" |
dat$EVTYPE=="volcanic ashfall" |
dat$EVTYPE=="volcanic ash" |
dat$EVTYPE=="vog" |
dat$EVTYPE=="volcanic eruption",]$EVTYPE <- "volcanic activity"
dat[dat$EVTYPE=="urban/small stream" |
dat$EVTYPE=="urban/small stream flood" |
dat$EVTYPE=="urban/small stream flooding" |
dat$EVTYPE=="urban/small strm fldg" |
dat$EVTYPE=="urban/sml stream fldg" |
dat$EVTYPE=="urban/street flooding" |
dat$EVTYPE=="urban and small stream flood" |
dat$EVTYPE=="urban and small stream floodin" |
dat$EVTYPE=="urban flood" |
dat$EVTYPE=="urban flood landslide" |
dat$EVTYPE=="urban and small" |
dat$EVTYPE=="urban and small stream" |
dat$EVTYPE=="urban flooding" |
dat$EVTYPE=="urban floods" |
dat$EVTYPE=="urban small" |
dat$EVTYPE=="urban small stream flood" |
dat$EVTYPE=="urban/small" |
dat$EVTYPE=="urban/small flooding" |
dat$EVTYPE=="urban/small stream flood" |
dat$EVTYPE=="urban/small stream flooding" |
dat$EVTYPE=="urban/sml stream fld",]$EVTYPE <- "urban/small stream flooding"
dat[grep("flood",dat$EVTYPE),]$EVTYPE <- "flooding"
dat[grep("stream",dat$EVTYPE),]$EVTYPE <- "flooding"
dat[grep("warm",dat$EVTYPE),]$EVTYPE <- "warm weather"
dat[grep("rain",dat$EVTYPE),]$EVTYPE <- "rain"
dat[grep("cold",dat$EVTYPE),]$EVTYPE <- "cold"
dat[grep("low temp",dat$EVTYPE),]$EVTYPE <- "cold"
dat[grep("cool",dat$EVTYPE),]$EVTYPE <- "cold"
#dat[grep("tunderstorm",dat$EVTYPE),]$EVTYPE <- "thunderstorm"
dat[grep("thunder",dat$EVTYPE),]$EVTYPE <- "thunderstorm"
dat[grep("tstm",dat$EVTYPE),]$EVTYPE <- "thunderstorm"
dat[grep("storm",dat$EVTYPE),]$EVTYPE <- "storm"
dat[grep("snow",dat$EVTYPE),]$EVTYPE <- "snow"
dat[grep("dry",dat$EVTYPE),]$EVTYPE <- "dry weather"
dat[grep("hail",dat$EVTYPE),]$EVTYPE <- "hail"
dat[grep("current",dat$EVTYPE),]$EVTYPE <- "current"
dat[grep("record",dat$EVTYPE),]$EVTYPE <- "record temperature"
dat[grep("red flag",dat$EVTYPE),]$EVTYPE <- "red flag criteria"
dat[grep("lightning",dat$EVTYPE),]$EVTYPE <- "lightning"
dat[grep("lighting",dat$EVTYPE),]$EVTYPE <- "lightning"
dat[grep("ligntning",dat$EVTYPE),]$EVTYPE <- "lightning"
dat[grep("ice",dat$EVTYPE),]$EVTYPE <- "ice"
dat[grep("marine",dat$EVTYPE),]$EVTYPE <- "marine accident"
dat[grep("hurricane",dat$EVTYPE),]$EVTYPE <- "hurricane"
dat[grep("hypothermia",dat$EVTYPE),]$EVTYPE <- "hypothermia"
dat[grep("hyperthermia",dat$EVTYPE),]$EVTYPE <- "hypothermia"
dat[grep("hot",dat$EVTYPE),]$EVTYPE <- "heat"
dat[grep("heat",dat$EVTYPE),]$EVTYPE <- "heat"
#dat[grep("excessive heat",dat$EVTYPE),]$EVTYPE <- "heat"
#dat[grep("extreme heat",dat$EVTYPE),]$EVTYPE <- "heat"
#dat[grep("heat wave",dat$EVTYPE),]$EVTYPE <- "heat"
dat[grep("gust*",dat$EVTYPE),]$EVTYPE <- "gusts"
dat[grep("funnel",dat$EVTYPE),]$EVTYPE <- "funnel cloud"
dat[grep("freez*",dat$EVTYPE),]$EVTYPE <- "freeze"
dat[grep("frost",dat$EVTYPE),]$EVTYPE <- "freeze"
dat[grep("fire",dat$EVTYPE),]$EVTYPE <- "fire"
dat[grep("smoke",dat$EVTYPE),]$EVTYPE <- "smoke"
dat[grep("dust dev",dat$EVTYPE),]$EVTYPE <- "dust devil"
dat[grep("remnants of floyd",dat$EVTYPE),]$EVTYPE <- "hurricane"
dat[grep("precip",dat$EVTYPE),]$EVTYPE <- "precipitation"
dat[grep("precip",dat$EVTYPE),]$EVTYPE <- "precipitation"
dat[grep("avalance",dat$EVTYPE),]$EVTYPE <- "avalanche"
dat[grep("fog",dat$EVTYPE),]$EVTYPE <- "fog"
num.events<-length(table(dat$EVTYPE))
The resulting data file still has 37 variables but only contains 882758 records. After consolidating events and correcting for relevant data entry errors, the number of events is now 92.
In this analysis, “harmful events” is defined as those events which result in loss of life (fatalities) or injuries.
Fatalities were classified according to the type of event. The total number of fatalities per event was calculated and the percentage of the total was reported for each event.
fatal<-tapply(dat$FATALITIES, dat$EVTYPE, sum)
events<-dimnames(fatal)
freq.f<-matrix(fatal)
fatal<-data.frame(event=events[[1]],freq=freq.f)
fatal$perc<-round(fatal$freq/sum(fatal$freq)*100,2)
largest<-fatal[which.max(fatal$freq),]$event
The event with the largest number of fatalities is tornado.
The top 15 types of events account for almost 100% of the fatalities. The Pareto analysis chart shown below displays the top 15 events causing fatalities, in order of importance. The percentages have been computed based on the top 15 events; but these percentages are almost identical to the actual propoertions, since the top 15 events account for almost 100% of the fatalities..
t<-fatal[order(fatal$freq, decreasing=T),][1:15,]
t.f<-t$freq
names(t.f)<-as.vector(t$event)
pareto.chart(t.f,
main="Most Harmful Events with Respect to Fatalities")
##
## Pareto chart analysis for t.f
## Frequency Cum.Freq. Percentage Cum.Percent.
## tornado 5660 5660 38.4615 38.46
## heat 3119 8779 21.1946 59.66
## flooding 1545 10324 10.4988 70.15
## wind 1385 11709 9.4115 79.57
## lightning 810 12519 5.5042 85.07
## current 573 13092 3.8937 88.96
## winter weather 379 13471 2.5754 91.54
## cold 236 13707 1.6037 93.14
## avalanche 225 13932 1.5289 94.67
## storm 206 14138 1.3998 96.07
## snow 142 14280 0.9649 97.04
## hurricane 133 14413 0.9038 97.94
## rain 110 14523 0.7475 98.69
## high surf 103 14626 0.6999 99.39
## fire 90 14716 0.6116 100.00
Likewise, injuries were classified according to the event to which they are related. Which event types caused the most injuries?
injury<-tapply(dat$INJURIES, dat$EVTYPE, sum)
events<-dimnames(injury)
freq.i<-matrix(injury)
injury<-data.frame(event=events[[1]],freq=freq.i)
injury$perc<-round(injury$freq/sum(injury$freq)*100,2)
largest.injury<-injury[which.max(injury$freq),]$event
The analysis shows that the event type that caused the most injuries is tornado.
The Pareto analysis chart shown below displays the top 15 events causing injuries, in order of importance. Just as in the case of fatalities, the top 15 events account for about 100% of the injuries.
t<-injury[order(injury$freq, decreasing=T),][1:15,]
t.i<-t$freq
names(t.i)<-as.vector(t$event)
pareto.chart(t.i,
main="Most Harmful Events with Respect to Injuries")
FALSE
FALSE Pareto chart analysis for t.i
FALSE Frequency Cum.Freq. Percentage Cum.Percent.
FALSE tornado 91388 91388 65.7387 65.74
FALSE wind 11389 102777 8.1925 73.93
FALSE heat 9174 111951 6.5992 80.53
FALSE flooding 8674 120625 6.2395 86.77
FALSE lightning 5160 125785 3.7118 90.48
FALSE storm 2898 128683 2.0846 92.57
FALSE winter weather 2773 131456 1.9947 94.56
FALSE fire 1608 133064 1.1567 95.72
FALSE hail 1369 134433 0.9848 96.70
FALSE hurricane 1304 135737 0.9380 97.64
FALSE snow 1120 136857 0.8057 98.45
FALSE fog 1052 137909 0.7567 99.20
FALSE current 527 138436 0.3791 99.58
FALSE rain 301 138737 0.2165 99.80
FALSE cold 280 139017 0.2014 100.00
both<-union(names(t.f), names(t.i))
subset<-subset(dat, dat$EVTYPE %in% both )
subset$total.i.f<-subset$INJURIES+subset$FATALITIES
injury.fatal<-tapply(subset$total.i.f, subset$EVTYPE, sum)
events<-dimnames(injury.fatal)
freq.i.f<-matrix(injury.fatal)
injury.fatal<-data.frame(event=events[[1]],freq=freq.i.f)
injury.fatal$perc<-round(injury.fatal$freq/sum(injury.fatal$freq)*100,2)
largest<-injury.fatal[which.max(injury.fatal$freq),]$event
The analysis shows that the event type that caused the most injuries and fatalities is tornado.
The Pareto analysis chart shown below displays the top 15 events causing both injuries and fatalities, in order of importance.
t<-injury.fatal[order(injury.fatal$freq, decreasing=T),][1:15,]
t.i.f<-t$freq
names(t.i.f)<-as.vector(t$event)
pareto.chart(t.i.f,
main="Most Harmful Events with Respect to Fatalities and Injuries")
##
## Pareto chart analysis for t.i.f
## Frequency Cum.Freq. Percentage Cum.Percent.
## tornado 97048 97048 63.2235 63.22
## wind 12774 109822 8.3218 71.55
## heat 12293 122115 8.0085 79.55
## flooding 10219 132334 6.6573 86.21
## lightning 5970 138304 3.8893 90.10
## winter weather 3152 141456 2.0534 92.15
## storm 3104 144560 2.0221 94.18
## fire 1698 146258 1.1062 95.28
## hurricane 1437 147695 0.9362 96.22
## hail 1384 149079 0.9016 97.12
## snow 1262 150341 0.8221 97.94
## fog 1132 151473 0.7375 98.68
## current 1100 152573 0.7166 99.40
## cold 516 153089 0.3362 99.73
## rain 411 153500 0.2678 100.00
Economic costs are comprised by the monetary cost of property damage and the monetary cost of crop damage. This analysis combine both costs, resulting in the total cost.
# Must convert to amounts using the two fields provided. One field indicated the order of magnitude and the second field provides the magnitude.
dat[dat$PROPDMGEXP == "K", ]$PROPDMG <- dat[dat$PROPDMGEXP == "K", ]$PROPDMG * 1000
dat[dat$PROPDMGEXP == "M", ]$PROPDMG <- dat[dat$PROPDMGEXP == "M", ]$PROPDMG * 1e+06
dat[dat$PROPDMGEXP == "m", ]$PROPDMG <- dat[dat$PROPDMGEXP == "m", ]$PROPDMG * 1e+06
dat[dat$PROPDMGEXP == "B", ]$PROPDMG <- dat[dat$PROPDMGEXP == "B", ]$PROPDMG * 1e+09
dat[dat$CROPDMGEXP == "K", ]$CROPDMG <- dat[dat$CROPDMGEXP == "K", ]$CROPDMG * 1000
dat[dat$CROPDMGEXP == "k", ]$CROPDMG <- dat[dat$CROPDMGEXP == "k", ]$CROPDMG * 1000
dat[dat$CROPDMGEXP == "M", ]$CROPDMG <- dat[dat$CROPDMGEXP == "M", ]$CROPDMG * 1e+06
dat[dat$CROPDMGEXP == "m", ]$CROPDMG <- dat[dat$CROPDMGEXP == "m", ]$CROPDMG * 1e+06
dat[dat$CROPDMGEXP == "B", ]$CROPDMG <- dat[dat$CROPDMGEXP == "B", ]$CROPDMG * 1e+09
dat$totaldmg<-dat$PROPDMG+dat$CROPDMG
expenses<-tapply(dat$totaldmg, dat$EVTYPE, sum)
events<-dimnames(expenses)
freq.e<-matrix(expenses)
expenses<-data.frame(event=events[[1]],freq=freq.e)
expenses$perc<-round(expenses$freq/sum(expenses$freq)*100,2)
largest.expenses<-expenses[which.max(expenses$freq),]$event
The analysis shows that the event type that caused the most economic cost and damage is flooding.
The Pareto analysis chart shown below displays the top 15 events causing most monetary cost, in order of importance.
t<-expenses[order(expenses$freq, decreasing=T),][1:15,]
t.e<-t$freq
names(t.e)<-as.vector(t$event)
pareto.chart(t.e,
main="Most Economically Expensive Weather Events",
ylab="Dollars")
##
## Pareto chart analysis for t.e
## Frequency Cum.Freq. Percentage Cum.Percent.
## flooding 1.796e+11 1.796e+11 37.8709 37.87
## hurricane 9.010e+10 2.697e+11 18.9983 56.87
## storm 6.650e+10 3.362e+11 14.0216 70.89
## tornado 5.900e+10 3.952e+11 12.4407 83.33
## hail 1.878e+10 4.140e+11 3.9597 87.29
## wind 1.822e+10 4.322e+11 3.8412 91.13
## drought 1.502e+10 4.472e+11 3.1668 94.30
## fire 8.900e+09 4.561e+11 1.8766 96.18
## winter weather 7.595e+09 4.637e+11 1.6015 97.78
## rain 4.052e+09 4.678e+11 0.8545 98.63
## freeze 2.019e+09 4.698e+11 0.4256 99.06
## cold 1.469e+09 4.713e+11 0.3097 99.37
## snow 1.141e+09 4.724e+11 0.2406 99.61
## lightning 9.362e+08 4.733e+11 0.1974 99.81
## heat 9.248e+08 4.743e+11 0.1950 100.00
The analysis reflects that about 80% of human loss, being it fatalities or injuries, can be attributed to four main events: tornados, strong winds, heat, and flooding.
The analysis of economic loss includes monetary losses in property as well as crops. This analysis of economic loss reflects that about 80% of the losses can be attributed to four main events: flooding, hurricane, storms, and tornadoes.
In general, most losses related to flooding, strong winds, tornadoes, hurricanes, and extreme heat.