Synopsis

The NOAA storms data contains information on over 900000 weather events in the US. By collapsing the 985 event types down to 11, I have simplified the data and make it easier to analyse. Storms are the most common event type in the database followed by cold/ice events. Wind caused the most injuries/fatalities in absolute terms but high temperature had a higher per event incidence. Wind also had the largest influence on property in both absolute and relative terms. Other sources of damage caused the most crop damage in relative terms, but cold/ice/snow caused most damage in absolute terms.

Data processing

#setwd("DataScienceCoursera/RepData_Assessment2")
dd <- read.csv("repdata-data-StormData.csv", sep=",", stringsAsFactors = FALSE, head=TRUE)

names(dd)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

Only certain variables are really needed - EVTYPE, MAG, FATALITIES, INJURIES, PROPDMG and CROPDMG.

v <- c("EVTYPE", "MAG", "FATALITIES", "INJURIES", "PROPDMG", "CROPDMG")
dd <- dd[,v]

Of the 985 unique types of event, there is a large amount of overlap and many spelling mistakes. This is a typical problem of unstructured data collection instruments. We can reduce this large number somewhat.

dd$event[grepl("wind|hurricane|torn|wnd|spout|typhoon|funnel", dd$EVTYPE, ignore.case = TRUE)] <- "wind"

dd$event[grepl("cold|hypotherm|ice|icy|sleet|snow|low t|freez|frost|bliz|winter|wintry|hail|cool", dd$EVTYPE, ignore.case = TRUE)] <- "cold/snow/ice"

dd$event[grepl("heat|hypertherm|high t|hot|warm|dry|record temp|record high", dd$EVTYPE, ignore.case = TRUE)] <- "high temp"

dd$event[grepl("storm|precip|wet|rain|lightning|tstm|shower|lighting|lignt", dd$EVTYPE, ignore.case = TRUE)] <- "storm"

dd$event[grepl("flood|floood|fld|high water|fldg|drown|dam|stream", dd$EVTYPE, ignore.case = TRUE)] <- "floods"

dd$event[grepl("surf|swell|tide|sea|current|coast|beach|marine|red flag|wave|tsun|rising water", dd$EVTYPE, ignore.case = TRUE)] <- "shore/marine"

dd$event[grepl("slump|slide|aval", dd$EVTYPE, ignore.case = TRUE)] <- "landslide"

dd$event[grepl("fire|smoke|volcan", dd$EVTYPE, ignore.case = TRUE)] <- "fires"

dd$event[grepl("dry|dries", dd$EVTYPE, ignore.case = TRUE)] <- "dryness"

dd$event[grepl("dust|vog|fog", dd$EVTYPE, ignore.case = TRUE)] <- "visibility"

dd$event[is.na(dd$event)] <- "other"

There are now 11 event types.

table(dd$event, useNA = "ifany")
## 
## cold/snow/ice       dryness         fires        floods     high temp 
##        321586           308          4290         85344          2791 
##     landslide         other  shore/marine         storm    visibility 
##          1042          2845         16101        367367          2473 
##          wind 
##         98150

Results

Create some summaries…

library(dplyr); library(ggplot2)
## Warning: package 'dplyr' was built under R version 3.1.2
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Warning: package 'ggplot2' was built under R version 3.1.2
dd1 <- dd %>% group_by(event) %>% 
  summarise(fatal = sum(FATALITIES), mean_fatal = mean(FATALITIES), 
            injury = sum(INJURIES), mean_injury = mean(INJURIES),
            prop = sum(PROPDMG), mean_prop = mean(PROPDMG), 
            crop = sum(CROPDMG), mean_crop = mean(CROPDMG), n_events = n())

Frequency of events:

knitr:::kable(dd1[order(dd1$n_events, decreasing = TRUE), c("event", "n_events")], format="markdown")
event n_events
storm 367367
cold/snow/ice 321586
wind 98150
floods 85344
shore/marine 16101
fires 4290
other 2845
high temp 2791
visibility 2473
landslide 1042
dryness 308

Injuries or fatalities

knitr:::kable(dd1[, c("event", "fatal", "mean_fatal", "injury", "mean_injury", "n_events")], format="markdown", digits = 2)
event fatal mean_fatal injury mean_injury n_events
cold/snow/ice 807 0.002509 4482 0.01394 321586
dryness 32 0.103896 29 0.09416 308
fires 90 0.020979 1608 0.37483 4290
floods 1551 0.018174 8683 0.10174 85344
high temp 2958 1.059835 8832 3.16446 2791
landslide 269 0.258157 226 0.21689 1042
other 7 0.002460 225 0.07909 2845
shore/marine 1057 0.065648 1430 0.08881 16101
storm 2053 0.005588 18838 0.05128 367367
visibility 105 0.042459 1560 0.63081 2473
wind 6216 0.063332 94615 0.96398 98150

The most fatalities and injuries are caused by wind in absolute terms but high temperature has the highest incidence of injuries and fatalities on a per storm basis.

ggplot(dd1, aes(event, mean_fatal)) + geom_bar(stat="identity") + theme_bw() + theme(axis.text.x=element_text(angle=90))

plot of chunk unnamed-chunk-8

Property and crop damage

knitr:::kable(dd1[, c("event", "prop", "mean_prop", "crop", "mean_crop", "n_events")])
event prop mean_prop crop mean_crop n_events
cold/snow/ice 906192 2.8179 601843 1.87148 321586
dryness 1738 5.6416 15 0.04870 308
fires 125823 29.3294 9566 2.22978 4290
floods 2444803 28.6464 367726 4.30875 85344
high temp 1764 0.6319 1168 0.41845 2791
landslide 22448 21.5431 37 0.03551 1042
other 7228 2.5405 34935 12.27929 2845
shore/marine 41171 2.5570 1375 0.08542 16101
storm 3608582 9.8228 226200 0.61573 367367
visibility 23098 9.3400 2102 0.84978 2473
wind 3701654 37.7143 132862 1.35366 98150

In contrast to injuries and fatalities, wind causes most property damage, followed by fire and flooding.

ggplot(dd1, aes(event, mean_prop)) + geom_bar(stat="identity") + theme_bw() + theme(axis.text.x=element_text(angle=90))

plot of chunk unnamed-chunk-10

Crop damage was predominantly caused by “other” causes in relative terms, but cold/ice/snow in absolute terms.

ggplot(dd1, aes(event, mean_crop)) + geom_bar(stat="identity") + theme_bw() + theme(axis.text.x=element_text(angle=90))

plot of chunk unnamed-chunk-11