The NOAA storms data contains information on over 900000 weather events in the US. By collapsing the 985 event types down to 11, I have simplified the data and make it easier to analyse. Storms are the most common event type in the database followed by cold/ice events. Wind caused the most injuries/fatalities in absolute terms but high temperature had a higher per event incidence. Wind also had the largest influence on property in both absolute and relative terms. Other sources of damage caused the most crop damage in relative terms, but cold/ice/snow caused most damage in absolute terms.
#setwd("DataScienceCoursera/RepData_Assessment2")
dd <- read.csv("repdata-data-StormData.csv", sep=",", stringsAsFactors = FALSE, head=TRUE)
names(dd)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
Only certain variables are really needed - EVTYPE, MAG, FATALITIES, INJURIES, PROPDMG and CROPDMG.
v <- c("EVTYPE", "MAG", "FATALITIES", "INJURIES", "PROPDMG", "CROPDMG")
dd <- dd[,v]
Of the 985 unique types of event, there is a large amount of overlap and many spelling mistakes. This is a typical problem of unstructured data collection instruments. We can reduce this large number somewhat.
dd$event[grepl("wind|hurricane|torn|wnd|spout|typhoon|funnel", dd$EVTYPE, ignore.case = TRUE)] <- "wind"
dd$event[grepl("cold|hypotherm|ice|icy|sleet|snow|low t|freez|frost|bliz|winter|wintry|hail|cool", dd$EVTYPE, ignore.case = TRUE)] <- "cold/snow/ice"
dd$event[grepl("heat|hypertherm|high t|hot|warm|dry|record temp|record high", dd$EVTYPE, ignore.case = TRUE)] <- "high temp"
dd$event[grepl("storm|precip|wet|rain|lightning|tstm|shower|lighting|lignt", dd$EVTYPE, ignore.case = TRUE)] <- "storm"
dd$event[grepl("flood|floood|fld|high water|fldg|drown|dam|stream", dd$EVTYPE, ignore.case = TRUE)] <- "floods"
dd$event[grepl("surf|swell|tide|sea|current|coast|beach|marine|red flag|wave|tsun|rising water", dd$EVTYPE, ignore.case = TRUE)] <- "shore/marine"
dd$event[grepl("slump|slide|aval", dd$EVTYPE, ignore.case = TRUE)] <- "landslide"
dd$event[grepl("fire|smoke|volcan", dd$EVTYPE, ignore.case = TRUE)] <- "fires"
dd$event[grepl("dry|dries", dd$EVTYPE, ignore.case = TRUE)] <- "dryness"
dd$event[grepl("dust|vog|fog", dd$EVTYPE, ignore.case = TRUE)] <- "visibility"
dd$event[is.na(dd$event)] <- "other"
There are now 11 event types.
table(dd$event, useNA = "ifany")
##
## cold/snow/ice dryness fires floods high temp
## 321586 308 4290 85344 2791
## landslide other shore/marine storm visibility
## 1042 2845 16101 367367 2473
## wind
## 98150
Create some summaries…
library(dplyr); library(ggplot2)
## Warning: package 'dplyr' was built under R version 3.1.2
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'ggplot2' was built under R version 3.1.2
dd1 <- dd %>% group_by(event) %>%
summarise(fatal = sum(FATALITIES), mean_fatal = mean(FATALITIES),
injury = sum(INJURIES), mean_injury = mean(INJURIES),
prop = sum(PROPDMG), mean_prop = mean(PROPDMG),
crop = sum(CROPDMG), mean_crop = mean(CROPDMG), n_events = n())
Frequency of events:
knitr:::kable(dd1[order(dd1$n_events, decreasing = TRUE), c("event", "n_events")], format="markdown")
| event | n_events |
|---|---|
| storm | 367367 |
| cold/snow/ice | 321586 |
| wind | 98150 |
| floods | 85344 |
| shore/marine | 16101 |
| fires | 4290 |
| other | 2845 |
| high temp | 2791 |
| visibility | 2473 |
| landslide | 1042 |
| dryness | 308 |
knitr:::kable(dd1[, c("event", "fatal", "mean_fatal", "injury", "mean_injury", "n_events")], format="markdown", digits = 2)
| event | fatal | mean_fatal | injury | mean_injury | n_events |
|---|---|---|---|---|---|
| cold/snow/ice | 807 | 0.002509 | 4482 | 0.01394 | 321586 |
| dryness | 32 | 0.103896 | 29 | 0.09416 | 308 |
| fires | 90 | 0.020979 | 1608 | 0.37483 | 4290 |
| floods | 1551 | 0.018174 | 8683 | 0.10174 | 85344 |
| high temp | 2958 | 1.059835 | 8832 | 3.16446 | 2791 |
| landslide | 269 | 0.258157 | 226 | 0.21689 | 1042 |
| other | 7 | 0.002460 | 225 | 0.07909 | 2845 |
| shore/marine | 1057 | 0.065648 | 1430 | 0.08881 | 16101 |
| storm | 2053 | 0.005588 | 18838 | 0.05128 | 367367 |
| visibility | 105 | 0.042459 | 1560 | 0.63081 | 2473 |
| wind | 6216 | 0.063332 | 94615 | 0.96398 | 98150 |
The most fatalities and injuries are caused by wind in absolute terms but high temperature has the highest incidence of injuries and fatalities on a per storm basis.
ggplot(dd1, aes(event, mean_fatal)) + geom_bar(stat="identity") + theme_bw() + theme(axis.text.x=element_text(angle=90))
knitr:::kable(dd1[, c("event", "prop", "mean_prop", "crop", "mean_crop", "n_events")])
| event | prop | mean_prop | crop | mean_crop | n_events |
|---|---|---|---|---|---|
| cold/snow/ice | 906192 | 2.8179 | 601843 | 1.87148 | 321586 |
| dryness | 1738 | 5.6416 | 15 | 0.04870 | 308 |
| fires | 125823 | 29.3294 | 9566 | 2.22978 | 4290 |
| floods | 2444803 | 28.6464 | 367726 | 4.30875 | 85344 |
| high temp | 1764 | 0.6319 | 1168 | 0.41845 | 2791 |
| landslide | 22448 | 21.5431 | 37 | 0.03551 | 1042 |
| other | 7228 | 2.5405 | 34935 | 12.27929 | 2845 |
| shore/marine | 41171 | 2.5570 | 1375 | 0.08542 | 16101 |
| storm | 3608582 | 9.8228 | 226200 | 0.61573 | 367367 |
| visibility | 23098 | 9.3400 | 2102 | 0.84978 | 2473 |
| wind | 3701654 | 37.7143 | 132862 | 1.35366 | 98150 |
In contrast to injuries and fatalities, wind causes most property damage, followed by fire and flooding.
ggplot(dd1, aes(event, mean_prop)) + geom_bar(stat="identity") + theme_bw() + theme(axis.text.x=element_text(angle=90))
Crop damage was predominantly caused by “other” causes in relative terms, but cold/ice/snow in absolute terms.
ggplot(dd1, aes(event, mean_crop)) + geom_bar(stat="identity") + theme_bw() + theme(axis.text.x=element_text(angle=90))