Synopsis

This document analyzes the impacts of a set of meteorogical and accidental events across the USA. The events have been collected from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. Data collected ranges from 1950 to 2011.
Data holds details about health and damage impacts:

Data is avaliable here. The analyzes starts by cleaning and refactoring data. Next it is grouped by event and ordered by total loss.

Data Processing

Necessary libraries are loaded and data is assumed to be in a default directory.

library(readr)
library(ggplot2)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)

setwd("D:/products/R/trainning/coursera/DataScience_JohnHopkins/5/peerassigment2/")

df_storm_raw <- read_csv("repdata%2Fdata%2FStormData.csv")

Only the USA 50 states and Washington DC are considered as acceptable data. Other territories under USA sovereignty are not considered. States are available from state {datasets}.

df_storm_USA <- df_storm_raw[toupper(df_storm_raw$STATE) %in% state.abb | df_storm_raw$STATE == "DC", ]

From the total set of columns available, the relevant ones are:

df_storm_USA_selected <- df_storm_USA[, c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]

Damage columns have associated exponencial column indicators that need to be parsed. In those columns, there may be a number of additonal zeros to be added or letters used as multiplier indicators. Exponencial data needs to be converted to numeric multiplers so that data has the same format. Conversions are:

After conversion, the columns that hold the actual values are updated to the same units. By using formula:

\(final = value * 10^{exp}\)

Health data is just converted for numeric data type, no special handling is necessary.

unique(df_storm_USA_selected$PROPDMGEXP)
##  [1] "K" "M" NA  "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-"
## [18] "1" "8"
df_storm_USA_selected$PROPDMGEXP[df_storm_USA_selected$PROPDMGEXP %in% c("?", "+", "-")] <- 0
df_storm_USA_selected$PROPDMGEXP[toupper(df_storm_USA_selected$PROPDMGEXP) == "M"] <- 6
df_storm_USA_selected$PROPDMGEXP[toupper(df_storm_USA_selected$PROPDMGEXP) == "K"] <- 3
df_storm_USA_selected$PROPDMGEXP[toupper(df_storm_USA_selected$PROPDMGEXP) == "H"] <- 2
df_storm_USA_selected$PROPDMGEXP[toupper(df_storm_USA_selected$PROPDMGEXP) == "B"] <- 9
df_storm_USA_selected$PROPDMGEXP[is.na(df_storm_USA_selected$PROPDMGEXP)] <- 0
df_storm_USA_selected$PROPDMGEXP <- as.numeric(df_storm_USA_selected$PROPDMGEXP)

unique(df_storm_USA_selected$CROPDMGEXP)
## [1] NA  "M" "K" "m" "B" "?" "0" "k" "2"
df_storm_USA_selected$CROPDMGEXP[df_storm_USA_selected$CROPDMGEXP %in% c("?", "+", "-")] <- 0
df_storm_USA_selected$CROPDMGEXP[toupper(df_storm_USA_selected$CROPDMGEXP) == "M"] <- 6
df_storm_USA_selected$CROPDMGEXP[toupper(df_storm_USA_selected$CROPDMGEXP) == "K"] <- 3
df_storm_USA_selected$CROPDMGEXP[toupper(df_storm_USA_selected$CROPDMGEXP) == "H"] <- 2
df_storm_USA_selected$CROPDMGEXP[toupper(df_storm_USA_selected$CROPDMGEXP) == "B"] <- 9
df_storm_USA_selected$CROPDMGEXP[is.na(df_storm_USA_selected$CROPDMGEXP)] <- 0
df_storm_USA_selected$CROPDMGEXP <- as.numeric(df_storm_USA_selected$CROPDMGEXP)

df_storm_USA_selected$PROPDMG <- as.numeric(df_storm_USA_selected$PROPDMG)
df_storm_USA_selected$CROPDMG <- as.numeric(df_storm_USA_selected$CROPDMG)

df_storm_USA_selected$PROPDMG <- df_storm_USA_selected$PROPDMG * 10^df_storm_USA_selected$PROPDMGEXP
df_storm_USA_selected$CROPDMG <- df_storm_USA_selected$CROPDMG * 10^df_storm_USA_selected$CROPDMGEXP

df_storm_USA_selected <- df_storm_USA_selected[,!names(df_storm_USA_selected) %in% c("PROPDMGEXP", "CROPDMGEXP")]

df_storm_USA_selected$FATALITIES <- as.numeric(df_storm_USA_selected$FATALITIES)
df_storm_USA_selected$INJURIES   <- as.numeric(df_storm_USA_selected$INJURIES)

Event column holds 946 different values. Some of these values result from variations in an event, typos or wrong data. This data needs to be manually grouped with the following rules:

df_storm_USA_selected[(grep("SMOKE", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "SMOKE"
df_storm_USA_selected[(grep("AVALANCE|AVALANCHE", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "AVALANCHE"
df_storm_USA_selected[(grep("FROST|FREEZE|ICE|ICY|FREEZING|GLAZE", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "ICE"
df_storm_USA_selected[(grep("FLOOD|FLOOOD|DAM|TSUNAMI|HIGH WATER|RISING WATER|FLD|STREAM", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "FLOOD"
df_storm_USA_selected[(grep("TORNADO|TORNDAO|FUNNEL|WATER SPOUT|WATERSPOUT|WAYTERSPOUT|GUSTNADO", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "TORNADO"
df_storm_USA_selected[(grep("HURRICANE|TYPHOON|FLOYD", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "HURRICANE"
df_storm_USA_selected[(grep("LIGHTNING|LIGHTING|LIGNTNINGN|LIGNTNING", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "LIGHTNING"
df_storm_USA_selected[(grep("THUNDERSTORM|STORM|TSTM|TROPICAL DEPRESSION", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "STORM"
df_storm_USA_selected[(grep("FIRE|RED FLAG", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "FIRE"
df_storm_USA_selected[(grep("DRY|DRIEST", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "DROUGHT"
df_storm_USA_selected[(grep("HEAT|HOT|WARM|WARMTH|HIGH TEMPERATURE", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "HEAT"
df_storm_USA_selected[(grep("EROSION|EROSIN|LANDSLUMP|LANDSPOUT|SLIDE|COASTAL SURGE", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "EROSION"
df_storm_USA_selected[(grep("FOG", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "FOG"
df_storm_USA_selected[(grep("RAIN|WET|PRECIPITATION|PRECIPATATION|PRECIP|SNOW|BLIZZARD|WINTRY|HAIL|SLEET", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "PRECIPITATION"
df_storm_USA_selected[(grep("COLD|COOL|LOW TEMPERATURE|WINTER", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "COLD"
df_storm_USA_selected[(grep("DUST", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "DUST"
df_storm_USA_selected[(grep("SEA|SURF|WAVE|TIDE|SEICHE", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "ROUGH SEA"
df_storm_USA_selected[(grep("RIP CURRENT|RIP CURRENTS", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "RIP CURRENTS"
df_storm_USA_selected[(grep("VOLCA|VOG", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "VOLCANO"
df_storm_USA_selected[(grep("WIND|WND|BURST", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "WIND"
df_storm_USA_selected[(grep("OTHER|\\?|^SUMMARY|NONE|RECORD|URBAN|ACCIDENT|TEMPERATURE|WEATHER|HIGH|CLOUD|EXPOSURE|MILD|TURBULENCE|MISHAP|MIX", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "OTHER"

df_storm_USA_selected$EVTYPE <- as.factor(df_storm_USA_selected$EVTYPE)

Finally data is aggregated by event. Columns are added to count the number of ocurrences and the total sum for health and damage.

\(Total_{Victims} = Injuries + Fatalities\)
\(Total_{Losses} = Crop + Property\)

NOTE: Events with less that 10 ocurrences are not condsidered. These are typos/issues in data.

df_storm_USA_agg <- df_storm_USA_selected %>%
                    group_by(EVTYPE) %>%
                    summarize(COUNT = n(), 
                              FATALITIES = sum(FATALITIES), INJURIES = sum(INJURIES), TOTAL_HEALTH = sum(FATALITIES + INJURIES),
                              CROP = sum(CROPDMG), PROPERTY = sum(PROPDMG), TOTAL_DAMAGE = sum(CROPDMG + PROPDMG))

df_storm_USA_agg <- df_storm_USA_agg[df_storm_USA_agg$COUNT >= 10,]

Result

Question 1: Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

Health data is prepared for presentation:

df_storm_USA_agg_health <- df_storm_USA_agg[, c("EVTYPE", "FATALITIES", "INJURIES", "TOTAL_HEALTH")]
df_storm_USA_agg_health <- df_storm_USA_agg_health[order(-df_storm_USA_agg_health$TOTAL_HEALTH),] 

df_storm_USA_agg_health_tidy <- gather(df_storm_USA_agg_health, "CODE", "VALUE", 2:4)

df_storm_USA_agg_health_tidy$CODE <- as.factor(df_storm_USA_agg_health_tidy$CODE)

The following graph shows the total victims per event. Data for injuries, fatalities and the sum of both is presented:

g1 <- ggplot(df_storm_USA_agg_health_tidy, aes(x = reorder(EVTYPE, VALUE), y = VALUE / 1000)) +
      geom_bar(aes(fill = factor(CODE)), stat = "identity", position = position_dodge()) +
      ylab("Victims (in 1 000)") +
      xlab("Events") + 
      theme(legend.title = element_blank()) +
      ggtitle("Events health impacts") +
      coord_flip()

g1

The following table presents health details, ordered by total victims:

EVTYPE FATALITIES INJURIES TOTAL_HEALTH
TORNADO 5664 91438 97102
STORM 1037 11717 12754
HEAT 3149 9243 12392
FLOOD 1508 8677 10185
LIGHTNING 807 5226 6033
PRECIPITATION 353 3694 4047
ICE 127 2454 2581
WIND 442 1839 2281
FIRE 90 1607 1697
COLD 503 857 1360
FOG 81 1077 1158
HURRICANE 113 973 1086
RIP CURRENTS 518 471 989
ROUGH SEA 157 252 409
AVALANCHE 225 171 396
EROSION 41 51 92
DROUGHT 32 33 65
DUST 2 43 45
OTHER 17 12 29
SMOKE 0 0 0

It is possible to conclude that tornados are by far the highest threat event. Even considering fatalities or injuries isolated, tornados are still on top.
The total health issues in tornados are 97.102 K resulting from 91.438 K injuries and 5.664 K fatalities.

Question 2: Across the United States, which types of events have the greatest economic consequences?

Damage data is prepared for presentation:

df_storm_USA_agg_damage <- df_storm_USA_agg[, c("EVTYPE", "CROP", "PROPERTY", "TOTAL_DAMAGE")]
df_storm_USA_agg_damage <- df_storm_USA_agg_damage[order(-df_storm_USA_agg_damage$TOTAL),] 

# Gather columns in name value for further separation
df_storm_USA_agg_damage_tidy <- gather(df_storm_USA_agg_damage, "CODE", "VALUE", 2:4)
df_storm_USA_agg_damage_tidy$CODE <- as.factor(df_storm_USA_agg_damage_tidy$CODE)

The following graphic shows the total damage per event. Data for crops, property and the sum of both is presented:

g2 <- ggplot(df_storm_USA_agg_damage_tidy, aes(x = reorder(EVTYPE, VALUE), y = VALUE / 1000000000)) +
      geom_bar(aes(fill = factor(CODE)), stat = "identity", position = position_dodge()) +
      ylab("Damage (in $ 1 000 000 000)") +
      xlab("Events") + 
      theme(legend.title = element_blank()) +
      ggtitle("Events economical impacts") +
      coord_flip()

g2

The following table presents damage details, ordered by total damage:

EVTYPE CROP PROPERTY TOTAL_DAMAGE
FLOOD 12321240200 167954591285 180275831485
HURRICANE 4958940800 82529119010 87488059810
STORM 1898215988 73736232191 75634448179
TORNADO 417462070 58607481077 59024943147
PRECIPITATION 4312663523 20632697723 24945361246
DROUGHT 13972381000 1047838600 15020219600
ICE 7024175300 4013257110 11037432410
FIRE 402116630 8489501500 8891618130
WIND 766971400 6067326183 6834297583
COLD 1365765500 272827400 1638592900
LIGHTNING 12097090 939729430 951826520
HEAT 904479280 20325750 924805030
EROSION 20017000 323258600 343275600
ROUGH SEA 0 102033650 102033650
FOG 0 22829500 22829500
AVALANCHE 0 8721800 8721800
OTHER 1034400 1470500 2504900
DUST 0 738630 738630
RIP CURRENTS 0 101000 101000
SMOKE 0 100000 100000

It is possible to conclude that floods are the most threat event.
The total loss in floods is $ 180.2758315 B resulting from $ 12.3212402 B in crops and $ 167.9545913 B in property.
If crops are only considered, the biggest cost is in drought with $ 13.972381 B.

Conclusion

The following statements are concluded: