This document analyzes the impacts of a set of meteorogical and accidental events across the USA. The events have been collected from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. Data collected ranges from 1950 to 2011.
Data holds details about health and damage impacts:
Data is avaliable here. The analyzes starts by cleaning and refactoring data. Next it is grouped by event and ordered by total loss.
Necessary libraries are loaded and data is assumed to be in a default directory.
library(readr)
library(ggplot2)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
setwd("D:/products/R/trainning/coursera/DataScience_JohnHopkins/5/peerassigment2/")
df_storm_raw <- read_csv("repdata%2Fdata%2FStormData.csv")
Only the USA 50 states and Washington DC are considered as acceptable data. Other territories under USA sovereignty are not considered. States are available from state {datasets}.
df_storm_USA <- df_storm_raw[toupper(df_storm_raw$STATE) %in% state.abb | df_storm_raw$STATE == "DC", ]
From the total set of columns available, the relevant ones are:
df_storm_USA_selected <- df_storm_USA[, c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
Damage columns have associated exponencial column indicators that need to be parsed. In those columns, there may be a number of additonal zeros to be added or letters used as multiplier indicators. Exponencial data needs to be converted to numeric multiplers so that data has the same format. Conversions are:
After conversion, the columns that hold the actual values are updated to the same units. By using formula:
\(final = value * 10^{exp}\)
Health data is just converted for numeric data type, no special handling is necessary.
unique(df_storm_USA_selected$PROPDMGEXP)
## [1] "K" "M" NA "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-"
## [18] "1" "8"
df_storm_USA_selected$PROPDMGEXP[df_storm_USA_selected$PROPDMGEXP %in% c("?", "+", "-")] <- 0
df_storm_USA_selected$PROPDMGEXP[toupper(df_storm_USA_selected$PROPDMGEXP) == "M"] <- 6
df_storm_USA_selected$PROPDMGEXP[toupper(df_storm_USA_selected$PROPDMGEXP) == "K"] <- 3
df_storm_USA_selected$PROPDMGEXP[toupper(df_storm_USA_selected$PROPDMGEXP) == "H"] <- 2
df_storm_USA_selected$PROPDMGEXP[toupper(df_storm_USA_selected$PROPDMGEXP) == "B"] <- 9
df_storm_USA_selected$PROPDMGEXP[is.na(df_storm_USA_selected$PROPDMGEXP)] <- 0
df_storm_USA_selected$PROPDMGEXP <- as.numeric(df_storm_USA_selected$PROPDMGEXP)
unique(df_storm_USA_selected$CROPDMGEXP)
## [1] NA "M" "K" "m" "B" "?" "0" "k" "2"
df_storm_USA_selected$CROPDMGEXP[df_storm_USA_selected$CROPDMGEXP %in% c("?", "+", "-")] <- 0
df_storm_USA_selected$CROPDMGEXP[toupper(df_storm_USA_selected$CROPDMGEXP) == "M"] <- 6
df_storm_USA_selected$CROPDMGEXP[toupper(df_storm_USA_selected$CROPDMGEXP) == "K"] <- 3
df_storm_USA_selected$CROPDMGEXP[toupper(df_storm_USA_selected$CROPDMGEXP) == "H"] <- 2
df_storm_USA_selected$CROPDMGEXP[toupper(df_storm_USA_selected$CROPDMGEXP) == "B"] <- 9
df_storm_USA_selected$CROPDMGEXP[is.na(df_storm_USA_selected$CROPDMGEXP)] <- 0
df_storm_USA_selected$CROPDMGEXP <- as.numeric(df_storm_USA_selected$CROPDMGEXP)
df_storm_USA_selected$PROPDMG <- as.numeric(df_storm_USA_selected$PROPDMG)
df_storm_USA_selected$CROPDMG <- as.numeric(df_storm_USA_selected$CROPDMG)
df_storm_USA_selected$PROPDMG <- df_storm_USA_selected$PROPDMG * 10^df_storm_USA_selected$PROPDMGEXP
df_storm_USA_selected$CROPDMG <- df_storm_USA_selected$CROPDMG * 10^df_storm_USA_selected$CROPDMGEXP
df_storm_USA_selected <- df_storm_USA_selected[,!names(df_storm_USA_selected) %in% c("PROPDMGEXP", "CROPDMGEXP")]
df_storm_USA_selected$FATALITIES <- as.numeric(df_storm_USA_selected$FATALITIES)
df_storm_USA_selected$INJURIES <- as.numeric(df_storm_USA_selected$INJURIES)
Event column holds 946 different values. Some of these values result from variations in an event, typos or wrong data. This data needs to be manually grouped with the following rules:
df_storm_USA_selected[(grep("SMOKE", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "SMOKE"
df_storm_USA_selected[(grep("AVALANCE|AVALANCHE", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "AVALANCHE"
df_storm_USA_selected[(grep("FROST|FREEZE|ICE|ICY|FREEZING|GLAZE", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "ICE"
df_storm_USA_selected[(grep("FLOOD|FLOOOD|DAM|TSUNAMI|HIGH WATER|RISING WATER|FLD|STREAM", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "FLOOD"
df_storm_USA_selected[(grep("TORNADO|TORNDAO|FUNNEL|WATER SPOUT|WATERSPOUT|WAYTERSPOUT|GUSTNADO", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "TORNADO"
df_storm_USA_selected[(grep("HURRICANE|TYPHOON|FLOYD", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "HURRICANE"
df_storm_USA_selected[(grep("LIGHTNING|LIGHTING|LIGNTNINGN|LIGNTNING", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "LIGHTNING"
df_storm_USA_selected[(grep("THUNDERSTORM|STORM|TSTM|TROPICAL DEPRESSION", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "STORM"
df_storm_USA_selected[(grep("FIRE|RED FLAG", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "FIRE"
df_storm_USA_selected[(grep("DRY|DRIEST", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "DROUGHT"
df_storm_USA_selected[(grep("HEAT|HOT|WARM|WARMTH|HIGH TEMPERATURE", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "HEAT"
df_storm_USA_selected[(grep("EROSION|EROSIN|LANDSLUMP|LANDSPOUT|SLIDE|COASTAL SURGE", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "EROSION"
df_storm_USA_selected[(grep("FOG", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "FOG"
df_storm_USA_selected[(grep("RAIN|WET|PRECIPITATION|PRECIPATATION|PRECIP|SNOW|BLIZZARD|WINTRY|HAIL|SLEET", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "PRECIPITATION"
df_storm_USA_selected[(grep("COLD|COOL|LOW TEMPERATURE|WINTER", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "COLD"
df_storm_USA_selected[(grep("DUST", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "DUST"
df_storm_USA_selected[(grep("SEA|SURF|WAVE|TIDE|SEICHE", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "ROUGH SEA"
df_storm_USA_selected[(grep("RIP CURRENT|RIP CURRENTS", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "RIP CURRENTS"
df_storm_USA_selected[(grep("VOLCA|VOG", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "VOLCANO"
df_storm_USA_selected[(grep("WIND|WND|BURST", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "WIND"
df_storm_USA_selected[(grep("OTHER|\\?|^SUMMARY|NONE|RECORD|URBAN|ACCIDENT|TEMPERATURE|WEATHER|HIGH|CLOUD|EXPOSURE|MILD|TURBULENCE|MISHAP|MIX", df_storm_USA_selected$EVTYPE,ignore.case = TRUE)), ]$EVTYPE <- "OTHER"
df_storm_USA_selected$EVTYPE <- as.factor(df_storm_USA_selected$EVTYPE)
Finally data is aggregated by event. Columns are added to count the number of ocurrences and the total sum for health and damage.
\(Total_{Victims} = Injuries + Fatalities\)
\(Total_{Losses} = Crop + Property\)
NOTE: Events with less that 10 ocurrences are not condsidered. These are typos/issues in data.
df_storm_USA_agg <- df_storm_USA_selected %>%
group_by(EVTYPE) %>%
summarize(COUNT = n(),
FATALITIES = sum(FATALITIES), INJURIES = sum(INJURIES), TOTAL_HEALTH = sum(FATALITIES + INJURIES),
CROP = sum(CROPDMG), PROPERTY = sum(PROPDMG), TOTAL_DAMAGE = sum(CROPDMG + PROPDMG))
df_storm_USA_agg <- df_storm_USA_agg[df_storm_USA_agg$COUNT >= 10,]
Health data is prepared for presentation:
df_storm_USA_agg_health <- df_storm_USA_agg[, c("EVTYPE", "FATALITIES", "INJURIES", "TOTAL_HEALTH")]
df_storm_USA_agg_health <- df_storm_USA_agg_health[order(-df_storm_USA_agg_health$TOTAL_HEALTH),]
df_storm_USA_agg_health_tidy <- gather(df_storm_USA_agg_health, "CODE", "VALUE", 2:4)
df_storm_USA_agg_health_tidy$CODE <- as.factor(df_storm_USA_agg_health_tidy$CODE)
The following graph shows the total victims per event. Data for injuries, fatalities and the sum of both is presented:
g1 <- ggplot(df_storm_USA_agg_health_tidy, aes(x = reorder(EVTYPE, VALUE), y = VALUE / 1000)) +
geom_bar(aes(fill = factor(CODE)), stat = "identity", position = position_dodge()) +
ylab("Victims (in 1 000)") +
xlab("Events") +
theme(legend.title = element_blank()) +
ggtitle("Events health impacts") +
coord_flip()
g1
The following table presents health details, ordered by total victims:
| EVTYPE | FATALITIES | INJURIES | TOTAL_HEALTH |
|---|---|---|---|
| TORNADO | 5664 | 91438 | 97102 |
| STORM | 1037 | 11717 | 12754 |
| HEAT | 3149 | 9243 | 12392 |
| FLOOD | 1508 | 8677 | 10185 |
| LIGHTNING | 807 | 5226 | 6033 |
| PRECIPITATION | 353 | 3694 | 4047 |
| ICE | 127 | 2454 | 2581 |
| WIND | 442 | 1839 | 2281 |
| FIRE | 90 | 1607 | 1697 |
| COLD | 503 | 857 | 1360 |
| FOG | 81 | 1077 | 1158 |
| HURRICANE | 113 | 973 | 1086 |
| RIP CURRENTS | 518 | 471 | 989 |
| ROUGH SEA | 157 | 252 | 409 |
| AVALANCHE | 225 | 171 | 396 |
| EROSION | 41 | 51 | 92 |
| DROUGHT | 32 | 33 | 65 |
| DUST | 2 | 43 | 45 |
| OTHER | 17 | 12 | 29 |
| SMOKE | 0 | 0 | 0 |
It is possible to conclude that tornados are by far the highest threat event. Even considering fatalities or injuries isolated, tornados are still on top.
The total health issues in tornados are 97.102 K resulting from 91.438 K injuries and 5.664 K fatalities.
Damage data is prepared for presentation:
df_storm_USA_agg_damage <- df_storm_USA_agg[, c("EVTYPE", "CROP", "PROPERTY", "TOTAL_DAMAGE")]
df_storm_USA_agg_damage <- df_storm_USA_agg_damage[order(-df_storm_USA_agg_damage$TOTAL),]
# Gather columns in name value for further separation
df_storm_USA_agg_damage_tidy <- gather(df_storm_USA_agg_damage, "CODE", "VALUE", 2:4)
df_storm_USA_agg_damage_tidy$CODE <- as.factor(df_storm_USA_agg_damage_tidy$CODE)
The following graphic shows the total damage per event. Data for crops, property and the sum of both is presented:
g2 <- ggplot(df_storm_USA_agg_damage_tidy, aes(x = reorder(EVTYPE, VALUE), y = VALUE / 1000000000)) +
geom_bar(aes(fill = factor(CODE)), stat = "identity", position = position_dodge()) +
ylab("Damage (in $ 1 000 000 000)") +
xlab("Events") +
theme(legend.title = element_blank()) +
ggtitle("Events economical impacts") +
coord_flip()
g2
The following table presents damage details, ordered by total damage:
| EVTYPE | CROP | PROPERTY | TOTAL_DAMAGE |
|---|---|---|---|
| FLOOD | 12321240200 | 167954591285 | 180275831485 |
| HURRICANE | 4958940800 | 82529119010 | 87488059810 |
| STORM | 1898215988 | 73736232191 | 75634448179 |
| TORNADO | 417462070 | 58607481077 | 59024943147 |
| PRECIPITATION | 4312663523 | 20632697723 | 24945361246 |
| DROUGHT | 13972381000 | 1047838600 | 15020219600 |
| ICE | 7024175300 | 4013257110 | 11037432410 |
| FIRE | 402116630 | 8489501500 | 8891618130 |
| WIND | 766971400 | 6067326183 | 6834297583 |
| COLD | 1365765500 | 272827400 | 1638592900 |
| LIGHTNING | 12097090 | 939729430 | 951826520 |
| HEAT | 904479280 | 20325750 | 924805030 |
| EROSION | 20017000 | 323258600 | 343275600 |
| ROUGH SEA | 0 | 102033650 | 102033650 |
| FOG | 0 | 22829500 | 22829500 |
| AVALANCHE | 0 | 8721800 | 8721800 |
| OTHER | 1034400 | 1470500 | 2504900 |
| DUST | 0 | 738630 | 738630 |
| RIP CURRENTS | 0 | 101000 | 101000 |
| SMOKE | 0 | 100000 | 100000 |
It is possible to conclude that floods are the most threat event.
The total loss in floods is $ 180.2758315 B resulting from $ 12.3212402 B in crops and $ 167.9545913 B in property.
If crops are only considered, the biggest cost is in drought with $ 13.972381 B.
The following statements are concluded: