Synopsis

This data analysis explores the US NOAA Storm Events database, timeframe 1950 through 2011. Generally fewer events are recorded in the early years of the database, and later years are considered more complete. However, exploratory data analysis reveals that notable recent events are not included; among the missing are Hurricanes Camille (1969) and Katrina (2005).

The objective of this analysis is to determine which event types are most dangerous to population health, and which event types are most expensive in economic impact, specifically in terms of property and crop damage.

Prerequisites

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor

Data and Supporting Info Sources

National Oceanic and Atmospheric Administration (NOAA) Storm Events Data

National Weather Service Storm Data Documentation

National Climatic Data Center Storm Events FAQ

Data Processing

Load the Data

Download and unzip the data.

# download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = "storm-data.csv.bz2", method = "curl")

Load data with readr::read_csv.

stormData <- read_csv("storm-data.csv.bz2")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   STATE__ = col_double(),
##   COUNTY = col_double(),
##   BGN_RANGE = col_double(),
##   COUNTY_END = col_double(),
##   END_RANGE = col_double(),
##   LENGTH = col_double(),
##   WIDTH = col_double(),
##   F = col_integer(),
##   MAG = col_double(),
##   FATALITIES = col_double(),
##   INJURIES = col_double(),
##   PROPDMG = col_double(),
##   CROPDMG = col_double(),
##   LATITUDE = col_double(),
##   LONGITUDE = col_double(),
##   LATITUDE_E = col_double(),
##   LONGITUDE_ = col_double(),
##   REFNUM = col_double()
## )
## See spec(...) for full column specifications.

Exploratory Data Analysis

Examine the data to determine size and structure.

# head(stormData)
# str(stormData)
dim(stormData)
## [1] 902297     37
colnames(stormData)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

Examination of the data reveals 902,297 observations and 37 variables. Cross-referencing with the codebook indicates that only 7 of the variables are required to address the analysis objective, as follows:

  • EVTYPE
  • FATALITIES
  • INJURIES
  • PROPDMG
  • PROPDMGEXP
  • CROPDMG
  • CROPDMGEXP

Clean the Data

Convert EVTYPE values to consistent format, and convert PROPDMGEXP AND CROPDMGEXP exponent form to calculate property and crop damage cost.

data <- stormData %>% mutate(EVTYPE = str_to_upper(gsub("[^A-Za-z0-9]", 
        " ", EVTYPE))) %>% 
        mutate_at(vars(PROPDMGEXP, CROPDMGEXP),
        funs(as.numeric(dplyr::recode(., '0'=1, '1'=10, '2'=100, '3'=1000,
        '4'=10000, '5'=100000, '6'=1000000, '7'=10000000, '8'=100000000,
        'B'=1000000000, 'h'=100, 'H'=100, 'k'=1000, 'K'=1000,
        'm'=1000000, 'M'=1000000, ' '=0, '-'=0, '?'=0, '+'=0))))

Analyze the 10 Events Most Dangerous to Human Health

Summarize the value of the variables FATALITIES and INJURIES for each EVTYPE.

var_health <- c("FATALITIES", "INJURIES")

health_loss <- data %>% select(EVTYPE, var_health) %>% 
        group_by(EVTYPE) %>% 
        summarise(count = n(),
        FATALITIES = sum(FATALITIES, na.rm = TRUE),
        INJURIES = sum(INJURIES, na.rm = TRUE)) %>% 
        mutate(Total_Casualties = FATALITIES + INJURIES) %>% 
        arrange(desc(Total_Casualties))

Results: Health Impact

Determine the event with maximum impact.

knitr::kable(health_loss[1,], format.args = list(big.mark = ",", caption = "Most Dangerous"))
EVTYPE count FATALITIES INJURIES Total_Casualties
TORNADO 60,652 5,633 91,346 96,979

Top Ten Most Dangerous (Total Casualties = Fatalities + Injuries)

knitr::kable(head(health_loss, 10), format.args = list(big.mark = ",", caption = "Most Dangerous Event Types"))
EVTYPE count FATALITIES INJURIES Total_Casualties
TORNADO 60,652 5,633 91,346 96,979
EXCESSIVE HEAT 1,678 1,903 6,525 8,428
TSTM WIND 219,946 504 6,957 7,461
FLOOD 25,327 470 6,789 7,259
LIGHTNING 15,755 816 5,230 6,046
HEAT 767 937 2,100 3,037
FLASH FLOOD 54,278 978 1,777 2,755
ICE STORM 2,006 89 1,975 2,064
THUNDERSTORM WIND 82,564 133 1,488 1,621
WINTER STORM 11,433 206 1,321 1,527

Health Impact Plot

casualty <- health_loss[1:10, ] %>% select(-Total_Casualties) %>% 
        gather(Type, Health_Impact, -EVTYPE, -count)
ph <- ggplot(casualty, aes(x = EVTYPE, y = Health_Impact, fill = Type)) +
        theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) +
        geom_bar(stat = "identity", position = position_dodge()) +
        labs(title = "Environmental Danger: Top Ten Threats to Human Health",
             x = "Event",
             y = "Casualty: Fatality and Injury")
ph + scale_y_continuous(labels = comma)

Analyze Events with Most Expensive Economic Consequences

Summarize the value of the variables PROPDMGEXP, PROPDMG, CROPDMG, CROPDMGEXP for each EVTYPE, determine the maximum and select the top 10.

var_econ <- c("PROPDMGEXP", "PROPDMG", "CROPDMGEXP", "CROPDMG")

econ_loss <- data %>% 
        select(EVTYPE, var_econ) %>% 
        mutate(PROPERTY = PROPDMG * PROPDMGEXP,
        CROPS = CROPDMG * CROPDMGEXP) %>% 
        group_by(EVTYPE) %>% 
        summarise(count = n(),
        PROPERTY = sum(PROPERTY,  na.rm = TRUE),
        CROPS = sum(CROPS, na.rm = TRUE)) %>% 
        mutate(Economic_Loss = PROPERTY + CROPS) %>% 
        arrange(desc(Economic_Loss))

Results: Economic Cost in US Dollars

Determine the Event with Maximum Cost.

knitr::kable(econ_loss[1,], format.args = list(big.mark = ",", caption = "Most Expensive"))
EVTYPE count PROPERTY CROPS Economic_Loss
FLOOD 25,327 144,657,709,800 5,661,968,450 150,319,678,250

Top Ten Most Expensive (Economic Impact = Property Loss + Crop Loss)

knitr::kable(head(econ_loss, 10), format.args = list(big.mark = ",", caption = "Most Expensive Event Types"))
EVTYPE count PROPERTY CROPS Economic_Loss
FLOOD 25,327 144,657,709,800 5,661,968,450 150,319,678,250
HURRICANE TYPHOON 88 69,305,840,000 2,607,872,800 71,913,712,800
TORNADO 60,652 56,947,380,614 414,953,270 57,362,333,884
STORM SURGE 261 43,323,536,000 5,000 43,323,541,000
HAIL 288,661 15,735,267,456 3,025,954,470 18,761,221,926
FLASH FLOOD 54,278 16,822,723,772 1,421,317,100 18,244,040,872
DROUGHT 2,488 1,046,106,000 13,972,566,000 15,018,672,000
HURRICANE 174 11,868,319,010 2,741,910,000 14,610,229,010
RIVER FLOOD 173 5,118,945,500 5,029,459,000 10,148,404,500
ICE STORM 2,006 3,944,927,860 5,022,113,500 8,967,041,360

Economic Impact Plot

econ_impact <- econ_loss[1:10,] %>% select(-Economic_Loss) %>% 
        gather(Type, Economic_Impact, -EVTYPE, -count)
pe <- ggplot(econ_impact, aes(x = EVTYPE, y = Economic_Impact, fill = Type)) +
        theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) +
        geom_bar(stat = "identity", position = position_dodge()) +
        labs(title = "Environmental Cost: Top Ten Expensive Events",
             x = "Event",
             y = "Cost in US Dollars") 
pe + scale_y_continuous(labels = comma)

Results: Summary

According to the available data in NOAA Storm Events database, the most dangerous environmental event type to human population health is TORNADO and the event type with most expensive economic impact to property and crops is FLOOD.