This data analysis explores the US NOAA Storm Events database, timeframe 1950 through 2011. Generally fewer events are recorded in the early years of the database, and later years are considered more complete. However, exploratory data analysis reveals that notable recent events are not included; among the missing are Hurricanes Camille (1969) and Katrina (2005).
The objective of this analysis is to determine which event types are most dangerous to population health, and which event types are most expensive in economic impact, specifically in terms of property and crop damage.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
National Oceanic and Atmospheric Administration (NOAA) Storm Events Data
National Weather Service Storm Data Documentation
National Climatic Data Center Storm Events FAQ
Download and unzip the data.
# download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = "storm-data.csv.bz2", method = "curl")
Load data with readr::read_csv.
stormData <- read_csv("storm-data.csv.bz2")
## Parsed with column specification:
## cols(
## .default = col_character(),
## STATE__ = col_double(),
## COUNTY = col_double(),
## BGN_RANGE = col_double(),
## COUNTY_END = col_double(),
## END_RANGE = col_double(),
## LENGTH = col_double(),
## WIDTH = col_double(),
## F = col_integer(),
## MAG = col_double(),
## FATALITIES = col_double(),
## INJURIES = col_double(),
## PROPDMG = col_double(),
## CROPDMG = col_double(),
## LATITUDE = col_double(),
## LONGITUDE = col_double(),
## LATITUDE_E = col_double(),
## LONGITUDE_ = col_double(),
## REFNUM = col_double()
## )
## See spec(...) for full column specifications.
Examine the data to determine size and structure.
# head(stormData)
# str(stormData)
dim(stormData)
## [1] 902297 37
colnames(stormData)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
Examination of the data reveals 902,297 observations and 37 variables. Cross-referencing with the codebook indicates that only 7 of the variables are required to address the analysis objective, as follows:
Convert EVTYPE values to consistent format, and convert PROPDMGEXP AND CROPDMGEXP exponent form to calculate property and crop damage cost.
data <- stormData %>% mutate(EVTYPE = str_to_upper(gsub("[^A-Za-z0-9]",
" ", EVTYPE))) %>%
mutate_at(vars(PROPDMGEXP, CROPDMGEXP),
funs(as.numeric(dplyr::recode(., '0'=1, '1'=10, '2'=100, '3'=1000,
'4'=10000, '5'=100000, '6'=1000000, '7'=10000000, '8'=100000000,
'B'=1000000000, 'h'=100, 'H'=100, 'k'=1000, 'K'=1000,
'm'=1000000, 'M'=1000000, ' '=0, '-'=0, '?'=0, '+'=0))))
Summarize the value of the variables FATALITIES and INJURIES for each EVTYPE.
var_health <- c("FATALITIES", "INJURIES")
health_loss <- data %>% select(EVTYPE, var_health) %>%
group_by(EVTYPE) %>%
summarise(count = n(),
FATALITIES = sum(FATALITIES, na.rm = TRUE),
INJURIES = sum(INJURIES, na.rm = TRUE)) %>%
mutate(Total_Casualties = FATALITIES + INJURIES) %>%
arrange(desc(Total_Casualties))
Determine the event with maximum impact.
knitr::kable(health_loss[1,], format.args = list(big.mark = ",", caption = "Most Dangerous"))
| EVTYPE | count | FATALITIES | INJURIES | Total_Casualties |
|---|---|---|---|---|
| TORNADO | 60,652 | 5,633 | 91,346 | 96,979 |
knitr::kable(head(health_loss, 10), format.args = list(big.mark = ",", caption = "Most Dangerous Event Types"))
| EVTYPE | count | FATALITIES | INJURIES | Total_Casualties |
|---|---|---|---|---|
| TORNADO | 60,652 | 5,633 | 91,346 | 96,979 |
| EXCESSIVE HEAT | 1,678 | 1,903 | 6,525 | 8,428 |
| TSTM WIND | 219,946 | 504 | 6,957 | 7,461 |
| FLOOD | 25,327 | 470 | 6,789 | 7,259 |
| LIGHTNING | 15,755 | 816 | 5,230 | 6,046 |
| HEAT | 767 | 937 | 2,100 | 3,037 |
| FLASH FLOOD | 54,278 | 978 | 1,777 | 2,755 |
| ICE STORM | 2,006 | 89 | 1,975 | 2,064 |
| THUNDERSTORM WIND | 82,564 | 133 | 1,488 | 1,621 |
| WINTER STORM | 11,433 | 206 | 1,321 | 1,527 |
casualty <- health_loss[1:10, ] %>% select(-Total_Casualties) %>%
gather(Type, Health_Impact, -EVTYPE, -count)
ph <- ggplot(casualty, aes(x = EVTYPE, y = Health_Impact, fill = Type)) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(title = "Environmental Danger: Top Ten Threats to Human Health",
x = "Event",
y = "Casualty: Fatality and Injury")
ph + scale_y_continuous(labels = comma)
Summarize the value of the variables PROPDMGEXP, PROPDMG, CROPDMG, CROPDMGEXP for each EVTYPE, determine the maximum and select the top 10.
var_econ <- c("PROPDMGEXP", "PROPDMG", "CROPDMGEXP", "CROPDMG")
econ_loss <- data %>%
select(EVTYPE, var_econ) %>%
mutate(PROPERTY = PROPDMG * PROPDMGEXP,
CROPS = CROPDMG * CROPDMGEXP) %>%
group_by(EVTYPE) %>%
summarise(count = n(),
PROPERTY = sum(PROPERTY, na.rm = TRUE),
CROPS = sum(CROPS, na.rm = TRUE)) %>%
mutate(Economic_Loss = PROPERTY + CROPS) %>%
arrange(desc(Economic_Loss))
Determine the Event with Maximum Cost.
knitr::kable(econ_loss[1,], format.args = list(big.mark = ",", caption = "Most Expensive"))
| EVTYPE | count | PROPERTY | CROPS | Economic_Loss |
|---|---|---|---|---|
| FLOOD | 25,327 | 144,657,709,800 | 5,661,968,450 | 150,319,678,250 |
knitr::kable(head(econ_loss, 10), format.args = list(big.mark = ",", caption = "Most Expensive Event Types"))
| EVTYPE | count | PROPERTY | CROPS | Economic_Loss |
|---|---|---|---|---|
| FLOOD | 25,327 | 144,657,709,800 | 5,661,968,450 | 150,319,678,250 |
| HURRICANE TYPHOON | 88 | 69,305,840,000 | 2,607,872,800 | 71,913,712,800 |
| TORNADO | 60,652 | 56,947,380,614 | 414,953,270 | 57,362,333,884 |
| STORM SURGE | 261 | 43,323,536,000 | 5,000 | 43,323,541,000 |
| HAIL | 288,661 | 15,735,267,456 | 3,025,954,470 | 18,761,221,926 |
| FLASH FLOOD | 54,278 | 16,822,723,772 | 1,421,317,100 | 18,244,040,872 |
| DROUGHT | 2,488 | 1,046,106,000 | 13,972,566,000 | 15,018,672,000 |
| HURRICANE | 174 | 11,868,319,010 | 2,741,910,000 | 14,610,229,010 |
| RIVER FLOOD | 173 | 5,118,945,500 | 5,029,459,000 | 10,148,404,500 |
| ICE STORM | 2,006 | 3,944,927,860 | 5,022,113,500 | 8,967,041,360 |
econ_impact <- econ_loss[1:10,] %>% select(-Economic_Loss) %>%
gather(Type, Economic_Impact, -EVTYPE, -count)
pe <- ggplot(econ_impact, aes(x = EVTYPE, y = Economic_Impact, fill = Type)) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(title = "Environmental Cost: Top Ten Expensive Events",
x = "Event",
y = "Cost in US Dollars")
pe + scale_y_continuous(labels = comma)
According to the available data in NOAA Storm Events database, the most dangerous environmental event type to human population health is TORNADO and the event type with most expensive economic impact to property and crops is FLOOD.