This analysis bases on the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks the characteristics of major storms and weather events in the United States. This will focus on analyzing how natural disasters affected society in the US in the period of 1950-2011. About the consequence in people’s health, the number of direct/indirect deaths and injuries cases are reported. The impact in the economy is calculated as total damage on properties, crops, and is reported in USD unit.
required_packages <- c('dplyr', 'ggplot2')
for (package in required_packages) {
if (!package %in% rownames(installed.packages())) {
install.packages(package)
}
}
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data_url <- 'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'
file_path <- 'raw_data/compressed.csv.bz2'
raw_data_description <- 'https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf'
download.file(data_url, file_path)
download.file(raw_data_description, 'raw_data/data_description.pdf')
storm_data <- read.csv(file_path)
discarded_columns <- c('REMARKS', 'REFNUM', 'LATITUDE', 'LONGITUDE',
'LATITUDE_E', 'LONGITUDE_', 'ZONENAMES', 'STATEOFFIC',
'WFO', 'BGN_TIME', 'TIME_ZONE', 'COUNTY', 'COUNTYNAME',
'END_TIME', 'COUNTY_END', 'COUNTYENDN', 'END_RANGE',
'END_LOCATI', 'BGN_RANGE', 'BGN_LOCATI', 'BGN_AZI',
'END_AZI', 'LENGTH', 'WIDTH', 'STATE__', 'MAG', 'F',
'BGN_DATE', 'END_DATE', 'STATE'
)
for (colname in discarded_columns) {
storm_data[colname] <- NULL
}
original_event_types <- list('tornado', 'hail', 'rain', 'snow', 'storm', 'dry',
'sleet', 'cloud', 'thunder', 'lightning', 'flood',
'drizzle', 'drought', 'frost', 'cold', 'eruption',
'fog', 'dust', 'waterspout', 'blizzard', 'tsunami',
c('wind', 'gust'),
c('tidal', 'tide'),
c('erosion', 'erosin'),
c('avalanche', 'avalance'),
c('tide', 'wave', 'surge'),
c('hurricane', 'whirlwind', 'typhoon'),
c('landslump', 'landslide', 'mudslide')
)
# Convert all character to lower case
storm_data$EVTYPE <- tolower(storm_data$EVTYPE)
storm_data$PROPDMGEXP <- tolower(storm_data$PROPDMGEXP)
storm_data$CROPDMGEXP <- tolower(storm_data$CROPDMGEXP)
construct_new_data <- function (other_storm_data) {
new_variable_name <- 'event_type'
new_storm_data <- list()
counter <- 1
for (i in 1:nrow(other_storm_data)) {
key_words <- strsplit(other_storm_data[i, 'EVTYPE'],
paste(c('/', ' '), collapse = '|'))
# Consider all event class
for (original_event_type in original_event_types) {
found <- F
# Consider all event's name of that class
for (my_type in original_event_type) {
# If that original event type contains the keyword, then this event is one of the current class of event
if (my_type %in% key_words) {
found = T
break
}
}
# The current row is classified into event classes here
if (found) {
# Discard the first column (EVTYPE)
new_row <- other_storm_data[i, -1]
# Add event class
new_row[new_variable_name] <- original_event_type[[1]][1]
# Add to the new data set
new_storm_data[[counter]] <- new_row
counter <- counter + 1
}
}
}
dplyr::bind_rows(new_storm_data)
}
system.time(new_storm_data <- construct_new_data(storm_data))
## user system elapsed
## 809.257 56.213 865.752
create_heath_data <- function (input_data) {
heath_properties <- input_data[, c('FATALITIES', 'INJURIES', 'event_type')]
fatalities <- heath_properties[, c('FATALITIES', 'event_type')]
# All statuses are fatalities
fatalities$heath_type <- rep('fatalities', nrow(heath_properties))
names(fatalities) <- c('count', 'event_type', 'heath_type')
injuries <- heath_properties[, c('INJURIES', 'event_type')]
# All statuses are injuries
injuries$heath_type <- rep('injuries', nrow(heath_properties))
names(injuries) <- c('count', 'event_type', 'heath_type')
# Bind injuries and deaths by row into a new data set
rbind(fatalities, injuries)
}
heath_data <- create_heath_data(new_storm_data)
system.time(
economic_data <- apply(new_storm_data, MARGIN = 1, function (row) {
# 'k', 'm', 'b' means '1000', '1000000', '1000000000' respectively, as the properties and crop damages were written as unit (in the PROPDMG and CROPDMG) and suffix 'k', 'b', 'm' (in the PROPDMGEXP and CROPDMGEXP).
calc_exp <- function (ch) {
if (ch == 'k') {
return (1e3)
} else {
if (ch == 'm') {
return (1e6)
} else {
if (ch == 'b') {
return (1e9)
} else {
return (0)
}
}
}
}
calc_origin <- function (num) {
if (!is.na(as.numeric(num))) {
return (as.numeric(num))
}
return (0)
}
economic_damage <- calc_origin(row[['PROPDMG']]) * calc_exp(row[['PROPDMGEXP']]) +
calc_origin(row[['CROPDMG']]) * calc_exp(row[['CROPDMGEXP']])
data.frame(event_type = row[['event_type']], damage = economic_damage)
}) %>%
dplyr::bind_rows()
)
## user system elapsed
## 325.211 4.794 330.117
heath_data_set <- aggregate(formula = count ~ event_type + heath_type,
data = heath_data,
FUN = sum, na.rm = T)
with(heath_data_set,
ggplot2::ggplot(heath_data_set,
aes(x = reorder(event_type, count),
y = count,
fill = event_type)
) +
ggplot2::coord_flip() +
ggplot2::geom_bar(stat = 'identity') +
ggplot2::ggtitle('Heath problems caused by each event type') +
ggplot2::labs(x = 'Event type', y = 'Deaths and injuries count') +
ggplot2::theme(legend.position = 'none',
plot.title = element_text(hjust = 0.5)) +
ggplot2::facet_grid(. ~ heath_type, scales = 'free_x')
)
heath_data_set
## event_type heath_type count
## 1 avalanche fatalities 225
## 2 blizzard fatalities 101
## 3 cold fatalities 38
## 4 drought fatalities 0
## 5 dry fatalities 0
## 6 flood fatalities 470
## 7 fog fatalities 62
## 8 frost fatalities 1
## 9 hail fatalities 15
## 10 hurricane fatalities 62
## 11 landslump fatalities 42
## 12 lightning fatalities 816
## 13 rain fatalities 0
## 14 sleet fatalities 2
## 15 snow fatalities 5
## 16 tornado fatalities 5633
## 17 tsunami fatalities 33
## 18 waterspout fatalities 3
## 19 wind fatalities 23
## 20 avalanche injuries 170
## 21 blizzard injuries 805
## 22 cold injuries 48
## 23 drought injuries 4
## 24 dry injuries 0
## 25 flood injuries 6789
## 26 fog injuries 734
## 27 frost injuries 3
## 28 hail injuries 1361
## 29 hurricane injuries 51
## 30 landslump injuries 54
## 31 lightning injuries 5230
## 32 rain injuries 0
## 33 sleet injuries 0
## 34 snow injuries 31
## 35 tornado injuries 91346
## 36 tsunami injuries 129
## 37 waterspout injuries 29
## 38 wind injuries 86
sum_damage_by_event <- aggregate(damage ~ event_type,
data = economic_data,
FUN = sum)
# Calculate as million dollars
sum_damage_by_event$damage <- round(sum_damage_by_event$damage / 1e6, 2)
with(sum_damage_by_event,
ggplot2::ggplot(sum_damage_by_event,
aes(x = reorder(event_type, damage),
y = damage,
fill = event_type)) +
ggplot2::coord_flip() +
ggplot2::geom_bar(stat = 'identity') +
ggplot2::ggtitle('Economic damage caused by each event type') +
ggplot2::labs(x = 'Event type', y = 'Monetary damage (million dollars)') +
ggplot2::theme(legend.position = 'none',
plot.title = element_text(hjust = 0.5))
)
sum_damage_by_event
## event_type damage
## 1 tornado 57352.11
## 2 hail 18758.22
## 3 snow 14.84
## 4 lightning 940.75
## 5 wind 8.99
## 6 flood 150319.68
## 7 cold 0.55
## 8 waterspout 9.35
## 9 blizzard 771.27
## 10 avalanche 3.72
## 11 sleet 0.00
## 12 drought 15018.67
## 13 dry 0.00
## 14 hurricane 15211.30
## 15 frost 66.02
## 16 fog 13.16
## 17 rain 6.05
## 18 landslump 346.46
## 19 tsunami 144.08