Data Processing
Load the necessary packages.
knitr::opts_chunk$set(echo = TRUE)
library(data.table)
library(dplyr)
Read the health data into R.
Here we will also replace some of the variable names with more readable ones, as well as collect the totals for the fatality and injury variables
current_path <- "/home/rob/Data Science/reproducibleresearch/week4"
setwd(current_path) # Author use only.
if(!file.exists("repdata%2Fdata%2FStormData.csv.bz2")){
download.file(data_url, "StormData.csv.bz2")
time_data_downloaded <- Sys.time()
}
relevant_variables <- c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
storm_data <- fread(sprintf("bzcat %s | tr -d '\\000'", "StormData.csv.bz2"), na.strings = "", select = relevant_variables)
names(storm_data) <- c("event_type", "fatalities", "injuries", "property_damage", "property_damage_exponent", "crop_damage", "crop_damage_exponent")
fatality_data <- storm_data %>%
group_by(event_type) %>%
summarise(total = sum(fatalities)) %>%
arrange(desc(total))
injury_data <- storm_data %>%
group_by(event_type) %>%
summarise(total = sum(injuries)) %>%
arrange(desc(total))
Read the property data into R.
Handling the property and crop damage data takes more care, as the raw data use letters to indicate which power of ten the given damage value should be multiplied by.
property_data <- select(storm_data,
event_type,
property_damage,
property_damage_exponent)
property_data <- within(property_data, {
exponent = 0
exponent[property_damage_exponent %in% c("H", "h")] = 2
exponent[property_damage_exponent %in% c("K", "k")] = 3
exponent[property_damage_exponent %in% c("M", "m")] = 6
exponent[property_damage_exponent %in% c("B", "b")] = 9
exponent[property_damage_exponent %in% c("+", "-", "?", " ", "")] = 0
})
property_damage <- property_data %>%
mutate(property_damage_cost = property_damage * 10^exponent) %>%
group_by(event_type) %>%
summarise(total_property_cost = sum(property_damage_cost)) %>%
arrange(desc(total_property_cost)) %>%
filter(row_number() <= 180)
crop_data <- select(storm_data,
event_type,
crop_damage,
crop_damage_exponent)
crop_data <- within(crop_data, {
exponent = 0
exponent[crop_damage_exponent %in% c("H", "h")] = 2
exponent[crop_damage_exponent %in% c("K", "k")] = 3
exponent[crop_damage_exponent %in% c("M", "m")] = 6
exponent[crop_damage_exponent %in% c("B", "b")] = 9
exponent[crop_damage_exponent %in% c("+", "-", "?", " ", "")] = 0
})
crop_damage <- crop_data %>%
mutate(crop_damage_cost = crop_damage * 10^exponent) %>%
group_by(event_type) %>%
summarise(total_crop_cost = sum(crop_damage_cost)) %>%
arrange(desc(total_crop_cost)) %>%
filter(row_number() <= 37)
Results
Here we produce the Pareto charts to see which events are the causes of the majority of the damages to health and property. The code to produce these figures is lengthy, but we believe that the analysis will show them worthwhile. Note that because there were so many unique event types, full Pareto charts would be impractical, as the vast majority of event types cause comparatively little damage. So only the first few events are shown. However, the events shown will account for at least 75% of damage caused.
Create the health related Pareto charts
fatality_data <- arrange(fatality_data, desc(total)) %>%
mutate(
cumsum = cumsum(total),
freq = round(total / sum(total), 3),
cum_freq = cumsum(freq)
)
how_many_events <- 8
stretch <- 1.4
par(mar=c(7,5,4,5), mgp=c(4, 1, .5), mfrow = c(1, 2))
pc = barplot(fatality_data$total[1:how_many_events] * stretch,
width = 1, space = 0.2, border = NA, axes = F,
ylim = c(0,1.1 * max(fatality_data$cumsum, na.rm = T)),
ylab = "Cummulative Counts" , cex.names = 0.7,
names.arg = fatality_data$event_type[1:how_many_events],
las = 2,
main = "Fatality Pareto Chart")
lines(pc, fatality_data$cumsum[1:how_many_events] * stretch,
type = "b", cex = 0.7, pch = 19, col="cyan4")
box(col = "grey62")
axis(side = 2,
at = c(0, fatality_data$cumsum[1:how_many_events]) * stretch,
las = 1, col.axis = "grey62", col = "grey62", cex.axis = 0.8
)
axis(side = 4,
at = c(0, fatality_data$cumsum[1:how_many_events] * stretch),
labels = paste(c(0, round(fatality_data$cum_freq[1:how_many_events] * 100)),"%",sep=""),
las = 1, col.axis = "cyan4", col = "cyan4", cex.axis = 0.8
)
injury_data <- arrange(injury_data, desc(total)) %>%
mutate(
cumsum = cumsum(total),
freq = round(total / sum(total), 3),
cum_freq = cumsum(freq)
)
how_many_events <- 6
stretch <- 1.2
pc = barplot(injury_data$total[1:how_many_events] * stretch,
width = 1, space = 0.2, border = NA, axes = F,
ylim = c(0,1.1 * max(injury_data$cumsum, na.rm = T)),
ylab = "Cummulative Counts" , cex.names = 0.7,
names.arg = injury_data$event_type[1:how_many_events],
las = 2,
main = "Injury Pareto Chart")
lines(pc, injury_data$cumsum[1:how_many_events] * stretch,
type = "b", cex = 0.7, pch = 19, col="cyan4")
box(col = "grey62")
axis(side = 2,
at = c(0, injury_data$cumsum[1:how_many_events]) * stretch,
las = 1, col.axis = "grey62", col = "grey62", cex.axis = 0.8
)
axis(side = 4,
at = c(0, injury_data$cumsum[1:how_many_events] * stretch),
labels = paste(c(0, round(injury_data$cum_freq[1:how_many_events] * 100)),"%",sep=""),
las = 1, col.axis = "cyan4", col = "cyan4", cex.axis = 0.8
)
mtext("Figure 1: Public Health", side = 1, line = -25, outer = TRUE)

So tornadoes were the cause of 37% of the country’s weather related fatalities, and the top eight worst weather events collectively accounted for 77% of the deaths. For injuries, tornardos were again the most dangerous events, accounting for 65% of weather related injuries. Also, the top six events alone accounted for 85% of the injuries. Excessive heat, heat, floods, and lightning were also significant contributors to both lists.
Create the property related Pareto charts
property_damage <- mutate(property_damage,
cumsum = cumsum(total_property_cost),
freq = round(total_property_cost / sum(total_property_cost), 3),
cum_freq = cumsum(freq)
)
how_many_events <- 4
stretch <- 1.1
par(mar=c(11,7,4,5), mgp=c(6, 1, .5), mfrow = c(1, 2))
pc = barplot(property_damage$total_property_cost[1:how_many_events] * stretch,
width = 1, space = 0.2, border = NA, axes = F,
ylim = c(0,1 * max(property_damage$cumsum, na.rm = T)),
ylab = "Cummulative Counts" , cex.names = 0.7,
names.arg = property_damage$event_type[1:how_many_events],
las = 2,
main = "Property Damage Pareto Chart")
lines(pc, property_damage$cumsum[1:how_many_events] * stretch,
type = "b", cex = 0.7, pch = 19, col="cyan4")
box(col = "grey62")
axis(side = 2,
at = c(0, property_damage$cumsum[1:how_many_events]) * stretch,
las = 1, col.axis = "grey62", col = "grey62", cex.axis = 0.8
)
axis(side = 4,
at = c(0, property_damage$cumsum[1:how_many_events] * stretch),
labels = paste(c(0, round(property_damage$cum_freq[1:how_many_events] * 100)),"%",sep=""),
las = 1, col.axis = "cyan4", col = "cyan4", cex.axis = 0.8
)
crop_damage <- mutate(crop_damage,
cumsum = cumsum(total_crop_cost),
freq = round(total_crop_cost / sum(total_crop_cost), 3),
cum_freq = cumsum(freq)
)
how_many_events <- 5
stretch <- 1.11
pc = barplot(crop_damage$total_crop_cost[1:how_many_events] * stretch,
width = 1, space = 0.2, border = NA, axes = F,
ylim = c(0,1 * max(crop_damage$cumsum, na.rm = T)),
ylab = "Cummulative Counts" , cex.names = 0.7,
names.arg = crop_damage$event_type[1:how_many_events],
las = 2,
main = "Crop Damage Pareto Chart")
lines(pc, crop_damage$cumsum[1:how_many_events] * stretch,
type = "b", cex = 0.7, pch = 19, col="cyan4")
box(col = "grey62")
axis(side = 2,
at = c(0, crop_damage$cumsum[1:how_many_events]) * stretch,
las = 1, col.axis = "grey62", col = "grey62", cex.axis = 0.8
)
axis(side = 4,
at = c(0, crop_damage$cumsum[1:how_many_events] * stretch),
labels = paste(c(0, round(crop_damage$cum_freq[1:how_many_events] * 100)),"%",sep=""),
las = 1, col.axis = "cyan4", col = "cyan4", cex.axis = 0.8
)
mtext("Figure 2: Economic Consequence", side = 1, line = -25, outer = TRUE)

Once more tornadoes are a leading cause of damage, this time to property. More than half of property damage came from tornadoes, and more than 80% of property damage was caused by the top four types of weather events alone. For crops, excessive wetness and cold and wet conditions were the greatest factors, with the top five worst events causing 86% of all crop damage.