In this report we aim to investigate which types of storm events are the most harmful with respect to the population heath. We are also interested in which type of storm events that have the greatest economic consequences.
For this analysis, we will utilize the dataset from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. However, the dataset consists of records with invalid field and some cleaning will have to be done to the dataset before we can utilize the dataset for the analysis.
Global Settings
echo = TRUE # Always make code visible
options(scipen = 1) # Turn off scientific notations for numbers
library(ggplot2)
library(RColorBrewer)
library(stringr)
library(scales)
suppressMessages(library(dplyr))
suppressMessages(library(R.utils))
Download and extract the storm dataset file. The code has been commented off as it is required only to run once to download and uncompress the dataset. The code has been included for others to reproduce the same result.
# download.file("http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
# destfile="storm.csv.bz2")
# bunzip2("storm.csv.bz2", overwrite=T, remove=F)
Load the storm dataset file into memory
storm_data <- read.csv("storm.csv", sep = ",")
From the information available in the code table, we know that the field INJURIES, FATALITIES and EVTYPE are required for the analysis.
dim(storm_data)
## [1] 902297 37
# filter out invalid record, accept only valid injuries and fatalities number
storm <- filter(storm_data, INJURIES > 0 & FATALITIES > 0)
# Convert EVTYPE to caps as some records have mixed lower and upper case.
storm$EVTYPE <- toupper(storm$EVTYPE)
# keep only required column and add a type for injuries/fatalities
injuries <- select(storm, EVTYPE, total = INJURIES)
injuries <- mutate(injuries, type = "Injuries" )
fatalities <- select(storm, EVTYPE, total = FATALITIES)
fatalities <- mutate(fatalities, type = "Fatalities" )
# top 5 injuries
injuries_grp <- summarise(group_by(injuries, type, EVTYPE), total = sum(total))
injuries_grp <- arrange(injuries_grp, desc(total))
injuries_grp <- top_n(injuries_grp, 5)
## Selecting by total
# top 5 fatalities
fatalities_grp <- summarise(group_by(fatalities, type, EVTYPE), total = sum(total))
fatalities_grp <- arrange(fatalities_grp, desc(total))
fatalities_grp <- top_n(fatalities_grp, 5)
## Selecting by total
# combine top 5 fatalities and injuries
health <- rbind(injuries_grp,fatalities_grp)
# helper function to calculate total cash damage by applying the multiper.
# Property damage in cash value = "PROPDMG" multiply "PROPDMGEXP"
# Crop damage in cash value = "CROPDMG" multiply "CROPDMGEXP"
cashValue <- function(value, multiplier) {
total <- 0
if(toupper(multiplier[1]) == 'H') {
total <- value * 100
}
if (toupper(multiplier[1]) == 'K') {
total <- value * 1000
}
if (toupper(multiplier[1]) == 'M') {
total <- value * 1000000
}
if (toupper(multiplier[1]) == 'B') {
total <- value * 1000000000
}
return(total)
}
# filter out invalid records, accept only valid property damage and damage multiplier value
property <- filter(storm_data, PROPDMG > 0)
property$EVTYPE <- toupper(property$EVTYPE)
property$PROPDMGEXP <- str_trim(property$PROPDMGEXP)
property <- select(property, EVTYPE, PROPDMG, PROPDMGEXP)
# calculate and add a column for the cash value of the property damage
property <- mutate(property, type = "Property", propdmgcash = cashValue(PROPDMG, PROPDMGEXP))
# top 5 property damage in mometary value
property_grp <- summarise(group_by(property, type, EVTYPE), total = sum(propdmgcash))
property_grp <- arrange(property_grp, desc(total))
property_grp <- top_n(property_grp, 5)
## Selecting by total
# filter out invalid records, accept only valid Crop damage and damage multiplier value
crop <- filter(storm_data, CROPDMG > 0)
crop$EVTYPE <- toupper(crop$EVTYPE)
crop$CROPDMGEXP <- str_trim(crop$CROPDMGEXP)
crop <- select(crop, EVTYPE, CROPDMG, CROPDMGEXP)
# calculate and add a column for the cash value of the crop damage
crop <- mutate(crop, type = "Crop", cropdmgcash = cashValue(CROPDMG, CROPDMGEXP))
# top 5 crop damage in mometary value
crop_grp <- summarise(group_by(crop, type, EVTYPE), total = sum(cropdmgcash))
crop_grp <- arrange(crop_grp, desc(total))
crop_grp <- top_n(crop_grp, 5)
## Selecting by total
# combine top 5 crop and property dammages in mometary value
damages <- rbind(property_grp, crop_grp)
# plot graph
ggplot(data = health, aes(x = EVTYPE, y = total, fill = EVTYPE)) +
geom_bar(stat="identity") +
facet_wrap(~ type, scales="free") +
theme(axis.text.x = element_text(angle = 90)) +
scale_fill_brewer(palette="Dark2") +
scale_y_continuous("Number of Injuries/Fatalities") +
ggtitle("Top 5 Injuries and Fatalities by Severe Weather\n Events in the U.S from 1950 - 2011")
TORNADO is the most harmful with respect to population health
# plot graph
ggplot(data = damages, aes(x = EVTYPE, y = total, fill = EVTYPE)) +
geom_bar(stat="identity") +
facet_wrap(~ type, scales = "free") +
theme(axis.text.x = element_text(angle = 90)) +
scale_fill_brewer(palette="Dark2") +
scale_y_continuous(name = "Property/Crop damages in mometary value", labels = comma) +
ggtitle("Top 5 Property and Crop damges by Severe Weather\n Events in the U.S from 1950 - 2011")
TORNADO has the greatest economic consequences on property
HAIL has the greatest economic consequences on crop.
End of Report