This is an analysis of the NOAA storm data that covers storm events that occurred starting in the year 1950 and ending November 2011. Harm to human health by specific types of storm events are measures by number of deaths and injuries resulting from the event. Storm Event economic consequences are measured through property damage and crop damage. The economic and human health impacts are assessed and relative damage is assessed for storm events with most impact.
From the NOAA website shown below, we download the data. A description of the data can be found in the Storm Data Documentation.
## Download the data
if (!file.exists("StormData.csv.bz2")) {
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileUrl, destfile = "StormData.csv.bz2", method = "curl")
library(tools)
sink("download_metadata3.txt")
print("Download date:")
print(Sys.time() )
print("Download URL:")
print(fileUrl)
print("Downloaded file Information")
print(file.info("StormData.csv.bz2"))
print("Downloaded file md5 Checksum")
print(md5sum("StormData.csv.bz2"))
sink()
}
## Read the data
StormData <- read.csv("StormData.csv.bz2")
## Filter and simplify the column names
StormData <- StormData[c(2:8, 23:28)]
names(StormData)[1] <- "begin.data"
names(StormData)[2] <- "begin.time"
names(StormData)[3] <- "time.zone"
names(StormData)[4] <- "county"
names(StormData)[5] <- "county.name"
names(StormData)[6] <- "state"
names(StormData)[7] <- "event.type"
names(StormData)[8] <- "fatalities"
names(StormData)[9] <- "injuries"
names(StormData)[10] <- "prop.damage"
names(StormData)[11] <- "pd.exp"
names(StormData)[12] <- "crop.damage"
names(StormData)[13] <- "cd.exp"
##Narrow down data to impact > 0.
StormData <- subset(StormData, StormData$fatalities | StormData$injuries | StormData$crop.damage | StormData$prop.damage > 0)
Assess how much missing data there is:
PCTNA <- mean(is.na(StormData)) * 100
The percentage of the data set that is “NA” is 0% so no adjustments need to be made to the data.
Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
## Group Data by Event Type
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
StormData.df = data.frame(StormData)
StormEvent <- group_by(StormData.df, event.type)
## Calculate sum of fatalities by Event Type and find which event has the most
SumsFatal <- summarise(StormEvent, fatalities=sum(fatalities))
MaxFatal <- which.max(SumsFatal$fatalities)
MaxEventFatal <- SumsFatal[MaxFatal, 1]
## Calculate sum of injuries by Event Type and find which event has the most
SumsInjury <- summarise(StormEvent, injuries=sum(injuries))
MaxInjury <- which.max(SumsInjury$injuries)
MaxEventInjury <- SumsInjury[MaxInjury, 1]
The storm event with the most fatalities across the US is TORNADO and the storm event with the most injuries is TORNADO
The following plot displays the relative harmfulness to humans of the various Storm Event types.
## Create an indicator of what events are most harmful with respect to population health.
StormData.df$Death.and.Injury <- StormData.df$fatalities + StormData.df$injuries
## Calculate sum of injuries and deaths over the time period of the data set and provide a ranking of top 10 weather events that impact population health.
StormEvent <- group_by(StormData.df, event.type)
SumsImpact <- summarise(StormEvent, Death.and.Injury=sum(Death.and.Injury))
sorteddata <- SumsImpact[order(-SumsImpact$Death.and.Injury),]
TopTen <- head(sorteddata, n = 10)
## Relevel factor so when plotted they appear in descending order of magnitude
TopTen$event.type <- factor(TopTen$event.type, levels=TopTen$event.type)
## Plot the results
library(ggplot2)
g <- ggplot(TopTen, aes(x = factor(event.type), y = Death.and.Injury))
g + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1)) +
xlab("Storm Event") + ylab("Death + Injury") + ggtitle("Impact to Public Health")
Do a similar analysis for top storm events causing the highest property damage and crop damage.
##Apply the multiplier provided in the data set to calculate property and crop damage in US$. The "prop.exp" and "crop.exp" columns contain a letter symbol that represents the multiplier.
## List the multiplier symbols and create a multiplier from the ones that are valid, then do a substitution for those symbols. Change muliplier factor to a numeric so they can be multiplie.
unique(StormData$pd.exp)
## [1] K M B m + 0 5 6 4 h 2 7 3 H -
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
StormData$pd.exp <- as.character(StormData$pd.exp)
StormData$pd.exp <- sapply(StormData$pd.exp, switch, '0' = 10, '1' = 10, '2' = 10, '3' = 10, '4' = 10, '5' = 10, '6' = 10, '7' = 10, '8' = 10, 'K' = 1000, 'B' = 1000000000,'M' = 1000000, 'm' = 1000000, 'H' = 100, 'h' = 100, ' ' = 0, '+' = 1, '-' = 0, '?' = 0)
StormData$pd.exp <- as.character(StormData$pd.exp)
StormData$pd.exp <- as.numeric(StormData$pd.exp)
## Warning: NAs introduced by coercion
StormData$prop.damage <- as.character(StormData$prop.damage)
StormData$prop.damage <- as.numeric(StormData$prop.damage)
StormData$cd.exp <- as.character(StormData$cd.exp)
StormData$cd.exp <- sapply(StormData$cd.exp, switch, '0' = 10, 'K' = 1000, 'k' = 1000, 'M' = 1000000, 'm' = 1000000, 'B' = 1000000000, ' ' = 0, '?' = 0, '2' = 0)
StormData$cd.exp <- as.character(StormData$cd.exp)
StormData$cd.exp <- as.numeric(StormData$cd.exp)
## Warning: NAs introduced by coercion
StormData$crop.damage <- as.character(StormData$crop.damage)
StormData$crop.damage <- as.numeric(StormData$crop.damage)
StormData$PropCost <- StormData$pd.exp * StormData$prop.damage
StormData$CropCost <- StormData$cd.exp * StormData$crop.damage
## Calculate overall damage in $
StormData$Cost <- StormData$CropCost + StormData$PropCost
StormData.df = data.frame(StormData)
StormEvent <- group_by(StormData.df, event.type)
SumsCost <- summarise(StormEvent, Cost=sum(Cost))
MaxCost <- which.max(SumsCost$Cost)
MaxEventCost <- SumsCost[MaxCost, 1]
sorteddata <- SumsCost[order(-SumsCost$Cost),]
Ranking <- head(sorteddata, n = 10)
Ranking$event.type <- factor(Ranking$event.type, levels=Ranking$event.type)
library(ggplot2)
library(scales)
options(scipen=999)
g <- ggplot(Ranking, aes(x = factor(event.type), y = Cost))
g + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1)) +
ylab("Property plus Crop Damage ($)") +
xlab("Storm Events") +
ggtitle("Storm Events with Highest Costs") +
scale_y_continuous(labels = comma)