The goal of the current project is to explore the NOAA Storm Database and answer the following questions:
Analyzing data contains observations from January 03, 1951 to November 30, 2011.
The found answer to first question is that the most harmful event to public health is TORDNADO:
The found answer to second question is that the most harmful event to economy is FLOOD:
We start from downloading data and read it into data frame.
# load libraries
library(ggplot2)
library(gridExtra)
# get the data
fileName <- "repdata-data-StormData.csv.bz2"
if (!file.exists(fileName)){
download.file(
"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
destfile=fileName)
}
# load the data
data <- read.csv(bzfile(fileName), header=TRUE, sep=",", stringsAsFactors = FALSE)
dim(data)
## [1] 902297 37
dates <- as.Date(data$BGN_DATE, format = "%m/%d/%Y %H:%M:%S")
print("Dates of observations:"); print(c("Date from"=min(dates), "Date to"=max(dates)))
## [1] "Dates of observations:"
## Date from Date to
## "1950-01-03" "2011-11-30"
Our data have 902297 rows (observations) and 37 columns (variables).
Due to the Storm Data Documentation for our project we select the next variables:
PROPDMGEXP and CROPDMGEXP contain alphabetical characters explained in documentations as:
“Alphabetical characters used to signify magnitude include “K” for thousands, “M” for millions, and “B” for billions.” Page 12
Property Damage (PROPDMG) and Crop Damage (CROPDMG) will be used to measure economical damage.
Number of Fatalities (FATALITIES) and number of Injuries (INJURIES) will be used to measure health damage.
Data transformation:
# remove unnecessary variables
data.tidy <- data[, c("BGN_DATE", "EVTYPE", "FATALITIES", "INJURIES",
"PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
# set type to upper case
# to union names like "Wintry mix" and "Wintry Mix"
data.tidy$EVTYPE = toupper(data.tidy$EVTYPE)
# remove old variable from memory
rm(data)
# function to convert damage to dollars
calculate_amount <- function (sum, exp){
exp <- toupper(exp)
if (exp == "H") ## hunderds
sum <- sum * 10^2
else if (exp == "K") # thousands
sum <- sum * 10^3
else if (exp == "M") # millions
sum <- sum * 10^6
else if (exp == "B") # billions
sum <- sum * 10^9
else if (exp %in% c("+","-","?"))
sum <- sum
else if (!is.na(as.numeric(exp))) # defined as number
sum <- sum * 10^as.numeric(exp)
sum
}
# Convert Crop and Property Damage to dollars
data.tidy$PROPDMG <- mapply(calculate_amount, data.tidy$PROPDMG, data.tidy$PROPDMGEXP)
data.tidy$CROPDMG <- mapply(calculate_amount, data.tidy$CROPDMG, data.tidy$CROPDMGEXP)
# select year from date
data.tidy$YEAR <- as.numeric(format(
as.Date(data.tidy$BGN_DATE, format = "%m/%d/%Y %H:%M:%S"), "%Y"))
# remove unused columns
data.tidy$PROPDMGEXP <- NULL
data.tidy$CROPDMGEXP <- NULL
data.tidy$BGN_DATE <- NULL
# rename columns
colnames(data.tidy) <- c("Event", "Fatalities", "Injuries", "PropertyDamage", "CropDamage", "Year")
# now data is tidy
print("Preview piece of data");print(head(data.tidy), row.names = F)
## [1] "Preview piece of data"
## Event Fatalities Injuries PropertyDamage CropDamage Year
## TORNADO 0 15 25000 0 1950
## TORNADO 0 0 2500 0 1950
## TORNADO 0 2 25000 0 1951
## TORNADO 0 2 2500 0 1951
## TORNADO 0 2 2500 0 1951
## TORNADO 0 6 2500 0 1951
Find top Event’s Types by Fatalities and by Injuries.
harmfulEvents <- aggregate(cbind(Fatalities, Injuries) ~ Event, data.tidy, FUN=sum)
topFatalities <- harmfulEvents[head(order(harmfulEvents$Fatalities, decreasing = TRUE), 10), 1:2]
topInjuries <- harmfulEvents[head(order(harmfulEvents$Injuries, decreasing = TRUE), 10), c(1, 3)]
# Show top 10 by Fatalities
print("Top 10 by Fatalities");print(topFatalities, row.names = F)
## [1] "Top 10 by Fatalities"
## Event Fatalities
## TORNADO 5633
## EXCESSIVE HEAT 1903
## FLASH FLOOD 978
## HEAT 937
## LIGHTNING 816
## TSTM WIND 504
## FLOOD 470
## RIP CURRENT 368
## HIGH WIND 248
## AVALANCHE 224
# Show top 10 by Injuries
print("Top 10 by Injuries");print(topInjuries, row.names = F)
## [1] "Top 10 by Injuries"
## Event Injuries
## TORNADO 91346
## TSTM WIND 6957
## FLOOD 6789
## EXCESSIVE HEAT 6525
## LIGHTNING 5230
## HEAT 2100
## ICE STORM 1975
## FLASH FLOOD 1777
## THUNDERSTORM WIND 1488
## HAIL 1361
Now lets’s show top events in plots:
plot1 <-
ggplot(topFatalities,
aes(x=reorder(Event, Fatalities, function(x) x*-1), y=Fatalities)) +
geom_bar(stat = "identity", position = "stack", aes(fill = Fatalities)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title="Top 10 Events by Fatalities in 1950-2011") +
labs(x="", y="Total Fatalities") +
scale_fill_continuous(low="darkorange", high="darkred",
limits=c(0, topFatalities$Fatalities[1]))
plot2 <-
ggplot(topInjuries,
aes(x=reorder(Event, Injuries, function(x) x*-1), y=Injuries)) +
geom_bar(stat = "identity", position = "stack", aes(fill = Injuries)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title="Top 10 Events by Injuries in 1950-2011") +
labs(x="", y="Total Injuries") +
scale_fill_continuous(low="darkorange", high="darkred",
limits=c(0, topInjuries$Injuries[1]))
grid.arrange(plot1, plot2, nrow = 2)
RESULTS:
Add Property Damage to Crop Damage and find top 10 events with the greates economic consequences.
damage <- aggregate(PropertyDamage + CropDamage ~ Event, data.tidy, FUN=sum)
colnames(damage)[2] <- "TotalDamage"
topDamage <- damage[head(order(damage$TotalDamage, decreasing = TRUE), 10), ]
Top 10 events with economic damage in millions of dollars.
topDamage$TotalDamage <- topDamage$TotalDamage/1e6
# Show top 10 events
print("Top 10 by financial damage");print(topDamage, row.names = F)
## [1] "Top 10 by financial damage"
## Event TotalDamage
## FLOOD 150319.678
## HURRICANE/TYPHOON 71913.713
## TORNADO 57362.334
## STORM SURGE 43323.541
## HAIL 18761.222
## FLASH FLOOD 18243.991
## DROUGHT 15018.672
## HURRICANE 14610.229
## RIVER FLOOD 10148.405
## ICE STORM 8967.041
Now let’s show the plot.
update_geom_defaults("bar", list(fill = "red"))
plot1 <- ggplot(topDamage,
aes(x=reorder(Event, TotalDamage, function(x) x*-1),
y=TotalDamage))
plot1 <- plot1 + geom_bar(stat = "identity", position = "stack", aes(fill=TotalDamage))
plot1 <- plot1 + theme(axis.text.x = element_text(angle = 45, hjust = 1))
plot1 <- plot1 + labs(title="Top 10 Events by Economic Damage in 1950-2011")
plot1 <- plot1 + labs(x="", y="Total Damage in million dollars")
plot1 <- plot1 + scale_fill_continuous(low="darkorange", high="darkred",
limits=c(0, topDamage$TotalDamage[1]), name="Total Damage")
plot1
RESULTS: