The dataset is loaded from the Data Repository provided as input for this study by Coursera on the Course: Reproducible Research. The downloaded file is compressed, but read.table() can handle compressed files directly, then we store it as data to handle in through data.table().
library(R.utils)
## Warning: package 'R.utils' was built under R version 3.1.1
library(data.table)
library(ggplot2)
library(gridExtra)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.1.1
library(quantmod)
## Warning: package 'quantmod' was built under R version 3.1.1
if(file.exists("idaho_horepdata_data_StormData.csv.bz2")==FALSE) {
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(url=fileUrl, destfile="idaho_horepdata_data_StormData.csv.bz2", method="curl")
}
#data <- read.csv("idaho_horepdata_data_StormData.csv.bz2") # This way loaded only around 32k rows of data.
data <- read.table("idaho_horepdata_data_StormData.csv.bz2", sep = ",", header=T)
dataset <- data.table(data)
rm(data)
Then we check the dimension of the data to confirm that all rows were loaded, which should match with 902297.
dim(dataset)
## [1] 902297 37
Given the condition that recent years are more well documented we base this study on years where the quantity of documented events is more significant. This is done with a graphic which qualitatively provide that on and after 1995 we can observe more events recorded.
dataset$year = as.numeric(format(as.Date(dataset$BGN_DATE, format="%m/%d/%Y %H:%M:%S"), "%Y"))
stormsByYear = dataset[, lapply(.SD,length), by=year, .SDcols=1]
setnames(stormsByYear, 2, "count")
ggplot(stormsByYear) + theme_bw() + geom_line(aes(year, count))
dataset = dataset[year >= 1995]
dim(dataset)
## [1] 681500 38
Property and Crop Damages Features are transformed into numerical form to be studied better from statistical point of view. This is based on descriptions from Storm Data Documentation by National Oceanic & Atmospheric Administration - National Weather Service.
dataset$PROPDMGEXP = as.character(dataset$PROPDMGEXP)
dataset$PROPDMGEXP[toupper(dataset$PROPDMGEXP) == 'B'] = "9"
dataset$PROPDMGEXP[toupper(dataset$PROPDMGEXP) == 'M'] = "6"
dataset$PROPDMGEXP[toupper(dataset$PROPDMGEXP) == 'K'] = "3"
dataset$PROPDMGEXP[toupper(dataset$PROPDMGEXP) == 'H'] = "2"
dataset$PROPDMGEXP = as.numeric(dataset$PROPDMGEXP)
dataset$PROPDMGEXP[is.na(dataset$PROPDMGEXP)] = 0
dataset$PropertyDamage = dataset$PROPDMG * 10^dataset$PROPDMGEXP
#summary(dataset$PropertyDamage)
dataset$CROPDMGEXP = as.character(dataset$CROPDMGEXP)
dataset$CROPDMGEXP[toupper(dataset$CROPDMGEXP) == 'B'] = "9"
dataset$CROPDMGEXP[toupper(dataset$CROPDMGEXP) == 'M'] = "6"
dataset$CROPDMGEXP[toupper(dataset$CROPDMGEXP) == 'K'] = "3"
dataset$CROPDMGEXP[toupper(dataset$CROPDMGEXP) == 'H'] = "2"
dataset$CROPDMGEXP[toupper(dataset$CROPDMGEXP) == ''] = "0"
dataset$CROPDMGEXP = as.numeric(dataset$CROPDMGEXP)
dataset$CROPDMGEXP[is.na(dataset$CROPDMGEXP)] = 0
dataset$CropDamage = dataset$CROPDMG * 10^dataset$CROPDMGEXP
#summary(dataset$CropDamage)
dataset$TotalDamage = dataset$PropertyDamage + dataset$CropDamage
#summary(dataset$TotalDamage)
We have considered above that property damage is as critical as crop damage and sum them up into a Total Damage indicator.
In order to handle a reasonable quantity of event types names, we apply stemming to similar denominations, for example calling “HEAVY RAIN”" to all types: “HEAVY RAIN/SEVERE WEATHER”, “EXCESSIVE RAINFALL”, “UNSEASONAL RAIN”, “HEAVY RAINS”.
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("HURRICANE/TYPHOON",
"HURRICANE OPAL",
"HURRICANE OPAL/HIGH WINDS",
"HURRICANE EMILY",
"TYPHOON",
"HURRICANE ERIN")] = "HURRICANE"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("TSTM WIND",
" TSTM WIND",
"SEVERE THUNDERSTORM WINDS",
"THUNDERSTORM WINDS")] = "THUNDERSTORM WIND"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("HEAVY RAIN/SEVERE WEATHER",
"EXCESSIVE RAINFALL",
"UNSEASONAL RAIN",
"HEAVY RAINS")] = "HEAVY RAIN"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("STORM SURGE/TIDE"
)] = "STORM SURGE"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("WILD/FOREST FIRE",
"WILDFIRES",
"WILD FIRES")] = "WILDFIRE"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("EXCESSIVE HEAT",
"HEAT WAVE",
"EXTREME HEAT",
"UNSEASONABLY WARM",
"RECORD/EXCESSIVE HEAT",
"RECORD HEAT")] = "HEAT"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("EXTREME COLD",
"FROST/FREEZE",
"FROST",
"Early Frost ",
"DAMAGING FREEZE",
"RECORD COLD",
"COLD/WIND CHILL",
"EXTREME COLD/WIND CHILL",
"UNSEASONABLY COLD",
"Unseasonable Cold",
"HARD FREEZE",
"FREEZE")] = "COLD"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("HIGH WINDS",
"HIGH WIND",
"BLOWING WIND",
"STRONG WINDS",
"STRONG WIND")] = "WIND"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("FLASH FLOODING",
"FLASH FLOOD/FLOOD",
"FLOOD/FLASH FLOOD")] = "FLASH FLOOD"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("SMALL HAIL")] = "HAIL"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("RIVER FLOODING"
)] = "RIVER FLOOD"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("FLOODING",
"MAJOR FLOOD")] = "FLOOD"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("COASTAL FLOODING",
"COASTAL FLOODING/EROSION",
"COASTAL FLOODING/EROSION",
"Erosion/Cstl Flood",
"COASTAL FLOOD")] = "COASTAL FLOOD"
We sum up all Injuries and Fatalitis into a Total People Feature.
dataset$TotalPeople = dataset$INJURIES + dataset$FATALITIES
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("TROPICAL STORM GORDON",
"TROPICAL STORM JERRY")] = "TROPICAL STORM"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("DENSE FOG"
)] = "FOG"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("RIP CURRENTS"
)] = "RIP CURRENT"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("HEAVY SURF",
"HEAVY SURF/HIGH SURF")] = "HIGH SURF"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("WATERSPOUT/TORNADO"
)] = "WATERSPOUT"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("WINTRY MIX",
"WINTER WEATHER MIX",
"WINTER WEATHER/MIX")] = "WINTER WEATHER"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("WINTER STORMS"
)] = "WINTER STORM"
dataset$EVTYPE[toupper(dataset$EVTYPE) %in%
c("MARINE TSTM WIND"
)] = "MARINE THUNDERSTORM WIND"
We want to address the question:
Based on our previous definition of Total People Feature which summarizes all injuries and fatalities, we investigate which are the 20 Event Types with highest impact to this feature, which we associate to Human Damage (Affecting Population’s Health) concept:
harmful = as.data.frame.table(tail(sort(tapply(dataset$TotalPeople, dataset$EVTYPE, sum)), n=20))
colnames(harmful) = c("EventType", "TotalPeople")
p1 = ggplot(data=harmful, aes(x=EventType, y=TotalPeople)) + theme_bw() +
theme(plot.margin=unit(c(1,1,1,1), "cm")) +
geom_bar(stat="identity") +
labs(x="", y="Quantity of People Killed and Injured")+
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
xlab("Weather Disaster Event Type")
grid.arrange(p1, nrow=1, main="Harmfulness vs Event Type")
From these results we can see that the 5 Event Types with highest impact in fatalities and injuries, or Human Damage as we called, are in order of impact:
From the list above, the conclusion that answer our first question is: Tornado and Heat are the event types with highest Effect in Population’s Health.
We want to address the question:
As mentioned before, we have considered above that property damage is as critical as crop damage and sum them up into a Total Damage indicator. From this perspective we investigate which are the 20 Event Types with highest impact to this indicator or feature.
destruction = as.data.frame.table(tail(sort(tapply(dataset$TotalDamage, dataset$EVTYPE, sum)), n=20))
colnames(destruction) = c("EventType", "TotalDamage")
p1 = ggplot(data=destruction, aes(x=EventType, y=TotalDamage)) + theme_bw() +
theme(plot.margin=unit(c(1,1,1,1), "cm")) +
geom_bar(stat="identity") +
labs(x="", y="Destruction on Property and Crops (USD)")+
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
xlab("Weather Disaster Event Type")
grid.arrange(p1, nrow=1, main="Destruction Level vs Event Type")
From these results we can see that the 5 Event Types with highest impact in property and crop (Highest Economic Consequences), are in order of impact:
From the list above, the conclusion that answer our second question is: Flood and Hurricane are the event types with highest Effects in Properties and Crops, or greatest Economical Consequences.
From the two previous sections, observing the individual graphics, we can conclude that from an Overall Perspective, considering People’s Health and Economic Consequences, the most destructive Weather Disaster Event Types are:
These two Weather Disaster Event Types are both in the Top 5 lists of People’s Health and Economic Consequences.
From this study it is clear that Public and Private special care measures must be taken in order to protect People’s Health and Economic Consequences to cope with Tornados and Flood. For these types of events, the society and public institutions must be prepared, with proactive measures to avoid damages and with reactive platforms to actuate and recover/restore their destructive effects.