The aim of this analysis it to assess the major severe weather events that are most harmful against the U.S population, as well as those most impactful from an economic point of view.
The analysis is conduct using the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database, and higlights the fatalities, injuries, damages to properties and crops detected during the period 1950-2011.
As a first step we should clean the environment, and set the working directory.
#rm(list=ls())
#setwd(<set working path here>)
For this analysis, we used the following libraries that should be installed if not availables.
# install.packages("knitr")
# install.packages("stringr")
# install.packages("lubridate")
# install.packages("ggplot2")
library(knitr)
library(stringr)
library(lubridate)
library (ggplot2)
findDecadefindDecade <- function(yearIn, startYear, endYear){
startYear <- as.numeric(substr(startYear,1,3))*10
for (i in seq(startYear, endYear, 10)){
if (yearIn %in% seq(i, i+9, 1))
{decade <- paste(i , "s", sep = "")}
}
return(decade)
}
multiplotThe function is a courtesy of Winston Chang, and available at this link.
# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
We can then download the dataset from the following link storm data, unzip and load it into the rawData dataframe in R. To speed up the load we set the class of every column.
url <- "./repdata-data-StormData.csv"
headersData <- read.csv(url, nrows = 1)
classesData <- sapply(headersData, class)
classesData[1:37] <- "character"
classesData[1] <- "numeric"
classesData[2] <- "character"
classesData[5] <- "numeric"
classesData[23:24] <- "numeric"
classesData[19:20] <- "numeric"
classesData[32:35] <- "numeric"
classesData[37] <- "numeric"
rawData <- read.csv(url, colClasses = classesData)
We then partitioned the rawData in four subsets to analyze the:
dataFataldataInjurdataPropdataCropdataFatal <- rawData[(as.numeric(rawData$FATALITIES) != 0), c(2,8,23)]
dataInjur <- rawData[(as.numeric(rawData$INJURIES) != 0), c(2,8,24)]
dataProp <- rawData[(as.numeric(rawData$PROPDMG) != 0)
& (rawData$PROPDMGEXP %in% c("K","M","B")), c(2,8,25,26)]
dataCrop <- rawData[(as.numeric(rawData$CROPDMG) != 0)
& (rawData$CROPDMGEXP %in% c("K","M","B")), c(2,8,27,28)]
For each of the working subsets we add new columns to perform the analysis.
DATE_NORMdataFatalNorm <- cbind(
dataFatal,
mdy(substr(dataFatal[,1], 1, nchar(dataFatal[,1])-8))
)
dataInjurNorm <- cbind(
dataInjur,
mdy(substr(dataInjur[,1], 1, nchar(dataInjur[,1])-8))
)
dataPropNorm <- cbind(
dataProp,
mdy(substr(dataProp[,1], 1, nchar(dataProp[,1])-8))
)
dataCropNorm <- cbind(
dataCrop,
mdy(substr(dataCrop[,1], 1, nchar(dataCrop[,1])-8))
)
colnames(dataFatalNorm)[4] <- "DATE_NORM"
colnames(dataInjurNorm)[4] <- "DATE_NORM"
colnames(dataPropNorm)[5] <- "DATE_NORM"
colnames(dataCropNorm)[5] <- "DATE_NORM"
PROPDMG_NORM and CROPDMG_NORMFor the economic consequences we do a further step by normalizing the damages values expressed by the following fields: PROPDMG, PROPDMGEXP, CROPDMG and CROPDMGEXP. The only admissibiles values are K, M and B for the multipliers, respectively for the thousands, millions and billions.
dataPropNorm$PROPDMG_NORM <- ifelse(dataPropNorm$PROPDMGEXP == "K",as.numeric(dataPropNorm$PROPDMG)*1e+3,"")
dataPropNorm$PROPDMG_NORM <- ifelse(dataPropNorm$PROPDMGEXP == "M",as.numeric(dataPropNorm$PROPDMG)*1e+6,dataPropNorm$PROPDMG_NORM)
dataPropNorm$PROPDMG_NORM <- ifelse(dataPropNorm$PROPDMGEXP == "B",as.numeric(dataPropNorm$PROPDMG)*1e+9,dataPropNorm$PROPDMG_NORM)
dataCropNorm$CROPDMG_NORM <- ifelse(dataCropNorm$CROPDMGEXP == "K",as.numeric(dataCropNorm$CROPDMG)*1e+3,"")
dataCropNorm$CROPDMG_NORM <- ifelse(dataCropNorm$CROPDMGEXP == "M",as.numeric(dataCropNorm$CROPDMG)*1e+6,dataCropNorm$CROPDMG_NORM)
dataCropNorm$CROPDMG_NORM <- ifelse(dataCropNorm$CROPDMGEXP == "B",as.numeric(dataCropNorm$CROPDMG)*1e+9,dataCropNorm$CROPDMG_NORM)
DECADEWe then use a function called findDecade that takes 3 arguments in in put and return the decade of a year in an range (the function is reported in appendix).
We first retrieve, for each working subset, the interval of years (starting and ending) of the event.
sYearFat <- year(min(dataFatalNorm$DATE_NORM))
eYearFat <- year(max(dataFatalNorm$DATE_NORM))
sYearInj <- year(min(dataInjurNorm$DATE_NORM))
eYearInj <- year(max(dataInjurNorm$DATE_NORM))
sYearProp <- year(min(dataPropNorm$DATE_NORM))
eYearProp <- year(max(dataPropNorm$DATE_NORM))
sYearCrop <- year(min(dataCropNorm$DATE_NORM))
eYearCrop <- year(max(dataCropNorm$DATE_NORM))
Then we apply the function the decade.
for(i in 1:nrow(dataFatalNorm)) {
dataFatalNorm[i,5] <- findDecade(
year(dataFatalNorm[i,4]),
sYearFat,
eYearFat
)
}
for(i in 1:nrow(dataInjurNorm)) {
dataInjurNorm[i,5] <- findDecade(
year(dataInjurNorm[i,4]),
sYearInj,
eYearInj
)
}
for(i in 1:nrow(dataPropNorm)) {
dataPropNorm[i,7] <- findDecade(
year(dataPropNorm[i,5]),
sYearProp,
eYearProp
)
}
for(i in 1:nrow(dataCropNorm)) {
dataCropNorm[i,7] <- findDecade(
year(dataCropNorm[i,5]),
sYearCrop,
eYearCrop
)
}
And then we rename each of the new columns created
colnames(dataFatalNorm)[5] <- "DECADE"
colnames(dataInjurNorm)[5] <- "DECADE"
colnames(dataPropNorm)[7] <- "DECADE"
colnames(dataCropNorm)[7] <- "DECADE"
GROUPSince in the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records, we divided the analysis in two groups: the first of the pre 1990s decades and the second of the post 1990s ones.
dataFatalNorm$GROUP <- factor(ifelse(dataFatalNorm$DECADE >= "1990s","post-1990s", "pre-1990s"))
dataInjurNorm$GROUP <- factor(ifelse(dataInjurNorm$DECADE >= "1990s","post-1990s", "pre-1990s"))
dataPropNorm$GROUP <- factor(ifelse(dataPropNorm$DECADE >= "1990s","post-1990s", "pre-1990s"))
dataCropNorm$GROUP <- factor(ifelse(dataCropNorm$DECADE >= "1990s","post-1990s", "pre-1990s"))
For each of the working datasets we made an aggregation of data by DECADE, EVTYPE and GROUP, to obtain 4 new aggregated working datasets.
aggFat <- aggregate(FATALITIES~DECADE+EVTYPE+GROUP,
data=dataFatalNorm,
FUN="sum"
)
aggInj <- aggregate(INJURIES~DECADE+EVTYPE+GROUP,
data=dataInjurNorm,
FUN="sum"
)
aggProp <- aggregate(as.numeric(PROPDMG_NORM)~DECADE+EVTYPE+GROUP,
data=dataPropNorm,
FUN="sum"
)
aggCrop <- aggregate(as.numeric(CROPDMG_NORM)~DECADE+EVTYPE+GROUP,
data=dataCropNorm,
FUN="sum"
)
colnames(aggProp)[4] <- "PROPDMG_NORM"
colnames(aggCrop)[4] <- "CROPDMG_NORM"
Since a full representation would not be feasible, for each of the aggregated working subsets we choose only the top values, based on the coverage percentage of at least 70%. Specifically we took a number of aggregated records needed to cover at least 70% of the harmful events or damaged value.
round(sum(aggFat[head(order(-aggFat[,4]),20),4])
/sum(aggFat$FATALITIES)*100,2)
## [1] 72.39
round(sum(aggInj[head(order(-aggInj[,4]),20),4])
/sum(aggInj$INJURIES)*100,2)
## [1] 86.71
round(sum(aggProp[head(order(-aggProp[,4]),20),4])
/sum(aggProp$PROPDMG_NORM)*100,2)
## [1] 86.7
round(sum(aggCrop[head(order(-aggCrop[,4]),20),4])
/sum(aggCrop$CROPDMG_NORM)*100,2)
## [1] 86.78
From those we created the top aggregated datasets:
aggFat20 <- aggFat[head(order(-aggFat[,4]),20),]
aggInj20 <- aggInj[head(order(-aggInj[,4]),20),]
aggProp20 <- aggProp[head(order(-aggProp[,4]),20),]
aggCrop20 <- aggCrop[head(order(-aggCrop[,4]),20),]
For representing the results, we used the ggplot2 package and the function multiplot to use the panels feature with the ggplot graphics.
aggFat20$EVTYPE <- as.factor(aggFat20$EVTYPE)
g1 <- ggplot(aggFat20, aes(x=EVTYPE, y=FATALITIES , fill = as.factor(DECADE)))
g1 <- g1 + geom_bar(stat = "identity") + coord_flip()
g1 <- g1 + facet_grid(. ~ DECADE)
g1 <- g1 + labs(fill = "Decades")
g1 <- g1 + labs( x = "Harmful event type", y = "Number of fatalities")
g1 <- g1 + theme(legend.position="none", strip.text.x = element_text(size = 16), axis.text=element_text(size=6))
g1 <- g1 + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
g1 <- g1 + scale_x_discrete(limits = rev(levels(aggFat20$EVTYPE)))
aggInj20$EVTYPE <- as.factor(aggInj20$EVTYPE)
g2 <- ggplot(aggInj20, aes(x=EVTYPE, y=INJURIES/1000 , fill = as.factor(DECADE)))
g2 <- g2 + geom_bar(stat = "identity") + coord_flip()
g2 <- g2 + facet_grid(. ~ DECADE)
g2 <- g2 + labs(fill = "Decades")
g2 <- g2 + labs( x = "Harmful event type", y = "Thousands of injuries")
g2 <- g2 + theme(legend.position="none", strip.text.x = element_text(size = 16), axis.text=element_text(size=6))
g2 <- g2 + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
g2 <- g2 + scale_x_discrete(limits = rev(levels(aggInj20$EVTYPE)))
multiplot(g1, g2, layout=matrix(c(1,2), nrow=2, byrow=TRUE))
aggProp20$EVTYPE <- as.factor(aggProp20$EVTYPE)
g3 <- ggplot(aggProp20, aes(x=EVTYPE, y=PROPDMG_NORM/1e+9 , fill = as.factor(DECADE)))
g3 <- g3 + geom_bar(stat = "identity") + coord_flip()
g3 <- g3 + facet_grid(. ~ DECADE)
g3 <- g3 + labs(fill = "Decades")
g3 <- g3 + labs( x = "Property damage event type", y = "Billions of $")
g3<- g3 + theme(legend.position="none", strip.text.x = element_text(size = 16), axis.text=element_text(size=6))
g3 <- g3 + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
g3 <- g3 + scale_x_discrete(limits = rev(levels(aggProp20$EVTYPE)))
aggCrop20$EVTYPE <- as.factor(aggCrop20$EVTYPE)
g4 <- ggplot(aggCrop20, aes(x=EVTYPE, y=CROPDMG_NORM/1e+9 , fill = as.factor(DECADE)))
g4 <- g4 + geom_bar(stat = "identity") + coord_flip()
g4 <- g4 + facet_grid(. ~ DECADE)
g4 <- g4 + labs(fill = "Decades")
g4 <- g4 + labs( x = "Crop damage event type", y = "Billions of $")
g4<- g4 + theme(legend.position="none", strip.text.x = element_text(size = 16), axis.text=element_text(size=6))
g4 <- g4 + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
g4 <- g4 + scale_x_discrete(limits = rev(levels(aggCrop20$EVTYPE)))
multiplot(g3, g4, layout=matrix(c(1,2), nrow=2, byrow=TRUE))
As it can be seen the tornadoes are the most harmful with respect to population health, while flood have the greatest economic consequences for the properties and drought for the crops.