Synopsis

The aim of this analysis it to assess the major severe weather events that are most harmful against the U.S population, as well as those most impactful from an economic point of view.

The analysis is conduct using the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database, and higlights the fatalities, injuries, damages to properties and crops detected during the period 1950-2011.

Data Processing

Preliminary steps

As a first step we should clean the environment, and set the working directory.

#rm(list=ls())
#setwd(<set working path here>)

Install/load libraries

For this analysis, we used the following libraries that should be installed if not availables.

# install.packages("knitr")
# install.packages("stringr")
# install.packages("lubridate")
# install.packages("ggplot2")
library(knitr)
library(stringr)
library(lubridate)
library (ggplot2)

Support functions

Function findDecade

findDecade <- function(yearIn, startYear, endYear){
        
        startYear <- as.numeric(substr(startYear,1,3))*10
        
        for (i in seq(startYear, endYear, 10)){
                if (yearIn %in% seq(i, i+9, 1))
                {decade <- paste(i , "s", sep = "")}                             
        }
        
        return(decade)
}

Function multiplot

The function is a courtesy of Winston Chang, and available at this link.

# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
        library(grid)
        
        # Make a list from the ... arguments and plotlist
        plots <- c(list(...), plotlist)
        
        numPlots = length(plots)
        
        # If layout is NULL, then use 'cols' to determine layout
        if (is.null(layout)) {
                # Make the panel
                # ncol: Number of columns of plots
                # nrow: Number of rows needed, calculated from # of cols
                layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                                 ncol = cols, nrow = ceiling(numPlots/cols))
        }
        
        if (numPlots==1) {
                print(plots[[1]])
                
        } else {
                # Set up the page
                grid.newpage()
                pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
                
                # Make each plot, in the correct location
                for (i in 1:numPlots) {
                        # Get the i,j matrix positions of the regions that contain this subplot
                        matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
                        
                        print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                                        layout.pos.col = matchidx$col))
                }
        }
}

Clean the dataset

Load of the dataset

We can then download the dataset from the following link storm data, unzip and load it into the rawData dataframe in R. To speed up the load we set the class of every column.

url <- "./repdata-data-StormData.csv"

headersData <- read.csv(url, nrows = 1)
classesData <- sapply(headersData, class)
classesData[1:37] <- "character"
classesData[1] <- "numeric"
classesData[2] <- "character"
classesData[5] <- "numeric"
classesData[23:24] <- "numeric"
classesData[19:20] <- "numeric"
classesData[32:35] <- "numeric"
classesData[37] <- "numeric"

rawData <- read.csv(url, colClasses = classesData)

Subset the main dataset into the working subsets

We then partitioned the rawData in four subsets to analyze the:

  1. Fatalities: dataFatal
  2. Injuries: dataInjur
  3. Property damages: dataProp
  4. Crop damages: dataCrop
dataFatal <- rawData[(as.numeric(rawData$FATALITIES) != 0), c(2,8,23)]
dataInjur <- rawData[(as.numeric(rawData$INJURIES) != 0), c(2,8,24)]
dataProp <- rawData[(as.numeric(rawData$PROPDMG) != 0)
                & (rawData$PROPDMGEXP %in% c("K","M","B")), c(2,8,25,26)]
dataCrop <- rawData[(as.numeric(rawData$CROPDMG) != 0)
                & (rawData$CROPDMGEXP %in% c("K","M","B")), c(2,8,27,28)]

Clean the working subsets

For each of the working subsets we add new columns to perform the analysis.

Column DATE_NORM

dataFatalNorm <- cbind(
                dataFatal,
                mdy(substr(dataFatal[,1], 1, nchar(dataFatal[,1])-8))
                )

dataInjurNorm <- cbind(
        dataInjur,
        mdy(substr(dataInjur[,1], 1, nchar(dataInjur[,1])-8))
)

dataPropNorm <- cbind(
        dataProp,
        mdy(substr(dataProp[,1], 1, nchar(dataProp[,1])-8))
)

dataCropNorm <- cbind(
        dataCrop,
        mdy(substr(dataCrop[,1], 1, nchar(dataCrop[,1])-8))
)

colnames(dataFatalNorm)[4] <- "DATE_NORM"
colnames(dataInjurNorm)[4] <- "DATE_NORM"
colnames(dataPropNorm)[5] <- "DATE_NORM"
colnames(dataCropNorm)[5] <- "DATE_NORM"

Columns PROPDMG_NORM and CROPDMG_NORM

For the economic consequences we do a further step by normalizing the damages values expressed by the following fields: PROPDMG, PROPDMGEXP, CROPDMG and CROPDMGEXP. The only admissibiles values are K, M and B for the multipliers, respectively for the thousands, millions and billions.

dataPropNorm$PROPDMG_NORM <- ifelse(dataPropNorm$PROPDMGEXP == "K",as.numeric(dataPropNorm$PROPDMG)*1e+3,"")
dataPropNorm$PROPDMG_NORM <- ifelse(dataPropNorm$PROPDMGEXP == "M",as.numeric(dataPropNorm$PROPDMG)*1e+6,dataPropNorm$PROPDMG_NORM) 
dataPropNorm$PROPDMG_NORM <- ifelse(dataPropNorm$PROPDMGEXP == "B",as.numeric(dataPropNorm$PROPDMG)*1e+9,dataPropNorm$PROPDMG_NORM) 

dataCropNorm$CROPDMG_NORM <- ifelse(dataCropNorm$CROPDMGEXP == "K",as.numeric(dataCropNorm$CROPDMG)*1e+3,"")
dataCropNorm$CROPDMG_NORM <- ifelse(dataCropNorm$CROPDMGEXP == "M",as.numeric(dataCropNorm$CROPDMG)*1e+6,dataCropNorm$CROPDMG_NORM) 
dataCropNorm$CROPDMG_NORM <- ifelse(dataCropNorm$CROPDMGEXP == "B",as.numeric(dataCropNorm$CROPDMG)*1e+9,dataCropNorm$CROPDMG_NORM) 

Column DECADE

We then use a function called findDecade that takes 3 arguments in in put and return the decade of a year in an range (the function is reported in appendix).

We first retrieve, for each working subset, the interval of years (starting and ending) of the event.

sYearFat <- year(min(dataFatalNorm$DATE_NORM))
eYearFat <- year(max(dataFatalNorm$DATE_NORM))

sYearInj <- year(min(dataInjurNorm$DATE_NORM))
eYearInj <- year(max(dataInjurNorm$DATE_NORM))

sYearProp <- year(min(dataPropNorm$DATE_NORM))
eYearProp <- year(max(dataPropNorm$DATE_NORM))

sYearCrop <- year(min(dataCropNorm$DATE_NORM))
eYearCrop <- year(max(dataCropNorm$DATE_NORM))

Then we apply the function the decade.

for(i in 1:nrow(dataFatalNorm)) {
        dataFatalNorm[i,5] <- findDecade(
                                year(dataFatalNorm[i,4]),
                                sYearFat,
                                eYearFat
                                )        
}

for(i in 1:nrow(dataInjurNorm)) {
        dataInjurNorm[i,5] <- findDecade(
                year(dataInjurNorm[i,4]),
                sYearInj,
                eYearInj
        )        
}

for(i in 1:nrow(dataPropNorm)) {
        dataPropNorm[i,7] <- findDecade(
                year(dataPropNorm[i,5]),
                sYearProp,
                eYearProp
        )        
}

for(i in 1:nrow(dataCropNorm)) {
        dataCropNorm[i,7] <- findDecade(
                year(dataCropNorm[i,5]),
                sYearCrop,
                eYearCrop
        )        
}

And then we rename each of the new columns created

colnames(dataFatalNorm)[5] <- "DECADE"
colnames(dataInjurNorm)[5] <- "DECADE"
colnames(dataPropNorm)[7] <- "DECADE"
colnames(dataCropNorm)[7] <- "DECADE"

Column GROUP

Since in the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records, we divided the analysis in two groups: the first of the pre 1990s decades and the second of the post 1990s ones.

dataFatalNorm$GROUP <- factor(ifelse(dataFatalNorm$DECADE >= "1990s","post-1990s", "pre-1990s"))
dataInjurNorm$GROUP <- factor(ifelse(dataInjurNorm$DECADE >= "1990s","post-1990s", "pre-1990s"))
dataPropNorm$GROUP <- factor(ifelse(dataPropNorm$DECADE >= "1990s","post-1990s", "pre-1990s"))
dataCropNorm$GROUP <- factor(ifelse(dataCropNorm$DECADE >= "1990s","post-1990s", "pre-1990s"))

Aggregation and selection of the data

Aggregation

For each of the working datasets we made an aggregation of data by DECADE, EVTYPE and GROUP, to obtain 4 new aggregated working datasets.

aggFat <- aggregate(FATALITIES~DECADE+EVTYPE+GROUP,
                    data=dataFatalNorm,
                    FUN="sum"
                )

aggInj <- aggregate(INJURIES~DECADE+EVTYPE+GROUP,
                    data=dataInjurNorm,
                    FUN="sum"
)

aggProp <- aggregate(as.numeric(PROPDMG_NORM)~DECADE+EVTYPE+GROUP,
                    data=dataPropNorm,
                    FUN="sum"
)

aggCrop <- aggregate(as.numeric(CROPDMG_NORM)~DECADE+EVTYPE+GROUP,
                     data=dataCropNorm,
                     FUN="sum"
)

colnames(aggProp)[4] <- "PROPDMG_NORM"
colnames(aggCrop)[4] <- "CROPDMG_NORM"

Selection

Since a full representation would not be feasible, for each of the aggregated working subsets we choose only the top values, based on the coverage percentage of at least 70%. Specifically we took a number of aggregated records needed to cover at least 70% of the harmful events or damaged value.

round(sum(aggFat[head(order(-aggFat[,4]),20),4])
      /sum(aggFat$FATALITIES)*100,2)
## [1] 72.39
round(sum(aggInj[head(order(-aggInj[,4]),20),4])
      /sum(aggInj$INJURIES)*100,2)
## [1] 86.71
round(sum(aggProp[head(order(-aggProp[,4]),20),4])
      /sum(aggProp$PROPDMG_NORM)*100,2)
## [1] 86.7
round(sum(aggCrop[head(order(-aggCrop[,4]),20),4])
      /sum(aggCrop$CROPDMG_NORM)*100,2)
## [1] 86.78

From those we created the top aggregated datasets:

aggFat20 <- aggFat[head(order(-aggFat[,4]),20),]
aggInj20 <- aggInj[head(order(-aggInj[,4]),20),]
aggProp20 <- aggProp[head(order(-aggProp[,4]),20),]
aggCrop20 <- aggCrop[head(order(-aggCrop[,4]),20),]

Results

For representing the results, we used the ggplot2 package and the function multiplot to use the panels feature with the ggplot graphics.

Population harmful events (fatalities & injuries)

aggFat20$EVTYPE <- as.factor(aggFat20$EVTYPE)

g1 <- ggplot(aggFat20, aes(x=EVTYPE, y=FATALITIES , fill = as.factor(DECADE)))
g1 <- g1 + geom_bar(stat = "identity") + coord_flip()
g1 <- g1 + facet_grid(. ~ DECADE)
g1 <- g1 + labs(fill = "Decades") 
g1 <- g1 + labs( x = "Harmful event type", y = "Number of fatalities")
g1 <- g1 + theme(legend.position="none", strip.text.x = element_text(size = 16), axis.text=element_text(size=6))
g1 <- g1 + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
g1 <- g1 + scale_x_discrete(limits = rev(levels(aggFat20$EVTYPE)))
aggInj20$EVTYPE <- as.factor(aggInj20$EVTYPE)

g2 <- ggplot(aggInj20, aes(x=EVTYPE, y=INJURIES/1000 , fill = as.factor(DECADE)))
g2 <- g2 + geom_bar(stat = "identity") + coord_flip()
g2 <- g2 + facet_grid(. ~ DECADE)
g2 <- g2 + labs(fill = "Decades") 
g2 <- g2 + labs( x = "Harmful event type", y = "Thousands of injuries")
g2 <- g2 + theme(legend.position="none", strip.text.x = element_text(size = 16), axis.text=element_text(size=6))
g2 <- g2 + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
g2 <- g2 + scale_x_discrete(limits = rev(levels(aggInj20$EVTYPE)))
multiplot(g1, g2, layout=matrix(c(1,2), nrow=2, byrow=TRUE))

Economic damages (properties & crops)

aggProp20$EVTYPE <- as.factor(aggProp20$EVTYPE)

g3 <- ggplot(aggProp20, aes(x=EVTYPE, y=PROPDMG_NORM/1e+9 , fill = as.factor(DECADE)))
g3 <- g3 + geom_bar(stat = "identity") + coord_flip()
g3 <- g3 + facet_grid(. ~ DECADE)
g3 <- g3 + labs(fill = "Decades") 
g3 <- g3 + labs( x = "Property damage event type", y = "Billions of $")
g3<- g3 + theme(legend.position="none", strip.text.x = element_text(size = 16), axis.text=element_text(size=6))
g3 <- g3 + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
g3 <- g3 + scale_x_discrete(limits = rev(levels(aggProp20$EVTYPE)))
aggCrop20$EVTYPE <- as.factor(aggCrop20$EVTYPE)

g4 <- ggplot(aggCrop20, aes(x=EVTYPE, y=CROPDMG_NORM/1e+9 , fill = as.factor(DECADE)))
g4 <- g4 + geom_bar(stat = "identity") + coord_flip()
g4 <- g4 + facet_grid(. ~ DECADE)
g4 <- g4 + labs(fill = "Decades") 
g4 <- g4 + labs( x = "Crop damage event type", y = "Billions of $")
g4<- g4 + theme(legend.position="none", strip.text.x = element_text(size = 16), axis.text=element_text(size=6))
g4 <- g4 + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
g4 <- g4 + scale_x_discrete(limits = rev(levels(aggCrop20$EVTYPE)))
multiplot(g3, g4, layout=matrix(c(1,2), nrow=2, byrow=TRUE))

Conclusions

As it can be seen the tornadoes are the most harmful with respect to population health, while flood have the greatest economic consequences for the properties and drought for the crops.