Synopsis

In this study we will explore NOAA Storm Database to analyze weather events and its highest impact on people’s lifes. During this approach we will, mostly, study it’s impacts on population’s health and economics.

Data Processing

Loading libraries to process data (installing also, if they’re not installed already):

if("date" %in% rownames(installed.packages()) == FALSE) {
  install.packages("date")
}

if("plyr" %in% rownames(installed.packages()) == FALSE) {
  install.packages("plyr")
}

if("ggplot2" %in% rownames(installed.packages()) == FALSE) {
  install.packages("ggplot2")
}

if("scales" %in% rownames(installed.packages()) == FALSE) {
  install.packages("scales")
}

library(date)
library(plyr)
library(ggplot2)
require(scales)
## Loading required package: scales
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

Loading the NOAA Storm Database:

data <- read.csv("stormData.csv")

Formatting fields:

data$BGN_DATE <- as.Date(data$BGN_DATE, "%m/%d/%Y 0:00:00")
data$END_DATE <- as.Date(data$END_DATE, "%m/%d/%Y 0:00:00")
data$year <- as.numeric(format(data$END_DATE, "%Y"))
data$Events <- data$EVTYPE

data[!is.na(data$PROPDMGEXP) & (data$PROPDMGEXP == "B" | data$PROPDMGEXP == "b"),]$PROPDMGEXP <- "9"
## Warning in `[<-.factor`(`*tmp*`, iseq, value = c("9", "9", "9", "9", "9",
## : invalid factor level, NA generated
data[!is.na(data$PROPDMGEXP) & (data$PROPDMGEXP == "M" | data$PROPDMGEXP == "m"),]$PROPDMGEXP <- "6"
data[!is.na(data$PROPDMGEXP) & (data$PROPDMGEXP == "K" | data$PROPDMGEXP == "k"),]$PROPDMGEXP <- "3"
data[!is.na(data$PROPDMGEXP) & (data$PROPDMGEXP == "H" | data$PROPDMGEXP == "h"),]$PROPDMGEXP <- "2"
data[!is.na(data$PROPDMGEXP) & (data$PROPDMGEXP == "" | data$PROPDMGEXP == "+" | data$PROPDMGEXP == "-" | data$PROPDMGEXP == "?"),]$PROPDMGEXP <- "0"

data[!is.na(data$CROPDMGEXP) & (data$CROPDMGEXP == "B" | data$CROPDMGEXP == "b"),]$CROPDMGEXP <- "9"
## Warning in `[<-.factor`(`*tmp*`, iseq, value = c("9", "9", "9", "9", "9",
## : invalid factor level, NA generated
data[!is.na(data$CROPDMGEXP) & (data$CROPDMGEXP == "M" | data$CROPDMGEXP == "m"),]$CROPDMGEXP <- "6"
## Warning in `[<-.factor`(`*tmp*`, iseq, value = c("6", "6", "6", "6", "6",
## : invalid factor level, NA generated
data[!is.na(data$CROPDMGEXP) & (data$CROPDMGEXP == "K" | data$CROPDMGEXP == "k"),]$CROPDMGEXP <- "3"
## Warning in `[<-.factor`(`*tmp*`, iseq, value = c("3", "3", "3", "3", "3",
## : invalid factor level, NA generated
data[!is.na(data$CROPDMGEXP) & (data$CROPDMGEXP == "" | data$CROPDMGEXP == "+" | data$CROPDMGEXP == "-" | data$CROPDMGEXP == "?"),]$CROPDMGEXP <- "0"

data$PROPDMG <- data$PROPDMG * 10^as.numeric(data$PROPDMGEXP)
data$CROPDMG <- data$CROPDMG * 10^as.numeric(data$CROPDMGEXP)

Checking the top 10 events that most of the people die or get hurt:

fatalitiesByEvents <- aggregate(FATALITIES~Events, data, sum)
top10FatalitiesEvents <- fatalitiesByEvents[order(fatalitiesByEvents$FATALITIES, decreasing = TRUE),][1:10,]

injuriesByEvents <- aggregate(INJURIES~Events, data, sum)
top10InjuriesEvents <- injuriesByEvents[order(injuriesByEvents$INJURIES, decreasing = TRUE),][1:10,]

Checking the top 10 events that were more costly:

propDmgByEvents <- aggregate(PROPDMG~Events, data, sum)
top10PropDmgEvents <- propDmgByEvents[order(propDmgByEvents$PROPDMG, decreasing = TRUE),][1:10,]

cropDmgByEvents <- aggregate(CROPDMG~Events, data, sum)
top10CropDmgEvents <- cropDmgByEvents[order(cropDmgByEvents$CROPDMG, decreasing = TRUE),][1:10,]

Grouping summarized data to plot health analysis:

fatalitiesSummarized <- ddply(data, c("year", "Events"), summarise, quantity=sum(FATALITIES))
fatalitiesSummarized <- fatalitiesSummarized[which(fatalitiesSummarized$Events %in% top10FatalitiesEvents$Events),]

injuriesSummarized <- ddply(data, c("year", "Events"), summarise, quantity=sum(INJURIES))
injuriesSummarized <- injuriesSummarized[which(injuriesSummarized$Events %in% top10InjuriesEvents$Events),]

Grouping summarized data to plot costs analysis:

propDmgSummarized <- ddply(data, c("year", "Events"), summarise, quantity=(sum(PROPDMG)))
propDmgSummarized <- propDmgSummarized[which(propDmgSummarized$Events %in% top10PropDmgEvents$Events),]

cropDmgSummarized <- ddply(data, c("year", "Events"), summarise, quantity=(sum(CROPDMG)))
cropDmgSummarized <- cropDmgSummarized[which(cropDmgSummarized$Events %in% top10CropDmgEvents$Events),]

Results

p1 <- ggplot(data=fatalitiesSummarized, aes(x=year, y=quantity, fill=Events)) + geom_bar(colour="black", stat="identity") + xlab("Years") + ylab("Amount of Fatalities") + ggtitle("Top fatalities by time, grouped by weather event")

p2 <- ggplot(data=injuriesSummarized, aes(x=year, y=quantity, fill=Events)) + geom_bar(colour="black", stat="identity") + xlab("Years") + ylab("Amount of Injuries") + ggtitle("Top injuries by time, grouped by weather event")

p3 <- ggplot(data=top10FatalitiesEvents, aes(x=Events, y=FATALITIES)) + geom_bar(fill="red", stat="identity") + xlab("Events") + ylab("Amount of Fatalities") + ggtitle("Top fatalities and their amount, by weather event")

p4 <- ggplot(data=top10InjuriesEvents, aes(x=Events, y=INJURIES)) + geom_bar(fill="orange", stat="identity") + xlab("Events") + ylab("Amount of Injuries") + ggtitle("Top injuries and their amount, by weather event")

multiplot(p1, p2, p3, p4, cols=1)
## Warning: Removed 10 rows containing missing values (position_stack).
## Warning: Removed 10 rows containing missing values (position_stack).

p5 <- ggplot(data=propDmgSummarized, aes(x=year, y=quantity, fill=Events)) + geom_bar(colour="black", stat="identity") + xlab("Years") + ylab("Amount of Property Damage") + ggtitle("Top events that did more damage by time in properties") + scale_y_continuous(labels = comma)

p6 <- ggplot(data=top10PropDmgEvents, aes(x=Events, y=PROPDMG)) + geom_bar(fill="red", stat="identity") + xlab("Events") + ylab("Amount of Property Damage") + ggtitle("Top events that did more damage by time in properties") + scale_y_continuous(labels = comma)

#p7 <- ggplot(data=cropDmgSummarized, aes(x=year, y=quantity, fill=Events)) + geom_bar(colour="black", stat="identity") + xlab("Years") + ylab("Amount of Crop Damage") + ggtitle("Top events that did more damage by time in crops") + scale_y_continuous(labels = comma)

p8 <- ggplot(data=top10CropDmgEvents, aes(x=Events, y=CROPDMG)) + geom_bar(fill="orange", stat="identity") + xlab("Events") + ylab("Amount of Crop Damage") + ggtitle("Top events that did more damage by time in crops") + scale_y_continuous(labels = comma)

multiplot(p5, p6, p8, cols=1)
## Warning: Removed 23 rows containing missing values (position_stack).