In this study we will explore NOAA Storm Database to analyze weather events and its highest impact on people’s lifes. During this approach we will, mostly, study it’s impacts on population’s health and economics.
Loading libraries to process data (installing also, if they’re not installed already):
if("date" %in% rownames(installed.packages()) == FALSE) {
install.packages("date")
}
if("plyr" %in% rownames(installed.packages()) == FALSE) {
install.packages("plyr")
}
if("ggplot2" %in% rownames(installed.packages()) == FALSE) {
install.packages("ggplot2")
}
if("scales" %in% rownames(installed.packages()) == FALSE) {
install.packages("scales")
}
library(date)
library(plyr)
library(ggplot2)
require(scales)
## Loading required package: scales
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
Loading the NOAA Storm Database:
data <- read.csv("stormData.csv")
Formatting fields:
data$BGN_DATE <- as.Date(data$BGN_DATE, "%m/%d/%Y 0:00:00")
data$END_DATE <- as.Date(data$END_DATE, "%m/%d/%Y 0:00:00")
data$year <- as.numeric(format(data$END_DATE, "%Y"))
data$Events <- data$EVTYPE
data[!is.na(data$PROPDMGEXP) & (data$PROPDMGEXP == "B" | data$PROPDMGEXP == "b"),]$PROPDMGEXP <- "9"
## Warning in `[<-.factor`(`*tmp*`, iseq, value = c("9", "9", "9", "9", "9",
## : invalid factor level, NA generated
data[!is.na(data$PROPDMGEXP) & (data$PROPDMGEXP == "M" | data$PROPDMGEXP == "m"),]$PROPDMGEXP <- "6"
data[!is.na(data$PROPDMGEXP) & (data$PROPDMGEXP == "K" | data$PROPDMGEXP == "k"),]$PROPDMGEXP <- "3"
data[!is.na(data$PROPDMGEXP) & (data$PROPDMGEXP == "H" | data$PROPDMGEXP == "h"),]$PROPDMGEXP <- "2"
data[!is.na(data$PROPDMGEXP) & (data$PROPDMGEXP == "" | data$PROPDMGEXP == "+" | data$PROPDMGEXP == "-" | data$PROPDMGEXP == "?"),]$PROPDMGEXP <- "0"
data[!is.na(data$CROPDMGEXP) & (data$CROPDMGEXP == "B" | data$CROPDMGEXP == "b"),]$CROPDMGEXP <- "9"
## Warning in `[<-.factor`(`*tmp*`, iseq, value = c("9", "9", "9", "9", "9",
## : invalid factor level, NA generated
data[!is.na(data$CROPDMGEXP) & (data$CROPDMGEXP == "M" | data$CROPDMGEXP == "m"),]$CROPDMGEXP <- "6"
## Warning in `[<-.factor`(`*tmp*`, iseq, value = c("6", "6", "6", "6", "6",
## : invalid factor level, NA generated
data[!is.na(data$CROPDMGEXP) & (data$CROPDMGEXP == "K" | data$CROPDMGEXP == "k"),]$CROPDMGEXP <- "3"
## Warning in `[<-.factor`(`*tmp*`, iseq, value = c("3", "3", "3", "3", "3",
## : invalid factor level, NA generated
data[!is.na(data$CROPDMGEXP) & (data$CROPDMGEXP == "" | data$CROPDMGEXP == "+" | data$CROPDMGEXP == "-" | data$CROPDMGEXP == "?"),]$CROPDMGEXP <- "0"
data$PROPDMG <- data$PROPDMG * 10^as.numeric(data$PROPDMGEXP)
data$CROPDMG <- data$CROPDMG * 10^as.numeric(data$CROPDMGEXP)
Checking the top 10 events that most of the people die or get hurt:
fatalitiesByEvents <- aggregate(FATALITIES~Events, data, sum)
top10FatalitiesEvents <- fatalitiesByEvents[order(fatalitiesByEvents$FATALITIES, decreasing = TRUE),][1:10,]
injuriesByEvents <- aggregate(INJURIES~Events, data, sum)
top10InjuriesEvents <- injuriesByEvents[order(injuriesByEvents$INJURIES, decreasing = TRUE),][1:10,]
Checking the top 10 events that were more costly:
propDmgByEvents <- aggregate(PROPDMG~Events, data, sum)
top10PropDmgEvents <- propDmgByEvents[order(propDmgByEvents$PROPDMG, decreasing = TRUE),][1:10,]
cropDmgByEvents <- aggregate(CROPDMG~Events, data, sum)
top10CropDmgEvents <- cropDmgByEvents[order(cropDmgByEvents$CROPDMG, decreasing = TRUE),][1:10,]
Grouping summarized data to plot health analysis:
fatalitiesSummarized <- ddply(data, c("year", "Events"), summarise, quantity=sum(FATALITIES))
fatalitiesSummarized <- fatalitiesSummarized[which(fatalitiesSummarized$Events %in% top10FatalitiesEvents$Events),]
injuriesSummarized <- ddply(data, c("year", "Events"), summarise, quantity=sum(INJURIES))
injuriesSummarized <- injuriesSummarized[which(injuriesSummarized$Events %in% top10InjuriesEvents$Events),]
Grouping summarized data to plot costs analysis:
propDmgSummarized <- ddply(data, c("year", "Events"), summarise, quantity=(sum(PROPDMG)))
propDmgSummarized <- propDmgSummarized[which(propDmgSummarized$Events %in% top10PropDmgEvents$Events),]
cropDmgSummarized <- ddply(data, c("year", "Events"), summarise, quantity=(sum(CROPDMG)))
cropDmgSummarized <- cropDmgSummarized[which(cropDmgSummarized$Events %in% top10CropDmgEvents$Events),]
p1 <- ggplot(data=fatalitiesSummarized, aes(x=year, y=quantity, fill=Events)) + geom_bar(colour="black", stat="identity") + xlab("Years") + ylab("Amount of Fatalities") + ggtitle("Top fatalities by time, grouped by weather event")
p2 <- ggplot(data=injuriesSummarized, aes(x=year, y=quantity, fill=Events)) + geom_bar(colour="black", stat="identity") + xlab("Years") + ylab("Amount of Injuries") + ggtitle("Top injuries by time, grouped by weather event")
p3 <- ggplot(data=top10FatalitiesEvents, aes(x=Events, y=FATALITIES)) + geom_bar(fill="red", stat="identity") + xlab("Events") + ylab("Amount of Fatalities") + ggtitle("Top fatalities and their amount, by weather event")
p4 <- ggplot(data=top10InjuriesEvents, aes(x=Events, y=INJURIES)) + geom_bar(fill="orange", stat="identity") + xlab("Events") + ylab("Amount of Injuries") + ggtitle("Top injuries and their amount, by weather event")
multiplot(p1, p2, p3, p4, cols=1)
## Warning: Removed 10 rows containing missing values (position_stack).
## Warning: Removed 10 rows containing missing values (position_stack).
p5 <- ggplot(data=propDmgSummarized, aes(x=year, y=quantity, fill=Events)) + geom_bar(colour="black", stat="identity") + xlab("Years") + ylab("Amount of Property Damage") + ggtitle("Top events that did more damage by time in properties") + scale_y_continuous(labels = comma)
p6 <- ggplot(data=top10PropDmgEvents, aes(x=Events, y=PROPDMG)) + geom_bar(fill="red", stat="identity") + xlab("Events") + ylab("Amount of Property Damage") + ggtitle("Top events that did more damage by time in properties") + scale_y_continuous(labels = comma)
#p7 <- ggplot(data=cropDmgSummarized, aes(x=year, y=quantity, fill=Events)) + geom_bar(colour="black", stat="identity") + xlab("Years") + ylab("Amount of Crop Damage") + ggtitle("Top events that did more damage by time in crops") + scale_y_continuous(labels = comma)
p8 <- ggplot(data=top10CropDmgEvents, aes(x=Events, y=CROPDMG)) + geom_bar(fill="orange", stat="identity") + xlab("Events") + ylab("Amount of Crop Damage") + ggtitle("Top events that did more damage by time in crops") + scale_y_continuous(labels = comma)
multiplot(p5, p6, p8, cols=1)
## Warning: Removed 23 rows containing missing values (position_stack).