This study aims to analyse the storm data set collected from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) from 1950 - 2011 and identify those events that are most harmful to human population and have the highest economic impact
Check if the zip file exists, if not it downloads the file from the web, unzip it and read it with either read csv or read rds, according to the existing file format. The needed libraries are loaded as well:
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.3
library(plyr)
library(gridExtra)
setwd("~/Desktop/Reproducible Research/Projects/RepData_PeerAssessment2/")
data_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
data_zip <- "StormData.csv.bz2"
data_rds <- "StormData.RDS"
if (!file.exists(data_zip)) {
download.file(url = data_url,
destfile = data_zip,
method = "curl")
}
if (!file.exists(data_rds)) {
data <- read.csv(file = bzfile(data_zip), strip.white = TRUE)
} else {
data <- readRDS(data_rds)
RDSloaded <- TRUE
}
Reducing the dataset to the columns of interest:
data_US <- subset(data, data$STATE %in% state.abb)
In the following the eventypes are normalised:
data_US$EVTYPE[grep("WIND",data_US$EVTYPE)] <- "THUNDERSTORM WIND"
data_US$EVTYPE[grep("HEAT",data_US$EVTYPE)] <- "HEAT"
data_US$EVTYPE[grep("WARM",data_US$EVTYPE)] <- "HEAT"
Two different datasets are then created, one for the analysis about health data, the other to focus the analysis on economic damage-related data:
health.df <- data_US[,c("EVTYPE", "FATALITIES", "INJURIES")]
economy.df <- data_US[,c("EVTYPE","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]
There are no missing data in the columns of FATALITIES,INJURIES, PROPDMG and CROPDMG:
na_fatalities <- sum(is.na(health.df$FATALITIES))
na_injuries <- sum(is.na(health.df$INJURIES))
na_propdmg <- sum(is.na(economy.df$PROPDMG))
na_cropdmg <- sum(is.na(economy.df$CROPDMG))
print(c(na_fatalities,na_injuries,na_propdmg,na_cropdmg))
## [1] 0 0 0 0
However the fields PROPDMGEXP and CROPDMGEXP have to be cleaned because there are a few entries that are not allowed as it can be seen in the following:
print(summary(economy.df$PROPDMGEXP))
## - ? + 0 1 2 3 4 5
## 456190 1 8 5 216 25 13 4 4 28
## 6 7 8 B h H K m M
## 4 5 1 39 1 6 415375 7 11254
print(summary(economy.df$CROPDMGEXP))
## ? 0 2 B k K m M
## 607897 7 19 1 9 21 273262 1 1969
The cleaning strategy is the following:
calc_amount <- function(dmg, dmgexp) dmg * switch(toupper(dmgexp), H=100, K=1000, M=1000000, B=1000000000,1)
economy.df$PDMG <- mapply(calc_amount, economy.df$PROPDMG, economy.df$PROPDMGEXP)
economy.df$CDMG <- mapply(calc_amount, economy.df$CROPDMG, economy.df$CROPDMGEXP)
Fatalities and injuries are aggregated according to the events, summing the number of events for each category.
fatalities <- aggregate(health.df$FATALITIES, list(health.df$EVTYPE), sum )
names(fatalities) <- c("EVTYPE", "FATALITIES")
top10fatalities <- head(arrange(fatalities, fatalities$FATALITIES, decreasing = T ), 10 )
injuries <- aggregate(health.df$INJURIES, list(health.df$EVTYPE), sum )
names(injuries) <- c("EVTYPE", "INJURIES")
top10injuries <- head(arrange(injuries, injuries$INJURIES, decreasing = T ), 10 )
The results are:
top10fatalities
## EVTYPE FATALITIES
## 1 TORNADO 5633
## 2 HEAT 3156
## 3 THUNDERSTORM WIND 1408
## 4 FLASH FLOOD 939
## 5 LIGHTNING 806
## 6 FLOOD 464
## 7 RIP CURRENT 343
## 8 AVALANCHE 224
## 9 WINTER STORM 205
## 10 RIP CURRENTS 175
and
top10injuries
## EVTYPE INJURIES
## 1 TORNADO 91346
## 2 THUNDERSTORM WIND 11373
## 3 HEAT 8856
## 4 FLOOD 6786
## 5 LIGHTNING 5212
## 6 ICE STORM 1975
## 7 FLASH FLOOD 1767
## 8 HAIL 1361
## 9 WINTER STORM 1321
## 10 HEAVY SNOW 1021
These results can be visualized in the following plots:
plot_fatalities <- qplot( data = top10fatalities, EVTYPE, weight = FATALITIES, geom = "bar") + theme( axis.text.x = element_text(angle=45, hjust=1)) + scale_y_continuous( "N° of fatalities") + xlab( "Event Types") + ggtitle("Fatalities caused by\n the top 10 natural events")
plot_injuries <- qplot( data = top10injuries, EVTYPE, weight = INJURIES, geom = "bar") + theme( axis.text.x = element_text(angle=45, hjust=1)) + scale_y_continuous( "Nos. of injuries") + xlab( "Event Types") + ggtitle("Injuries caused by\n the top 10 natural events")
grid.arrange(plot_fatalities, plot_injuries, ncol=2)
The same steps as above are repeated to analyse the economic impact:
properties <- aggregate(economy.df$PDMG, list(economy.df$EVTYPE), sum )
names(properties) <- c("EVTYPE", "propDamage")
top10propdmg <- head(arrange(properties, properties$propDamage, decreasing = T ), 10 )
crops <- aggregate(economy.df$CDMG, list(economy.df$EVTYPE), sum )
names(crops) <- c("EVTYPE", "cropDamage")
top10cropdmg <- head(arrange(crops, crops$cropDamage, decreasing = T ), 10 )
The results are:
top10propdmg
## EVTYPE propDamage
## 1 FLOOD 144531318807
## 2 HURRICANE/TYPHOON 69033100000
## 3 TORNADO 56936688779
## 4 STORM SURGE 43323461000
## 5 THUNDERSTORM WIND 17710742880
## 6 FLASH FLOOD 15868170417
## 7 HAIL 15732260543
## 8 HURRICANE 9913998010
## 9 TROPICAL STORM 7475121550
## 10 WINTER STORM 6688492251
and
top10cropdmg
## EVTYPE cropDamage
## 1 DROUGHT 13972361000
## 2 FLOOD 5613968450
## 3 RIVER FLOOD 5029459000
## 4 ICE STORM 5022113500
## 5 HAIL 3025954473
## 6 HURRICANE/TYPHOON 2603500800
## 7 HURRICANE 2189930000
## 8 THUNDERSTORM WIND 2158958538
## 9 FLASH FLOOD 1406905100
## 10 EXTREME COLD 1292973000
The results are shown in the following plots:
plot_propdmg <- qplot( data = top10propdmg, EVTYPE, weight = propDamage, geom = "bar") + theme( axis.text.x = element_text(angle=45, hjust=1)) + scale_y_continuous( "Property Damages [$]") + xlab( "Event Types") + ggtitle("Total property damages\n caused by top 10 natural events")
plot_cropdmg <- qplot( data = top10cropdmg, EVTYPE, weight = cropDamage, geom = "bar") + theme( axis.text.x = element_text(angle=45, hjust=1)) + scale_y_continuous( "Crop Damages in USD") + xlab( "Event Types") + ggtitle("Total crop damages\n caused by top 10 natural events top 10 weather events")
grid.arrange(plot_propdmg, plot_cropdmg, ncol=2)
The conclusions of this study are: