This analysis uses data from the Storm database of the NOAA (National Oceanic and Atmospheric Administration of the USA - https://www.ncdc.noaa.gov/stormevents/). The aim of the analysis is to examine the data of weather events between 1950 and 2011 to answer two questions: (1) which types of weather events are the most harmful with respect to population health? and (2) which types of weather events have the greatest economic consequences?
Examine the evnironment of the local machine that is being used for the analysis.
sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 7 x64 (build 7601) Service Pack 1
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252
## [2] LC_CTYPE=English_United Kingdom.1252
## [3] LC_MONETARY=English_United Kingdom.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] backports_1.0.5 magrittr_1.5 rprojroot_1.2 tools_3.3.2
## [5] htmltools_0.3.5 Rcpp_0.12.9 stringi_1.1.2 rmarkdown_1.3
## [9] knitr_1.15.1 stringr_1.1.0 digest_0.6.12 evaluate_0.10
Libraries needed for the analysis
library(ggplot2)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.3.3
library(plyr)
## Warning: package 'plyr' was built under R version 3.3.3
Download and read the data
The NOAA storm data is available at: https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2 The documentation for the National Weather Service for this database is available at: https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf
# Download and read the data file
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = "StormData.csv.bz2")
stormData <- read.csv(("StormData.csv.bz2"), header=TRUE, stringsAsFactors = FALSE)
Subset the data to extract relevant data for this analysis
# Create an year column to analyse the data from the 1950 - 2001 timeperiod.
stormData$year <- as.numeric(format(as.Date(stormData$BGN_DATE, format = "%m/%d/%Y %H:%M:%S"), "%Y"))
# Subset dataframe to focus on the columns that are the focus of this analysis -
# Year ("year"); weather event type ("EVTYPE"); fatalities ("FATALITIES"); injuries ("INJURIES"); property damage and the associated # # # exponential ("PROPDMG" and "PROPDMGEXP"); crop damage and associated exponential ("CROPDMG" and "CROPDMGEXP").
# The exponentials will be used for analysis of economic impact as they can be used to convert the crop and property damage values into # a set of values with consistent numbering scale. This will make comparison of the values more straightforward.
setcol <- c("EVTYPE","FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP","CROPDMG", "CROPDMGEXP", "year")
selectedData2 <- stormData[setcol]
Plot events recorded for each year.
eventsplot <- ggplot(selectedData2, aes(year)) + geom_histogram(binwidth=1, fill="blue", colour="white") + xlab("Year") + ylab(" Recorded Weather Events") + ggtitle("Weather Events Recorded Per Year") + theme(plot.title = element_text(size = 10))
eventsplot
The plot indicates that from 1950 to the early 1990’s there are a relatively few recorded events. This suggests patchy data collection probably due to technology limitations. The analysis will therefore focus on data from 1993 onwards when the recording of recorded events increased significantly.
# subset the data from 1993 onwards
selectedData3 <- selectedData2[selectedData2$year >= 1993, ]
Convert the characters in exponential variable to their numeric equivalent
# Replace PROPDMGEXP to the equivalent numeric value.
selectedData3$propExp <- selectedData3$PROPDMGEXP
selectedData3$propExp <- revalue(selectedData3$propExp, c("K"="3","M"="6","m"="6","B"="9","+"="0","h"="2","H"="2","-"="0","?"="0"))
selectedData3$propExp[selectedData3$propExp==""] <- "0"
selectedData3$propExp <- as.numeric(selectedData3$propExp)
# Replace CROPDMGEXP to the equivalent numeric value.
selectedData3$cropExp <- selectedData3$CROPDMGEXP
selectedData3$cropExp <- revalue(selectedData3$cropExp, c("K"="3","k"="3", "M"="6", "m"="6", "B"="9", "?"="0"))
selectedData3$cropExp[selectedData3$cropExp==""] <- "0"
selectedData3$cropExp <- as.numeric(selectedData3$cropExp)
selectedData3$TOTALPROPDMG <- selectedData3$PROPDMG * (10^selectedData3$propExp)
selectedData3$TOTALCROPDMG <- selectedData3$CROPDMG * (10^selectedData3$cropExp)
# Convert to Billions by dividing by 1000000000
selectedData3$TOTALPROPDMG <- selectedData3$TOTALPROPDMG / 1000000000
# Convert to Billions by dividing by 1000000000
selectedData3$TOTALCROPDMG <- selectedData3$TOTALCROPDMG / 1000000000
Question 1 - Population Health
Identify the top 8 weather events with the largest number of injuries; and also the top 8 weather events with the largest number of fatalities.
# Aggregate 8 weather event types with the largest number of injuries
injuries <- aggregate(INJURIES ~ EVTYPE, selectedData3, sum)
injuries <- injuries[order(-injuries$INJURIES), ][1:8, ]
# Print results
print(injuries)
## EVTYPE INJURIES
## 834 TORNADO 23310
## 170 FLOOD 6789
## 130 EXCESSIVE HEAT 6525
## 464 LIGHTNING 5230
## 856 TSTM WIND 3631
## 275 HEAT 2100
## 427 ICE STORM 1975
## 153 FLASH FLOOD 1777
# Aggregate 8 weather event types with the largest number of fatalities
fatalities <- aggregate(FATALITIES ~ EVTYPE, selectedData3, sum)
fatalities <- fatalities[order(-fatalities$FATALITIES), ][1:8, ]
# Print results
print(fatalities)
## EVTYPE FATALITIES
## 130 EXCESSIVE HEAT 1903
## 834 TORNADO 1621
## 153 FLASH FLOOD 978
## 275 HEAT 937
## 464 LIGHTNING 816
## 170 FLOOD 470
## 585 RIP CURRENT 368
## 359 HIGH WIND 248
Plot the results
# Plot top 8 weather event types with the largest number of injuries
injuriesPlot <- ggplot(injuries, aes(x=EVTYPE, y=INJURIES)) + geom_bar(stat="identity", fill="red", colour="white") + xlab("Weather Event Type") + ylab("No. of Injuries") + ggtitle("Top 8 Weather Events - Injuries") + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + theme(plot.title = element_text(size = 10))
# Plot 8 weather event types with the largest number of fatalities
fatalitiesPlot <- ggplot(fatalities, aes(x=EVTYPE, y=FATALITIES)) + geom_bar(stat="identity", fill="green", colour="white") + xlab("Weather Event Type") + ylab("No. of Fatalitites") + ggtitle("Top 8 Weather Events - Fatalities") + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + theme(plot.title = element_text(size = 10))
# Arrange both results in the same plot grid.
grid.arrange(injuriesPlot, fatalitiesPlot, ncol =2)
Tornado is the weather event type generating the largest number of injuries across the USA during the 1993 - 2011 time period. Excessive Heat is the weather event type generating the largest number of fatalitites across the USA during the 1993 - 2011 time period. Tornado is a close second for largest number of fatalities.
Question 2 - greatest economic consequences.
Identify the top 8 weather events with the greatest cost values for crop damage; and also the top 8 weather events with the greatest cost values for property damage.
# Aggregate 8 weather event types with the greatest property damage
agrProperty <- aggregate(TOTALPROPDMG~EVTYPE, data = selectedData3,"sum")
property <- agrProperty[order(-agrProperty$TOTALPROPDMG), ][1:8, ]
property
## EVTYPE TOTALPROPDMG
## 170 FLOOD 144.657710
## 411 HURRICANE/TYPHOON 69.305840
## 670 STORM SURGE 43.323536
## 834 TORNADO 26.349182
## 153 FLASH FLOOD 16.822674
## 244 HAIL 15.735268
## 402 HURRICANE 11.868319
## 848 TROPICAL STORM 7.703891
# Aggregate 8 weather event types with the greatest crop damage
agrCrop <- aggregate(TOTALCROPDMG~EVTYPE, data = selectedData3,"sum")
crop <- agrCrop[order(-agrCrop$TOTALCROPDMG), ][1:8, ]
crop
## EVTYPE TOTALCROPDMG
## 95 DROUGHT 13.972566
## 170 FLOOD 5.661968
## 590 RIVER FLOOD 5.029459
## 427 ICE STORM 5.022113
## 244 HAIL 3.025954
## 402 HURRICANE 2.741910
## 411 HURRICANE/TYPHOON 2.607873
## 153 FLASH FLOOD 1.421317
Plot the results
# Plot top 8 weather event types with the with the greatest property damage
propPlot <- ggplot(property, aes(x=EVTYPE, y=TOTALPROPDMG)) + geom_bar(stat="identity", fill="pink", colour="white") + xlab("Weather Event Type") + ylab("Property Damage - $ Billions") + ggtitle("Top 8 Weather Events - Property Damage") + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + theme(plot.title = element_text(size = 10))
# Plot top 8 weather event types with the with the greatest crop damage
cropPlot <- ggplot(crop, aes(x=EVTYPE, y=TOTALCROPDMG)) + geom_bar(stat="identity", fill="salmon", colour="white") + xlab("Weather Event Type") + ylab("Crop Damage - $ Billions") + ggtitle("Top 8 Weather Events - Crop Damage") + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + theme(plot.title = element_text(size = 10))
# Arrange both results in the same plot grid.
grid.arrange(propPlot, cropPlot, ncol =2)
Flood is the weather event type generating the greatest economic consequence for property. Drought is the weather event type generating the greatest economic consequence for crops.