The task here is to analyse the NOAA storm database across the U.S. in order to identify the types of events that: 1. are most harmful with respect to population health? 2. have the greatest economic consequences?
From the Storm data, we will require the variables associated with population health and economic consequences in order to address the questions above.
library(knitr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
check if file exists and read it into R. If it doesnt exist, download it first then read the data
if ( file.exists("./storm data.csv.bz2")) {
storm = read.table("storm data.csv.bz2", header = TRUE, sep = ",", quote = "\"")
} else {
url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(url, destfile = "./storm data.csv.bz2")
storm = read.table("storm data.csv.bz2", header = TRUE, sep = ",", quote = "\"")}
subset the columns with information on health and economic variables
storm = storm[ , c('EVTYPE', 'FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP',
'CROPDMG', 'CROPDMGEXP')]
take a look
head(storm)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO 0 15 25.0 K 0
## 2 TORNADO 0 0 2.5 K 0
## 3 TORNADO 0 2 25.0 K 0
## 4 TORNADO 0 2 2.5 K 0
## 5 TORNADO 0 2 2.5 K 0
## 6 TORNADO 0 6 2.5 K 0
From the storm data documentation, for damage, alphabetical characters used to signify magnitude include “K” for thousands, “M” for millions, and “B” for billions etc. We can express the PROPDMG and CROPDMG variableS in actual dollar amounts
what are the values in CROPDMGEXP
unique(storm$PROPDMGEXP)
## [1] K M B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
PROPDMG as actual dollar amounts
for (i in 1:nrow(storm)) {
if (storm$PROPDMGEXP [i] == "K") {
storm$PROPDMG [i] = storm$PROPDMG [i] *10^3
} else if (storm$PROPDMGEXP [i] == "M") {
storm$PROPDMG [i] = storm$PROPDMG [i] *10^6
} else if (storm$PROPDMGEXP [i] == "B") {
storm$PROPDMG [i] = storm$PROPDMG [i] *10^9
} else if (storm$PROPDMGEXP [i] == "1") {
storm$PROPDMG [i] = storm$PROPDMG [i] *10^1
} else if (storm$PROPDMGEXP [i] == "2") {
storm$PROPDMG [i] = storm$PROPDMG [i] *10^2
} else if (storm$PROPDMGEXP [i] == "3") {
storm$PROPDMG [i] = storm$PROPDMG [i] *10^3
} else if (storm$PROPDMGEXP [i] == "4") {
storm$PROPDMG [i] = storm$PROPDMG [i] *10^4
} else if (storm$PROPDMGEXP [i] == "5") {
storm$PROPDMG [i] = storm$PROPDMG [i] *10^5
} else if (storm$PROPDMGEXP [i] == "6") {
storm$PROPDMG [i] = storm$PROPDMG [i] *10^6
} else if (storm$PROPDMGEXP [i] == "7") {
storm$PROPDMG [i] = storm$PROPDMG [i] *10^7
} else if (storm$PROPDMGEXP [i] == "8") {
storm$PROPDMG [i] = storm$PROPDMG [i] *10^8
} else if (storm$PROPDMGEXP [i] == "h") {
storm$PROPDMG [i] = storm$PROPDMG [i] *10^2
} else if (storm$PROPDMGEXP [i] == "H") {
storm$PROPDMG [i] = storm$PROPDMG [i] *10^2
} else if (storm$PROPDMGEXP [i] == "m") {
storm$PROPDMG [i] = storm$PROPDMG [i] *10^6
} else {
NULL
}
}
what are the values in CROPDMGEXP
unique(storm$CROPDMGEXP)
## [1] M K m B ? 0 k 2
## Levels: ? 0 2 B k K m M
CROPDMG as actual dollar amounts
for (i in 1:nrow(storm)) {
if (storm$CROPDMGEXP [i] == "K") {
storm$CROPDMG [i] = storm$CROPDMG [i] *10^3
} else if (storm$CROPDMGEXP [i] == "M") {
storm$CROPDMG [i] = storm$CROPDMG [i] *10^6
} else if (storm$CROPDMGEXP [i] == "B") {
storm$CROPDMG [i] = storm$CROPDMG [i] *10^9
} else if (storm$CROPDMGEXP [i] == "k") {
storm$CROPDMG [i] = storm$CROPDMG [i] *10^3
} else if (storm$CROPDMGEXP [i] == "m") {
storm$CROPDMG [i] = storm$CROPDMG [i] *10^6
} else if (storm$CROPDMGEXP [i] == "2") {
storm$CROPDMG [i] = storm$CROPDMG [i] *10^2
} else {
NULL
}
}
In concidering both fatalities and injuries together as harm related to population health and property and crop damage as economic consequences, let us create a single variable that contains the sum of fatalities and injuries and another column to represent the sum of crop and property damage.
for (i in 1:nrow(storm)) {
storm$health [i] = storm$FATALITIES [i] + storm$INJURIES [i]
storm$economic [i] = storm$PROPDMG [i] + storm$CROPDMG [i]
}
Now, since the two variables we just created are all we shall need to perform the analysis to answer the required questions, let us subset the dataset to have only this variables.
storm = storm[ , c('EVTYPE', 'health', 'economic')]
Since the analysis involves examining the outcomes by the type of event, let us group the data by event type variable EVTYPE
storm2 = aggregate(storm[c("health","economic")], by=list(evtype=storm$EVTYPE), FUN=sum, na.rm=TRUE)
top_econ_impact = storm2[with(storm2, order(-economic, evtype)), ]
top_econ_impact = top_econ_impact [ , c('evtype', 'economic')]
The table below shows the event types with the 10 highest economic impact
top_econ_impact = top_econ_impact [1:10,]
top_econ_impact
## evtype economic
## 170 FLOOD 150319678257
## 411 HURRICANE/TYPHOON 71913712800
## 834 TORNADO 57362333947
## 670 STORM SURGE 43323541000
## 244 HAIL 18761221986
## 153 FLASH FLOOD 18243991079
## 95 DROUGHT 15018672000
## 402 HURRICANE 14610229010
## 590 RIVER FLOOD 10148404500
## 427 ICE STORM 8967041360
qplot(x=evtype, y=economic/1000000, data = top_econ_impact, stat = "identity",
position = "dodge") + xlab("Event type") +
ylab("Total economic impact in millions (USD)") +
ggtitle("10 events with highest economic impact") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: `stat` is deprecated
## Warning: `position` is deprecated
based on the table and graph, we see that, floods have the most economic loss (sum of total crop and property damage). The second and that most costly events are hurricane/typhoon and tornado respectively
top_health_impact = storm2[with(storm2, order(-health, evtype)), ]
top_health_impact = top_health_impact [ , c('evtype', 'health')]
The table below shows the event types with the 10 highest health impact
top_health_impact = top_health_impact [1:10,]
top_health_impact
## evtype health
## 834 TORNADO 96979
## 130 EXCESSIVE HEAT 8428
## 856 TSTM WIND 7461
## 170 FLOOD 7259
## 464 LIGHTNING 6046
## 275 HEAT 3037
## 153 FLASH FLOOD 2755
## 427 ICE STORM 2064
## 760 THUNDERSTORM WIND 1621
## 972 WINTER STORM 1527
qplot(x=evtype, y=health, data = top_health_impact, stat = "identity",
position = "dodge") + xlab("Event type") + ylab("Total health") +
ggtitle("10 events with highest health impact") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: `stat` is deprecated
## Warning: `position` is deprecated
based on the table and graph, we see that, tornadoes cause more health consequences (sum of total injuries and fatalities) compared to the other event types. The second and third most harmful events are excessive heat and TSTM WIND respectively.