We analyze the storm database from the U.S. National Oceanic and Atmospheric Administration's (NOAA) in order to identify (1) which types of events are most harmful with respect to population health and (2) which types of events have the greatest economic consequences. To achieve our goal we consider the economic indicators as corp and properties cost in million of dollars and the sum of fatalities and injuries. We conclude that the weather event that more health impact has is the “tornado”, followed by “wind”, “heat”, “floods” and “lightning”. We also conclude that the most expensive weather event, in economics terms, are the “flood”, “hurricane”.
Load the data
# filepath<-'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'
# # 2014-06-19 download.file(filepath,
# destfile=paste(getwd(),'/','Storm_Data.bz2',sep=''), method='wget') con <-
# unz(Storm_Data.bz2, 'Storm_Data.cvs') rawdata <-
# read.csv(bzfile('Storm_Data.bz2')) the download and unzip takes to much
# time so we place in the working directory the unziped csv file
df <- read.csv("repdata_data_StormData.csv")
Recode the even types to avoid case problems.
df$EVTYPE <- factor(toupper(df$EVTYPE))
df$PROPDMGEXP <- factor(toupper(df$PROPDMGEXP))
df$CROPDMGEXP <- factor(toupper(df$CROPDMGEXP))
The dataset is very big and there are only a few variables that have some interest for our purposes so we will filter it to get a smaller set of variables. We will use to filter the information we found in the document NCDC Storm Events-FAQ Page.pdf.
df2 <- df[, c(8, 23:28)]
str(df2)
## 'data.frame': 902297 obs. of 7 variables:
## $ EVTYPE : Factor w/ 898 levels "?","ABNORMALLY DRY",..: 754 754 754 754 754 754 754 754 754 754 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 17 levels "","-","?","+",..: 16 16 16 16 16 16 16 16 16 16 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 7 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
names(df2)
## [1] "EVTYPE" "FATALITIES" "INJURIES" "PROPDMG" "PROPDMGEXP"
## [6] "CROPDMG" "CROPDMGEXP"
Some of the variables are relative to health: FATALITIES, INJURIES, and some to Crops and properties damages: PROPDMG (property damage), PROPDMGEXP, CROPDMG (Crops damages), CROPDMGEXP.
The variables 'PROPDMGEXP,PROPDMGEXP, are the exponet of the PROPDMG and CROPDMG respectively.
table(df2$PROPDMGEXP)
##
## - ? + 0 1 2 3 4 5
## 465934 1 8 5 216 25 13 4 4 28
## 6 7 8 B H K M
## 4 5 1 40 7 424665 11337
table(df2$CROPDMGEXP)
##
## ? 0 2 B K M
## 618413 7 19 1 9 281853 1995
We understand that some of the exponents have no meaning or are bad coded, so we will discard them: “-”, “?”. We just take those observations with valid exponents.
validexp <- c("", "0", "H", "K", "M", "B")
df3 <- subset(df2, (CROPDMGEXP %in% validexp) & (PROPDMGEXP %in% validexp))
df3$PROPDMGEXP <- factor(df3$PROPDMGEXP)
df3$CROPDMGEXP <- factor(df3$CROPDMGEXP)
Create new variables that represnet the PROPDMG and CROPDMG in dollars.
table(df3$PROPDMGEXP)
##
## 0 B H K M
## 465931 216 40 7 424661 11336
table(df3$CROPDMGEXP)
##
## 0 B K M
## 618318 19 9 281851 1994
# 'H'=10^2, 'K'=10^3,'M'=10^6, 'B'=10^9
# PROP
df3$PROPDMG_m <- 1
df3$PROPDMG_m[df3$PROPDMGEXP == "H"] <- 10^2
df3$PROPDMG_m[df3$PROPDMGEXP == "K"] <- 10^3
df3$PROPDMG_m[df3$PROPDMGEXP == "M"] <- 10^6
df3$PROPDMG_m[df3$PROPDMGEXP == "B"] <- 10^9
# crop
df3$CROPDMG_m <- 1
df3$CROPDMG_m[df3$CROPDMGEXP == "H"] <- 10^2
df3$CROPDMG_m[df3$CROPDMGEXP == "K"] <- 10^3
df3$CROPDMG_m[df3$CROPDMGEXP == "M"] <- 10^6
df3$CROPDMG_m[df3$CROPDMGEXP == "B"] <- 10^9
df3 <- transform(df3, PROPDMG_d = PROPDMG * PROPDMG_m, CROPDMG_d = CROPDMG *
CROPDMG_m)
# head(df3)
df3 <- df3[, -c(4, 5, 6, 7, 8, 9)] # drop unusefull variables 4 clearness
This is clear that we should do some recoding in the variable EVTYPE because there are very much some codes that means the same thing, as for example “winter wheater” and “winter mix”. So by means of some exploration we consider to recode some of them.
df3 <- cbind(df3, evtype2 = df3$EVTYPE) # create a new variable for the new evtype codes, this must be a character to use grep
df3$evtype2 <- tolower(as.character(df3$evtype2))
# str(df3)
# table(df3$evtype2[grep('wint', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("wint", df3$evtype2, ignore.case = T)] <- "winter wh"
# table(df3$evtype2[grep('heat', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("heat", df3$evtype2, ignore.case = T)] <- "heat"
# table(df3$evtype2[grep('summary', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("summary", df3$evtype2, ignore.case = T)] <- "summary"
# table(df3$evtype2[grep('fire', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("fire", df3$evtype2, ignore.case = T)] <- "fire"
df3$evtype2[grep("hurricane", df3$evtype2, ignore.case = T)] <- "hurricane"
df3$evtype2[grep("volcan", df3$evtype2, ignore.case = T)] <- "volcanic"
df3$evtype2[grep("tornado", df3$evtype2, ignore.case = T)] <- "tornado"
df3$evtype2[grep("torndao", df3$evtype2, ignore.case = T)] <- "tornado"
df3$evtype2[grep("tornado", df3$evtype2, ignore.case = T)] <- "hurricane" # hurricana are the same that tornado
df3$evtype2[grep("wind", df3$evtype2, ignore.case = T)] <- "wind"
df3$evtype2[grep("rain", df3$evtype2, ignore.case = T)] <- "rain"
df3$evtype2[grep("snow", df3$evtype2, ignore.case = T)] <- "snow"
df3$evtype2[grep("flood", df3$evtype2, ignore.case = T)] <- "flood"
df3$evtype2[grep("stream", df3$evtype2, ignore.case = T)] <- "flood"
# table(df3$evtype2[grep('fld', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("fld", df3$evtype2, ignore.case = T)] <- "flood"
# table(df3$evtype2[grep('tropic', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("tropic", df3$evtype2, ignore.case = T)] <- "tropical"
# table(df3$evtype2[grep('warm', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("warm", df3$evtype2, ignore.case = T)] <- "warm wh"
df3$evtype2[grep("hot", df3$evtype2, ignore.case = T)] <- "warm wh"
# table(df3$evtype2[grep('cold', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("cold", df3$evtype2, ignore.case = T)] <- "cold"
# table(df3$evtype2[grep('water', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("water", df3$evtype2, ignore.case = T)] <- "water spout"
# table(df3$evtype2[grep('hail', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("hail", df3$evtype2, ignore.case = T)] <- "hail"
# table(df3$evtype2[grep('ice', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("ice", df3$evtype2, ignore.case = T)] <- "ice"
# table(df3$evtype2[grep('dry', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("dry", df3$evtype2, ignore.case = T)] <- "dryness"
# table(df3$evtype2[grep('burst', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("burst", df3$evtype2, ignore.case = T)] <- "burst"
df3$evtype2[grep("thunder", df3$evtype2, ignore.case = T)] <- "thunderstormw"
df3$evtype2[grep("slide", df3$evtype2, ignore.case = T)] <- "slide"
# table(df3$evtype2[grep('temperature', df3$evtype2, ignore.case = T)])
After this recoding we will focus in the most frequent event types. Here there are the events that occurs more than 500 times.
t <- table(df3$evtype2)
# table(df3$evtype2)
tev <- as.data.frame(t)
names(tev) <- c("event", "times")
tev$rank <- rank(-tev$times, ties.method = "min")
tev <- tev[order(tev$rank, decreasing = F), ]
tev <- tev[tev$times > 500, ]
# xtev
xtev <- xtable(tev)
print(xtev, type = "html")
| event | times | rank | |
|---|---|---|---|
| 180 | wind | 364848 | 1 |
| 66 | hail | 289256 | 2 |
| 47 | flood | 86081 | 3 |
| 90 | hurricane | 60978 | 4 |
| 181 | winter wh | 19698 | 5 |
| 150 | snow | 17636 | 6 |
| 101 | lightning | 15745 | 7 |
| 124 | rain | 12216 | 8 |
| 58 | funnel cloud | 6844 | 9 |
| 44 | fire | 4240 | 10 |
| 175 | water spout | 3855 | 11 |
| 12 | blizzard | 2719 | 12 |
| 69 | heat | 2648 | 13 |
| 31 | drought | 2488 | 14 |
| 94 | ice | 2101 | 15 |
| 55 | frost/freeze | 1343 | 16 |
| 28 | dense fog | 1293 | 17 |
| 22 | cold | 899 | 18 |
| 157 | tropical | 757 | 19 |
| 81 | high surf | 734 | 20 |
| 148 | slide | 646 | 21 |
| 48 | fog | 538 | 22 |
tev <- tev[order(tev$times, decreasing = T), ]
tevs <- tev[1:7, ]
barplot(tevs$times, names.arg = tevs$event, ylab = "Number of occurrencies",
main = "Most frequent wheater events")
We create a table (data.frame) that summarize the health info (fatalities and injures).
# summary health info
df3HA <- tapply(df3$FATALITIES, df3$evtype2, sum)
df3HA <- data.frame(evtype2 = row.names(df3HA), FATALITIES = as.numeric(df3HA))
df3HB <- tapply(df3$INJURIES, df3$evtype2, sum)
df3HB <- data.frame(evtype2 = row.names(df3HB), INJURIES = as.numeric(df3HB))
# merge
df3H <- merge(df3HA, df3HB, by = "evtype2")
df3H <- data.frame(df3H, totalhealthindexT = df3H$FATALITIES + df3H$INJURIES)
df3H <- df3H[order(df3H$totalhealthindexT, df3H$FATALITIES, decreasing = T),
]
rm(df3HA, df3HB) # delete auxiliar dataframes
# head(df3H)
By the same way we crate a table (data.frame) thats summarize the economics costs (crops and properties)
# Summary of economic costs: Corps (C) and Properties (P)
df3EC <- tapply(df3$CROPDMG_d, df3$evtype2, sum)
df3EC <- data.frame(evtype2 = row.names(df3EC), CROPDMG_d = as.numeric(df3EC))
df3EP <- tapply(df3$PROPDMG_d, df3$evtype2, sum)
df3EP <- data.frame(evtype2 = row.names(df3EP), PROPDMG_d = as.numeric(df3EP))
# merge
df3E <- merge(df3EC, df3EP, by = "evtype2")
# order it
df3E <- data.frame(df3E, totalcost = df3E$CROPDMG_d + df3E$PROPDMG_d)
df3E <- df3E[order(df3E$totalcost, df3E$PROPDMG_d, decreasing = T), ]
rm(Df3EC, df3EP) # delete aux objects
## Warning: objeto 'Df3EC' no encontrado
# a table to summarize all costs: health and economics
df4 <- merge(df3H, df3E, by = "evtype2")
df4 <- df4[order(df4$totalhealthindexT, df4$totalcost, decreasing = T), ]
For graphing purposes we will convert the dollars in millions of dollars by dividing by 1,000,000.
df3E$CROPDMG_d <- df3E$CROPDMG_d/1e+06
df3E$PROPDMG_d <- df3E$PROPDMG_d/1e+06
df3E$totalcost <- df3E$totalcost/1e+06
# names(df3H) 'evtype2' 'FATALITIES' 'INJURIES' 'totalhealthindexT'
# names(df3E) 'evtype2' 'CROPDMG_d' 'PROPDMG_d' 'totalcost' names(df4)
# 'evtype2' 'FATALITIES' 'INJURIES' 'totalhealthindexT' 'CROPDMG_d'
# 'PROPDMG_d' 'totalcost'
ne <- 10
df3H <- df3H[1:ne, ]
df3E <- df3E[1:ne, ]
We found that the events that are most harmful with respect to population health are the “tornado”, followed by “wind”, “heat”, “floods” and “lightning”.
Here we offer a more detailed information of the 10 events with higher health cost
# df3H
xdf3H <- xtable(df3H)
print(xdf3H, type = "html")
| evtype2 | FATALITIES | INJURIES | totalhealthindexT | |
|---|---|---|---|---|
| 90 | hurricane | 5796.00 | 92734.00 | 98530.00 |
| 180 | wind | 1421.00 | 11483.00 | 12904.00 |
| 69 | heat | 3138.00 | 9224.00 | 12362.00 |
| 47 | flood | 1552.00 | 8683.00 | 10235.00 |
| 101 | lightning | 816.00 | 5230.00 | 6046.00 |
| 94 | ice | 97.00 | 2152.00 | 2249.00 |
| 181 | winter wh | 279.00 | 1968.00 | 2247.00 |
| 44 | fire | 90.00 | 1608.00 | 1698.00 |
| 66 | hail | 15.00 | 1371.00 | 1386.00 |
| 150 | snow | 157.00 | 1121.00 | 1278.00 |
We also conclude that the most expensive weather event, in economics terms, are the “flood”, “hurricane”.
Here we offer a more detailed information of the 10 events with higher Economic cost
# df3E
xdf3E <- xtable(df3E)
print(xdf3E, type = "html")
| evtype2 | CROPDMG_d | PROPDMG_d | totalcost | |
|---|---|---|---|---|
| 47 | flood | 12274.96 | 167560.11 | 179835.07 |
| 90 | hurricane | 5932.75 | 143349.28 | 149282.03 |
| 152 | storm surge | 0.01 | 43323.54 | 43323.54 |
| 66 | hail | 3021.89 | 15974.05 | 18995.94 |
| 180 | wind | 2141.80 | 15982.50 | 18124.30 |
| 31 | drought | 13972.57 | 1046.11 | 15018.67 |
| 94 | ice | 5022.11 | 3958.50 | 8980.62 |
| 44 | fire | 403.28 | 8501.63 | 8904.91 |
| 157 | tropical | 694.90 | 7716.13 | 8411.02 |
| 181 | winter wh | 47.44 | 6777.31 | 6824.75 |
layout(matrix(c(1, 2, 3, 3), 2, 2, byrow = TRUE))
plot(x = log(df3H$FATALITIES), log(df3H$INJURIES), asp = F, type = "p", pch = "*",
main = "Relation of Fatities vs Injures.\n Main 10 Weather Event.", xlab = "Fatalities (log scale)",
ylab = "Injures (log scale)")
for (i in 1:10) {
text(log(df3H$FATALITIES[i]), log(df3H$INJURIES[i]), df3H$evtype2[i], cex = 0.8,
pos = 3)
}
# plotting-crop
plot(df3E$PROPDMG_d, df3E$CROPDMG_d, asp = F, type = "p", pch = "*", main = "Relation of Property Damage vs Crop Damage ($).\n Main 10 Weather Event.",
xlab = "Property Damage (Million of $)", ylab = "Crop Damage (Million of $)")
for (i in 1:10) {
text(df3E$PROPDMG_d[i], df3E$CROPDMG_d[i], df3E$evtype2[i], cex = 0.8, pos = 3)
}
# plotting-all
df4 <- df4[1:10, ]
plot(log(df4$totalhealthindexT), df4$totalcost, asp = F, type = "p", pch = "*",
main = "Relation of Health impact vs Total econommic cost.\n Main 10 Weather Event.",
xlab = "Health Impact (log scale)", ylab = "Total Cost in Millions of $")
for (i in 1:10) {
text(log(df4$totalhealthindexT[i]), df4$totalcost[i], df4$evtype2[i], cex = 0.8,
pos = 3)
}
par(mfrow = c(2, 1))
df4 <- df4[order(df4$totalcost, decreasing = T), ]
df4s <- df4[1:7, ]
barplot(df4s$totalcost, names.arg = df4s$evtype2, ylab = "Total Cost in millions od $",
main = "Even types with higher Economic cost")
# health
df4 <- df4[order(df4$totalhealthindexT, decreasing = T), ]
df4s <- df4[1:7, ]
barplot(df4s$totalhealthindexT, names.arg = df4s$evtype2, ylab = "Healt impact",
main = "Even types with higher health total impact")
sessionInfo()
## R version 3.1.0 beta (2014-03-28 r65330)
## Platform: i686-pc-linux-gnu (32-bit)
##
## locale:
## [1] LC_CTYPE=es_ES.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=es_ES.UTF-8 LC_COLLATE=es_ES.UTF-8
## [5] LC_MONETARY=es_ES.UTF-8 LC_MESSAGES=es_ES.UTF-8
## [7] LC_PAPER=es_ES.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] xtable_1.7-3 knitr_1.5
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.4 evaluate_0.5.3 formatR_0.10 stringr_0.6.2
## [5] tools_3.1.0