An analysis of the consequences of weather events on Health and Economics costs across United States.

Synopsis

We analyze the storm database from the U.S. National Oceanic and Atmospheric Administration's (NOAA) in order to identify (1) which types of events are most harmful with respect to population health and (2) which types of events have the greatest economic consequences. To achieve our goal we consider the economic indicators as corp and properties cost in million of dollars and the sum of fatalities and injuries. We conclude that the weather event that more health impact has is the “tornado”, followed by “wind”, “heat”, “floods” and “lightning”. We also conclude that the most expensive weather event, in economics terms, are the “flood”, “hurricane”.

Data Processing

Load the data

# filepath<-'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'
# # 2014-06-19 download.file(filepath,
# destfile=paste(getwd(),'/','Storm_Data.bz2',sep=''), method='wget') con <-
# unz(Storm_Data.bz2, 'Storm_Data.cvs') rawdata <-
# read.csv(bzfile('Storm_Data.bz2')) the download and unzip takes to much
# time so we place in the working directory the unziped csv file
df <- read.csv("repdata_data_StormData.csv")

Cleaning and exploration of dataset

Recode the even types to avoid case problems.

df$EVTYPE <- factor(toupper(df$EVTYPE))
df$PROPDMGEXP <- factor(toupper(df$PROPDMGEXP))
df$CROPDMGEXP <- factor(toupper(df$CROPDMGEXP))

The dataset is very big and there are only a few variables that have some interest for our purposes so we will filter it to get a smaller set of variables. We will use to filter the information we found in the document NCDC Storm Events-FAQ Page.pdf.

df2 <- df[, c(8, 23:28)]
str(df2)
## 'data.frame':    902297 obs. of  7 variables:
##  $ EVTYPE    : Factor w/ 898 levels "?","ABNORMALLY DRY",..: 754 754 754 754 754 754 754 754 754 754 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: Factor w/ 17 levels "","-","?","+",..: 16 16 16 16 16 16 16 16 16 16 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: Factor w/ 7 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
names(df2)
## [1] "EVTYPE"     "FATALITIES" "INJURIES"   "PROPDMG"    "PROPDMGEXP"
## [6] "CROPDMG"    "CROPDMGEXP"

Some of the variables are relative to health: FATALITIES, INJURIES, and some to Crops and properties damages: PROPDMG (property damage), PROPDMGEXP, CROPDMG (Crops damages), CROPDMGEXP. The variables 'PROPDMGEXP,PROPDMGEXP, are the exponet of the PROPDMG and CROPDMG respectively.

table(df2$PROPDMGEXP)
## 
##             -      ?      +      0      1      2      3      4      5 
## 465934      1      8      5    216     25     13      4      4     28 
##      6      7      8      B      H      K      M 
##      4      5      1     40      7 424665  11337
table(df2$CROPDMGEXP)
## 
##             ?      0      2      B      K      M 
## 618413      7     19      1      9 281853   1995

We understand that some of the exponents have no meaning or are bad coded, so we will discard them: “-”, “?”. We just take those observations with valid exponents.

validexp <- c("", "0", "H", "K", "M", "B")
df3 <- subset(df2, (CROPDMGEXP %in% validexp) & (PROPDMGEXP %in% validexp))
df3$PROPDMGEXP <- factor(df3$PROPDMGEXP)
df3$CROPDMGEXP <- factor(df3$CROPDMGEXP)

Create new variables that represnet the PROPDMG and CROPDMG in dollars.

table(df3$PROPDMGEXP)
## 
##             0      B      H      K      M 
## 465931    216     40      7 424661  11336
table(df3$CROPDMGEXP)
## 
##             0      B      K      M 
## 618318     19      9 281851   1994
# 'H'=10^2, 'K'=10^3,'M'=10^6, 'B'=10^9

# PROP
df3$PROPDMG_m <- 1
df3$PROPDMG_m[df3$PROPDMGEXP == "H"] <- 10^2
df3$PROPDMG_m[df3$PROPDMGEXP == "K"] <- 10^3
df3$PROPDMG_m[df3$PROPDMGEXP == "M"] <- 10^6
df3$PROPDMG_m[df3$PROPDMGEXP == "B"] <- 10^9
# crop
df3$CROPDMG_m <- 1
df3$CROPDMG_m[df3$CROPDMGEXP == "H"] <- 10^2
df3$CROPDMG_m[df3$CROPDMGEXP == "K"] <- 10^3
df3$CROPDMG_m[df3$CROPDMGEXP == "M"] <- 10^6
df3$CROPDMG_m[df3$CROPDMGEXP == "B"] <- 10^9

df3 <- transform(df3, PROPDMG_d = PROPDMG * PROPDMG_m, CROPDMG_d = CROPDMG * 
    CROPDMG_m)
# head(df3)
df3 <- df3[, -c(4, 5, 6, 7, 8, 9)]  # drop unusefull variables 4 clearness

Analysis

This is clear that we should do some recoding in the variable EVTYPE because there are very much some codes that means the same thing, as for example “winter wheater” and “winter mix”. So by means of some exploration we consider to recode some of them.

df3 <- cbind(df3, evtype2 = df3$EVTYPE)  # create a new variable for the new evtype codes, this must be a character to use grep
df3$evtype2 <- tolower(as.character(df3$evtype2))
# str(df3)

# table(df3$evtype2[grep('wint', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("wint", df3$evtype2, ignore.case = T)] <- "winter wh"

# table(df3$evtype2[grep('heat', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("heat", df3$evtype2, ignore.case = T)] <- "heat"

# table(df3$evtype2[grep('summary', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("summary", df3$evtype2, ignore.case = T)] <- "summary"

# table(df3$evtype2[grep('fire', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("fire", df3$evtype2, ignore.case = T)] <- "fire"

df3$evtype2[grep("hurricane", df3$evtype2, ignore.case = T)] <- "hurricane"


df3$evtype2[grep("volcan", df3$evtype2, ignore.case = T)] <- "volcanic"

df3$evtype2[grep("tornado", df3$evtype2, ignore.case = T)] <- "tornado"
df3$evtype2[grep("torndao", df3$evtype2, ignore.case = T)] <- "tornado"
df3$evtype2[grep("tornado", df3$evtype2, ignore.case = T)] <- "hurricane"  # hurricana are the same that tornado

df3$evtype2[grep("wind", df3$evtype2, ignore.case = T)] <- "wind"
df3$evtype2[grep("rain", df3$evtype2, ignore.case = T)] <- "rain"
df3$evtype2[grep("snow", df3$evtype2, ignore.case = T)] <- "snow"
df3$evtype2[grep("flood", df3$evtype2, ignore.case = T)] <- "flood"
df3$evtype2[grep("stream", df3$evtype2, ignore.case = T)] <- "flood"

# table(df3$evtype2[grep('fld', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("fld", df3$evtype2, ignore.case = T)] <- "flood"

# table(df3$evtype2[grep('tropic', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("tropic", df3$evtype2, ignore.case = T)] <- "tropical"

# table(df3$evtype2[grep('warm', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("warm", df3$evtype2, ignore.case = T)] <- "warm wh"
df3$evtype2[grep("hot", df3$evtype2, ignore.case = T)] <- "warm wh"

# table(df3$evtype2[grep('cold', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("cold", df3$evtype2, ignore.case = T)] <- "cold"

# table(df3$evtype2[grep('water', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("water", df3$evtype2, ignore.case = T)] <- "water spout"

# table(df3$evtype2[grep('hail', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("hail", df3$evtype2, ignore.case = T)] <- "hail"

# table(df3$evtype2[grep('ice', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("ice", df3$evtype2, ignore.case = T)] <- "ice"

# table(df3$evtype2[grep('dry', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("dry", df3$evtype2, ignore.case = T)] <- "dryness"

# table(df3$evtype2[grep('burst', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("burst", df3$evtype2, ignore.case = T)] <- "burst"

df3$evtype2[grep("thunder", df3$evtype2, ignore.case = T)] <- "thunderstormw"
df3$evtype2[grep("slide", df3$evtype2, ignore.case = T)] <- "slide"

# table(df3$evtype2[grep('temperature', df3$evtype2, ignore.case = T)])

After this recoding we will focus in the most frequent event types. Here there are the events that occurs more than 500 times.

t <- table(df3$evtype2)
# table(df3$evtype2)

tev <- as.data.frame(t)
names(tev) <- c("event", "times")
tev$rank <- rank(-tev$times, ties.method = "min")
tev <- tev[order(tev$rank, decreasing = F), ]
tev <- tev[tev$times > 500, ]

# xtev
xtev <- xtable(tev)
print(xtev, type = "html")
event times rank
180 wind 364848 1
66 hail 289256 2
47 flood 86081 3
90 hurricane 60978 4
181 winter wh 19698 5
150 snow 17636 6
101 lightning 15745 7
124 rain 12216 8
58 funnel cloud 6844 9
44 fire 4240 10
175 water spout 3855 11
12 blizzard 2719 12
69 heat 2648 13
31 drought 2488 14
94 ice 2101 15
55 frost/freeze 1343 16
28 dense fog 1293 17
22 cold 899 18
157 tropical 757 19
81 high surf 734 20
148 slide 646 21
48 fog 538 22
tev <- tev[order(tev$times, decreasing = T), ]
tevs <- tev[1:7, ]
barplot(tevs$times, names.arg = tevs$event, ylab = "Number of occurrencies", 
    main = "Most frequent wheater events")

plot of chunk plott-tev

We create a table (data.frame) that summarize the health info (fatalities and injures).

# summary health info
df3HA <- tapply(df3$FATALITIES, df3$evtype2, sum)
df3HA <- data.frame(evtype2 = row.names(df3HA), FATALITIES = as.numeric(df3HA))

df3HB <- tapply(df3$INJURIES, df3$evtype2, sum)
df3HB <- data.frame(evtype2 = row.names(df3HB), INJURIES = as.numeric(df3HB))

# merge
df3H <- merge(df3HA, df3HB, by = "evtype2")

df3H <- data.frame(df3H, totalhealthindexT = df3H$FATALITIES + df3H$INJURIES)
df3H <- df3H[order(df3H$totalhealthindexT, df3H$FATALITIES, decreasing = T), 
    ]

rm(df3HA, df3HB)  # delete auxiliar dataframes
# head(df3H)

By the same way we crate a table (data.frame) thats summarize the economics costs (crops and properties)

# Summary of economic costs: Corps (C) and Properties (P)
df3EC <- tapply(df3$CROPDMG_d, df3$evtype2, sum)
df3EC <- data.frame(evtype2 = row.names(df3EC), CROPDMG_d = as.numeric(df3EC))

df3EP <- tapply(df3$PROPDMG_d, df3$evtype2, sum)
df3EP <- data.frame(evtype2 = row.names(df3EP), PROPDMG_d = as.numeric(df3EP))

# merge
df3E <- merge(df3EC, df3EP, by = "evtype2")

# order it
df3E <- data.frame(df3E, totalcost = df3E$CROPDMG_d + df3E$PROPDMG_d)
df3E <- df3E[order(df3E$totalcost, df3E$PROPDMG_d, decreasing = T), ]

rm(Df3EC, df3EP)  # delete aux objects
## Warning: objeto 'Df3EC' no encontrado

# a table to summarize all costs: health and economics
df4 <- merge(df3H, df3E, by = "evtype2")
df4 <- df4[order(df4$totalhealthindexT, df4$totalcost, decreasing = T), ]

For graphing purposes we will convert the dollars in millions of dollars by dividing by 1,000,000.

df3E$CROPDMG_d <- df3E$CROPDMG_d/1e+06
df3E$PROPDMG_d <- df3E$PROPDMG_d/1e+06
df3E$totalcost <- df3E$totalcost/1e+06
# names(df3H) 'evtype2' 'FATALITIES' 'INJURIES' 'totalhealthindexT'
# names(df3E) 'evtype2' 'CROPDMG_d' 'PROPDMG_d' 'totalcost' names(df4)
# 'evtype2' 'FATALITIES' 'INJURIES' 'totalhealthindexT' 'CROPDMG_d'
# 'PROPDMG_d' 'totalcost'
ne <- 10
df3H <- df3H[1:ne, ]
df3E <- df3E[1:ne, ]

Results

We found that the events that are most harmful with respect to population health are the “tornado”, followed by “wind”, “heat”, “floods” and “lightning”.

Here we offer a more detailed information of the 10 events with higher health cost

# df3H
xdf3H <- xtable(df3H)
print(xdf3H, type = "html")
evtype2 FATALITIES INJURIES totalhealthindexT
90 hurricane 5796.00 92734.00 98530.00
180 wind 1421.00 11483.00 12904.00
69 heat 3138.00 9224.00 12362.00
47 flood 1552.00 8683.00 10235.00
101 lightning 816.00 5230.00 6046.00
94 ice 97.00 2152.00 2249.00
181 winter wh 279.00 1968.00 2247.00
44 fire 90.00 1608.00 1698.00
66 hail 15.00 1371.00 1386.00
150 snow 157.00 1121.00 1278.00

We also conclude that the most expensive weather event, in economics terms, are the “flood”, “hurricane”.

Here we offer a more detailed information of the 10 events with higher Economic cost

# df3E
xdf3E <- xtable(df3E)
print(xdf3E, type = "html")
evtype2 CROPDMG_d PROPDMG_d totalcost
47 flood 12274.96 167560.11 179835.07
90 hurricane 5932.75 143349.28 149282.03
152 storm surge 0.01 43323.54 43323.54
66 hail 3021.89 15974.05 18995.94
180 wind 2141.80 15982.50 18124.30
31 drought 13972.57 1046.11 15018.67
94 ice 5022.11 3958.50 8980.62
44 fire 403.28 8501.63 8904.91
157 tropical 694.90 7716.13 8411.02
181 winter wh 47.44 6777.31 6824.75
layout(matrix(c(1, 2, 3, 3), 2, 2, byrow = TRUE))
plot(x = log(df3H$FATALITIES), log(df3H$INJURIES), asp = F, type = "p", pch = "*", 
    main = "Relation of Fatities vs Injures.\n Main 10 Weather Event.", xlab = "Fatalities (log scale)", 
    ylab = "Injures (log scale)")
for (i in 1:10) {
    text(log(df3H$FATALITIES[i]), log(df3H$INJURIES[i]), df3H$evtype2[i], cex = 0.8, 
        pos = 3)
}
# plotting-crop
plot(df3E$PROPDMG_d, df3E$CROPDMG_d, asp = F, type = "p", pch = "*", main = "Relation of Property Damage vs Crop Damage ($).\n Main 10 Weather Event.", 
    xlab = "Property Damage (Million of $)", ylab = "Crop Damage (Million of $)")
for (i in 1:10) {
    text(df3E$PROPDMG_d[i], df3E$CROPDMG_d[i], df3E$evtype2[i], cex = 0.8, pos = 3)
}
# plotting-all
df4 <- df4[1:10, ]
plot(log(df4$totalhealthindexT), df4$totalcost, asp = F, type = "p", pch = "*", 
    main = "Relation of Health impact vs Total econommic cost.\n Main 10 Weather Event.", 
    xlab = "Health Impact (log scale)", ylab = "Total Cost in Millions of $")
for (i in 1:10) {
    text(log(df4$totalhealthindexT[i]), df4$totalcost[i], df4$evtype2[i], cex = 0.8, 
        pos = 3)
}

plot of chunk plotting-health-crop

par(mfrow = c(2, 1))
df4 <- df4[order(df4$totalcost, decreasing = T), ]
df4s <- df4[1:7, ]
barplot(df4s$totalcost, names.arg = df4s$evtype2, ylab = "Total Cost in millions od $", 
    main = "Even types with higher Economic cost")
# health
df4 <- df4[order(df4$totalhealthindexT, decreasing = T), ]
df4s <- df4[1:7, ]
barplot(df4s$totalhealthindexT, names.arg = df4s$evtype2, ylab = "Healt impact", 
    main = "Even types with higher health total impact")

plot of chunk barplots

sessionInfo()
## R version 3.1.0 beta (2014-03-28 r65330)
## Platform: i686-pc-linux-gnu (32-bit)
## 
## locale:
##  [1] LC_CTYPE=es_ES.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=es_ES.UTF-8        LC_COLLATE=es_ES.UTF-8    
##  [5] LC_MONETARY=es_ES.UTF-8    LC_MESSAGES=es_ES.UTF-8   
##  [7] LC_PAPER=es_ES.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] xtable_1.7-3 knitr_1.5   
## 
## loaded via a namespace (and not attached):
## [1] digest_0.6.4   evaluate_0.5.3 formatR_0.10   stringr_0.6.2 
## [5] tools_3.1.0