An analysis of the consequences of weather events on Health and Economics costs across United States.

Synopsis

We analyze the storm database from the U.S. National Oceanic and Atmospheric Administration's (NOAA) in order to identify (1) which types of events are most harmful with respect to population health and (2) which types of events have the greatest economic consequences. To achieve our goal we consider the economic indicators as corp and properties cost in million of dollars and the sum of fatalities and injuries. We conclude that the weather event that more health impact has is the “tornado”, followed by “wind”, “heat”, “floods” and “lightning”. We also conclude that the most expensive weather event, in economics terms, are the “flood”, “hurricane”.

Data Processing

Load the data

# filepath<-'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'
# # 2014-06-19 download.file(filepath,
# destfile=paste(getwd(),'/','Storm_Data.bz2',sep=''), method='wget') con <-
# unz(Storm_Data.bz2, 'Storm_Data.cvs') rawdata <-
# read.csv(bzfile('Storm_Data.bz2')) the download and unzip takes to much
# time so we place in the working directory the unziped csv file
df <- read.csv("repdata_data_StormData.csv")

Cleaning and exploration of dataset

Recode the even types to avoid case problems.

df$EVTYPE <- factor(toupper(df$EVTYPE))
df$PROPDMGEXP <- factor(toupper(df$PROPDMGEXP))
df$CROPDMGEXP <- factor(toupper(df$CROPDMGEXP))

The dataset is very big and there are only a few variables that have some interest for our purposes so we will filter it to get a smaller set of variables. We will use to filter the information we found in the document NCDC Storm Events-FAQ Page.pdf.

df2 <- df[, c(8, 23:28)]
str(df2)

## 'data.frame':    902297 obs. of  7 variables:
##  $ EVTYPE    : Factor w/ 898 levels "?","ABNORMALLY DRY",..: 754 754 754 754 754 754 754 754 754 754 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: Factor w/ 17 levels "","-","?","+",..: 16 16 16 16 16 16 16 16 16 16 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: Factor w/ 7 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...

names(df2)

## [1] "EVTYPE"     "FATALITIES" "INJURIES"   "PROPDMG"    "PROPDMGEXP"
## [6] "CROPDMG"    "CROPDMGEXP"

Some of the variables are relative to health: FATALITIES, INJURIES, and some to Crops and properties damages: PROPDMG (property damage), PROPDMGEXP, CROPDMG (Crops damages), CROPDMGEXP. The variables 'PROPDMGEXP,PROPDMGEXP, are the exponet of the PROPDMG and CROPDMG respectively.

table(df2$PROPDMGEXP)

## 
##             -      ?      +      0      1      2      3      4      5 
## 465934      1      8      5    216     25     13      4      4     28 
##      6      7      8      B      H      K      M 
##      4      5      1     40      7 424665  11337

table(df2$CROPDMGEXP)

## 
##             ?      0      2      B      K      M 
## 618413      7     19      1      9 281853   1995

We understand that some of the exponents have no meaning or are bad coded, so we will discard them: “-”, “?”. We just take those observations with valid exponents.

validexp <- c("", "0", "H", "K", "M", "B")
df3 <- subset(df2, (CROPDMGEXP %in% validexp) & (PROPDMGEXP %in% validexp))
df3$PROPDMGEXP <- factor(df3$PROPDMGEXP)
df3$CROPDMGEXP <- factor(df3$CROPDMGEXP)

Create new variables that represnet the PROPDMG and CROPDMG in dollars.

table(df3$PROPDMGEXP)

## 
##             0      B      H      K      M 
## 465931    216     40      7 424661  11336

table(df3$CROPDMGEXP)

## 
##             0      B      K      M 
## 618318     19      9 281851   1994

# 'H'=10^2, 'K'=10^3,'M'=10^6, 'B'=10^9

# PROP
df3$PROPDMG_m <- 1
df3$PROPDMG_m[df3$PROPDMGEXP == "H"] <- 10^2
df3$PROPDMG_m[df3$PROPDMGEXP == "K"] <- 10^3
df3$PROPDMG_m[df3$PROPDMGEXP == "M"] <- 10^6
df3$PROPDMG_m[df3$PROPDMGEXP == "B"] <- 10^9
# crop
df3$CROPDMG_m <- 1
df3$CROPDMG_m[df3$CROPDMGEXP == "H"] <- 10^2
df3$CROPDMG_m[df3$CROPDMGEXP == "K"] <- 10^3
df3$CROPDMG_m[df3$CROPDMGEXP == "M"] <- 10^6
df3$CROPDMG_m[df3$CROPDMGEXP == "B"] <- 10^9

df3 <- transform(df3, PROPDMG_d = PROPDMG * PROPDMG_m, CROPDMG_d = CROPDMG * 
    CROPDMG_m)
# head(df3)
df3 <- df3[, -c(4, 5, 6, 7, 8, 9)]  # drop unusefull variables 4 clearness

Analysis

This is clear that we should do some recoding in the variable EVTYPE because there are very much some codes that means the same thing, as for example “winter wheater” and “winter mix”. So by means of some exploration we consider to recode some of them.

df3 <- cbind(df3, evtype2 = df3$EVTYPE)  # create a new variable for the new evtype codes, this must be a character to use grep
df3$evtype2 <- tolower(as.character(df3$evtype2))
# str(df3)

# table(df3$evtype2[grep('wint', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("wint", df3$evtype2, ignore.case = T)] <- "winter wh"

# table(df3$evtype2[grep('heat', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("heat", df3$evtype2, ignore.case = T)] <- "heat"

# table(df3$evtype2[grep('summary', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("summary", df3$evtype2, ignore.case = T)] <- "summary"

# table(df3$evtype2[grep('fire', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("fire", df3$evtype2, ignore.case = T)] <- "fire"

df3$evtype2[grep("hurricane", df3$evtype2, ignore.case = T)] <- "hurricane"


df3$evtype2[grep("volcan", df3$evtype2, ignore.case = T)] <- "volcanic"

df3$evtype2[grep("tornado", df3$evtype2, ignore.case = T)] <- "tornado"
df3$evtype2[grep("torndao", df3$evtype2, ignore.case = T)] <- "tornado"
df3$evtype2[grep("tornado", df3$evtype2, ignore.case = T)] <- "hurricane"  # hurricana are the same that tornado

df3$evtype2[grep("wind", df3$evtype2, ignore.case = T)] <- "wind"
df3$evtype2[grep("rain", df3$evtype2, ignore.case = T)] <- "rain"
df3$evtype2[grep("snow", df3$evtype2, ignore.case = T)] <- "snow"
df3$evtype2[grep("flood", df3$evtype2, ignore.case = T)] <- "flood"
df3$evtype2[grep("stream", df3$evtype2, ignore.case = T)] <- "flood"

# table(df3$evtype2[grep('fld', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("fld", df3$evtype2, ignore.case = T)] <- "flood"

# table(df3$evtype2[grep('tropic', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("tropic", df3$evtype2, ignore.case = T)] <- "tropical"

# table(df3$evtype2[grep('warm', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("warm", df3$evtype2, ignore.case = T)] <- "warm wh"
df3$evtype2[grep("hot", df3$evtype2, ignore.case = T)] <- "warm wh"

# table(df3$evtype2[grep('cold', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("cold", df3$evtype2, ignore.case = T)] <- "cold"

# table(df3$evtype2[grep('water', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("water", df3$evtype2, ignore.case = T)] <- "water spout"

# table(df3$evtype2[grep('hail', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("hail", df3$evtype2, ignore.case = T)] <- "hail"

# table(df3$evtype2[grep('ice', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("ice", df3$evtype2, ignore.case = T)] <- "ice"

# table(df3$evtype2[grep('dry', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("dry", df3$evtype2, ignore.case = T)] <- "dryness"

# table(df3$evtype2[grep('burst', df3$evtype2, ignore.case = T)])
df3$evtype2[grep("burst", df3$evtype2, ignore.case = T)] <- "burst"

df3$evtype2[grep("thunder", df3$evtype2, ignore.case = T)] <- "thunderstormw"
df3$evtype2[grep("slide", df3$evtype2, ignore.case = T)] <- "slide"

# table(df3$evtype2[grep('temperature', df3$evtype2, ignore.case = T)])

After this recoding we will focus in the most frequent event types. Here there are the events that occurs more than 500 times.

t <- table(df3$evtype2)
# table(df3$evtype2)

tev <- as.data.frame(t)
names(tev) <- c("event", "times")
tev$rank <- rank(-tev$times, ties.method = "min")
tev <- tev[order(tev$rank, decreasing = F), ]
tev <- tev[tev$times > 500, ]

# xtev
xtev <- xtable(tev)
print(xtev, type = "html")

	event	times	rank
180	wind	364848	1
66	hail	289256	2
47	flood	86081	3
90	hurricane	60978	4
181	winter wh	19698	5
150	snow	17636	6
101	lightning	15745	7
124	rain	12216	8
58	funnel cloud	6844	9
44	fire	4240	10
175	water spout	3855	11
12	blizzard	2719	12
69	heat	2648	13
31	drought	2488	14
94	ice	2101	15
55	frost/freeze	1343	16
28	dense fog	1293	17
22	cold	899	18
157	tropical	757	19
81	high surf	734	20
148	slide	646	21
48	fog	538	22

tev <- tev[order(tev$times, decreasing = T), ]
tevs <- tev[1:7, ]
barplot(tevs$times, names.arg = tevs$event, ylab = "Number of occurrencies", 
    main = "Most frequent wheater events")

plot of chunk plott-tev

We create a table (data.frame) that summarize the health info (fatalities and injures).

# summary health info
df3HA <- tapply(df3$FATALITIES, df3$evtype2, sum)
df3HA <- data.frame(evtype2 = row.names(df3HA), FATALITIES = as.numeric(df3HA))

df3HB <- tapply(df3$INJURIES, df3$evtype2, sum)
df3HB <- data.frame(evtype2 = row.names(df3HB), INJURIES = as.numeric(df3HB))

# merge
df3H <- merge(df3HA, df3HB, by = "evtype2")

df3H <- data.frame(df3H, totalhealthindexT = df3H$FATALITIES + df3H$INJURIES)
df3H <- df3H[order(df3H$totalhealthindexT, df3H$FATALITIES, decreasing = T), 
    ]

rm(df3HA, df3HB)  # delete auxiliar dataframes
# head(df3H)

By the same way we crate a table (data.frame) thats summarize the economics costs (crops and properties)

# Summary of economic costs: Corps (C) and Properties (P)
df3EC <- tapply(df3$CROPDMG_d, df3$evtype2, sum)
df3EC <- data.frame(evtype2 = row.names(df3EC), CROPDMG_d = as.numeric(df3EC))

df3EP <- tapply(df3$PROPDMG_d, df3$evtype2, sum)
df3EP <- data.frame(evtype2 = row.names(df3EP), PROPDMG_d = as.numeric(df3EP))

# merge
df3E <- merge(df3EC, df3EP, by = "evtype2")

# order it
df3E <- data.frame(df3E, totalcost = df3E$CROPDMG_d + df3E$PROPDMG_d)
df3E <- df3E[order(df3E$totalcost, df3E$PROPDMG_d, decreasing = T), ]

rm(Df3EC, df3EP)  # delete aux objects

## Warning: objeto 'Df3EC' no encontrado


# a table to summarize all costs: health and economics
df4 <- merge(df3H, df3E, by = "evtype2")
df4 <- df4[order(df4$totalhealthindexT, df4$totalcost, decreasing = T), ]

For graphing purposes we will convert the dollars in millions of dollars by dividing by 1,000,000.

df3E$CROPDMG_d <- df3E$CROPDMG_d/1e+06
df3E$PROPDMG_d <- df3E$PROPDMG_d/1e+06
df3E$totalcost <- df3E$totalcost/1e+06
# names(df3H) 'evtype2' 'FATALITIES' 'INJURIES' 'totalhealthindexT'
# names(df3E) 'evtype2' 'CROPDMG_d' 'PROPDMG_d' 'totalcost' names(df4)
# 'evtype2' 'FATALITIES' 'INJURIES' 'totalhealthindexT' 'CROPDMG_d'
# 'PROPDMG_d' 'totalcost'

ne <- 10
df3H <- df3H[1:ne, ]
df3E <- df3E[1:ne, ]

Results

We found that the events that are most harmful with respect to population health are the “tornado”, followed by “wind”, “heat”, “floods” and “lightning”.

Here we offer a more detailed information of the 10 events with higher health cost

# df3H
xdf3H <- xtable(df3H)
print(xdf3H, type = "html")

	evtype2	FATALITIES	INJURIES	totalhealthindexT
90	hurricane	5796.00	92734.00	98530.00
180	wind	1421.00	11483.00	12904.00
69	heat	3138.00	9224.00	12362.00
47	flood	1552.00	8683.00	10235.00
101	lightning	816.00	5230.00	6046.00
94	ice	97.00	2152.00	2249.00
181	winter wh	279.00	1968.00	2247.00
44	fire	90.00	1608.00	1698.00
66	hail	15.00	1371.00	1386.00
150	snow	157.00	1121.00	1278.00

We also conclude that the most expensive weather event, in economics terms, are the “flood”, “hurricane”.

Here we offer a more detailed information of the 10 events with higher Economic cost

# df3E
xdf3E <- xtable(df3E)
print(xdf3E, type = "html")

	evtype2	CROPDMG_d	PROPDMG_d	totalcost
47	flood	12274.96	167560.11	179835.07
90	hurricane	5932.75	143349.28	149282.03
152	storm surge	0.01	43323.54	43323.54
66	hail	3021.89	15974.05	18995.94
180	wind	2141.80	15982.50	18124.30
31	drought	13972.57	1046.11	15018.67
94	ice	5022.11	3958.50	8980.62
44	fire	403.28	8501.63	8904.91
157	tropical	694.90	7716.13	8411.02
181	winter wh	47.44	6777.31	6824.75

layout(matrix(c(1, 2, 3, 3), 2, 2, byrow = TRUE))
plot(x = log(df3H$FATALITIES), log(df3H$INJURIES), asp = F, type = "p", pch = "*", 
    main = "Relation of Fatities vs Injures.\n Main 10 Weather Event.", xlab = "Fatalities (log scale)", 
    ylab = "Injures (log scale)")
for (i in 1:10) {
    text(log(df3H$FATALITIES[i]), log(df3H$INJURIES[i]), df3H$evtype2[i], cex = 0.8, 
        pos = 3)
}
# plotting-crop
plot(df3E$PROPDMG_d, df3E$CROPDMG_d, asp = F, type = "p", pch = "*", main = "Relation of Property Damage vs Crop Damage ($).\n Main 10 Weather Event.", 
    xlab = "Property Damage (Million of $)", ylab = "Crop Damage (Million of $)")
for (i in 1:10) {
    text(df3E$PROPDMG_d[i], df3E$CROPDMG_d[i], df3E$evtype2[i], cex = 0.8, pos = 3)
}
# plotting-all
df4 <- df4[1:10, ]
plot(log(df4$totalhealthindexT), df4$totalcost, asp = F, type = "p", pch = "*", 
    main = "Relation of Health impact vs Total econommic cost.\n Main 10 Weather Event.", 
    xlab = "Health Impact (log scale)", ylab = "Total Cost in Millions of $")
for (i in 1:10) {
    text(log(df4$totalhealthindexT[i]), df4$totalcost[i], df4$evtype2[i], cex = 0.8, 
        pos = 3)
}

plot of chunk plotting-health-crop

par(mfrow = c(2, 1))
df4 <- df4[order(df4$totalcost, decreasing = T), ]
df4s <- df4[1:7, ]
barplot(df4s$totalcost, names.arg = df4s$evtype2, ylab = "Total Cost in millions od $", 
    main = "Even types with higher Economic cost")
# health
df4 <- df4[order(df4$totalhealthindexT, decreasing = T), ]
df4s <- df4[1:7, ]
barplot(df4s$totalhealthindexT, names.arg = df4s$evtype2, ylab = "Healt impact", 
    main = "Even types with higher health total impact")

plot of chunk barplots

sessionInfo()

## R version 3.1.0 beta (2014-03-28 r65330)
## Platform: i686-pc-linux-gnu (32-bit)
## 
## locale:
##  [1] LC_CTYPE=es_ES.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=es_ES.UTF-8        LC_COLLATE=es_ES.UTF-8    
##  [5] LC_MONETARY=es_ES.UTF-8    LC_MESSAGES=es_ES.UTF-8   
##  [7] LC_PAPER=es_ES.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] xtable_1.7-3 knitr_1.5   
## 
## loaded via a namespace (and not attached):
## [1] digest_0.6.4   evaluate_0.5.3 formatR_0.10   stringr_0.6.2 
## [5] tools_3.1.0