Introduction

This is the Course Project 2: Economic and Human cost of severe weather. I try to analyze which type of events have a bigger human and economic cost.

Since the data is online, first we have to download it and load it

setwd("D:\\Coursera\\ReproducibleResearch")
if(!file.exists("./repdata-data-StormData.csv.bz2")){
  file.create("./repdata-data-StormData.csv.bz2")
  URL <- "http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
  destfile <- "./repdata-data-StormData.csv.bz2"
  download.file(URL, destfile)
}
data <- read.csv(bzfile("repdata-data-StormData.csv.bz2"), header = TRUE, sep=",")

Transforming data

We notice that there are many columns and that the cost has some multiplier that must be converted to a numeric value for a simpler comparison. We will also sum injured and dead into a dummy variable, and also crop and propierty damages:

data_tmp=data[c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]

data_tmp$PROPDMGEXP <- sub("[-?+H123456789]", "0", data_tmp$PROPDMGEXP, ignore.case = TRUE)
data_tmp$PROPDMGEXP <- sub("^$", "0", data_tmp$PROPDMGEXP)
data_tmp$PROPDMGEXP <- sub("^K$", "1000", data_tmp$PROPDMGEXP, ignore.case = TRUE)
data_tmp$PROPDMGEXP <- sub("^M$", "1000000", data_tmp$PROPDMGEXP, ignore.case = TRUE)
data_tmp$PROPDMGEXP <- sub("^B$", "1000000000", data_tmp$PROPDMGEXP, ignore.case = TRUE)
data_tmp$PROPDMGEXP <- as.numeric(data_tmp$PROPDMGEXP)
data_tmp$PROPDMGVAL <- data_tmp$PROPDMG * data_tmp$PROPDMGEXP

data_tmp$CROPDMGEXP <- sub("[-?+H123456789]", "0", data_tmp$CROPDMGEXP, ignore.case = TRUE)
data_tmp$CROPDMGEXP <- sub("^$", "0", data_tmp$CROPDMGEXP)
data_tmp$CROPDMGEXP <- sub("^K$", "1000", data_tmp$CROPDMGEXP, ignore.case = TRUE)
data_tmp$CROPDMGEXP <- sub("^M$", "1000000", data_tmp$CROPDMGEXP, ignore.case = TRUE)
data_tmp$CROPDMGEXP <- sub("^B$", "1000000000", data_tmp$CROPDMGEXP, ignore.case = TRUE)
data_tmp$CROPDMGEXP <- as.numeric(data_tmp$CROPDMGEXP)

data_tmp$CROPDMGVAL <- data_tmp$CROPDMG * data_tmp$CROPDMGEXP

data_tmp$TOTDMGVAL = data_tmp$CROPDMGVAL + data_tmp$PROPDMGVAL

data_tmp$harmed<- data$INJURIES + data$FATALITIES

Now we will sum columnwise by type of event, getting a much smaller data frame

library(plyr)
data2=ddply(data_tmp, "EVTYPE", numcolwise(sum))

Results

Now we will analyze some of the results. Which type of events cause more human damage?

head(data2[with(data2, order(-FATALITIES)), c("EVTYPE","FATALITIES","INJURIES","harmed")])
##             EVTYPE FATALITIES INJURIES harmed
## 834        TORNADO       5633    91346  96979
## 130 EXCESSIVE HEAT       1903     6525   8428
## 153    FLASH FLOOD        978     1777   2755
## 275           HEAT        937     2100   3037
## 464      LIGHTNING        816     5230   6046
## 856      TSTM WIND        504     6957   7461
head(data2[with(data2, order(-INJURIES)), c("EVTYPE","FATALITIES","INJURIES","harmed")])
##             EVTYPE FATALITIES INJURIES harmed
## 834        TORNADO       5633    91346  96979
## 856      TSTM WIND        504     6957   7461
## 170          FLOOD        470     6789   7259
## 130 EXCESSIVE HEAT       1903     6525   8428
## 464      LIGHTNING        816     5230   6046
## 275           HEAT        937     2100   3037
head(data2[with(data2, order(-harmed)), c("EVTYPE","FATALITIES","INJURIES","harmed")])
##             EVTYPE FATALITIES INJURIES harmed
## 834        TORNADO       5633    91346  96979
## 130 EXCESSIVE HEAT       1903     6525   8428
## 856      TSTM WIND        504     6957   7461
## 170          FLOOD        470     6789   7259
## 464      LIGHTNING        816     5230   6046
## 275           HEAT        937     2100   3037

The order may change, but in the three cases the Tornado is always the first.

Now analyze the Economic cost

head(data2[with(data2, order(-TOTDMGVAL)), 
           c("EVTYPE","TOTDMGVAL","CROPDMGVAL","PROPDMGVAL")])
##                EVTYPE    TOTDMGVAL CROPDMGVAL   PROPDMGVAL
## 170             FLOOD 150319678250 5661968450 144657709800
## 411 HURRICANE/TYPHOON  71913712800 2607872800  69305840000
## 834           TORNADO  57352113590  414953110  56937160480
## 670       STORM SURGE  43323541000       5000  43323536000
## 244              HAIL  18758221170 3025954450  15732266720
## 153       FLASH FLOOD  17562128610 1421317100  16140811510
head(data2[with(data2, order(-CROPDMGVAL)), 
           c("EVTYPE","TOTDMGVAL","CROPDMGVAL","PROPDMGVAL")])
##          EVTYPE    TOTDMGVAL  CROPDMGVAL   PROPDMGVAL
## 95      DROUGHT  15018672000 13972566000   1046106000
## 170       FLOOD 150319678250  5661968450 144657709800
## 590 RIVER FLOOD  10148404500  5029459000   5118945500
## 427   ICE STORM   8967041310  5022113500   3944927810
## 244        HAIL  18758221170  3025954450  15732266720
## 402   HURRICANE  14610229010  2741910000  11868319010
head(data2[with(data2, order(-PROPDMGVAL)), 
           c("EVTYPE","TOTDMGVAL","CROPDMGVAL","PROPDMGVAL")])
##                EVTYPE    TOTDMGVAL CROPDMGVAL   PROPDMGVAL
## 170             FLOOD 150319678250 5661968450 144657709800
## 411 HURRICANE/TYPHOON  71913712800 2607872800  69305840000
## 834           TORNADO  57352113590  414953110  56937160480
## 670       STORM SURGE  43323541000       5000  43323536000
## 153       FLASH FLOOD  17562128610 1421317100  16140811510
## 244              HAIL  18758221170 3025954450  15732266720

Here we see that in Total and in Propierty Damage Floods are the biggest cause of pain, while in Crops Drought is the biggest mishap.

Let’s plot the Human costs:

library(tidyr)
data2<-data2[with(data2, order(-harmed)),]
data_long <- gather(data2, condition, harmed, c(harmed,INJURIES,FATALITIES), factor_key=TRUE)

library(ggplot2)
ggplot(data=data_long,
#data_long, aes(x=reorder(EVTYPE,-harmed), y=harmed)) +
aes(x=reorder(EVTYPE,-harmed), y=harmed))  +
  geom_col(aes(fill=condition),stat = "identity",position=position_dodge()) +
#  geom_col(data=data2, aes(x=reorder(EVTYPE,-FATALITIES), y=FATALITIES,fill=(FATALITIES!=0) ),position=position_dodge(),stat = "identity") +
#  geom_col(data=data2, aes(x=reorder(EVTYPE,-INJURIES), y=INJURIES,fill=(INJURIES!=0)), position=position_dodge(),stat = "identity") +
  coord_cartesian(xlim=c(1,10)) +
  theme(axis.text.x=element_text(angle = 90, hjust = 1, vjust = 0.3)) +
  guides(fill=guide_legend(title="events"))+
  xlab ("Event type") + ylab ("Fatalities + Injuries") +
  ggtitle ("Total human cost by event type")
## Warning: Ignoring unknown parameters: stat

  #scale_x_continuous(limits=c(0,100))