This is the Course Project 2: Economic and Human cost of severe weather. I try to analyze which type of events have a bigger human and economic cost.
Since the data is online, first we have to download it and load it
setwd("D:\\Coursera\\ReproducibleResearch")
if(!file.exists("./repdata-data-StormData.csv.bz2")){
file.create("./repdata-data-StormData.csv.bz2")
URL <- "http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
destfile <- "./repdata-data-StormData.csv.bz2"
download.file(URL, destfile)
}
data <- read.csv(bzfile("repdata-data-StormData.csv.bz2"), header = TRUE, sep=",")
We notice that there are many columns and that the cost has some multiplier that must be converted to a numeric value for a simpler comparison. We will also sum injured and dead into a dummy variable, and also crop and propierty damages:
data_tmp=data[c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
data_tmp$PROPDMGEXP <- sub("[-?+H123456789]", "0", data_tmp$PROPDMGEXP, ignore.case = TRUE)
data_tmp$PROPDMGEXP <- sub("^$", "0", data_tmp$PROPDMGEXP)
data_tmp$PROPDMGEXP <- sub("^K$", "1000", data_tmp$PROPDMGEXP, ignore.case = TRUE)
data_tmp$PROPDMGEXP <- sub("^M$", "1000000", data_tmp$PROPDMGEXP, ignore.case = TRUE)
data_tmp$PROPDMGEXP <- sub("^B$", "1000000000", data_tmp$PROPDMGEXP, ignore.case = TRUE)
data_tmp$PROPDMGEXP <- as.numeric(data_tmp$PROPDMGEXP)
data_tmp$PROPDMGVAL <- data_tmp$PROPDMG * data_tmp$PROPDMGEXP
data_tmp$CROPDMGEXP <- sub("[-?+H123456789]", "0", data_tmp$CROPDMGEXP, ignore.case = TRUE)
data_tmp$CROPDMGEXP <- sub("^$", "0", data_tmp$CROPDMGEXP)
data_tmp$CROPDMGEXP <- sub("^K$", "1000", data_tmp$CROPDMGEXP, ignore.case = TRUE)
data_tmp$CROPDMGEXP <- sub("^M$", "1000000", data_tmp$CROPDMGEXP, ignore.case = TRUE)
data_tmp$CROPDMGEXP <- sub("^B$", "1000000000", data_tmp$CROPDMGEXP, ignore.case = TRUE)
data_tmp$CROPDMGEXP <- as.numeric(data_tmp$CROPDMGEXP)
data_tmp$CROPDMGVAL <- data_tmp$CROPDMG * data_tmp$CROPDMGEXP
data_tmp$TOTDMGVAL = data_tmp$CROPDMGVAL + data_tmp$PROPDMGVAL
data_tmp$harmed<- data$INJURIES + data$FATALITIES
Now we will sum columnwise by type of event, getting a much smaller data frame
library(plyr)
data2=ddply(data_tmp, "EVTYPE", numcolwise(sum))
Now we will analyze some of the results. Which type of events cause more human damage?
head(data2[with(data2, order(-FATALITIES)), c("EVTYPE","FATALITIES","INJURIES","harmed")])
## EVTYPE FATALITIES INJURIES harmed
## 834 TORNADO 5633 91346 96979
## 130 EXCESSIVE HEAT 1903 6525 8428
## 153 FLASH FLOOD 978 1777 2755
## 275 HEAT 937 2100 3037
## 464 LIGHTNING 816 5230 6046
## 856 TSTM WIND 504 6957 7461
head(data2[with(data2, order(-INJURIES)), c("EVTYPE","FATALITIES","INJURIES","harmed")])
## EVTYPE FATALITIES INJURIES harmed
## 834 TORNADO 5633 91346 96979
## 856 TSTM WIND 504 6957 7461
## 170 FLOOD 470 6789 7259
## 130 EXCESSIVE HEAT 1903 6525 8428
## 464 LIGHTNING 816 5230 6046
## 275 HEAT 937 2100 3037
head(data2[with(data2, order(-harmed)), c("EVTYPE","FATALITIES","INJURIES","harmed")])
## EVTYPE FATALITIES INJURIES harmed
## 834 TORNADO 5633 91346 96979
## 130 EXCESSIVE HEAT 1903 6525 8428
## 856 TSTM WIND 504 6957 7461
## 170 FLOOD 470 6789 7259
## 464 LIGHTNING 816 5230 6046
## 275 HEAT 937 2100 3037
The order may change, but in the three cases the Tornado is always the first.
Now analyze the Economic cost
head(data2[with(data2, order(-TOTDMGVAL)),
c("EVTYPE","TOTDMGVAL","CROPDMGVAL","PROPDMGVAL")])
## EVTYPE TOTDMGVAL CROPDMGVAL PROPDMGVAL
## 170 FLOOD 150319678250 5661968450 144657709800
## 411 HURRICANE/TYPHOON 71913712800 2607872800 69305840000
## 834 TORNADO 57352113590 414953110 56937160480
## 670 STORM SURGE 43323541000 5000 43323536000
## 244 HAIL 18758221170 3025954450 15732266720
## 153 FLASH FLOOD 17562128610 1421317100 16140811510
head(data2[with(data2, order(-CROPDMGVAL)),
c("EVTYPE","TOTDMGVAL","CROPDMGVAL","PROPDMGVAL")])
## EVTYPE TOTDMGVAL CROPDMGVAL PROPDMGVAL
## 95 DROUGHT 15018672000 13972566000 1046106000
## 170 FLOOD 150319678250 5661968450 144657709800
## 590 RIVER FLOOD 10148404500 5029459000 5118945500
## 427 ICE STORM 8967041310 5022113500 3944927810
## 244 HAIL 18758221170 3025954450 15732266720
## 402 HURRICANE 14610229010 2741910000 11868319010
head(data2[with(data2, order(-PROPDMGVAL)),
c("EVTYPE","TOTDMGVAL","CROPDMGVAL","PROPDMGVAL")])
## EVTYPE TOTDMGVAL CROPDMGVAL PROPDMGVAL
## 170 FLOOD 150319678250 5661968450 144657709800
## 411 HURRICANE/TYPHOON 71913712800 2607872800 69305840000
## 834 TORNADO 57352113590 414953110 56937160480
## 670 STORM SURGE 43323541000 5000 43323536000
## 153 FLASH FLOOD 17562128610 1421317100 16140811510
## 244 HAIL 18758221170 3025954450 15732266720
Here we see that in Total and in Propierty Damage Floods are the biggest cause of pain, while in Crops Drought is the biggest mishap.
Let’s plot the Human costs:
library(tidyr)
data2<-data2[with(data2, order(-harmed)),]
data_long <- gather(data2, condition, harmed, c(harmed,INJURIES,FATALITIES), factor_key=TRUE)
library(ggplot2)
ggplot(data=data_long,
#data_long, aes(x=reorder(EVTYPE,-harmed), y=harmed)) +
aes(x=reorder(EVTYPE,-harmed), y=harmed)) +
geom_col(aes(fill=condition),stat = "identity",position=position_dodge()) +
# geom_col(data=data2, aes(x=reorder(EVTYPE,-FATALITIES), y=FATALITIES,fill=(FATALITIES!=0) ),position=position_dodge(),stat = "identity") +
# geom_col(data=data2, aes(x=reorder(EVTYPE,-INJURIES), y=INJURIES,fill=(INJURIES!=0)), position=position_dodge(),stat = "identity") +
coord_cartesian(xlim=c(1,10)) +
theme(axis.text.x=element_text(angle = 90, hjust = 1, vjust = 0.3)) +
guides(fill=guide_legend(title="events"))+
xlab ("Event type") + ylab ("Fatalities + Injuries") +
ggtitle ("Total human cost by event type")
## Warning: Ignoring unknown parameters: stat
#scale_x_continuous(limits=c(0,100))