#Analysis of Harmful and Damaging Evens in Storm Data
At the very beginning, we need to read the data from the data file and set the plotting environment by include the library of “ggplot”.
library(ggplot2)
setwd("~/Desktop/my_documents/coursera/fall_2015/reproducible_research/project_2")
data=read.csv('./repdata-data-StormData.csv',sep=',',header=TRUE)
Since there is no explicity information on the damages of properties and crops, we need to add two new features into the data about these information.
data$prop.damage <- with(data, PROPDMG * ifelse(PROPDMGEXP == "", 1, ifelse(PROPDMGEXP ==
"K", 1000, ifelse(PROPDMGEXP == "M", 1e+06, ifelse(PROPDMGEXP == "B", 1e+09,-1)))))
data$crop.damage <- with(data, CROPDMG * ifelse(CROPDMGEXP == "", 1, ifelse(CROPDMGEXP ==
"K", 1000, ifelse(CROPDMGEXP == "M", 1e+06, ifelse(CROPDMGEXP == "B", 1e+09,-1)))))
In this section we analyze the most harmful events to population health. According to the information provided by the data, we divide the analysis into two parts: fatal analysis and injury analysis.
s.fatal=split(data$FATALITIES,as.factor(data$EVTYPE))
fatal.sum=sapply(s.fatal,sum)
fatal.sum.sorted=fatal.sum[order(fatal.sum,decreasing=TRUE)]
fatal.mean=sapply(s.fatal,mean)
fatal.mean.sorted=fatal.mean[order(fatal.sum,decreasing=TRUE)]
event.fatal.sorted=names(fatal.sum.sorted)
summary.fatal=data.frame(event.type=event.fatal.sorted,total.fatal.number=fatal.sum.sorted,
fatal.per.event=fatal.mean.sorted,row.names=NULL)
By the code above, we can see the ten most harmful events according to fatal in the table below:
print(summary.fatal[1:10,])
## event.type total.fatal.number fatal.per.event
## 1 TORNADO 5633 0.092874
## 2 EXCESSIVE HEAT 1903 1.134088
## 3 FLASH FLOOD 978 0.018019
## 4 HEAT 937 1.221643
## 5 LIGHTNING 816 0.051796
## 6 TSTM WIND 504 0.002292
## 7 FLOOD 470 0.018558
## 8 RIP CURRENT 368 0.782979
## 9 HIGH WIND 248 0.012270
## 10 AVALANCHE 224 0.580311
We can also plot the distribution:
data.fatal.top10=subset(data,data$EVTYPE %in% summary.fatal[1:10,1])
ggplot(data.fatal.top10,aes(x=as.factor(EVTYPE),y=FATALITIES))+geom_boxplot()+
ggtitle("Distribution of Fatalities for the most Harmful Events")+ylab("log10 of the number of fatals")+
xlab("Event Type") + scale_y_log10() + theme_bw()
## Warning: Removed 394320 rows containing non-finite values (stat_boxplot).
s.injur=split(data$INJURIES,as.factor(data$EVTYPE))
injur.sum=sapply(s.injur,sum)
injur.sum.sorted=injur.sum[order(injur.sum,decreasing=TRUE)]
injur.mean=sapply(s.injur,mean)
injur.mean.sorted=injur.mean[order(injur.sum,decreasing=TRUE)]
event.injur.sorted=names(injur.sum.sorted)
summary.injur=data.frame(event.type=event.injur.sorted,total.injury.number=injur.sum.sorted,
injuries.per.event=injur.mean.sorted,row.names=NULL)
By the code above, we can see the ten most harmful events according to fatal in the table below:
print(summary.injur[1:10,])
## event.type total.injury.number injuries.per.event
## 1 TORNADO 91346 1.506067
## 2 TSTM WIND 6957 0.031631
## 3 FLOOD 6789 0.268064
## 4 EXCESSIVE HEAT 6525 3.888558
## 5 LIGHTNING 5230 0.331979
## 6 HEAT 2100 2.737940
## 7 ICE STORM 1975 0.984546
## 8 FLASH FLOOD 1777 0.032739
## 9 THUNDERSTORM WIND 1488 0.018023
## 10 HAIL 1361 0.004715
We can also plot the distribution:
data.injur.top10=subset(data,data$EVTYPE %in% summary.injur[1:10,1])
ggplot(data.injur.top10,aes(x=as.factor(EVTYPE),y=INJURIES))+geom_boxplot()+
ggtitle("Distribution of Injuries for the most Harmful Events")+ylab("log10 of the number of injuries")+
xlab("Event Type") + scale_y_log10() + theme_bw()
## Warning: Removed 736740 rows containing non-finite values (stat_boxplot).
In this section we analyze the most harmful events to economic damage. According to the information provided by the data, we divide the analysis into two parts: property damage analysis and crop damage analysis.
s.prop=split(data$prop.damage,as.factor(data$EVTYPE))
prop.sum=sapply(s.prop,sum)
prop.sum.sorted=prop.sum[order(prop.sum,decreasing=TRUE)]
prop.mean=sapply(s.prop,mean)
prop.mean.sorted=prop.mean[order(prop.sum,decreasing=TRUE)]
event.prop.sorted=names(prop.sum.sorted)
summary.prop=data.frame(event.type=event.prop.sorted,total.prop.damage.number=prop.sum.sorted,
prop.damage.per.event=prop.mean.sorted,row.names=NULL)
By the code above, we can see the ten most harmful events according to property damage in the table below:
print(summary.prop[1:10,])
## event.type total.prop.damage.number prop.damage.per.event
## 1 FLOOD 1.447e+11 5711826
## 2 HURRICANE/TYPHOON 6.931e+10 787566364
## 3 TORNADO 5.693e+10 938562
## 4 STORM SURGE 4.332e+10 165990559
## 5 FLASH FLOOD 1.614e+10 297378
## 6 HAIL 1.573e+10 54484
## 7 HURRICANE 1.187e+10 68208730
## 8 TROPICAL STORM 7.704e+09 11165059
## 9 WINTER STORM 6.688e+09 585017
## 10 HIGH WIND 5.270e+09 260738
We can also plot the distribution:
data.prop.top10=subset(data,data$EVTYPE %in% summary.prop[1:10,1])
ggplot(data.prop.top10,aes(x=as.factor(EVTYPE),y=prop.damage))+geom_boxplot()+
ggtitle("Distribution of Properties Damage for the most Harmful Events")+ylab("log10 of the number of properties damage")+
xlab("Event Type") + theme_bw()+scale_y_log10()
## Warning: NaNs produced
## Warning: Removed 361979 rows containing non-finite values (stat_boxplot).
s.crop=split(data$crop.damage,as.factor(data$EVTYPE))
crop.sum=sapply(s.crop,sum)
crop.sum.sorted=crop.sum[order(crop.sum,decreasing=TRUE)]
crop.mean=sapply(s.crop,mean)
crop.mean.sorted=crop.mean[order(crop.sum,decreasing=TRUE)]
event.crop.sorted=names(crop.sum.sorted)
summary.crop=data.frame(event.type=event.crop.sorted,total.crop.damage.number=crop.sum.sorted,
crop.damage.per.event=crop.mean.sorted,row.names=NULL)
By the code above, we can see the ten most harmful events according to property damage in the table below:
print(summary.crop[1:10,])
## event.type total.crop.damage.number crop.damage.per.event
## 1 DROUGHT 1.397e+10 5615983
## 2 FLOOD 5.662e+09 223563
## 3 RIVER FLOOD 5.029e+09 29072017
## 4 ICE STORM 5.022e+09 2503546
## 5 HAIL 3.026e+09 10481
## 6 HURRICANE 2.742e+09 15758103
## 7 HURRICANE/TYPHOON 2.608e+09 29634918
## 8 FLASH FLOOD 1.421e+09 26186
## 9 EXTREME COLD 1.293e+09 1974005
## 10 FROST/FREEZE 1.094e+09 815265
We can also plot the distribution:
data.crop.top10=subset(data,data$EVTYPE %in% summary.crop[1:10,1])
ggplot(data.crop.top10,aes(x=as.factor(EVTYPE),y=crop.damage))+geom_boxplot()+
ggtitle("Distribution of crops Damage for the most Harmful Events")+ylab("log10 of the number of crops damage")+
xlab("Event Type") + theme_bw()+scale_y_log10()
## Warning: NaNs produced
## Warning: Removed 361433 rows containing non-finite values (stat_boxplot).