We analyzed the data from NOAA Storm Database to investigate which kind of events are most harmful with respect to (1)population health (2)econonomic damage. We begin with data processing by getting true value of property and crop loss, tidy up event names and then sum up by type of events. In brief, tornadoes cause the most total harm to population health, while heat waves are most harmful per event. In terms of econological damage, ice storms cause the most total econological damage to US, while hurricanes were most harmful in economy per event.
We read in the data from the csv file, and select only columns needed for analysis, which includes EVTYPE, FATALITIES, INJURIES, PROPDMG,PROPDMGEXP, CROPDMG, CROPDMGEXP
suppressPackageStartupMessages(library(dplyr))
storm<-read.csv("StormData.csv")
storm<-tbl_df(data = storm)
subset<-select(storm,EVTYPE,FATALITIES,INJURIES,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP)
subset<-mutate(subset,PROPDMGEXP=as.character(PROPDMGEXP))
subset<-mutate(subset,CROPDMGEXP=as.character(CROPDMGEXP))
subset$PROPDMGEXP <- gsub("^[^0-9A-Za-z]$","1",subset$PROPDMGEXP)
subset$PROPDMGEXP <- gsub("^0*$","1",subset$PROPDMGEXP)
subset$PROPDMGEXP <- gsub("^[Kk]$","1000",subset$PROPDMGEXP)
subset$PROPDMGEXP <- gsub("^[Mm]$","1000000",subset$PROPDMGEXP)
subset$PROPDMGEXP <- gsub("^[Bb]$","1000000000",subset$PROPDMGEXP)
subset$PROPDMGEXP <- gsub("^[Hh]$","10000",subset$PROPDMGEXP)
subset$PROPDMGEXP <- gsub("^2$","100",subset$PROPDMGEXP)
subset$PROPDMGEXP <- gsub("^3$","1000",subset$PROPDMGEXP)
subset$PROPDMGEXP <- gsub("^4$","10000",subset$PROPDMGEXP)
subset$PROPDMGEXP <- gsub("^5$","100000",subset$PROPDMGEXP)
subset$PROPDMGEXP <- gsub("^6$","1000000",subset$PROPDMGEXP)
subset$PROPDMGEXP <- gsub("^7$","10000000",subset$PROPDMGEXP)
subset$PROPDMGEXP <- gsub("^8$","100000000",subset$PROPDMGEXP)
subset<-mutate(subset, PROPVALUE = as.numeric(PROPDMGEXP)*PROPDMG)
subset$CROPDMGEXP <- gsub("^[^0-9A-Za-z]$","1",subset$CROPDMGEXP)
subset$CROPDMGEXP <- gsub("^0*$","1",subset$CROPDMGEXP)
subset$CROPDMGEXP <- gsub("^2$","100",subset$CROPDMGEXP)
subset$CROPDMGEXP <- gsub("^[Kk]$","1000",subset$CROPDMGEXP)
subset$CROPDMGEXP <- gsub("^[Mm]$","1000000",subset$CROPDMGEXP)
subset$CROPDMGEXP <- gsub("^[Bb]$","1000000000",subset$CROPDMGEXP)
subset<-mutate(subset, CROPVALUE = as.numeric(CROPDMGEXP)*CROPDMG)
subset<- mutate(subset, EVTYPE=as.character(EVTYPE))
subset$EVTYPE <- gsub("^.*DROUGHT.*$","DROUGHT",subset$EVTYPE)
subset$EVTYPE <- gsub("^.*FLASH.*FLOOD.*","FLASH FLOOD",subset$EVTYPE)
subset$EVTYPE <- gsub("^ICE JAM.*","FLASH FLOOD",subset$EVTYPE)
subset$EVTYPE <- gsub("^SNOWMELT FLOODING$","FLASH FLOOD",subset$EVTYPE)
subset$EVTYPE <- gsub("^URBAN.*$","FLOOD",subset$EVTYPE)
subset$EVTYPE <- gsub("^RIVER.*$","FLOOD",subset$EVTYPE)
subset$EVTYPE <- gsub(".*HAIL.*","HAIL",subset$EVTYPE)
subset$EVTYPE <- gsub(".*HEAVY.*RAIN.*","HEAVY RAIN",subset$EVTYPE)
subset$EVTYPE <- gsub(".*HIGH.*WIND.*","HIGH WIND",subset$EVTYPE)
subset$EVTYPE <- gsub(".*HURRICANE.*","HURRICANE (TYPHOON)",subset$EVTYPE)
subset$EVTYPE <- gsub(".*SURGE.*","STORM SURGE/TIDE",subset$EVTYPE)
subset$EVTYPE <- gsub(".*TORNADO.*","TORNADO",subset$EVTYPE)
subset$EVTYPE <- gsub("^.*TROPICAL.*STORM.*$","TROPICAL STORM",subset$EVTYPE)
subset$EVTYPE <- gsub("^.*THUNDER.*WIND.*$","THUNDERSTORM WIND",subset$EVTYPE)
subset$EVTYPE <- gsub(".*TSTM.*","THUNDERSTORM WIND",subset$EVTYPE)
subset$EVTYPE <- gsub("^.*WINTER STORM.*$","WINTER STORM",subset$EVTYPE)
subset$EVTYPE <- gsub("^.*FIRE.*$","WILDFIRE",subset$EVTYPE)
suppressPackageStartupMessages(library(dplyr))
subset<-mutate(subset, EVTYPE=as.factor(EVTYPE))
subset<-group_by(subset,EVTYPE)
fatality<-summarize(subset,mean(FATALITIES))
colnames(fatality)<-c("EVTYPE","FATALITIES")
injuries<-summarize(subset,mean(INJURIES))
colnames(injuries)<-c("EVTYPE","INJURIES")
avg_population<-cbind(fatality, INJURIES = injuries$INJURIES)
avg_population<-arrange(avg_population,desc(INJURIES + FATALITIES))
fatality_total<-summarize(subset,sum(FATALITIES))
colnames(fatality_total)<-c("EVTYPE","FATALITIES")
injuries_total<-summarize(subset,sum(INJURIES))
colnames(injuries_total)<-c("EVTYPE","INJURIES")
total_population<-cbind(fatality_total, INJURIES = injuries_total$INJURIES)
total_population<-arrange(total_population,desc(INJURIES + FATALITIES))
econoloss<-summarize(subset,mean( PROPVALUE + CROPVALUE ))
colnames(econoloss)<-c("EVTYPE","ECONOLOSS")
econoloss<-arrange(econoloss,desc(ECONOLOSS))
econoloss_total<-summarize(subset,sum(PROPVALUE+CROPVALUE))
colnames(econoloss_total)<-c("EVTYPE","ECONOLOSS")
econoloss_total<-arrange(econoloss_total,desc(ECONOLOSS))
We plot the barplot of five most harmful events to population, left one is the total casualty by event, right one is average casualty by event. We can see that tornadoes cause the most total harm to population health, while heat waves are most harmful per event. If we take fatalities into consideration, then cold and snow might be the most harmful per event.
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(ggplot2))
draw_total_population<-total_population[1:5,]
draw_total_population$fx<-1:5
draw_total_population$EVTYPE<-reorder(draw_total_population$EVTYPE,draw_total_population$fx)
draw_total_population<-melt(draw_total_population[,1:3])
## Using EVTYPE as id variables
plot1<-qplot(EVTYPE, value, data=draw_total_population,fill=variable, geom="bar", stat="identity",ylab="count", main="total casualty by event")
draw_avg_population<-avg_population[1:5,]
draw_avg_population$fx<-1:5
draw_avg_population$EVTYPE<-reorder(draw_avg_population$EVTYPE,draw_avg_population$fx)
draw_avg_population<-melt(draw_avg_population[,1:3])
## Using EVTYPE as id variables
plot2<-qplot(EVTYPE, value, data=draw_avg_population,fill=variable, geom="bar", stat="identity",ylab="count",main="average casualties per event")
suppressMessages(require(gridExtra))
grid.arrange(plot1, plot2, ncol=2)
We also draw the barplot of economy loss per event and total economy loss by certain event here. We can see ice storms cause the most total econological damage to US, while hurricanes were most harmful in economy per event.
draw_total_economy<-econoloss_total[1:5,]
draw_total_economy$fx<-1:5
draw_total_economy$EVTYPE<-reorder(draw_total_economy$EVTYPE,draw_total_economy$fx)
suppressMessages(draw_total_economy<-melt(draw_total_economy[,1:3]))
plot1<-qplot(EVTYPE, value, data=draw_total_economy, fill=EVTYPE, geom="bar", stat="identity",ylab="US dollar",main="total econological loss by event")
draw_avg_economy<-econoloss[1:5,]
draw_avg_economy$fx<-1:5
draw_avg_economy$EVTYPE<-reorder(draw_avg_economy$EVTYPE,draw_avg_economy$fx)
suppressMessages(draw_avg_economy<-melt(draw_avg_economy[,1:3]))
plot2<-qplot(EVTYPE, value, data=draw_avg_economy, fill=EVTYPE,geom="bar", stat="identity",ylab="US dollar",main="average econological lose per event")
suppressMessages(require(gridExtra))
grid.arrange(plot1, plot2, ncol=2)