In this report, I aim to describe the situation that different types of Storms and other severe weather events caused both public health and economic problems (both property and corp costs) for communities and municipalities from January 1950 to November 2011. My overall hypothesis is that tornado should be the most harmful event and cause the greatest econimic influence, because U.S.A is surrounded by the Pacific Ocean, Atlantic Ocean and the gulf of Mexico, wihch means there are loads of vapor flock to this land and easily generate the thunder clouds, forming severe tornado. To investigate this hypothesis, I fetch the NOAA storm database (1950 - 2011). From this dataset, I found that the tornado was indeed the most harmful event, because it caused highest fatalities (5633 times) and injuries (91346 times). However, tornado did not caused the greatest economic consequences ($57340613590, ranked the 4th), the flood was the event that caused the greatest financial costs ($150319678250, ranked 1st).
First, download the data from: https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2 (use download.file() in R or just directly download it on your machine)
Then, zip the file you just dowloaded, and read the .csv file, the “StormData.csv”:
library(dplyr)
Sys.setlocale("LC_ALL","English")
## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
datafile <- bzfile(description = "g:/RAssignment/StormData.csv.bz2","r")
stormdata <- read.csv(datafile,sep=",",stringsAsFactors = FALSE,header = TRUE)
close(datafile)
StormData <- tbl_df( data = stormdata)
dim(StormData)
## [1] 902297 37
head(StormData)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO 0 0
## 2 TORNADO 0 0
## 3 TORNADO 0 0
## 4 TORNADO 0 0
## 5 TORNADO 0 0
## 6 TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1 NA 0 14.0 100 3 0 0
## 2 NA 0 2.0 150 2 0 0
## 3 NA 0 0.1 123 2 0 0
## 4 NA 0 0.0 100 2 0 0
## 5 NA 0 0.0 150 2 0 0
## 6 NA 0 1.5 177 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1 15 25.0 K 0
## 2 0 2.5 K 0
## 3 2 25.0 K 0
## 4 2 2.5 K 0
## 5 2 2.5 K 0
## 6 6 2.5 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3040 8812 3051 8806 1
## 2 3042 8755 0 0 2
## 3 3340 8742 0 0 3
## 4 3458 8626 0 0 4
## 5 3412 8642 0 0 5
## 6 3450 8748 0 0 6
mean(is.na(StormData))
## [1] 0.05229737
The NA value seems won’t bother the process of analysis, so just ingored it.
fatality <- StormData %>% group_by(EVTYPE) %>% summarise(FATALITIES_SUM = sum(FATALITIES,na.rm=TRUE))
injury <- StormData %>% group_by(EVTYPE) %>% summarise(INJURIES_SUM = sum(INJURIES,na.rm=TRUE))
HealthData <- data.frame(EVTYPE=injury$EVTYPE,FATALITIES = fatality$FATALITIES_SUM,INJURIES = injury$INJURIES_SUM)
HealthData <- HealthData %>% arrange( desc(FATALITIES),desc(INJURIES),EVTYPE)
PROPDMG_k <- StormData %>% filter(PROPDMGEXP == "K")%>% group_by(EVTYPE) %>% mutate(NEWPROPDMG = PROPDMG * 1000)
PROPDMG_M <- StormData %>% filter(PROPDMGEXP == "M")%>% group_by(EVTYPE) %>% mutate(NEWPROPDMG = PROPDMG * 1000000)
PROPDMG_B <- StormData %>% filter(PROPDMGEXP == "B")%>% group_by(EVTYPE) %>% mutate(NEWPROPDMG = PROPDMG * 1000000000)
PROPDMG <- rbind.data.frame(PROPDMG_k,PROPDMG_M,PROPDMG_B)
PROPDMGCost <- PROPDMG[,c(8,38)]%>%group_by(EVTYPE)%>%summarise(Cost = sum(NEWPROPDMG,na.rm=TRUE))%>%arrange(desc(Cost))
CROPDMG_k <- StormData %>% filter(CROPDMGEXP == "K")%>% group_by(EVTYPE) %>% mutate(NEWCROPDMG = CROPDMG * 1000)
CROPDMG_M <- StormData %>% filter(CROPDMGEXP == "M")%>% group_by(EVTYPE) %>% mutate(NEWCROPDMG = CROPDMG * 1000000)
CROPDMG_B <- StormData %>% filter(CROPDMGEXP == "B")%>% group_by(EVTYPE) %>% mutate(NEWCROPDMG = CROPDMG * 1000000000)
CROPDMG <- rbind.data.frame(CROPDMG_k,CROPDMG_M,CROPDMG_B)
CROPDMGCost <- CROPDMG[,c(8,38)]%>%group_by(EVTYPE)%>%summarise(Cost = sum(NEWCROPDMG,na.rm=TRUE))%>%arrange(desc(Cost))
ECONOMICDATA <-merge(PROPDMGCost,CROPDMGCost,by="EVTYPE",all = TRUE)
ECONOMICDATA[is.na(ECONOMICDATA)] <- 0
ECONOMICDATA <- mutate(ECONOMICDATA,SumCost = Cost.x +Cost.y)
fECONOMICDATA <- data.frame(EVTYPE = ECONOMICDATA$EVTYPE,SumCost = ECONOMICDATA$SumCost)
afECONOMICADATA <- arrange(fECONOMICDATA,desc(SumCost))
Plot a TOP 7 list of most harmful types of events:
library(ggplot2)
head(HealthData,10)
## EVTYPE FATALITIES INJURIES
## 1 TORNADO 5633 91346
## 2 EXCESSIVE HEAT 1903 6525
## 3 FLASH FLOOD 978 1777
## 4 HEAT 937 2100
## 5 LIGHTNING 816 5230
## 6 TSTM WIND 504 6957
## 7 FLOOD 470 6789
## 8 RIP CURRENT 368 232
## 9 HIGH WIND 248 1137
## 10 AVALANCHE 224 170
topseven <- HealthData[1:7,]
topseven <- transform(topseven,EVTYPE = as.factor(EVTYPE))
gtopseven <- ggplot(data=topseven,mapping = aes(EVTYPE,FATALITIES,fill=INJURIES))
gtopseven +geom_bar(stat = "identity")+theme(axis.text.x = element_text(angle=30, vjust=0.5))+labs(title="Top 7 Most Harmful Types of Events")
From the plot above, obviously the “TORNADO” was the most harmful event respect to popluation health because it caused the most number of fatalities and injuries.
Plot a TOP 7 list of types of events have the greatest economic consequences:
head(afECONOMICADATA,10)
## EVTYPE SumCost
## 1 FLOOD 150319678250
## 2 HURRICANE/TYPHOON 71913712800
## 3 TORNADO 57340613590
## 4 STORM SURGE 43323541000
## 5 HAIL 18752904170
## 6 FLASH FLOOD 17562128610
## 7 DROUGHT 15018672000
## 8 HURRICANE 14610229010
## 9 RIVER FLOOD 10148404500
## 10 ICE STORM 8967041310
ggplot(afECONOMICADATA[1:7,],aes(x=EVTYPE,y=SumCost,fill=SumCost))+geom_bar(stat = "identity")+theme(axis.text.x = element_text(angle=30, vjust=0.5))+labs(title="TOP 7 Events Have the Greatest Economic Consequences")
From the plot above, obviously the “FLOOD” has casued the greatest economic consequences because it cost the greates amount of money($150319678250).