This report try to answer two questions: 1.Across the United States, which types of events are most harmful with respect to population health? 2.Across the United States, which types of events have the greatest economic consequences? The data used to address this question come from the U.S. National Oceanic and Atmospheric Administration (NOAA)
The data is downloaded here Storm Data. The file loaded here is a 535 MB CSV file that was extracted from a 47 MB BZ2 file.
Sys.setlocale(category = "LC_ALL", locale = "C")
## [1] "C"
library(ggplot2)
data <- read.csv("stormData.csv", header = TRUE)
check the colnames
colnames(data)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
extrac date, event type, fatalities, injuries, property damage, and crop damage.Filter the date to a tidy format.
Edata<- data[,c(2,8,23:28)]
Edata$BGN_DATE<-as.Date(Edata$BGN_DATE, format = "%m/%d/%Y %H:%M:%S")
Edata$FATALITIES<-as.numeric(as.character(Edata$FATALITIES))
Edata$INJURIES<-as.numeric(as.character(Edata$INJURIES))
Edata$PROPDMG<-as.numeric(as.character(Edata$PROPDMG))
Edata$CROPDMG<-as.numeric(as.character(Edata$CROPDMG))
Edata$EVTYPE<-as.character(Edata$EVTYPE)
calcultae the total casualties by evtype
tmp1 <- aggregate(FATALITIES ~ EVTYPE, data = Edata, FUN = sum)
tmp2 <- aggregate(INJURIES ~ EVTYPE, data =Edata, FUN = sum)
datapopsum <- merge(tmp1, tmp2, by = "EVTYPE")
datapopsum$CASUALTIES <- datapopsum$FATALITIES + datapopsum$INJURIES
order the datapopsum by casualties and extract the top 10
datapopsum<-datapopsum[order(datapopsum$CASUALTIES,decreasing=T),]
datapopsumtop10<-datapopsum[1:10,]
create a bar chart of casualties based on event type.We draw the conclusion that tornado is most harmful with respect to population health
barplot(datapopsumtop10$CASUALTIES,names.arg=datapopsumtop10$EVTYPE,xlab="Events",ylab="Casualties")
extract the data about economy damage
dataeconomy<-Edata[, c(1,2,5:8)]
In colums “PROPDMGEXP” and “CROPDMGEXP”,“K” stands for thousand, “M”" stands for million. We focus on the events millions or more,means “M” or “B” .Extract those data.
dataeconomy <- dataeconomy[dataeconomy$PROPDMGEXP == "M" | dataeconomy$PROPDMGEXP == "B" |
dataeconomy$CROPDMGEXP == "M" | dataeconomy$CROPDMGEXP == "B", ]
calculate the real number of PROPDMG and CROPDMG
for(i in 1:nrow(dataeconomy)) {
if(dataeconomy$PROPDMGEXP[i] == "M") {
dataeconomy$PROPDMG[i] <- dataeconomy$PROPDMG[i]*1000000
} else if(dataeconomy$PROPDMGEXP[i] == "B") {
dataeconomy$PROPDMG[i] <- dataeconomy$PROPDMG[i]*1000000000
}
}
for(i in 1:nrow(dataeconomy)) {
if(dataeconomy$CROPDMGEXP[i] == "M") {
dataeconomy$CROPDMG[i] <- dataeconomy$CROPDMG[i]*1000000
} else if(dataeconomy$CROPDMGEXP[i] == "B") {
dataeconomy$CROPDMG[i] <- dataeconomy$CROPDMG[i]*1000000000
}
}
calculate the sum of damage
dataeconomy$alldmg <- dataeconomy$PROPDMG + dataeconomy$CROPDMG
calcultae the total damage by evtype
dataecosum<-aggregate(alldmg~EVTYPE,data=dataeconomy,FUN=sum)
order the dataecosum by alldmg and extract the top 10
dataecosum<-dataecosum[order(dataecosum$alldmg,decreasing=T),]
dataecosumtop10<-dataecosum[1:10,]
create a bar chart of alldmg based on event type.We draw the conclusion that flood has the greatest economic consequences
barplot(dataecosumtop10$alldmg,names.arg=dataecosumtop10$EVTYPE,xlab="Events",ylab="Property Damage")