This report try to answer two questions: 1.Across the United States, which types of events are most harmful with respect to population health? 2.Across the United States, which types of events have the greatest economic consequences? The data used to address this question come from the U.S. National Oceanic and Atmospheric Administration (NOAA)

Data Processing

The data is downloaded here Storm Data. The file loaded here is a 535 MB CSV file that was extracted from a 47 MB BZ2 file.

Sys.setlocale(category = "LC_ALL", locale = "C")
## [1] "C"
library(ggplot2)
data <- read.csv("stormData.csv", header = TRUE)

check the colnames

colnames(data)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

extrac date, event type, fatalities, injuries, property damage, and crop damage.Filter the date to a tidy format.

Edata<- data[,c(2,8,23:28)]
Edata$BGN_DATE<-as.Date(Edata$BGN_DATE, format = "%m/%d/%Y %H:%M:%S")
Edata$FATALITIES<-as.numeric(as.character(Edata$FATALITIES))
Edata$INJURIES<-as.numeric(as.character(Edata$INJURIES))
Edata$PROPDMG<-as.numeric(as.character(Edata$PROPDMG))
Edata$CROPDMG<-as.numeric(as.character(Edata$CROPDMG))
Edata$EVTYPE<-as.character(Edata$EVTYPE)

Storm Events and Population Health

calcultae the total casualties by evtype

tmp1 <- aggregate(FATALITIES ~ EVTYPE, data = Edata, FUN = sum)
tmp2 <- aggregate(INJURIES ~ EVTYPE, data =Edata, FUN = sum)
datapopsum <- merge(tmp1, tmp2, by = "EVTYPE")
datapopsum$CASUALTIES <- datapopsum$FATALITIES + datapopsum$INJURIES

order the datapopsum by casualties and extract the top 10

datapopsum<-datapopsum[order(datapopsum$CASUALTIES,decreasing=T),]
datapopsumtop10<-datapopsum[1:10,]

create a bar chart of casualties based on event type.We draw the conclusion that tornado is most harmful with respect to population health

barplot(datapopsumtop10$CASUALTIES,names.arg=datapopsumtop10$EVTYPE,xlab="Events",ylab="Casualties")

Storm Events and Economic Health

extract the data about economy damage

dataeconomy<-Edata[, c(1,2,5:8)]

In colums “PROPDMGEXP” and “CROPDMGEXP”,“K” stands for thousand, “M”" stands for million. We focus on the events millions or more,means “M” or “B” .Extract those data.

dataeconomy <- dataeconomy[dataeconomy$PROPDMGEXP == "M" | dataeconomy$PROPDMGEXP == "B" | 
                      dataeconomy$CROPDMGEXP == "M" | dataeconomy$CROPDMGEXP == "B", ]

calculate the real number of PROPDMG and CROPDMG

for(i in 1:nrow(dataeconomy)) {
    if(dataeconomy$PROPDMGEXP[i] == "M") {
         dataeconomy$PROPDMG[i] <- dataeconomy$PROPDMG[i]*1000000
     } else if(dataeconomy$PROPDMGEXP[i] == "B") {
        dataeconomy$PROPDMG[i] <- dataeconomy$PROPDMG[i]*1000000000
     }
 }

for(i in 1:nrow(dataeconomy)) {
    if(dataeconomy$CROPDMGEXP[i] == "M") {
        dataeconomy$CROPDMG[i] <- dataeconomy$CROPDMG[i]*1000000
    } else if(dataeconomy$CROPDMGEXP[i] == "B") {
        dataeconomy$CROPDMG[i] <- dataeconomy$CROPDMG[i]*1000000000
     }
 }

calculate the sum of damage

dataeconomy$alldmg <- dataeconomy$PROPDMG + dataeconomy$CROPDMG

calcultae the total damage by evtype

dataecosum<-aggregate(alldmg~EVTYPE,data=dataeconomy,FUN=sum)

order the dataecosum by alldmg and extract the top 10

dataecosum<-dataecosum[order(dataecosum$alldmg,decreasing=T),]
dataecosumtop10<-dataecosum[1:10,]

create a bar chart of alldmg based on event type.We draw the conclusion that flood has the greatest economic consequences

barplot(dataecosumtop10$alldmg,names.arg=dataecosumtop10$EVTYPE,xlab="Events",ylab="Property Damage")