Synopsis

In this project, we analyze the NOAA Storm data set from U.S. National Oceanic and Atmospheric Administration’s (NOAA). We use an exploratory analysis to find which types of events are most harmful with respect to population health and which are for economic.

Data Processing

We first download the data from DATA and use the bunzip2 function in the R.utils libraryto unzipe it.

#library(R.utils) 
#download.file('https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2','#NOAA.csv.bz2',method='curl')
#bunzip2('NOAA.csv.bz2')
NOAA_Data<- read.csv("NOAA.csv", header = TRUE)

Most Causes of Injuries

The following codes aggregate the data set by injuries, sort them, and then depict the bar plot of top 5 most causes of injuries. The table and the plot show that tornodo is mostly responsible for injuries.

injuries<-aggregate(NOAA_Data$INJURIES, by= list(Evtype=   NOAA_Data$EVTYPE), sum)
injuries<-injuries[order(injuries$x, decreasing = TRUE),]
head(injuries,5)
##             Evtype     x
## 834        TORNADO 91346
## 856      TSTM WIND  6957
## 170          FLOOD  6789
## 130 EXCESSIVE HEAT  6525
## 464      LIGHTNING  5230
options(warn = -1) 
library(ggplot2)
g<-ggplot(injuries[1:5,], aes(Evtype, y=x)) 
g+ geom_bar(stat="identity")+ xlab("Event Type") + 
   ylab("Number of Injuries") + ggtitle("Injuries by Event Type")

Most Causes of Fatalities

Similarly, we can aggregate the data set by fatalities, sort them, and then depict the bar plot of top 5 most causes of fatalities.
The table and the plot show that tornodo is also mostly responsible for fatalities.

fatalitie<-aggregate(NOAA_Data$FATALITIES, by=list(Evtype=NOAA_Data$EVTYPE), sum)
fatalitie<- fatalitie[order(fatalitie$x, decreasing = TRUE),]

head(fatalitie,5)
##             Evtype    x
## 834        TORNADO 5633
## 130 EXCESSIVE HEAT 1903
## 153    FLASH FLOOD  978
## 275           HEAT  937
## 464      LIGHTNING  816
g<-ggplot(fatalitie[1:5,], aes(Evtype, y=x)) 
g+ geom_bar(stat="identity")+ xlab("Event Type") + 
    ylab("Number of Fatalities") + ggtitle("Fatalities by Event Type")

Most Causes of Economic Damage

We now see what causes most economic damages. Before that, we notice that the relevant variables are in \(10^x\) format and they need some cleaning. To make this explicit.

NOAA_Data$PROPDMGEXP<-toupper(NOAA_Data$PROPDMGEXP)
NOAA_Data$CROPDMGEXP<-toupper(NOAA_Data$CROPDMGEXP)
unique(c(NOAA_Data$PROPDMGEXP, NOAA_Data$CROPDMGEXP))
##  [1] "K" "M" ""  "B" "+" "0" "5" "6" "?" "4" "2" "3" "H" "7" "-" "1" "8"

We first replace the sings (+, -, ?) with zero.

Sub_NOAAdata<-NOAA_Data[,c("EVTYPE", "PROPDMG", "PROPDMGEXP",       "CROPDMG", "CROPDMGEXP")]
Sub_NOAAdata[Sub_NOAAdata$PROPDMGEXP %in% c("", "+", "-", "?"), "PROPDMGEXP"] <- "0"
Sub_NOAAdata[Sub_NOAAdata$CROPDMGEXP %in% c("", "+", "-", "?"), "CROPDMGEXP"] <- "0"
unique(c(Sub_NOAAdata$PROPDMGEXP, Sub_NOAAdata$CROPDMGEXP))
##  [1] "K" "M" "0" "B" "5" "6" "4" "2" "3" "H" "7" "1" "8"

In addition, we replace the initials (B,M,K,H) representing (Billion, Million, Thousand, Hundred) with (9,6,3,2).

Sub_NOAAdata[Sub_NOAAdata$PROPDMGEXP== "B", "PROPDMGEXP"]<-"9"
Sub_NOAAdata[Sub_NOAAdata$CROPDMGEXP== "B", "CROPDMGEXP"]<-"9"
Sub_NOAAdata[Sub_NOAAdata$PROPDMGEXP== "M", "PROPDMGEXP"]<-"6"
Sub_NOAAdata[Sub_NOAAdata$CROPDMGEXP== "M", "CROPDMGEXP"]<-"6"
Sub_NOAAdata[Sub_NOAAdata$PROPDMGEXP== "K", "PROPDMGEXP"]<-"3"
Sub_NOAAdata[Sub_NOAAdata$CROPDMGEXP== "K", "CROPDMGEXP"]<-"3"
Sub_NOAAdata[Sub_NOAAdata$PROPDMGEXP== "H", "PROPDMGEXP"]<-"2"
Sub_NOAAdata[Sub_NOAAdata$CROPDMGEXP== "H", "CROPDMGEXP"]<-"2"
unique(c(Sub_NOAAdata$PROPDMGEXP, Sub_NOAAdata$CROPDMGEXP))
##  [1] "3" "6" "0" "9" "5" "4" "2" "7" "1" "8"
Sub_NOAAdata$PROPDMGEXP<-10^(as.numeric(Sub_NOAAdata$PROPDMGEXP))
Sub_NOAAdata$CROPDMGEXP<-10^(as.numeric(Sub_NOAAdata$CROPDMGEXP))
Sub_NOAAdata[is.na(Sub_NOAAdata$PROPDMG), "PROPDMG"]<- 0
Sub_NOAAdata[is.na(Sub_NOAAdata$CROPDMG), "CROPDMG"] <- 0

## calculate the total damage
Sub_NOAAdata<-within(Sub_NOAAdata, 
                     Total_dmg<- PROPDMG*PROPDMGEXP + CROPDMG*CROPDMGEXP)
DamageByType<-aggregate(Sub_NOAAdata$Total_dmg, 
                        by=list(Evtype=Sub_NOAAdata$EVTYPE), FUN=sum)
DamageByType<-DamageByType[order(DamageByType$x,decreasing = TRUE),]

The following codes aggregate the transformed variables by economic damage and create the table and the bar plot of top most causes of economic damage. As we see, this time flood is responsible for most economic damages, followed by hurricane/typhoon and tornodo.

head(DamageByType,5)
##                Evtype            x
## 170             FLOOD 150319678257
## 411 HURRICANE/TYPHOON  71913712800
## 834           TORNADO  57362333946
## 670       STORM SURGE  43323541000
## 244              HAIL  18761221986
Sub_Damage<-DamageByType[1:5,]
g<-ggplot(Sub_Damage, aes(Evtype, y=x)) 
g+ geom_bar(stat = "identity") + xlab("Event Type")+ 
   ylab("Damage in Dollars") + ggtitle("Economic Damage by Event Type")

Results

In summary, tornado by far causes most injuries and fatality while flood is responsible for most economic damage, followed by hurricane/typhoon and tornodo.