In this project, we analyze the NOAA Storm data set from U.S. National Oceanic and Atmospheric Administration’s (NOAA). We use an exploratory analysis to find which types of events are most harmful with respect to population health and which are for economic.
We first download the data from DATA and use the bunzip2 function in the R.utils libraryto unzipe it.
#library(R.utils)
#download.file('https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2','#NOAA.csv.bz2',method='curl')
#bunzip2('NOAA.csv.bz2')
NOAA_Data<- read.csv("NOAA.csv", header = TRUE)
The following codes aggregate the data set by injuries, sort them, and then depict the bar plot of top 5 most causes of injuries. The table and the plot show that tornodo is mostly responsible for injuries.
injuries<-aggregate(NOAA_Data$INJURIES, by= list(Evtype= NOAA_Data$EVTYPE), sum)
injuries<-injuries[order(injuries$x, decreasing = TRUE),]
head(injuries,5)
## Evtype x
## 834 TORNADO 91346
## 856 TSTM WIND 6957
## 170 FLOOD 6789
## 130 EXCESSIVE HEAT 6525
## 464 LIGHTNING 5230
options(warn = -1)
library(ggplot2)
g<-ggplot(injuries[1:5,], aes(Evtype, y=x))
g+ geom_bar(stat="identity")+ xlab("Event Type") +
ylab("Number of Injuries") + ggtitle("Injuries by Event Type")
Similarly, we can aggregate the data set by fatalities, sort them, and then depict the bar plot of top 5 most causes of fatalities.
The table and the plot show that tornodo is also mostly responsible for fatalities.
fatalitie<-aggregate(NOAA_Data$FATALITIES, by=list(Evtype=NOAA_Data$EVTYPE), sum)
fatalitie<- fatalitie[order(fatalitie$x, decreasing = TRUE),]
head(fatalitie,5)
## Evtype x
## 834 TORNADO 5633
## 130 EXCESSIVE HEAT 1903
## 153 FLASH FLOOD 978
## 275 HEAT 937
## 464 LIGHTNING 816
g<-ggplot(fatalitie[1:5,], aes(Evtype, y=x))
g+ geom_bar(stat="identity")+ xlab("Event Type") +
ylab("Number of Fatalities") + ggtitle("Fatalities by Event Type")
We now see what causes most economic damages. Before that, we notice that the relevant variables are in \(10^x\) format and they need some cleaning. To make this explicit.
NOAA_Data$PROPDMGEXP<-toupper(NOAA_Data$PROPDMGEXP)
NOAA_Data$CROPDMGEXP<-toupper(NOAA_Data$CROPDMGEXP)
unique(c(NOAA_Data$PROPDMGEXP, NOAA_Data$CROPDMGEXP))
## [1] "K" "M" "" "B" "+" "0" "5" "6" "?" "4" "2" "3" "H" "7" "-" "1" "8"
We first replace the sings (+, -, ?) with zero.
Sub_NOAAdata<-NOAA_Data[,c("EVTYPE", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
Sub_NOAAdata[Sub_NOAAdata$PROPDMGEXP %in% c("", "+", "-", "?"), "PROPDMGEXP"] <- "0"
Sub_NOAAdata[Sub_NOAAdata$CROPDMGEXP %in% c("", "+", "-", "?"), "CROPDMGEXP"] <- "0"
unique(c(Sub_NOAAdata$PROPDMGEXP, Sub_NOAAdata$CROPDMGEXP))
## [1] "K" "M" "0" "B" "5" "6" "4" "2" "3" "H" "7" "1" "8"
In addition, we replace the initials (B,M,K,H) representing (Billion, Million, Thousand, Hundred) with (9,6,3,2).
Sub_NOAAdata[Sub_NOAAdata$PROPDMGEXP== "B", "PROPDMGEXP"]<-"9"
Sub_NOAAdata[Sub_NOAAdata$CROPDMGEXP== "B", "CROPDMGEXP"]<-"9"
Sub_NOAAdata[Sub_NOAAdata$PROPDMGEXP== "M", "PROPDMGEXP"]<-"6"
Sub_NOAAdata[Sub_NOAAdata$CROPDMGEXP== "M", "CROPDMGEXP"]<-"6"
Sub_NOAAdata[Sub_NOAAdata$PROPDMGEXP== "K", "PROPDMGEXP"]<-"3"
Sub_NOAAdata[Sub_NOAAdata$CROPDMGEXP== "K", "CROPDMGEXP"]<-"3"
Sub_NOAAdata[Sub_NOAAdata$PROPDMGEXP== "H", "PROPDMGEXP"]<-"2"
Sub_NOAAdata[Sub_NOAAdata$CROPDMGEXP== "H", "CROPDMGEXP"]<-"2"
unique(c(Sub_NOAAdata$PROPDMGEXP, Sub_NOAAdata$CROPDMGEXP))
## [1] "3" "6" "0" "9" "5" "4" "2" "7" "1" "8"
Sub_NOAAdata$PROPDMGEXP<-10^(as.numeric(Sub_NOAAdata$PROPDMGEXP))
Sub_NOAAdata$CROPDMGEXP<-10^(as.numeric(Sub_NOAAdata$CROPDMGEXP))
Sub_NOAAdata[is.na(Sub_NOAAdata$PROPDMG), "PROPDMG"]<- 0
Sub_NOAAdata[is.na(Sub_NOAAdata$CROPDMG), "CROPDMG"] <- 0
## calculate the total damage
Sub_NOAAdata<-within(Sub_NOAAdata,
Total_dmg<- PROPDMG*PROPDMGEXP + CROPDMG*CROPDMGEXP)
DamageByType<-aggregate(Sub_NOAAdata$Total_dmg,
by=list(Evtype=Sub_NOAAdata$EVTYPE), FUN=sum)
DamageByType<-DamageByType[order(DamageByType$x,decreasing = TRUE),]
The following codes aggregate the transformed variables by economic damage and create the table and the bar plot of top most causes of economic damage. As we see, this time flood is responsible for most economic damages, followed by hurricane/typhoon and tornodo.
head(DamageByType,5)
## Evtype x
## 170 FLOOD 150319678257
## 411 HURRICANE/TYPHOON 71913712800
## 834 TORNADO 57362333946
## 670 STORM SURGE 43323541000
## 244 HAIL 18761221986
Sub_Damage<-DamageByType[1:5,]
g<-ggplot(Sub_Damage, aes(Evtype, y=x))
g+ geom_bar(stat = "identity") + xlab("Event Type")+
ylab("Damage in Dollars") + ggtitle("Economic Damage by Event Type")
In summary, tornado by far causes most injuries and fatality while flood is responsible for most economic damage, followed by hurricane/typhoon and tornodo.