This project involves exploring the U.S. National Oceanic and Atmospheric Administration's (NOAA) storm database, which tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
The data analysis aims to find out which type of events are most harmful with respect to population health (using Fatalities & Injuries measures) and the greatest economic consequences (using property damages & crop damages measures).
From the analysis, we found that: (A) The top 3 events that cause the most fatalities (in decreasing order) are Tornado, Excessive Heat and Flash Flood (B) The top 3 events that cause the most injuries (in decreasing order) are Tornado, Thunderstorm Wind and Flash Flood. © The top 3 events that cost the most (in decreasing order) are Flood, Huricane/Typhoon and Tornado.
Concluding, resources should be placed to reduce the impacts caused by Tornado, Flood, Huricane/Typhone and Thuderstorm Wind.
Data is downloaded from the Coursera website. https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2
Documentation of the website is available at: National Weather Service Storm Data Documentation: https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf
National Climatic Data Center Storm Events FAQ: https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2FNCDC%20Storm%20Events-FAQ%20Page.pdf
The data are read via the following code:
data = read.csv(bzfile("repdata-data-StormData.csv.bz2"))
Transform Fatalities to numeric value:
data$FATALITIES <- as.numeric(data$FATALITIES)
Transform Property Damage to a numeric value in USD
data$PROPDMGEXP <- as.character(data$PROPDMGEXP)
data$PROPDMGEXP[data$PROPDMGEXP == "" | data$PROPDMGEXP == "+" | data$PROPDMGEXP == "?" | data$PROPDMGEXP == "-"] <- "1"
data$PROPDMGEXP[data$PROPDMGEXP == "H" | data$PROPDMGEXP == "h"] <- "100"
data$PROPDMGEXP[data$PROPDMGEXP == "K" | data$PROPDMGEXP == "k"] <- "1000"
data$PROPDMGEXP[data$PROPDMGEXP == "M" | data$PROPDMGEXP == "m"] <- "1000000"
data$PROPDMGEXP[data$PROPDMGEXP == "B" | data$PROPDMGEXP == "b"] <- "1000000000"
data$PROPDMGEXP <- as.numeric(data$PROPDMGEXP)
data$PROPDMG_USD <- data$PROPDMG * data$PROPDMGEXP
Transform Crop Damage to a numeric value in USD
data$CROPDMGEXP <- as.character(data$CROPDMGEXP)
data$CROPDMGEXP[data$CROPDMGEXP == "" | data$CROPDMGEXP == "?"] <- "1"
data$CROPDMGEXP[data$CROPDMGEXP == "B" | data$CROPDMGEXP == "b"] <- "1000000000"
data$CROPDMGEXP[data$CROPDMGEXP == "M" | data$CROPDMGEXP == "m"] <- "1000000"
data$CROPDMGEXP[data$CROPDMGEXP == "K" | data$CROPDMGEXP == "k"] <- "1000"
data$CROPDMGEXP[data$CROPDMGEXP == "" | data$CROPDMGEXP == "?"] <- "1"
data$CROPDMGEXP <- as.numeric(data$CROPDMGEXP)
data$CROPDMG_USD <- data$CROPDMG * data$CROPDMGEXP
Aggregate the data by looking at Event Type (EVTYPE), Fatalities, Injuries, Cost of Property Damage (in USD) & Cost of Crop Damage (in USD). Omit the values that are missing.
Consequence<-aggregate(cbind(FATALITIES,INJURIES,PROPDMG_USD,CROPDMG_USD)~EVTYPE,data=data,FUN=sum,na.action=na.omit)
Plot the data to present the results.
require(ggplot2)
## Loading required package: ggplot2
Fatal_Data<-head(Consequence[order(Consequence[,2],decreasing=TRUE),],10)[,c(1,2)]
ggplot(data=Fatal_Data,aes(y=FATALITIES,x=EVTYPE))+geom_bar(size=1,colour="black",fill="dark red",stat="identity")+labs(list(x="Event",y="Fatalities"))
As indicated by the graph, the top 3 events that cause the most fatalities (in decreasing order) are Tornado, Excessive Heat and Flash Flood.
require(ggplot2)
Injury_Data<-head(Consequence[order(Consequence[,3],decreasing=TRUE),],10)[,c(1,3)]
ggplot(data=Injury_Data,aes(y=INJURIES,x=EVTYPE))+geom_bar(size=1,colour="black",fill="red",stat="identity")+labs(list(x="Event",y="Injuries"))
As indicated by the graph, the top 3 events that cause the most injuries (in decreasing order) are Tornado, Thunderstorm Wind and Flash Flood.
Combine the expenses for both Property Damage and Crop Damage
Consequence$TotalExp_USD=Consequence$PROPDMG_USD+Consequence$CROPDMG_USD
Plot total expenses against event types
require(ggplot2)
Expenses<-head(Consequence[order(Consequence[,6],decreasing=TRUE),],10)[,c(1,6)]
ggplot(data=Expenses,aes(y=TotalExp_USD/(10^9),x=EVTYPE))+geom_bar(size=1,colour="black",fill="blue",stat="identity")+labs(list(x="Event",y="Economic consequences in billion USD"))
As indicated by the graph, the top 3 events that cost the most (in decreasing order) are Flood, Huricane/Typhoon and Tornado.