This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
The events in the database start in the year 1950 and end in November 2011.The basic goal of this assignment is to explore the NOAA Storm Database and answer some basic questions about severe weather events. The data analysis must address the following questions:
Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
Across the United States, which types of events have the greatest economic consequences?
Source: website:“https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2” We are using a load function to load the Data in to the sesssion
-Raw data is transformed to our Analysis by summarizing the data by Event Type -Top 20 events causing the maximum damage is considered for the analysis using PARETO principle -Further refinement can be done to deep dive in to each event type, it’s trend and impact over the years
loadData <- function() {
if(file.exists("StormData.rds")) {
loadRDS("raw_data.rds")
}
if(!file.exists("StormData.rds.bz2")) {
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", "StormData.csv.bz2", method = "curl")
}
df <- read.csv("StormData.csv.bz2")
saveRDS(df, "raw_data.rds")
df
}
raw_data <- loadData()
library(dplyr)
# Aggregate Data by Event Type for Fatalities
Data_fatalities_aggregate<-aggregate(list(FATALITIES=raw_data$FATALITIES), by=list(EVTYPE=raw_data$EVTYPE),sum,na.rm = TRUE)
# Arrange Data in Descending order
Data_fatalities_descending <-arrange(Data_fatalities_aggregate,desc(FATALITIES))
library(sqldf)
attach(Data_fatalities_descending)
Data_fatalities_aggregate_nonzero <- sqldf("select * from Data_fatalities_descending where FATALITIES>0")
Data_fatalities_aggregate_nonzero_top10 <- head(Data_fatalities_aggregate_nonzero,n=10)
Data_injuries_aggregate<-aggregate(list(INJURIES=raw_data$INJURIES), by=list(EVTYPE=raw_data$EVTYPE),sum,na.rm = TRUE)
# Data Injuries Desending
Data_injuries_descending <-arrange(Data_injuries_aggregate,desc(INJURIES))
# Data Injuries remove non zero
Data_injuries_aggregate_nonzero <- sqldf("select * from Data_injuries_descending where INJURIES>0")
#Select Top10 Injury Events
Data_injuries_aggregate_nonzero_top10 <- head(Data_injuries_aggregate_nonzero,n=10)
library(plyr)
library(dplyr)
library(ggplot2)
library(gridExtra)
Data_economic_impact <- select(raw_data,COUNTYNAME,STATE,EVTYPE,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP)
propdmg <- Data_economic_impact[raw_data$PROPDMG>0,] # filter out Data without any Property damage from the Dataset
for (i in 1:nrow(propdmg)){
tmp <- propdmg$PROPDMGEXP[i]
mul <- propdmg$PROPDMG[i]
if (tmp %in% c('h', 'H'))
propdmg$actualpropdmg[i] <- mul * 100
else if (tmp %in% c('k', 'K'))
propdmg$actualpropdmg[i] <- mul * 1000
else if (tmp %in% c('m', 'M'))
propdmg$actualpropdmg[i] <- mul * 1000000
else if (tmp %in% c('b', 'B'))
propdmg$actualpropdmg[i] <- mul * 1000000000
}
propdmgSum <- aggregate(list(actualpropdmg=propdmg$actualpropdmg), by=list(EVTYPE=propdmg$EVTYPE),sum,na.rm = TRUE)
# Arrange data in descending order
Propdmgsum_descending <-arrange(propdmgSum,desc(actualpropdmg))
# Select top10 events with highest property damage
top_propdmg <- head(Propdmgsum_descending,10)
cropdmg <- Data_economic_impact[raw_data$CROPDMG>0,] # filter out Data without any Property damage from the Dataset
for (i in 1:nrow(cropdmg)){
tmp <- cropdmg$CROPDMGEXP[i]
mul <- cropdmg$CROPDMG[i]
if (tmp %in% c('h', 'H'))
cropdmg$actualcropdmg[i] <- mul * 100
else if (tmp %in% c('k', 'K'))
cropdmg$actualcropdmg[i] <- mul * 1000
else if (tmp %in% c('m', 'M'))
cropdmg$actualcropdmg[i] <- mul * 1000000
else if (tmp %in% c('b', 'B'))
cropdmg$actualcropdmg[i] <- mul * 1000000000
}
library(plyr)
#propdmgSum <- ddply(propdmg, .(EVTYPE), summarize, propdmg = sum(actualpropdmg))
cropdmgSum <- aggregate(list(actualcropdmg=cropdmg$actualcropdmg), by=list(EVTYPE=cropdmg$EVTYPE),sum,na.rm = TRUE)
# Arrange data in descending order
cropdmgsum_descending <-arrange(cropdmgSum,desc(actualcropdmg))
# Select top10 events with highest property damage
top_cropdmg <- head(cropdmgsum_descending,10)
library(ggplot2)
library(gridExtra)
#Plotting the Data
Fatalities_plot<- ggplot(Data_fatalities_aggregate_nonzero_top10, aes(x = reorder(EVTYPE, -FATALITIES), y = FATALITIES)) +
geom_bar(stat = "identity")+
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+
labs(title = "Fatalities by Event Type")+
labs(x = "Event Type")
Injuries_plot<-ggplot(Data_injuries_aggregate_nonzero_top10, aes(x = reorder(EVTYPE, -INJURIES), y = INJURIES)) +
geom_bar(stat = "identity")+
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+
labs(title = "Injuries by Event Type")+
labs(x = "Event Type")
grid.arrange(Fatalities_plot, Injuries_plot, ncol=2)
ggplot(top_propdmg, aes(x = reorder(EVTYPE, -actualpropdmg), y = actualpropdmg)) +
geom_bar(stat = "identity")+
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+
labs(title = "Property Damage by Event Type")+
labs(x = "Event Type")+
labs(y = "Property Damage (in Billions)")
ggplot(top_cropdmg, aes(x = reorder(EVTYPE, -actualcropdmg), y = actualcropdmg)) +
geom_bar(stat = "identity")+
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+
labs(title = "Crop Damage by Event Type")+
labs(x = "Event Type")+
labs(y = "Crop Damage (in Billions)")
The raw weather events or non-events contains patterns which can be used to reduce the number of events to the official forty-eight categories enumerated in the National Weather Service document provided with the course project instructions. We have found that FLOOD, HURRYCANE and TORNADO had maximum economic impact on the property damage and DROUGHT had major impact on the Crop Damage. Further analysis can be done by indvidually studying the trend and it’s impact for each event type.