US NOAA Storm Database Analysis has been done for duration 1993 to 2011 in order to answer two important questions concerning the impact of various Environmental Event Types on Human Health and their Economical Consequences. Analysis suggests that Tsunami resulted in maximum number of Fatalities and Injuries. However, the cost of damages were highest for Hurricane(Typhoon) crossing 400 Million Dollars.
# Adding required packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(tidyr)
library(ggplot2)
# Download and read StormDB.csv
url<- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(url,destfile="StormDB.csv",mode="wb")
stormData<- read.csv("StormDB.csv")
# Understand the structure of Storm dataset
str(stormData)
head(stormData)
colnames(stormData)
unique(stormData$EVTYPE)
Select the columns required for finding Most Harmful Events w.r.t. population health and Damage Costs
Change free-text column names to understandable column names for convinience.
Replace Date with only Year as integer
Remove data before 1993 to avoid skewness in analysis because very few events have been recorded for years till 1992
stormDataClean<- select(tbl_df(stormData),BGN_DATE,EVTYPE,FATALITIES, INJURIES,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP)
stormDataClean<- setNames(stormDataClean, c("begin_date","event_type","fatalities", "injuries","prop_dmg","prop_dmg_exp","crop_dmg","crop_dmg_exp"))
stormDataClean$begin_date<- parse_date_time(stormDataClean$begin_date, orders = "mdy HMS")# Or use ymd_hms()
stormDataClean$begin_date<- as.integer(format(stormDataClean$begin_date, "%Y"))
colnames(stormDataClean)[colnames(stormDataClean)=="begin_date"]<-"year"
# Ignore data before 1993 to avoid skewness in analysis
group_by(stormDataClean, year) %>%
summarise(count_event_type = length(unique(event_type)))
stormDataClean<- filter(stormDataClean,year>1992) #unique(stormDataClean$year)
In raw dataset, extra Event Types have been recorded other than 48 mentioned in the document. So, we need to clean EVTYPE such that it contains only 48 valid Event Types.
stormDataClean$event_type<- tolower(stormDataClean$event_type)
unique(stormDataClean$event_type) # more than 48
clean.eventType<- function(event_type){
event<- NA #default to be returned
if (grepl("low tide", event_type))
event <- "astronomical low tide"
else if (grepl("avalan", event_type))
event <- "avalanche"
else if (grepl("blizz", event_type))
event <- "blizzard"
else if (grepl("chill|cold", event_type)) {
if (grepl("extr", event_type))
event <- "extreme cold/wind chill"
else
event <- "cold/wind chill"
} # cold/wind chill events
else if (grepl("debris flow|slide", event_type))
event <- "debris flow"
else if (grepl("dense smoke", event_type))
event <- "dense smoke"
else if (grepl("drought", event_type))
event <- "drought"
else if (grepl("dust", event_type)) {
if (grepl("devil", event_type))
event <- "dust devil"
else
event <- "dust storm"
} # dust events
else if (grepl("fire", event_type))
event <- "wildfire"
else if (grepl("flood", event_type)) {
if (grepl("coast", event_type))
event <- "coastal flood"
else if (grepl("flash", event_type))
event <- "flash flood"
else if (grepl("lake", event_type))
event <- "lakeshore flood"
else
event <- "flood"
} # flood events
else if (grepl("fog", event_type)) {
if (grepl("freez", event_type))
event <- "freezing fog"
else
event <- "dense fog"
} # fog events
else if (grepl("frost|freeze", event_type))
event <- "frost/freeze"
else if (grepl("funnel", event_type))
event <- "funnel cloud"
else if (grepl("hail", event_type)) {
if (grepl("marine", event_type))
event <- "marine hail"
else
event <- "hail"
} # hail events
else if (grepl("heat", event_type)) {
if (grepl("excess", event_type))
event <- "excessive heat"
else
event <- "heat"
} # heat events
else if (grepl("hurricane|typhoon", event_type))
event <- "hurricane (typhoon)"
else if (grepl("ice|icy", event_type))
event <- "ice storm"
else if (grepl("lightning", event_type))
event <- "lightning"
else if (grepl("rain", event_type))
event <- "heavy rain"
else if (grepl("rip", event_type))
event <- "rip current"
else if (grepl("seiche", event_type))
event <- "seiche"
else if (grepl("sleet", event_type))
event <- "sleet"
else if (grepl("snow", event_type)) {
if (grepl("lake", event_type))
event <- "lake-effect snow"
else if (grepl("heavy", event_type))
event <- "heavy snow"
} # snow events
else if (grepl("spout", event_type))
event <- "waterspout"
else if (grepl("surf", event_type))
event <- "high surf"
else if (grepl("surge", event_type))
event <- "storm surge/tide"
else if (grepl("torn", event_type))
event <- "tornado"
else if (grepl("tropical", event_type)) {
if (grepl("depress", event_type))
event <- "tropical depression"
else
event <- "tropical storm"
} # tropical events
else if (grepl("tsun", event_type))
event <- "tsunami"
else if (grepl("volcan", event_type))
event <- "volcanic ash"
else if (grepl("wind", event_type)) {
if (grepl("marine", event_type)) {
if (grepl("thunder", event_type))
event <- "marine thunderstorm wind"
else if (grepl("strong", event_type))
event <- "marine strong wind"
else if (grepl("high", event_type))
event <- "marine high wind"
} # marine wind events
else if (grepl("thunder|tstm", event_type))
event <- "thunderstorm wind"
else if (grepl("strong", event_type))
event <- "strong wind"
else if (grepl("high", event_type))
event <- "high wind"
} # wind events
else if (grepl("wint", event_type)) {
if (grepl("storm", event_type))
event <- "winter storm"
else
event <- "winter weather"
} # winter events
event
}
#unique(stormDataClean$event_type)
stormDataClean$event_type<- sapply( stormDataClean$event_type, clean.eventType)
sort(unique(stormDataClean$event_type)) # displays 48 valid event types replaced in dataset
# The events causing maximum no. of fatalities as well as injuries are being considered most dangerous
stormData1<-arrange(select(stormDataClean,year,event_type,fatalities, injuries ),desc(fatalities,injuries))
stormData1<- filter(stormData1,(injuries>0|fatalities>0))
fatalities.data<- stormData1%>%
group_by(event_type) %>%
summarise(mean(fatalities, na.rm=TRUE))
names(fatalities.data)<-c("event_type","fatalities")
fatalities.data<-arrange(fatalities.data,desc(fatalities))[1:5,1:2] # take only top 5 fatalities.data
injuries.data<- stormData1%>%
group_by(event_type) %>%
summarise(mean(injuries, na.rm=TRUE))
names(injuries.data)<-c("event_type","injuries")
injuries.data<-arrange(injuries.data,desc(injuries))[1:5,1:2] # take only top 5 injuries.data
g.fatalities<- ggplot(fatalities.data, aes(x=event_type, y=fatalities))+
geom_bar(stat = "identity",aes(fill=fatalities))+
scale_fill_gradient(low = "yellow",high="red")+
labs(y="Average fatalities",title="Average fatalities per weather event")
g.fatalities
g.injuries<- ggplot(injuries.data, aes(x=event_type, y=injuries))+
geom_bar(stat="identity",aes(fill=injuries))+
scale_color_gradient(low="yellow",high = "red")+
labs(y="Average Injuries", title="Average injuries per weather event")
g.injuries
Crop and Property Damage exponents have been converted into integers and multiplied with damage cost. Total Damage done by an event is the sum of crop damage and property damage by an event.
stormData2<- select(stormDataClean,year,event_type,prop_dmg,prop_dmg_exp,
crop_dmg,crop_dmg_exp)
stormData2<- filter(stormData2,(prop_dmg>0|crop_dmg>0))
stormData2$prop_dmg_exp<- tolower(stormData2$prop_dmg_exp)
stormData2$crop_dmg_exp<- tolower(stormData2$crop_dmg_exp)
#length(stormData2$prop_dmg)
for( i in 1:length(stormData2$prop_dmg)){
if(grepl('h',stormData2$prop_dmg_exp[i]))
stormData2$prop_dmg_exp[i]<- 100
else if(grepl('k',stormData2$prop_dmg_exp[i]))
stormData2$prop_dmg_exp[i]<- 1000
else if(grepl('m',stormData2$prop_dmg_exp[i]))
stormData2$prop_dmg_exp[i]<- 1000000
else if(grepl('b',stormData2$prop_dmg_exp[i]))
stormData2$prop_dmg_exp[i]<- 1000000000
else
stormData2$prop_dmg_exp[i]<- -1
}
for( i in 1:length(stormData2$crop_dmg)){
if(grepl('h',stormData2$crop_dmg_exp[i]))
stormData2$crop_dmg_exp[i]<- 100
else if(grepl('k',stormData2$crop_dmg_exp[i]))
stormData2$crop_dmg_exp[i]<- 1000
else if(grepl('m',stormData2$crop_dmg_exp[i]))
stormData2$crop_dmg_exp[i]<- 1000000
else if(grepl('b',stormData2$crop_dmg_exp[i]))
stormData2$crop_dmg_exp[i]<- 1000000000
else
stormData2$crop_dmg_exp[i]<- -1
}
stormData2<- mutate(stormData2, total.damage= ((as.integer(prop_dmg_exp)*prop_dmg) +
(as.integer(crop_dmg_exp)*crop_dmg))/1000000)
damages.data<- stormData2%>%
group_by(event_type) %>%
summarise(mean(total.damage, na.rm=TRUE))
names(damages.data)<-c("event_type","total.damage")
damages.data<-arrange(damages.data,desc(total.damage))[1:5,1:2] # take only top 5 damages.data
g.damages<- ggplot(damages.data, aes(x=event_type, y=total.damage))+
geom_bar(stat = "identity",aes(fill=total.damage))+
scale_fill_gradient(low = "yellow",high="red")+
labs(y="Average damages",title="Average damages(in millions) per weather event")
g.damages