Synopsis

US NOAA Storm Database Analysis has been done for duration 1993 to 2011 in order to answer two important questions concerning the impact of various Environmental Event Types on Human Health and their Economical Consequences. Analysis suggests that Tsunami resulted in maximum number of Fatalities and Injuries. However, the cost of damages were highest for Hurricane(Typhoon) crossing 400 Million Dollars.

Data Processing

# Adding required packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(tidyr)
library(ggplot2) 
# Download and read StormDB.csv
url<- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(url,destfile="StormDB.csv",mode="wb")
stormData<- read.csv("StormDB.csv")
# Understand the structure of Storm dataset
str(stormData)
head(stormData)
colnames(stormData)
unique(stormData$EVTYPE)

Cleaning Dataset:

  1. Select the columns required for finding Most Harmful Events w.r.t. population health and Damage Costs

  2. Change free-text column names to understandable column names for convinience.

  3. Replace Date with only Year as integer

  4. Remove data before 1993 to avoid skewness in analysis because very few events have been recorded for years till 1992

stormDataClean<- select(tbl_df(stormData),BGN_DATE,EVTYPE,FATALITIES, INJURIES,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP)

stormDataClean<- setNames(stormDataClean, c("begin_date","event_type","fatalities", "injuries","prop_dmg","prop_dmg_exp","crop_dmg","crop_dmg_exp"))

stormDataClean$begin_date<- parse_date_time(stormDataClean$begin_date, orders = "mdy HMS")# Or use ymd_hms()
stormDataClean$begin_date<- as.integer(format(stormDataClean$begin_date, "%Y"))
colnames(stormDataClean)[colnames(stormDataClean)=="begin_date"]<-"year"

# Ignore data before 1993 to avoid skewness in analysis
group_by(stormDataClean, year) %>%  
        summarise(count_event_type = length(unique(event_type)))
stormDataClean<- filter(stormDataClean,year>1992) #unique(stormDataClean$year)

In raw dataset, extra Event Types have been recorded other than 48 mentioned in the document. So, we need to clean EVTYPE such that it contains only 48 valid Event Types.

stormDataClean$event_type<- tolower(stormDataClean$event_type)
unique(stormDataClean$event_type) # more than 48
clean.eventType<- function(event_type){
  event<- NA #default to be returned 
  if (grepl("low tide", event_type))
          event <- "astronomical low tide"
     
     else if (grepl("avalan", event_type))
          event <- "avalanche"
     
     else if (grepl("blizz", event_type))
          event <- "blizzard"
     
     else if (grepl("chill|cold", event_type)) {
               if (grepl("extr", event_type))
                    event <- "extreme cold/wind chill"
               else
                    event <- "cold/wind chill"
     } # cold/wind chill events
     
     else if (grepl("debris flow|slide", event_type))
          event <- "debris flow"
     
     else if (grepl("dense smoke", event_type))
          event <- "dense smoke"
     
     else if (grepl("drought", event_type))
          event <- "drought"
     
     else if (grepl("dust", event_type)) {
               if (grepl("devil", event_type))
                    event <- "dust devil"
               else
                    event <- "dust storm"
     } # dust events
     
     else if (grepl("fire", event_type))
          event <- "wildfire"
     
     else if (grepl("flood", event_type)) {
               if (grepl("coast", event_type))
                    event <- "coastal flood"
               else if (grepl("flash", event_type))
                    event <- "flash flood"
               else if (grepl("lake", event_type))
                    event <- "lakeshore flood"
               else
                    event <- "flood"
     } # flood events
     
     else if (grepl("fog", event_type)) {
               if (grepl("freez", event_type))
                    event <- "freezing fog"
               else
                    event <- "dense fog"
     } # fog events
     
     else if (grepl("frost|freeze", event_type))
          event <- "frost/freeze"
     
     else if (grepl("funnel", event_type))
          event <- "funnel cloud"
     
     else if (grepl("hail", event_type)) {
               if (grepl("marine", event_type))
                    event <- "marine hail"
               else
                    event <- "hail"
     } # hail events
     
     else if (grepl("heat", event_type)) {
               if (grepl("excess", event_type))
                    event <- "excessive heat"
               else
                    event <- "heat"
     } # heat events
     
     else if (grepl("hurricane|typhoon", event_type))
          event <- "hurricane (typhoon)"
     
     else if (grepl("ice|icy", event_type))
          event <- "ice storm"

     else if (grepl("lightning", event_type))
          event <- "lightning"
     
     else if (grepl("rain", event_type))
          event <- "heavy rain"
     
     else if (grepl("rip", event_type))
          event <- "rip current"
     
     else if (grepl("seiche", event_type))
          event <- "seiche"
     
     else if (grepl("sleet", event_type))
          event <- "sleet"
     
     else if (grepl("snow", event_type)) {
               if (grepl("lake", event_type))
                    event <- "lake-effect snow"
               else if (grepl("heavy", event_type))
                    event <- "heavy snow"
     } # snow events
     
     else if (grepl("spout", event_type))
          event <- "waterspout"
     
     else if (grepl("surf", event_type))
          event <- "high surf"
     
     else if (grepl("surge", event_type))
          event <- "storm surge/tide"
     
     else if (grepl("torn", event_type))
          event <- "tornado"
     
     else if (grepl("tropical", event_type)) {
               if (grepl("depress", event_type))
                    event <- "tropical depression"
               else
                    event <- "tropical storm"
     } # tropical events
     
     else if (grepl("tsun", event_type))
          event <- "tsunami"
     
     else if (grepl("volcan", event_type))
          event <- "volcanic ash"
     
     else if (grepl("wind", event_type)) {
               if (grepl("marine", event_type)) {
                    if (grepl("thunder", event_type))
                         event <- "marine thunderstorm wind"
                    else if (grepl("strong", event_type))
                         event <- "marine strong wind"
                    else if (grepl("high", event_type))
                         event <- "marine high wind"
               } # marine wind events
          
               else if (grepl("thunder|tstm", event_type))
                    event <- "thunderstorm wind"
               else if (grepl("strong", event_type))
                    event <- "strong wind"
               else if (grepl("high", event_type))
                    event <- "high wind"
     } # wind events
     
     else if (grepl("wint", event_type)) {
               if (grepl("storm", event_type))
                    event <- "winter storm"
               else
                    event <- "winter weather" 
     } # winter events
     event
}
#unique(stormDataClean$event_type)
stormDataClean$event_type<-  sapply( stormDataClean$event_type, clean.eventType)
sort(unique(stormDataClean$event_type)) # displays 48 valid event types replaced in dataset

Results

Analysis Task1: Finding most harmful events w.r.t. population health

# The events causing maximum no. of fatalities as well as injuries are being considered most dangerous

stormData1<-arrange(select(stormDataClean,year,event_type,fatalities, injuries ),desc(fatalities,injuries))
stormData1<- filter(stormData1,(injuries>0|fatalities>0))
fatalities.data<- stormData1%>%
              group_by(event_type) %>%
                summarise(mean(fatalities, na.rm=TRUE))
names(fatalities.data)<-c("event_type","fatalities")
fatalities.data<-arrange(fatalities.data,desc(fatalities))[1:5,1:2] # take only top 5 fatalities.data
injuries.data<- stormData1%>%
              group_by(event_type) %>%
                summarise(mean(injuries, na.rm=TRUE))
names(injuries.data)<-c("event_type","injuries")
injuries.data<-arrange(injuries.data,desc(injuries))[1:5,1:2] # take only top 5 injuries.data
g.fatalities<- ggplot(fatalities.data, aes(x=event_type, y=fatalities))+
                      geom_bar(stat = "identity",aes(fill=fatalities))+
                      scale_fill_gradient(low = "yellow",high="red")+
                      labs(y="Average fatalities",title="Average fatalities per weather event")
g.fatalities

g.injuries<- ggplot(injuries.data, aes(x=event_type, y=injuries))+
             geom_bar(stat="identity",aes(fill=injuries))+
             scale_color_gradient(low="yellow",high = "red")+
             labs(y="Average Injuries", title="Average injuries per weather event")
g.injuries

Analysis Task2: Finding types of events having greatest economic consequences

Crop and Property Damage exponents have been converted into integers and multiplied with damage cost. Total Damage done by an event is the sum of crop damage and property damage by an event.

stormData2<- select(stormDataClean,year,event_type,prop_dmg,prop_dmg_exp,
                    crop_dmg,crop_dmg_exp)
stormData2<- filter(stormData2,(prop_dmg>0|crop_dmg>0))
stormData2$prop_dmg_exp<- tolower(stormData2$prop_dmg_exp)
stormData2$crop_dmg_exp<- tolower(stormData2$crop_dmg_exp)
#length(stormData2$prop_dmg)
for( i in 1:length(stormData2$prop_dmg)){
  if(grepl('h',stormData2$prop_dmg_exp[i]))
    stormData2$prop_dmg_exp[i]<- 100
  else if(grepl('k',stormData2$prop_dmg_exp[i]))
    stormData2$prop_dmg_exp[i]<- 1000
  else if(grepl('m',stormData2$prop_dmg_exp[i]))
    stormData2$prop_dmg_exp[i]<- 1000000
  else if(grepl('b',stormData2$prop_dmg_exp[i]))
    stormData2$prop_dmg_exp[i]<- 1000000000
  else 
    stormData2$prop_dmg_exp[i]<- -1
}
for( i in 1:length(stormData2$crop_dmg)){
  if(grepl('h',stormData2$crop_dmg_exp[i]))
    stormData2$crop_dmg_exp[i]<- 100
  else if(grepl('k',stormData2$crop_dmg_exp[i]))
    stormData2$crop_dmg_exp[i]<- 1000
  else if(grepl('m',stormData2$crop_dmg_exp[i]))
    stormData2$crop_dmg_exp[i]<- 1000000
  else if(grepl('b',stormData2$crop_dmg_exp[i]))
    stormData2$crop_dmg_exp[i]<- 1000000000
  else 
    stormData2$crop_dmg_exp[i]<- -1
}
stormData2<- mutate(stormData2, total.damage= ((as.integer(prop_dmg_exp)*prop_dmg) +
                      (as.integer(crop_dmg_exp)*crop_dmg))/1000000)
damages.data<- stormData2%>%
              group_by(event_type) %>%
                summarise(mean(total.damage, na.rm=TRUE)) 
names(damages.data)<-c("event_type","total.damage")
damages.data<-arrange(damages.data,desc(total.damage))[1:5,1:2] # take only top 5 damages.data
g.damages<- ggplot(damages.data, aes(x=event_type, y=total.damage))+
                      geom_bar(stat = "identity",aes(fill=total.damage))+
                      scale_fill_gradient(low = "yellow",high="red")+
                      labs(y="Average damages",title="Average damages(in millions) per weather event")
g.damages