Synopsis

This is an exploratory analysis based on data set provided by National Weather services. The dataset consists of historical severe weather events, timeline, duration, fatalities, injuries and etc. This analytis focuses on population health and economics consequences by event type and find out which of the events are the most harmful

Data Processing

This analysis uses following packages. dplyr to mutate, aggregate, filter and arrange data. stringr is used to parse strings and put into array, and finally ggplot2 is used to show the charts. Please make sure these packages are installed before running the code.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)

The original file is unzipped and placed in the current working directory. The dataset consists too many fields that the scope of this analysis cannot cover. To minimize the file size and data table clean, only the fields of interest are read.

## Read certain columns to be used in the analysis
storm_dataset <-
  read.csv(file = "repdata-data-StormData.csv", header = TRUE,stringsAsFactors = FALSE)[,c(
    "BGN_DATE", "END_DATE", "STATE", "EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "CROPDMG"
  )]

The analysis is divided in two categories: Most harmful weather events by population health and by economic consequeces. For population health anaylsys, number of fatalities and injuries are the key metrices. For economic, number of corp. damages and number of property damages are taken into account. On top of category specific metrices, information such as how many states the event has occured, how many years, how many months out of 12 months are also considered to come up with the sevirity score.

Clean up the date formart etc.Also add month and year field to the dataset with mutate function in dplyr.Distinct number of years, months and states are captured which tells the scope of certain event type by timeline as well as by geolocation.

## Keep only date part from BGN_DATE and END_DATE
storm_dataset <- mutate(
  storm_dataset,
  BGN_DATE = str_split_fixed(storm_dataset$BGN_DATE, " ", 2)[,1],
  END_DATE = str_split_fixed(storm_dataset$END_DATE, " ", 2)[,1],
  EVTYPE = tolower(EVTYPE),
  YEAR = as.numeric(str_split_fixed(BGN_DATE, "/", 3)[,3]),
  MONTH = as.numeric(str_split_fixed(BGN_DATE, "/", 3)[,1])
)

total_num_year <-
  as.numeric(summarize(storm_dataset, n_distinct(YEAR)))
total_num_month <-
  as.numeric(summarize(storm_dataset, n_distinct(MONTH)))
total_states <-
  as.numeric(summarize(storm_dataset, n_distinct(STATE)))

Code to come up with the severity score for populaiton health:

total_fatal <- as.numeric(sum(storm_dataset$FATALITIES))
total_injur <- as.numeric(sum(storm_dataset$INJURIES))

## Group and Summarize the data
hlth_by_event_type <- storm_dataset %>%
  filter(FATALITIES > 0 | INJURIES > 0) %>%
  group_by(EVTYPE) %>%
  summarize(
    event_count = n(),
    fatal = sum(FATALITIES),
    injur = sum(INJURIES),
    num_year = n_distinct(YEAR),
    num_month = n_distinct(MONTH),
    num_states = n_distinct(STATE),
    severity_score = (3 * fatal / total_fatal) * 100 + (2 * injur / total_injur) *
      100 + (num_year / 62) * 100 + (num_month / 12) * 100 + (num_states / total_states) *
      100
  ) %>%
  arrange(desc(severity_score))

Similarly, for the severity score related to economic consequences is calculated following code chunk.

total_prop_dmg <- as.numeric(sum(storm_dataset$PROPDMG))
total_corp_dmg <- as.numeric(sum(storm_dataset$CROPDMG))

## Group and Summarize the data
econ_by_event_type <- storm_dataset %>%
  filter(CROPDMG > 0 | PROPDMG > 0) %>%
  group_by(EVTYPE) %>%
  summarize(
    event_count = n(),
    prop_damage = sum(PROPDMG),
    corp_damage = sum(CROPDMG),
    num_year = n_distinct(YEAR),
    num_month = n_distinct(MONTH),
    num_states = n_distinct(STATE),
    severity_score = (prop_damage / total_prop_dmg) * 100 + (corp_damage / total_corp_dmg) *
      100 + (num_year / 62) * 100 + (num_month / 12) * 100 + (num_states / total_states) *
      100
  ) %>%
  arrange(desc(severity_score))

Results

First, let’s preview the final aggregated resultset with calculated severity score. The resultset is ordered by the seveeiry score in decreasing order. Higher the severity score, more harmful the event type is.

head(hlth_by_event_type)
## Source: local data frame [6 x 8]
## 
##        EVTYPE event_count fatal injur num_year num_month num_states
##         (chr)       (int) (dbl) (dbl)    (int)     (int)      (int)
## 1     tornado        7928  5633 91346       62        12         49
## 2   lightning        3305   816  5230       19        12         55
## 3   tstm wind        2930   504  6957       23        12         49
## 4 flash flood         931   978  1777       19        12         50
## 5       flood         410   470  6789       19        12         48
## 6   high wind         525   248  1137       19        12         51
## Variables not shown: severity_score (dbl)
head(econ_by_event_type)
## Source: local data frame [6 x 8]
## 
##        EVTYPE event_count prop_damage corp_damage num_year num_month
##         (chr)       (int)       (dbl)       (dbl)    (int)     (int)
## 1     tornado       39361   3212258.2   100018.52       62        12
## 2        hail       25969    688693.4   579596.28       19        12
## 3 flash flood       20659   1420124.6   179200.46       19        12
## 4       flood       10058    899938.5   168037.88       19        12
## 5   tstm wind       61476   1335995.6   109202.60       14        12
## 6   lightning       10360    603351.8     3580.61       19        12
## Variables not shown: num_states (int), severity_score (dbl)

As seen in the ranking above, tornado is the most harmful event related to both population health and economy. For the economic consequences, hail ranks the second whereas hail is ranked number 9 in terms of population health consequences. It makes sense as hail is usually not so much life threating events but it does cause tremendous amount of economic damages.

The findings are presented in following two graphs. Only top 10 events are presented to make it look clean. I took a chunk of code from internet for multiple plots in a chart.

## Taken from http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/

multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)
  plots <- c(list(...), plotlist)
  
  numPlots = length(plots)
  if (is.null(layout)) {
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  if (numPlots==1) {
    print(plots[[1]])
  } else {
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    for (i in 1:numPlots) {
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

Viz: Impact by different weather events by Year

final_summary_for_graph <- storm_dataset %>%
  filter(
    EVTYPE == 'tornado' |
      EVTYPE == "flash flood" |
      EVTYPE == "hail" |
      EVTYPE == "tstm wind" |
      EVTYPE == "lightning" |
      EVTYPE == "high wind" |
      EVTYPE == "heavy rain" |
      EVTYPE == "excessive heat" |
      EVTYPE == "string wind" |
      EVTYPE == "thunderstorm wind"
  ) %>%
  group_by(YEAR, EVTYPE) %>%
  summarize(
    prop_damage = sum(PROPDMG),
    corp_damage = sum(CROPDMG),
    fatal = sum(FATALITIES),
    injur = sum(INJURIES),
    severity_score = (prop_damage + corp_damage + fatal + injur)
  ) %>%
  arrange(desc(severity_score))

p1 <- ggplot(final_summary_for_graph, aes(YEAR, fatal))
p1 <- p1 + geom_area(aes(colour = EVTYPE, fill= EVTYPE), position = 'stack')
p2 <- ggplot(final_summary_for_graph, aes(YEAR, injur))
p2 <- p2 + geom_area(aes(colour = EVTYPE, fill= EVTYPE), position = 'stack')
p3 <- ggplot(final_summary_for_graph, aes(YEAR, corp_damage))
p3 <- p3 + geom_area(aes(colour = EVTYPE, fill= EVTYPE), position = 'stack')
p4 <- ggplot(final_summary_for_graph, aes(YEAR, prop_damage))
p4 <- p4 + geom_area(aes(colour = EVTYPE, fill= EVTYPE), position = 'stack')

## Plot fatalities and injuries count by year

multiplot(p1, p2, cols=1)

## Plot crop and prop damages by year
multiplot(p3, p4, cols=1)