# HEADS UP! message and warning set to FALSE only after working through problems, otherwise it is very difficult to
# troubleshoot code.
knitr::opts_chunk$set(echo = TRUE, cache = TRUE, message = TRUE, warning = TRUE)

Reproducible Research Project 2

This is an R Markdown document created to submit Peer Graded Project 2 in the Coursera Reproducible Research course.
This project has the following list of requirements:
1. Has either a (1) valid RPubs URL pointing to a data analysis document for this assignment been submitted; or (2) a complete PDF file presenting the data analysis been uploaded?
2. Is the document written in English?
3. Does the analysis include description and justification for any data transformations?
4. Does the document have a title that briefly summarizes the data analysis?
5. Does the document have a synopsis that describes and summarizes the data analysis in less than 10 sentences?
6. Is there a section titled “Data Processing” that describes how the data were loaded into R and processed for analysis?
7. Is there a section titled “Results” where the main results are presented?
8. Is there at least one figure in the document that contains a plot?
9. Are there at most 3 figures in this document?
10. Does the analysis start from the raw data file (i.e. the original.csv.bz2 file)?
11. Does the analysis address the question of which types of events are most harmful to population health?
12. Does the analysis address the question of which types of events have the greatest economic consequences?
13. Do all the results of the analysis (i.e. figures, tables, numerical summaries) appear to be reproducible?
14. Do the figure(s) have descriptive captions (i.e. there is a description near the figure of what is happening in the figure)
15. As far as you can determine, does it appear that the work submitted for this project is the work of the student who submitted it?

Synopsis

The Coursera Reproducible Research Project 2 asked students to use the R programming language to examine a climate data set provided by the National Weather Service and answer two questions.

  1. Across the United States, which types of events (as indicated in the “EVTYPE” variable) are most harmful with respect to population health?

  2. Across the United States, which types of events have the greatest economic consequences?

Across the United States, Tornado events are most harmful to population health. Tornadoes were the leading causes of weather-related fatalities and injuries, (5,633, and 91,346) respectively.

With regard to weather events having the greatest economic consequences across the United States, Flooding was the largest contributor. 150.3 billion USD worth of damages were attributed to flooding with Hurricanes/Typhoons coming in second at over $71.9 billion.

Data Processing

Download Storm Data File.

The data file was provided by following the URL https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2. The data file was in the form of a comma-separated-value file compressed via the bzip2 algorithm. I wasn’t familiar with this compression type and had to review the help information on the download.file() function to get the proper syntax so that I could figure out how to extract data from a .BZ2 file. See below.

library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact
library(tidyverse)

# Download data file
URL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(URL, destfile = "repdata_data_StormData.csv.bz2", method = "curl")

if(!exists("storm.data")) {
  storm.data <- read.csv(bzfile("repdata_data_StormData.csv.bz2"), header = TRUE)
}

The data file is approximately 47MB. It contains 902,297 rows and 37 variables. I reviewed the contents of the file using the names(), str(), and head() functions.

names(storm.data)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"
str(storm.data)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...
head(storm.data)
##   STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE  EVTYPE
## 1       1  4/18/1950 0:00:00     0130       CST     97     MOBILE    AL TORNADO
## 2       1  4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL TORNADO
## 3       1  2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL TORNADO
## 4       1   6/8/1951 0:00:00     0900       CST     89    MADISON    AL TORNADO
## 5       1 11/15/1951 0:00:00     1500       CST     43    CULLMAN    AL TORNADO
## 6       1 11/15/1951 0:00:00     2000       CST     77 LAUDERDALE    AL TORNADO
##   BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1         0                                               0         NA
## 2         0                                               0         NA
## 3         0                                               0         NA
## 4         0                                               0         NA
## 5         0                                               0         NA
## 6         0                                               0         NA
##   END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1         0                      14.0   100 3   0          0       15    25.0
## 2         0                       2.0   150 2   0          0        0     2.5
## 3         0                       0.1   123 2   0          0        2    25.0
## 4         0                       0.0   100 2   0          0        2     2.5
## 5         0                       0.0   150 2   0          0        2     2.5
## 6         0                       1.5   177 2   0          0        6     2.5
##   PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1          K       0                                         3040      8812
## 2          K       0                                         3042      8755
## 3          K       0                                         3340      8742
## 4          K       0                                         3458      8626
## 5          K       0                                         3412      8642
## 6          K       0                                         3450      8748
##   LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1       3051       8806              1
## 2          0          0              2
## 3          0          0              3
## 4          0          0              4
## 5          0          0              5
## 6          0          0              6

I also reviewed the National Weather Service Storm Data Documentation and the National Climatic Data Center Storm Events FAQ to learn more about the data file and how the variables were defined. Based on the requirements for the assignment, I deduced that many of the columns provided weren’t necessary. I used the select() function to create a smaller version of the data set that contained the information that I needed. I chose to focus on the following variables: “EVTYPE”, “FATALITIES”, “INJURIES”, “PROPDMG”, “PROPDMGEXP”, “CROPDMG”, and “CROPDMGEXP”.

sub.storm.data <- select(storm.data, EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
head(sub.storm.data)
##    EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO          0       15    25.0          K       0           
## 2 TORNADO          0        0     2.5          K       0           
## 3 TORNADO          0        2    25.0          K       0           
## 4 TORNADO          0        2     2.5          K       0           
## 5 TORNADO          0        2     2.5          K       0           
## 6 TORNADO          0        6     2.5          K       0

An interesting find was that in estimating the total financial cost of the damage to property and or crops, the National Weather Service didn’t put the total cost into one column. Instead they put the base dollar figure (rounded to three significant digits) in the “PROPODMG” and “CROPDMG” columns and character symbol representing a multiplier in the “PROPDMGEXP” and “CROPDMGEXP” columns. Examples of multipliers are: K = Thousand, M = Million, or B = Billion. This made it necessary to convert the character to a number and compute the total dollar cost for each observation. An example would be PROPDMG = 25.5 and a corresponding multiplier “K” would equate to property damage of $25,500.

The data provided gives us information on fatalities, injuries, property damage, and crop damage as it relates to weather events. The assignment calls for no more than three plots. Taking this into consideration, I took steps to combine the values of the property damage and crop damage into one variable denoting total financial damage “TOTFINDMG”. The unit measure for total financial damage is in U.S. dollars. See formula below:

TOTFINDMG = (PROPDMG * PROPDMGEXP) + (CROPDMG * CROPDMGEXP)

# Find all the unique multiplier symbols used in this PROPDMGEXP and CROPDMGEXP.
unique(sub.storm.data$PROPDMGEXP)
##  [1] "K" "M" ""  "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-" "1" "8"
# Replace multiplier symbols with numbers
sub.storm.data$PROPDMGEXP <- mapvalues(sub.storm.data$PROPDMGEXP, from = c("K", "M", "", "B", "m", "+", "0", "5", "6", "?", "4", "2", "3", "h", "7", "H", "-", "1", "8"), to = c(10^3, 10^6, 1, 10^9, 10^6, 0, 1, 10^5, 10^6, 0, 10^4, 100, 10^3, 100, 10^7, 10^2, 0, 10, 10^8))

# Convert to a numeric vector
sub.storm.data$PROPDMGEXP <- as.numeric(sub.storm.data$PROPDMGEXP)

## Repeat exercise for CROPDMGEXP  ##
unique(sub.storm.data$CROPDMGEXP)
## [1] ""  "M" "K" "m" "B" "?" "0" "k" "2"
# Replace multiplier symbols with numbers
sub.storm.data$CROPDMGEXP <- mapvalues(sub.storm.data$CROPDMGEXP, from = c("", "M", "K", "m", "B", "?", "0", "k", "2"), to = c(1, 10^6, 10^3, 10^6, 10^9, 0, 1, 10^3, 100))

sub.storm.data$CROPDMGEXP <- as.numeric(sub.storm.data$CROPDMGEXP)

# Add columns for Total Financial Damage = Property Damage + Crop Damage
sub.storm.data <- mutate(sub.storm.data, TOTFINDMG = (PROPDMG*PROPDMGEXP) + (CROPDMGEXP*CROPDMG))
str(sub.storm.data)
## 'data.frame':    902297 obs. of  8 variables:
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: num  1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ TOTFINDMG : num  25000 2500 25000 2500 2500 2500 2500 2500 25000 25000 ...

Breakdown Of Fatalities, Injuries, And Economic Impact Data

Now the data is set up with the necessary variables to address the questions asked in the assignment. We can now measure fatalities, injuries, and economic impact as it pertains to weather-related events. I looked into the data discovered that there are 985 different weather events in the data set. While interesting, that is too many. I decided that I could best answer the assigned questions based on Top 10 weather events with regard to fatalities, injuries and economic impact.

The code below depicts actions taken to get the broken down to see Top 10 events for each category.

# ===================== FATALITIES BREAKDWON ===========================
# Group fatalities by Event type
aggFATALITIES <- aggregate(FATALITIES ~ EVTYPE, data = sub.storm.data, FUN = "sum")

# There are 985 different weather events. This is too many, will focus on the Top 10
dim(aggFATALITIES)
## [1] 985   2
# Using the order function and the '-' sign will allow me to generate the Top 10 list in descending order.
# Repeat for all three indicators
aggFATALITIES <- aggFATALITIES[order(-aggFATALITIES$FATALITIES), ][1:10, ]
aggFATALITIES
##             EVTYPE FATALITIES
## 834        TORNADO       5633
## 130 EXCESSIVE HEAT       1903
## 153    FLASH FLOOD        978
## 275           HEAT        937
## 464      LIGHTNING        816
## 856      TSTM WIND        504
## 170          FLOOD        470
## 585    RIP CURRENT        368
## 359      HIGH WIND        248
## 19       AVALANCHE        224
#===================== INJURIES BREAKDOWN ==================================
aggINJURIES <- aggregate(INJURIES ~ EVTYPE, data = sub.storm.data, FUN = "sum")

# Pull the Top 10 Weather Events based on Injuries
aggINJURIES <- aggINJURIES[order(-aggINJURIES$INJURIES), ][1:10, ]
aggINJURIES
##                EVTYPE INJURIES
## 834           TORNADO    91346
## 856         TSTM WIND     6957
## 170             FLOOD     6789
## 130    EXCESSIVE HEAT     6525
## 464         LIGHTNING     5230
## 275              HEAT     2100
## 427         ICE STORM     1975
## 153       FLASH FLOOD     1777
## 760 THUNDERSTORM WIND     1488
## 244              HAIL     1361
#=================== ECONOMIC IMPACT =======================================
aggFINIMPACT <- aggregate(TOTFINDMG ~ EVTYPE, data = sub.storm.data, FUN = "sum")

# Pull the Top 10 Weather Events based on Economic impact in Billions
aggFINIMPACT <- aggFINIMPACT[order(-aggFINIMPACT$TOTFINDMG), ][1:10, ]
aggFINIMPACT
##                EVTYPE    TOTFINDMG
## 170             FLOOD 150319678257
## 411 HURRICANE/TYPHOON  71913712800
## 834           TORNADO  57362333887
## 670       STORM SURGE  43323541000
## 244              HAIL  18761221986
## 153       FLASH FLOOD  18243991079
## 95            DROUGHT  15018672000
## 402         HURRICANE  14610229010
## 590       RIVER FLOOD  10148404500
## 427         ICE STORM   8967041360

Results

Based on the data provided, Tornadoes were responsible for the largest weather-related death toll. 5,633 deaths were related to tornados. See the plot below.

# Plot the data for fatalities
# Employed coord_flip() to make axis easier to read.
wthr.fatalities.plot <- ggplot(aggFATALITIES, aes(x = reorder(EVTYPE, FATALITIES), y = FATALITIES, fill = EVTYPE)) +
  geom_bar(stat = "identity") +
  labs(x = "Weather Event Type", y = "Number of Fatalities", title = "Top 10 Fatalities by Weather Event") +
  coord_flip() +
  theme_bw()

wthr.fatalities.plot

Based on the above, it is not hard to surmise that Tornadoes would provide the highest volume of injuries also. Tornadoes accounted for 91,346 injuries according to the data provided. See the plot below.

# Plot injuries
# Employed coord_flip() to make axis easier to read.
wthr.injuries.plot <- ggplot(aggINJURIES, aes(x = reorder(EVTYPE, INJURIES), y = INJURIES, fill = EVTYPE)) +
  geom_bar(stat = "identity") +
  labs(x = "Weather Event Type", y = "Number of Injuries", title = "Top 10 Injuries by Weather Event") +
  coord_flip() +
  theme_bw()

wthr.injuries.plot

As for the economic consequences as related to weather damage, Floods led the pack with over 150 Billion USD in damage followed by hurricanes or typhoons with over $71 Billion USD. See the plot below.

# Plot economic impact
# Employed coord_flip() to make axis easier to read.
wthr.econdmg.plot <- ggplot(aggFINIMPACT, aes(x = reorder(EVTYPE, TOTFINDMG), y = TOTFINDMG, fill = EVTYPE)) +
  geom_bar(stat = "sum") +
    labs(x = "Weather Event Type", y = "Total Financial Impact (USD)", title = "Financial Impact of Weather Damage to Property and Crops") +
  coord_flip() +
  theme_bw()

wthr.econdmg.plot