### Setup
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(reshape2)

Synopsis / Abstract

Objective

The objective of this report is to look through our history of weather events from the last 60 years to identify the types of weather events that cause the greatest degree of harm to the population, and result in significant economic damages.

Questions Addressed

In short, this analsis addresses two questions:

  1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

  2. Across the United States, which types of events have the greatest economic consequences?

Conclusion

The detail of the report identifies the top 10 types of environmental events resulting in harm to the population, and the top 10 events resulting in economic damages.

Tornados are the most significant risk to personal safety, fatalaties and injuries from Tornados are an order of magnitude greater than the next categories of Heat, Thunderstorm Winds, and Flooding.

In order, Floods, Hurricanes and Tornados are the weather events that cause the greatest economic damage. With the majority of the damage caused to personal property.


Data Processing

This report uses data supplied by the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. The dataset was pre-processed by the wonderful people at John Hopkins University in the Biostatistics Department, and is available on their server.

Retrieve File from website

fname <- "StormData.csv.bz2"
if (!file.exists(fname)){
      furl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
      download.file(furl, destfile = fname, method="curl")
      # date of download noted.
      StormDataDownloadDate <- date()
      StormDataDownloadDate
}
## [1] "Sat Jan 14 15:13:26 2017"

Load data

  • Conversion to factors is suppressed for all columns. EVTYPE is kept as character until consolodation, then turned into a factor.
data1 <- read.csv(fname, as.is=TRUE)
str(data1)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

Preprocessing Data Modifications

Remove Unnecessary Columns

From the initial exploration of the data with respect to the questions of concern, it is apparent that a number of the columns in the dataset will not be required. These are removed to improve computer performance. The resulting dataset contains the 7 columns required to address the report’s objectives.

data1 <- select(data1, EVTYPE, FATALITIES, INJURIES, PROPDMG,
                         PROPDMGEXP, CROPDMG, CROPDMGEXP) 
str(data1)
## 'data.frame':    902297 obs. of  7 variables:
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...

Grouping / Factoring of Event types

From the exploratory analysis it became apparent that a number of the event types are split into multiple groups resulting from variations in the labeling applied. Many of these consolidations arise from alternate abbreviations, upper/lower case usage, spelling mistakes. The initial file contained 985 EVTYPE factors. The following groups of event types have been combined. Refer to the R code in the ‘consolidateEvents’ code chunk for specific details.

  • Tornado
  • Heat
  • Thunderstorm wind
  • Hail
  • Extreme Cold
  • Hurricane
  • Tropical Storm
  • Rip Current
  • Lightning

In the event that a ranking deeper than the top 10 is required, further effort is required to group and consolidate the event types.

# Combine Tornado events
data1$EVTYPE <- sub("(.*)TORNADO(.*)", "TORNADO", data1$EVTYPE)

# Combine Heat events
gr1 <- c('HEAT', 'EXCESSIVE HEAT', 'HEAT WAVE', 'EXTREME HEAT', 'Heat Wave', 'RECORD HEAT', 'UNSEASONABLY WARM AND DRY', 'UNSEASONABLY WARM', 'HEAT WAVE DROUGHT', 'RECORD/EXCESSIVE HEAT', 'HEAT WAVES', 'DROUGHT/EXCESSIVE HEAT', 'WARM WEATHER', 'Record Heat', 'RECORD HEAT WAVE', 'Record High', 'RECORD HIGH', 'RECORD HIGH TEMPERATURE', 'RECORD HIGH TEMPERATURES', 'Record temperature', 'RECORD TEMPERATURE', 'Record Temperatures', 'RECORD TEMPERATURES', 'RECORD WARM', 'RECORD WARM TEMPS.', 'Record Warmth', 'RECORD WARMTH', 'VERY WARM')
data1$EVTYPE[data1$EVTYPE %in% gr1] <- gr1[1]

# Combine Thunderstorm wind events
gr2 <- c('THUNDERSTORM WIND', 'TSTM WIND', 'THUNDERSTORM WINDS', 'THUNDERSTORMW', 'THUNDERSTORM WINDS', 'THUNDERSTORM WINDSS', "LIGHTNING AND THUNDERSTORM WIN", "THUNDERSTORM WIND (G40)", "THUNDERSTORM WIND G52", "THUNDERSTORM WINDS 13", "THUNDERSTORM WINDS/HAIL", "THUNDERSTORMS WINDS", "THUNDERTORM WINDS", "TSTM WIND (G35)", "TSTM WIND (G40)", "TSTM WIND", "TSTM WIND (G45)", "GUSTY THUNDERSTORM WIND", "GUSTY THUNDERSTORM WINDS", "LIGHTNING THUNDERSTORM WINDS", "LIGHTNING THUNDERSTORM WINDSS", "SEVERE THUNDERSTORM", "SEVERE THUNDERSTORM WINDS", "SEVERE THUNDERSTORMS", "THUDERSTORM WINDS", "THUNDEERSTORM WINDS", "THUNDERESTORM WINDS", "THUNDERSTORM DAMAGE", "THUNDERSTORM DAMAGE TO", "THUNDERSTORM W INDS", "Thunderstorm Wind", "THUNERSTORM WINDS", "TUNDERSTORM WIND")
data1$EVTYPE[data1$EVTYPE %in% gr2] <- gr2[1]

# Combine HAIL events
data1$EVTYPE <- sub("(.*)HAIL(.*)", "HAIL", data1$EVTYPE)

# Second pass to combine combine additional THUNDERSTORM and TSTM events
data1$EVTYPE <- sub("(.*)THUNDERSTORM(.*)", "THUNDERSTORM WIND", data1$EVTYPE)
data1$EVTYPE <- sub("(.*)TSTM(.*)", "THUNDERSTORM WIND", data1$EVTYPE)

# Extreme Cold events
gr3 <- c("EXTREME COLD", "EXTREME COLD/WIND CHILL", "COLD/WIND CHILL", "COLD", "EXTREME WINDCHILL", "COLD WEATHER", "Cold", "COLD WAVE", "Cold Temperature", "Extreme Cold", "UNSEASONABLY COLD", "Extended Cold", "RECORD COLD", "SNOW/ BITTER COLD", "EXTREME/RECORD COLD")
data1$EVTYPE[data1$EVTYPE %in% gr3] <- gr3[1]

# Combine Hurricane events
data1$EVTYPE <- sub("(.*)HURRICANE(.*)", "HURRICANE", data1$EVTYPE)

# Tropical Storm events
data1$EVTYPE <- sub("(.*)TROPICAL(.*)", "TROPICAL STORM", data1$EVTYPE)

# Rip Current events
data1$EVTYPE <- sub("(.*)RIP(.*)", "RIP CURRENT", data1$EVTYPE)

# Lightning events
data1$EVTYPE <- sub("(.*)LIGHTN(.*)", "LIGHTNING", data1$EVTYPE)

# Fire events
data1$EVTYPE <- sub("(.*)FIRE(.*)", "WILD/FOREST FIRE", data1$EVTYPE)

# Set EVTYPE as factor
data1$EVTYPE <- as.factor(data1$EVTYPE)

str(data1$EVTYPE)
##  Factor w/ 745 levels "   HIGH SURF ADVISORY",..: 646 646 646 646 646 646 646 646 646 646 ...

Applying cost factor multipliers

From the dataset codebook, section 2.7 “Damage,”" we find that the property damage estimates PROPDMG need to be multiplied by PROPDMGEXP to arrive at the correct dollar figure. A similar multiplication is required for the crop damage estimates.

The multipliers used in the dataset are as follows (definitions are in the codebook):

levels(as.factor(data1$PROPDMGEXP))
##  [1] ""  "-" "?" "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "h" "H" "K"
## [18] "m" "M"
levels(as.factor(data1$CROPDMGEXP))
## [1] ""  "?" "0" "2" "B" "k" "K" "m" "M"

In the following code chunk, the exponential multipliers are set up and then applied to the damage cost estimates to provide the correct economic damages for analysis.

# Convert the PROPDMGEXP and CROPDMGEXP columns to numerical exponents of 10's
data1$PROPDMGEXP <- sub("[H|h]", "2", data1$PROPDMGEXP)
data1$PROPDMGEXP <- sub("[K|k]", "3", data1$PROPDMGEXP)
data1$PROPDMGEXP <- sub("[M|m]", "6", data1$PROPDMGEXP)
data1$PROPDMGEXP <- sub("[B|b]", "9", data1$PROPDMGEXP)
data1$PROPDMGEXP <- sub("[-|+|?]", "0", data1$PROPDMGEXP)
data1$PROPDMGEXP <- sub("", "0", data1$PROPDMGEXP)
data1$PROPDMGEXP <- as.numeric(data1$PROPDMGEXP)

data1$CROPDMGEXP <- sub("[H|h]", "2", data1$CROPDMGEXP)
data1$CROPDMGEXP <- sub("[K|k]", "3", data1$CROPDMGEXP)
data1$CROPDMGEXP <- sub("[M|m]", "6", data1$CROPDMGEXP)
data1$CROPDMGEXP <- sub("[B|b]", "9", data1$CROPDMGEXP)
data1$CROPDMGEXP <- sub("[-|+|?]", "0", data1$CROPDMGEXP)
data1$CROPDMGEXP <- sub("", "0", data1$CROPDMGEXP)
data1$CROPDMGEXP <- as.numeric(data1$CROPDMGEXP)

data1 <- data1 %>%
         mutate(cost_property=PROPDMG*10**PROPDMGEXP,
                cost_crops=CROPDMG*10**CROPDMGEXP)

# Remove unnecessary columns
data1$PROPDMG <- NULL
data1$CROPDMG <- NULL
data1$PROPDMGEXP <- NULL
data1$CROPDMGEXP <- NULL

Results

Health Effects of Severe Weather Events

Objective 1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

For the purposes of this investigation, fatalities are combined with reportable injuries, giving a total casualty summation that is used for identifying the most harmful weather events. The total represents casualties over the entire reported period, 1950 through 2011.

health_ev <- group_by(data1, EVTYPE) %>%
             summarize(Tot_fatal=sum(FATALITIES, na.rm=TRUE),
                       Max_fatal=max(FATALITIES, na.rm=TRUE),
                       Mean_fatal=mean(FATALITIES, na.rm=TRUE),
                       Tot_inj=sum(INJURIES, na.rm=TRUE),
                       Max_inj=max(INJURIES, na.rm=TRUE),                       
                       Mean_inj=mean(INJURIES, na.rm=TRUE) ) %>%
             mutate(Tot_casualty=Tot_fatal + Tot_inj) %>%
             arrange(desc(Tot_casualty))

# Adjust column ordering
health_ev <- select(health_ev, c(1,8,2:7))

#Create tables for examining most harmful weather events
health_comb <- select(health_ev, c(1,2))
health_fatal <- arrange(health_ev, desc(Tot_fatal)) %>% select(c(1,3:5))
health_injury <- arrange(health_ev, desc(Tot_inj)) %>% select(c(1,6:8))

Top Weather Events Causing Population Harm Over Study Period

  • Total Casualties is the combination of fatalities and injuries
head(health_comb, 10)
## # A tibble: 10 × 2
##               EVTYPE Tot_casualty
##               <fctr>        <dbl>
## 1            TORNADO        97068
## 2               HEAT        12421
## 3  THUNDERSTORM WIND        10174
## 4              FLOOD         7259
## 5          LIGHTNING         6048
## 6        FLASH FLOOD         2755
## 7          ICE STORM         2064
## 8   WILD/FOREST FIRE         1698
## 9       WINTER STORM         1527
## 10              HAIL         1486

Top Weather Events Causing Fatalities

head(health_fatal, 10)
## # A tibble: 10 × 4
##               EVTYPE Tot_fatal Max_fatal  Mean_fatal
##               <fctr>     <dbl>     <dbl>       <dbl>
## 1            TORNADO      5661       158 0.093261944
## 2               HEAT      3178       583 1.068953919
## 3        FLASH FLOOD       978        20 0.018018682
## 4          LIGHTNING       817         5 0.051826947
## 5  THUNDERSTORM WIND       725        11 0.002159525
## 6        RIP CURRENT       577         6 0.742599743
## 7              FLOOD       470        15 0.018558004
## 8       EXTREME COLD       452        10 0.174787316
## 9          HIGH WIND       248         8 0.012269939
## 10         AVALANCHE       224         6 0.580310881

Top Weather Events Causing Injuries

head(health_injury, 10)
## # A tibble: 10 × 4
##               EVTYPE Tot_inj Max_inj    Mean_inj
##               <fctr>   <dbl>   <dbl>       <dbl>
## 1            TORNADO   91407    1700 1.505881384
## 2  THUNDERSTORM WIND    9449      70 0.028145311
## 3               HEAT    9243     519 3.108980827
## 4              FLOOD    6789     800 0.268064440
## 5          LIGHTNING    5231      51 0.331832022
## 6          ICE STORM    1975    1568 0.984546361
## 7        FLASH FLOOD    1777     150 0.032739466
## 8   WILD/FOREST FIRE    1608     150 0.379334749
## 9               HAIL    1466     109 0.005048748
## 10         HURRICANE    1326     780 4.636363636

Summary Chart of Most Harmful Types of Weather Events

# melt down the top 10 rows of health_ev
health_top <- health_ev[1:10,]
health_top <- melt(health_top, id="EVTYPE", measure.vars=c("Tot_fatal", "Tot_inj"))

# plot
g <- ggplot(health_top, aes(x= reorder(EVTYPE, -value), y=value, fill=variable))
g <- g + geom_bar(position="stack", stat='identity')
g <- g + theme(axis.text.x=element_text(angle = 60, hjust = 1))
g <- g + labs(title="Population Health Affected By Weather Events",
              x="Type of Weather Event",
              y="Casualties")
#g <- g + scale_y_log10()
g <- g + scale_fill_manual(name="Health Effect",
                           values=c("red","skyblue"),
                           labels=c("Fatalities","Injuries"))
print(g)


Financial Effects of Severe Weather Events

Objective 2. Across the United States, which types of events have the greatest economic consequences?

For the purposes of this analysis, property damage is combined with agricultural damages for the total economic losses. The summary chart does show the relative magnitudes of property vs agricultural damages.

financial_ev <- group_by(data1, EVTYPE) %>%
                summarize(Tot_prop=sum(cost_property, na.rm=TRUE),
                          Tot_crop=sum(cost_crops, na.rm=TRUE) ) %>%
                mutate(Tot_cost=Tot_prop + Tot_crop) %>%
                arrange(desc(Tot_cost))
# Fix column ordering
financial_ev <- select(financial_ev, c(1,4,2,3))

head(financial_ev,10)
## # A tibble: 10 × 4
##               EVTYPE     Tot_cost     Tot_prop    Tot_crop
##               <fctr>        <dbl>        <dbl>       <dbl>
## 1              FLOOD 150319678257 144657709807  5661968450
## 2          HURRICANE  90271472810  84756180010  5515292800
## 3            TORNADO  59020779447  58603317927   417461520
## 4        STORM SURGE  43323541000  43323536000        5000
## 5               HAIL  19134309410  16022626537  3111682873
## 6        FLASH FLOOD  18243991079  16822673979  1421317100
## 7            DROUGHT  15018672000   1046106000 13972566000
## 8  THUNDERSTORM WIND  12347318414  11140399676  1206918738
## 9        RIVER FLOOD  10148404500   5118945500  5029459000
## 10         ICE STORM   8967041360   3944927860  5022113500
# melt down the top 10 rows of health_ev
financial_top <- financial_ev[1:10,]
financial_top <- melt(financial_top, id="EVTYPE", 
                      measure.vars=c("Tot_prop", "Tot_crop"))

# plot
g <- ggplot(financial_top, aes(x= reorder(EVTYPE, -value), y=value, fill=variable))
g <- g + geom_bar(position="stack", stat='identity')
g <- g + theme(axis.text.x=element_text(angle = 60, hjust = 1))
g <- g + labs(title="Damage Caused By Weather Events",
              x="Type of Weather Event",
              y="Cummulative Cost")
g <- g + scale_fill_manual(name="Damages",
                           values=c("darkgreen","royalblue"),
                           labels=c("Property","Crops"))
print(g)