Storm Data Analysis

Reproducible Research: Peer Assessment 2

Jenina Halitsky

August 24, 2014

=========================================================================================================================

Synopsis

Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage. Preventing such outcomes to the extent possible is a key concern. This report explores the U.S. National Oceanic and Atmospheric Administration`s (NOAA) storm database to determine which type of events are most harmful with respect to population health, as well as, which types of events have the greatest economic consequences.

Data Processing

The data for this analysis comes in the form of a comma-separated-value (CSV) file compressed via the bzip2 algorithm to reduce its size. The storm data file was downloaded from the Coursera Reproducible Research website on August 20, 2014. The events in the database start in the year 1950 and end in November 2011. In the earlier years, there are generally fewer events recorded due to a lack of good records. Most recent years should be complete.

Setup Configurations & Libraries

        library(knitr)
        opts_knit$set(progress=FALSE, verbose = TRUE)
        opts_chunk$set(echo=TRUE, message=FALSE, tidy=TRUE, comment=NA,
                       fig.path="figure/", fig.keep="high", fig.width=10, fig.height=6,
                       fig.align="center")

Load needed libraries.

require(plyr)
require(ggplot2)

Load Data

download.file("http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", 
    "~/Data/repdata_data_StormData.csv.bz2")
csv <- bzfile("~/Data/repdata_data_StormData.csv.bz2", "repdata_data_StormData.csv")
stormdata <- read.csv2(csv, sep = ",", stringsAsFactors = FALSE)
close(csv)
unlink(csv)

=========================================================================================================================

Question 1: Across the United States, Which types of events are most harmful with respect to population health?

Data Variables

str(stormdata)
'data.frame':   902297 obs. of  37 variables:
 $ STATE__   : chr  "1.00" "1.00" "1.00" "1.00" ...
 $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
 $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
 $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
 $ COUNTY    : chr  "97.00" "3.00" "57.00" "89.00" ...
 $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
 $ STATE     : chr  "AL" "AL" "AL" "AL" ...
 $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
 $ BGN_RANGE : chr  "0.00" "0.00" "0.00" "0.00" ...
 $ BGN_AZI   : chr  "" "" "" "" ...
 $ BGN_LOCATI: chr  "" "" "" "" ...
 $ END_DATE  : chr  "" "" "" "" ...
 $ END_TIME  : chr  "" "" "" "" ...
 $ COUNTY_END: chr  "0.00" "0.00" "0.00" "0.00" ...
 $ COUNTYENDN: logi  NA NA NA NA NA NA ...
 $ END_RANGE : chr  "0.00" "0.00" "0.00" "0.00" ...
 $ END_AZI   : chr  "" "" "" "" ...
 $ END_LOCATI: chr  "" "" "" "" ...
 $ LENGTH    : chr  "14.00" "2.00" "0.10" "0.00" ...
 $ WIDTH     : chr  "100.00" "150.00" "123.00" "100.00" ...
 $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
 $ MAG       : chr  "0.00" "0.00" "0.00" "0.00" ...
 $ FATALITIES: chr  "0.00" "0.00" "0.00" "0.00" ...
 $ INJURIES  : chr  "15.00" "0.00" "2.00" "2.00" ...
 $ PROPDMG   : chr  "25.00" "2.50" "25.00" "2.50" ...
 $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
 $ CROPDMG   : chr  "0.00" "0.00" "0.00" "0.00" ...
 $ CROPDMGEXP: chr  "" "" "" "" ...
 $ WFO       : chr  "" "" "" "" ...
 $ STATEOFFIC: chr  "" "" "" "" ...
 $ ZONENAMES : chr  "" "" "" "" ...
 $ LATITUDE  : chr  "3040.00" "3042.00" "3340.00" "3458.00" ...
 $ LONGITUDE : chr  "8812.00" "8755.00" "8742.00" "8626.00" ...
 $ LATITUDE_E: chr  "3051.00" "0.00" "0.00" "0.00" ...
 $ LONGITUDE_: chr  "8806.00" "0.00" "0.00" "0.00" ...
 $ REMARKS   : chr  "" "" "" "" ...
 $ REFNUM    : chr  "1.00" "2.00" "3.00" "4.00" ...

Show First 10 Rows of Data.

head(stormdata, 10)
   STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
1     1.00  4/18/1950 0:00:00     0130       CST  97.00     MOBILE    AL
2     1.00  4/18/1950 0:00:00     0145       CST   3.00    BALDWIN    AL
3     1.00  2/20/1951 0:00:00     1600       CST  57.00    FAYETTE    AL
4     1.00   6/8/1951 0:00:00     0900       CST  89.00    MADISON    AL
5     1.00 11/15/1951 0:00:00     1500       CST  43.00    CULLMAN    AL
6     1.00 11/15/1951 0:00:00     2000       CST  77.00 LAUDERDALE    AL
7     1.00 11/16/1951 0:00:00     0100       CST   9.00     BLOUNT    AL
8     1.00  1/22/1952 0:00:00     0900       CST 123.00 TALLAPOOSA    AL
9     1.00  2/13/1952 0:00:00     2000       CST 125.00 TUSCALOOSA    AL
10    1.00  2/13/1952 0:00:00     2000       CST  57.00    FAYETTE    AL
    EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
1  TORNADO      0.00                                            0.00
2  TORNADO      0.00                                            0.00
3  TORNADO      0.00                                            0.00
4  TORNADO      0.00                                            0.00
5  TORNADO      0.00                                            0.00
6  TORNADO      0.00                                            0.00
7  TORNADO      0.00                                            0.00
8  TORNADO      0.00                                            0.00
9  TORNADO      0.00                                            0.00
10 TORNADO      0.00                                            0.00
   COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH  WIDTH F  MAG FATALITIES
1          NA      0.00                     14.00 100.00 3 0.00       0.00
2          NA      0.00                      2.00 150.00 2 0.00       0.00
3          NA      0.00                      0.10 123.00 2 0.00       0.00
4          NA      0.00                      0.00 100.00 2 0.00       0.00
5          NA      0.00                      0.00 150.00 2 0.00       0.00
6          NA      0.00                      1.50 177.00 2 0.00       0.00
7          NA      0.00                      1.50  33.00 2 0.00       0.00
8          NA      0.00                      0.00  33.00 1 0.00       0.00
9          NA      0.00                      3.30 100.00 3 0.00       1.00
10         NA      0.00                      2.30 100.00 3 0.00       0.00
   INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
1     15.00   25.00          K    0.00                                    
2      0.00    2.50          K    0.00                                    
3      2.00   25.00          K    0.00                                    
4      2.00    2.50          K    0.00                                    
5      2.00    2.50          K    0.00                                    
6      6.00    2.50          K    0.00                                    
7      1.00    2.50          K    0.00                                    
8      0.00    2.50          K    0.00                                    
9     14.00   25.00          K    0.00                                    
10     0.00   25.00          K    0.00                                    
   LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
1   3040.00   8812.00    3051.00    8806.00           1.00
2   3042.00   8755.00       0.00       0.00           2.00
3   3340.00   8742.00       0.00       0.00           3.00
4   3458.00   8626.00       0.00       0.00           4.00
5   3412.00   8642.00       0.00       0.00           5.00
6   3450.00   8748.00       0.00       0.00           6.00
7   3405.00   8631.00       0.00       0.00           7.00
8   3255.00   8558.00       0.00       0.00           8.00
9   3334.00   8740.00    3336.00    8738.00           9.00
10  3336.00   8738.00    3337.00    8737.00          10.00

Convert Data

Convert Fatalities and Injury Fields to Numeric

stormdata$FATALITIES <- as.numeric(stormdata$FATALITIES)
stormdata$INJURIES <- as.numeric(stormdata$INJURIES)

Show First 10 Rows of Data.

head(stormdata$FATALITIES, 10)
 [1] 0 0 0 0 0 0 0 0 1 0
# [1] 0 0 0 0 0 0 0 0 1 0
head(stormdata$INJURIES, 10)
 [1] 15  0  2  2  2  6  1  0 14  0
# [1] 15 0 2 2 2 6 1 0 14 0

Convert Event Type to Factor

stormdata$EVTYPE <- toupper(stormdata$EVTYPE)
eventtype <- sort(unique(stormdata$EVTYPE))
stormdata$EVTYPE <- as.factor(stormdata$EVTYPE)
eventtype[1:10]
 [1] "   HIGH SURF ADVISORY" " COASTAL FLOOD"       
 [3] " FLASH FLOOD"          " LIGHTNING"           
 [5] " TSTM WIND"            " TSTM WIND (G45)"     
 [7] " WATERSPOUT"           " WIND"                
 [9] "?"                     "ABNORMAL WARMTH"      
# Note there are 898 event types

Consolidate Fatalities and Injuries for Graphs

injuries <- aggregate(stormdata$INJURIES, by = list(EVTYPE = stormdata$EVTYPE), 
    sum)
injuries <- injuries[order(injuries$x, decreasing = TRUE), ]
top20injuries <- injuries[1:20, ]
fatalities <- aggregate(stormdata$FATALITIES, by = list(EVTYPE = stormdata$EVTYPE), 
    sum)
fatalities <- fatalities[order(fatalities$x, decreasing = TRUE), ]
top20fatalities <- fatalities[1:20, ]

Results

The graphs indicate that the type of event the most harmful with respect to population health is a Tornado.

ggplot(top20injuries, aes(EVTYPE, y = x)) + geom_bar(stat = "Identity", fill = "red") + 
    xlab("Event Type") + ylab("Number of Injuries") + ggtitle("Top 20 Injuries by Event Type") + 
    coord_flip() + theme(legend.position = "none")

plot of chunk injuries graph

ggplot(top20fatalities, aes(EVTYPE, y = x)) + geom_bar(stat = "Identity", fill = "blue") + 
    xlab("Event Type") + ylab("Number of Fatalities") + ggtitle("Top 20 Fatalities by Event Type") + 
    coord_flip() + theme(legend.position = "none")

plot of chunk fatalities graph

=========================================================================================================================

Question 2: Across the United States, which types of events have the greatest economic consequences?

Data Variables

unique(stormdata$PROPDMGEXP)
 [1] "K" "M" ""  "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-"
[18] "1" "8"
# [1] 'K' 'M' '' 'B' 'm' '+' '0' '5' '6' '?' '4' '2' '3' 'h' '7' 'H' '-' '1'
# '8'
unique(stormdata$CROPDMGEXP)
[1] ""  "M" "K" "m" "B" "?" "0" "k" "2"
# [1] '' 'M' 'K' 'm' 'B' '?' '0' 'k' '2'

Convert Data

Convert PROPDMGEXP an CROPDMGEXP to All Upper Case Letters

stormdata$PROPDMGEXP <- toupper(stormdata$PROPDMGEXP)
stormdata$CROPDMGEXP <- toupper(stormdata$CROPDMGEXP)
unique(c(stormdata$PROPDMGEXP, stormdata$CROPDMGEXP))
 [1] "K" "M" ""  "B" "+" "0" "5" "6" "?" "4" "2" "3" "H" "7" "-" "1" "8"
# [1] 'K' 'M' '' 'B' '+' '0' '5' '6' '?' '4' '2' '3' 'H' '7' '-' '1' '8'

Convert PROPDMGEXP an CROPDMGEXP to Numeric Using Exponential Multiplier

stormdatasub <- stormdata[, c("EVTYPE", "PROPDMG", "PROPDMGEXP", "CROPDMG", 
    "CROPDMGEXP")]
stormdatasub[stormdatasub$PROPDMGEXP %in% c("", "+", "-", "?"), "PROPDMGEXP"] <- "0"
stormdatasub[stormdatasub$CROPDMGEXP %in% c("", "+", "-", "?"), "CROPDMGEXP"] <- "0"
unique(c(stormdatasub$PROPDMGEXP, stormdatasub$CROPDMGEXP))
 [1] "K" "M" "0" "B" "5" "6" "4" "2" "3" "H" "7" "1" "8"
# [1] 'K' 'M' '0' 'B' '5' '6' '4' '2' '3' 'H' '7' '1' '8'
stormdatasub[stormdatasub$PROPDMGEXP == "B", "PROPDMGEXP"] <- 9
stormdatasub[stormdatasub$CROPDMGEXP == "B", "CROPDMGEXP"] <- 9
stormdatasub[stormdatasub$PROPDMGEXP == "M", "PROPDMGEXP"] <- 6
stormdatasub[stormdatasub$CROPDMGEXP == "M", "CROPDMGEXP"] <- 6
stormdatasub[stormdatasub$PROPDMGEXP == "K", "PROPDMGEXP"] <- 3
stormdatasub[stormdatasub$CROPDMGEXP == "K", "CROPDMGEXP"] <- 3
stormdatasub[stormdatasub$PROPDMGEXP == "H", "PROPDMGEXP"] <- 2
stormdatasub[stormdatasub$CROPDMGEXP == "H", "CROPDMGEXP"] <- 2
unique(c(stormdatasub$PROPDMGEXP, stormdatasub$CROPDMGEXP))
 [1] "3" "6" "0" "9" "5" "4" "2" "7" "1" "8"
# [1] '3' '6' '0' '9' '5' '4' '2' '7' '1' '8'

Combine Exponent and Value

stormdatasub$PROPDMGEXP <- 10^(as.numeric(stormdatasub$PROPDMGEXP))
stormdatasub$CROPDMGEXP <- 10^(as.numeric(stormdatasub$CROPDMGEXP))
stormdatasub$PROPDMG <- as.numeric(stormdatasub$PROPDMG)
stormdatasub$CROPDMG <- as.numeric(stormdatasub$CROPDMG)
stormdatasub[is.na(stormdatasub$PROPDMG), "PROPDMG"] <- 0
stormdatasub[is.na(stormdatasub$CROPDMG), "CROPDMG"] <- 0

Calculate the Total Storm Damage

stormdatasub <- within(stormdatasub, TOTALDMG <- PROPDMG * PROPDMGEXP + CROPDMG * 
    CROPDMGEXP)
damagetype <- aggregate(stormdatasub$TOTALDMG, by = list(EVTYPE = stormdatasub$EVTYPE), 
    FUN = sum)
damagetype <- damagetype[order(damagetype$x, decreasing = TRUE), ]
head(damagetype, 10)
               EVTYPE         x
154             FLOOD 1.503e+11
372 HURRICANE/TYPHOON 7.191e+10
758           TORNADO 5.736e+10
599       STORM SURGE 4.332e+10
212              HAIL 1.876e+10
138       FLASH FLOOD 1.824e+10
84            DROUGHT 1.502e+10
363         HURRICANE 1.461e+10
529       RIVER FLOOD 1.015e+10
387         ICE STORM 8.967e+09
# EVTYPE x 154 FLOOD 150319678257 372 HURRICANE/TYPHOON 71913712800 758
# TORNADO 57362333946 599 STORM SURGE 43323541000 212 HAIL 18761221986 138
# FLASH FLOOD 18243991078 84 DROUGHT 15018672000 363 HURRICANE 14610229010
# 529 RIVER FLOOD 10148404500 387 ICE STORM 8967041360

Results

The graph indicates that the type of event that would have the greatest economic consequence would be a flood.

damagesub <- damagetype[1:20, ]
ggplot(damagesub, aes(EVTYPE, y = x)) + geom_bar(stat = "Identity", fill = "orange") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Event Type") + 
    ylab("Damage in Dollars") + ggtitle("Top 20 Greatest Economical Consequences by Event Type") + 
    theme(legend.position = "none")

plot of chunk damage graph