Reproducible Research: Peer Assessment 2

by Chan Chee-Foong on 20 Apr 2016

Libraries required

lubridate
dplyr
ggplot2
lattice
knitr
markdown
rmarkdown
R.utils
tidyr
scales
reshape2
DT

Assignment Introduction and Data Source

This analysis involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.

The data use from this analysis can be downloaded from here.

Other references:

National Weather Service Storm Data Documentation
National Climatic Data Center Storm Events FAQ

This analysis will address the following questions:

Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
Across the United States, which types of events have the greatest economic consequences?

Data Loading

Download the zip file if it is not found in the working directory and unzip the file for analysis.

datadir <- "./stormdata"
datafile <- "repdata_data_StormData.csv"
zipfile <- "repdata_data_StormData.csv.bz2"

datadirfile <- paste(datadir, datafile, sep="/")
zipdirfile <- paste(datadir, zipfile, sep="/")

if (!file.exists(datadirfile)) {
    if (!file.exists(datadir)) {dir.create(datadir)}
    url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
    download.file(url, destfile = zipdirfile, mode = 'wb')
    bunzip2 (filename = zipdirfile, remove = FALSE, skip=TRUE)
}

Load the csv file in to myStormData dataframe.

myStormData <- read.csv(datadirfile, header = TRUE, na.strings = 'NA')
dim(myStormData)

## [1] 902297     37

str(myStormData)

## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
##  $ BGN_TIME  : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
##  $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
##  $ STATE     : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ EVTYPE    : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : Factor w/ 35 levels "","  N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_LOCATI: Factor w/ 54429 levels "","- 1 N Albion",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_DATE  : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_TIME  : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_LOCATI: Factor w/ 34506 levels "","- .5 NNW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ WFO       : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ZONENAMES : Factor w/ 25112 levels "","                                                                                                                               "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : Factor w/ 436774 levels "","-2 at Deer Park\n",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

The following fields will be used for the purpose of this analysis:

EVTYPE : Type of Event
BGN_DATE : Event Start Date
END_DATE : Event End Date
COUNTYNAME : Name of County
STATE : Name of State
FATALITIES : Number of Fatalities
INJURIES : Number of Injuries
PROPDMG : Estimated value of property damage
PROPDMGEXP : Base 10 exponent indicator for property damage
CROPDMG : Estimated value of corp damage
CROPDMGEXP : Base 10 exponent indicator for corp damage

Observed that the Base 10 exponent indicators will need some cleaning up.

summary(myStormData$PROPDMGEXP)

##             -      ?      +      0      1      2      3      4      5 
## 465934      1      8      5    216     25     13      4      4     28 
##      6      7      8      B      h      H      K      m      M 
##      4      5      1     40      1      6 424665      7  11330

summary(myStormData$CROPDMGEXP)

##             ?      0      2      B      k      K      m      M 
## 618413      7     19      1      9     21 281832      1   1994

Data Processing

Subsetting the whole database with only required fields. Prepare data for analysis by creating new fields and setting all fields to the correct classes for ease of use.

myAnalysisData <- select(myStormData, EVTYPE, BGN_DATE, END_DATE, COUNTYNAME, STATE, 
                         FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)

Processing the Base 10 exponent indicators

H,h : 10^2
K,k : 10^3
M,m : 10^6
B,b : 10^9
1 to 9 : 10^n where n is the respective numeric number
others : 10^0

myAnalysisData$PROPDMGEXP <- as.character(myAnalysisData$PROPDMGEXP)
myAnalysisData$CROPDMGEXP <- as.character(myAnalysisData$CROPDMGEXP)

EXPCHAR <- data.frame(abbr = as.character(c('H','h','K','k','M','m','B','b')), 
                      value = c(2,2,3,3,6,6,9,9), stringsAsFactors = FALSE)
EXPNUM <- data.frame(abbr = as.character(c(0:9)), value = c(0:9), stringsAsFactors = FALSE)
EXP <- rbind(EXPCHAR, EXPNUM)

myAnalysisData <- left_join(x=myAnalysisData, y=EXP, by = c('PROPDMGEXP' = 'abbr'))
colnames(myAnalysisData)[which(names(myAnalysisData) == 'value')] <- 'PROPDMGEXPCLEAN'
myAnalysisData <- left_join(x=myAnalysisData, y=EXP, by = c('CROPDMGEXP' = 'abbr'))
colnames(myAnalysisData)[which(names(myAnalysisData) == 'value')] <- 'CROPDMGEXPCLEAN'

myAnalysisData[is.na(myAnalysisData$PROPDMGEXPCLEAN),'PROPDMGEXPCLEAN'] <- 0
myAnalysisData[is.na(myAnalysisData$CROPDMGEXPCLEAN),'CROPDMGEXPCLEAN'] <- 0

myAnalysisData$PROPDMGEXPCLEAN <- 10^as.numeric(myAnalysisData$PROPDMGEXPCLEAN)
myAnalysisData$CROPDMGEXPCLEAN <- 10^as.numeric(myAnalysisData$CROPDMGEXPCLEAN)

Calculate

Total Damage By Value (Property Damage + Crop Damage)
Total Lives Affected (Number of Fatalities + Number of Injuries)

myAnalysisData$PROPDMGVALUE <- (myAnalysisData$PROPDMG * myAnalysisData$PROPDMGEXPCLEAN)
myAnalysisData$CROPDMGVALUE <- (myAnalysisData$CROPDMG * myAnalysisData$CROPDMGEXPCLEAN)
myAnalysisData$TOTALDMGVALUE <- myAnalysisData$PROPDMGVALUE + myAnalysisData$CROPDMGVALUE
myAnalysisData$TOTALLIFEAFFECTED <- myAnalysisData$FATALITIES + myAnalysisData$INJURIES

Determine the Month and Year for each event recorded

myAnalysisData$DATE <- mdy(sapply(strsplit(as.character(myAnalysisData$BGN_DATE), ' '), function(x) {x[1]}))
myAnalysisData$YEAR <- as.factor(year(myAnalysisData$DATE))
myAnalysisData$MONTH <- as.factor(lubridate::month(myAnalysisData$DATE, label = TRUE, abbr = TRUE))

Data Analysis

For this analysis, I choose to analysis data from year 2005 to the latest year available at 2011. This will give me the 6-7 years of latest data. Earlier years data were found not to be too clean. For example, ‘EVTYPE’ was found to have invalid and inconsisent entries. Data entries in the earlier years could be biased giving more emphasis only to events reported or observed. Only ‘TORNADO’, ‘TSTN WIND’, ‘HAIL’ are recorded before 1990.

Choosing to use the most recent data is also more reflective of the current environmental changes and technological advancement to capture data more consistently. Events like ‘Excessive Heat’ and ‘Heat’ are more pronouced in the recent years.

However, the analysis can be repeated by setting the start and end year in the code below.

## Set Start Year and End Year for Analysis

startYear <- 2005
endYear <- 2011

Grouping Data by Event Type

myDatabyGrp <- myAnalysisData %>% filter(as.numeric(as.character(YEAR)) >= startYear & as.numeric(as.character(YEAR)) <= endYear) %>% group_by(EVTYPE)

Calculating the fatalities and injuries by Event Type

mostLifeAffected <- myDatabyGrp %>% summarise(NumOfLifeAffected = sum(TOTALLIFEAFFECTED)) %>% 
    arrange(desc(NumOfLifeAffected))
mostFatalities <- myDatabyGrp %>% summarise(NumOfFatalities = sum(FATALITIES)) %>% 
    arrange(desc(NumOfFatalities))
mostInjuries <- myDatabyGrp %>% summarise(NumOfInjuries = sum(INJURIES)) %>% 
    arrange(desc(NumOfInjuries))

mostLifeAffected <- full_join(mostLifeAffected, mostFatalities, by = 'EVTYPE')
mostLifeAffected <- full_join(mostLifeAffected, mostInjuries, by = 'EVTYPE')

datatable(mostLifeAffected, 
          options = list(pageLength = 5), 
          colnames = c('EVENTS', 'TOTAL', 'FATALITIES', 'INJURIES'),
          caption = 'Table 1: Top Harmful Events resulting in Fatalities and Injuries.')

Calculating total damage by value by Event Type

mostDMG <- myDatabyGrp %>% summarise(TotalDmgValue = sum(TOTALDMGVALUE)) %>% 
    arrange(desc(TotalDmgValue))
mostPROPDMG <- myDatabyGrp %>% summarise(PropDmgValue = sum(PROPDMGVALUE)) %>% 
    arrange(desc(PropDmgValue))
mostCROPDMG <- myDatabyGrp %>% summarise(CropDmgValue = sum(CROPDMGVALUE)) %>% 
    arrange(desc(CropDmgValue))

mostDMG <- full_join(mostDMG, mostPROPDMG, by = 'EVTYPE')
mostDMG <- full_join(mostDMG, mostCROPDMG, by = 'EVTYPE')

datatable(mostDMG, 
          options = list(pageLength = 5), 
          colnames = c('EVENTS', 'TOTAL DAMAGE', 'PROPERTY DAMAGE', 'CROP DAMAGE'),
          caption = 'Table 2: Top Harmful Events resulting in Damage by Value.')

Results

Set the number of top most harmful events for analysis

topNumEVENT <- 5

Q1) Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

A1) Tornado is the most harmful event causing the most death and injuries in recent years. This is follow by excessive heat and lightning.

dataPlot1 <- arrange(mostLifeAffected, desc(NumOfLifeAffected))[c(1:topNumEVENT),]
dataPlot1 <- dataPlot1[,c(1,3,4)]
dataPlot1 <- melt(dataPlot1, id=c("EVTYPE"), measure.vars = c("NumOfFatalities", "NumOfInjuries"))
colnames(dataPlot1) <- c('EVENT.TYPE','TYPE','COUNT')
dataPlot1$TYPE <- as.character(dataPlot1$TYPE)
dataPlot1[dataPlot1$TYPE == 'NumOfFatalities',]$TYPE <- 'FATALITIES'
dataPlot1[dataPlot1$TYPE == 'NumOfInjuries',]$TYPE <- 'INJURIES'

g1 <- ggplot(data=dataPlot1, aes(x=reorder(EVENT.TYPE,COUNT), y=COUNT, fill=TYPE)) +
    geom_bar(stat="identity") +
    ylab('Number of Lives Affected (Fatalities + Injuries)') +
    xlab('') +
    labs(title = 'Top events resulting in the most fatalities and injuries') +
    theme(legend.title = element_blank()) +
    theme(legend.position = 'top') +
    scale_fill_brewer(palette="Paired") +
    coord_flip()

g1

Q2) Across the United States, which types of events have the greatest economic consequences?

A2) Flood causes the most property damage by value of a total more than $100b in recent years. This is follow by hurricane/typhoon and storm surge.

dataPlot2 <- arrange(mostDMG, desc(TotalDmgValue))[c(1:topNumEVENT),]
dataPlot2 <- dataPlot2[,c(1,3,4)]
dataPlot2 <- melt(dataPlot2, id=c("EVTYPE"), measure.vars = c("PropDmgValue", "CropDmgValue"))
colnames(dataPlot2) <- c('EVENT.TYPE','TYPE','VALUE')
dataPlot2$TYPE <- as.character(dataPlot2$TYPE)
dataPlot2[dataPlot2$TYPE == 'PropDmgValue',]$TYPE <- 'PROPERTY DAMAGE'
dataPlot2[dataPlot2$TYPE == 'CropDmgValue',]$TYPE <- 'CROP DAMAGE'

g2 <- ggplot(data=dataPlot2, aes(x=reorder(EVENT.TYPE,VALUE), y=VALUE/10^9, fill=TYPE)) +
    geom_bar(stat="identity") +
    ylab('Damage Value (Billions)') +
    xlab('') +
    labs(title = 'Top events resulting in  the most damage by value') +
    theme(legend.title = element_blank()) +
    theme(legend.position = 'top') +
    scale_fill_brewer(palette="Paired") +
    coord_flip()

g2

Further Analysis

Since tornado is one of the most harmful events. This analysis here attempt to study the frequency of occurances by month on top 20 states most prone to tornado.

topEVENT <- c('TORNADO')
numOfState <- 20

Grouping Data by Event Type, Month and State and count the number of occurances

myDatabyGrp2 <- myAnalysisData %>% filter(EVTYPE %in% topEVENT) %>% group_by(EVTYPE, MONTH, STATE)

dataPlot3 <- myDatabyGrp2 %>% summarise(FREQUENCY = n()) %>% arrange(desc(FREQUENCY))

dataPlot4 <- dataPlot3 %>% ungroup()
dataPlot4$EVTYPE <- as.factor(as.character(dataPlot4$EVTYPE))
dataPlot4$STATE <- as.factor(as.character(dataPlot4$STATE))

dataPlot4 <- complete(dataPlot4, EVTYPE, MONTH, STATE, fill = list(FREQUENCY=0))

Determine the top risky states with frequent occurance of the top harmful event

myDatabyGrp3 <- myAnalysisData %>% filter(EVTYPE %in% topEVENT) %>% group_by(STATE)
riskyState <- myDatabyGrp3 %>% summarise(FREQUENCY = n()) %>% arrange(desc(FREQUENCY))
riskyState <- as.character(riskyState$STATE[1:numOfState])

Plotting the results in a Heatmap. It is observed that Tornado occurs most frequently in Texas, Kansas and Oklahoma. in the summer months of Apr - Jun. For most states, Tornado seems to occur often in the summer months.

gg1 <- ggplot(filter(dataPlot4, STATE %in% riskyState), 
             aes(x=reorder(STATE, desc(FREQUENCY)), y=MONTH, fill=FREQUENCY))
  gg1 <- gg1 + geom_tile(color="white", size=0.1)
  gg1 <- gg1 + scale_x_discrete(expand=c(0,0))
  gg1 <- gg1 + scale_y_discrete(expand=c(0,0))
  gg1 <- gg1 + scale_fill_gradient(low = "light yellow", high = "red")
  gg1 <- gg1 + coord_equal()
  gg1 <- gg1 + labs(x='', y='', title='Number of Tornado Occurance by Month in Tornado-prone States')
  gg1 <- gg1 + theme(axis.ticks=element_blank())
  gg1 <- gg1 + theme(axis.text=element_text(size=8))
  gg1 <- gg1 + theme(panel.border=element_blank())
  gg1 <- gg1 + theme(plot.title=element_text(hjust=0.5, size=12))
  gg1 <- gg1 + theme(panel.margin.x=unit(0.5, "cm"))
  gg1 <- gg1 + theme(panel.margin.y=unit(0.5, "cm"))
  gg1 <- gg1 + theme(legend.title=element_text(size=8))
  gg1 <- gg1 + theme(legend.title.align=1)
  gg1 <- gg1 + theme(legend.text=element_text(size=8))
  gg1 <- gg1 + theme(legend.position="bottom")
  gg1 <- gg1 + theme(legend.key.size=unit(0.2, "cm"))
  gg1 <- gg1 + theme(legend.key.width=unit(1, "cm"))

gg1