Analysis of NOAA Storm Database

This document aims to assist in the decision making for the preparation for severe weather events and the prioritization of resources for different types of events. In a nutshell we are looking to answer these questions:

Across the United States, which types of events are most harmful with respect to population health? Across the United States, which types of events have the greatest economic consequences? This analysis is based on the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.

Data Processing

Load and read the database:

setwd('D:\\Coursera\\Data_Science_Specialization\\05 - Reproducible Research\\Course Project 2')
zname = "StormData.csv.bz2"
url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"

# download data 
if(!file.exists(zname))
{
    download.file(url, zname)
}
# read the data
if(!exists('dd'))
{
    dd = read.csv(file=zname)
}

Show the dataset structure:

# show the data structure 

str(dd)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
##  $ BGN_TIME  : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
##  $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
##  $ STATE     : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ EVTYPE    : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : Factor w/ 35 levels "","  N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_LOCATI: Factor w/ 54429 levels "","- 1 N Albion",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_DATE  : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_TIME  : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_LOCATI: Factor w/ 34506 levels "","- .5 NNW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ WFO       : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ZONENAMES : Factor w/ 25112 levels "","                                                                                                               "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : Factor w/ 436781 levels "","-2 at Deer Park\n",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

Data Processing for Major causes of Fatalities and Injuries

Extracting the count of FATALITIES and INJURIES for each type of event, and find the total count depending on the type of event. Extract from the resulting table the data for some most harmful event type and figure out the most harmful event types.

 # the most harmful data is the count of FATALITIES and INJURIES

# extract the data for FATALITIES and INJURIES into new dataframe

harmful = (dd[, c("EVTYPE", "FATALITIES", "INJURIES")])

# show the summary of the harmful data

summary(harmful)
##                EVTYPE         FATALITIES          INJURIES        
##  HAIL             :288661   Min.   :  0.0000   Min.   :   0.0000  
##  TSTM WIND        :219940   1st Qu.:  0.0000   1st Qu.:   0.0000  
##  THUNDERSTORM WIND: 82563   Median :  0.0000   Median :   0.0000  
##  TORNADO          : 60652   Mean   :  0.0168   Mean   :   0.1557  
##  FLASH FLOOD      : 54277   3rd Qu.:  0.0000   3rd Qu.:   0.0000  
##  FLOOD            : 25326   Max.   :583.0000   Max.   :1700.0000  
##  (Other)          :170878
# extract the data with nonzero count of the FATALITIES and INJURIES
harmful = harmful[harmful$FATALITIES > 0 | harmful$INJURIES > 0, ]

# find the sum of FATALITIES and INJURIES grouped by the event type, 
# store the sum of FATALITIES and INJURIES in the new column HARMFUL_CNT, and arrange the resulting dataset 
# in descending order by the new HARMFUL_CNT variable

library(dplyr)

fatalityCnt <- dd %>% group_by(EVTYPE) %>%
    summarise_at(vars(FATALITIES, INJURIES), funs(sum(., na.rm=TRUE))) %>% 
    mutate(HARMFUL_CNT = FATALITIES + INJURIES, ) %>% 
    arrange(desc(HARMFUL_CNT))

Results

# set the count of the most harmful events to investigate
topHarmfulEventCnt = 5

# extract the top topHarmfulEventCnt events from the resultin fatalities dataframe

topHarmful = head(fatalityCnt, topHarmfulEventCnt)

# Now show the figure of the most harmful event types

par(mgp=c(5,1,0))
par(mar=c(15, 10, 5, 5))


barplot(topHarmful$HARMFUL_CNT, width=2, space=2, topHarmful$EVTYPE, cex.axis=1, cex.names=1,
        las=2,main=paste0("Top ", as.character(topHarmfulEventCnt), " Event type by Total of FATALITIES and INJURIES"), 
        ylab= 'Total of FATALITIES and INJURIES')

Data Processing for Major causes of Economical Damages caused by various event types.

For the economical consequences the relevant data columns are EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG. Create the table for these and show its summary:

 # for the economical consequences the relevant data columns are EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG
# create the datatable for economical consequences

ec <- dd[,c("EVTYPE","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]

summary(ec)
##                EVTYPE          PROPDMG          PROPDMGEXP    
##  HAIL             :288661   Min.   :   0.00          :465934  
##  TSTM WIND        :219940   1st Qu.:   0.00   K      :424665  
##  THUNDERSTORM WIND: 82563   Median :   0.00   M      : 11330  
##  TORNADO          : 60652   Mean   :  12.06   0      :   216  
##  FLASH FLOOD      : 54277   3rd Qu.:   0.50   B      :    40  
##  FLOOD            : 25326   Max.   :5000.00   5      :    28  
##  (Other)          :170878                     (Other):    84  
##     CROPDMG          CROPDMGEXP    
##  Min.   :  0.000          :618413  
##  1st Qu.:  0.000   K      :281832  
##  Median :  0.000   M      :  1994  
##  Mean   :  1.527   k      :    21  
##  3rd Qu.:  0.000   0      :    19  
##  Max.   :990.000   B      :     9  
##                    (Other):     9
##Get the table for Major causes of Economical Damages grouped by various event types.

##```{r, cache=FALSE}
# function to transform the symbols like 'K', 'B', etc... to exponent

mult=function(x)
{
    m=0; 
    if(x=="") {m=1}
    else if(x=="k"| x=="K") {m=1000} 
    else if(x=="M" | x=="m") {m=1000000} 
    else if(x=="B") {m=1000000000} 
    else if(x=="0"){m=1}
    else if(x=="1"){m=10}
    else if(x=="2"){m=100}
    else if(x=="3"){m=1000}
    else if(x=="4"){m=10000}
    else if(x=="5"){m=100000}
    else if(x=="6"){m=1000000}
    else if(x=="7"){m=10000000}
    else if(x=="8"){m=100000000}
    m
}

# transform the data in the ec dataframe to get the damage costs

ec$PROPDMG_MULT <- sapply(ec$PROPDMGEXP, mult)
ec$CROPDMG_MULT <- sapply(ec$CROPDMGEXP, mult)

# get the total economical damage costs arranged by the total economical damage costs:

ecTot <- ec %>% mutate(COST_TOTAL = PROPDMG_MULT * PROPDMG + CROPDMG_MULT * CROPDMG) %>% 
    group_by(EVTYPE) %>%
    summarise_at(vars(COST_TOTAL), funs(sum(., na.rm=TRUE))) %>% 
    arrange(desc(COST_TOTAL))

Show the summary of the total economical damage costs

# show summary of the total economical damage costs
summary(ecTot)
##                    EVTYPE      COST_TOTAL       
##     HIGH SURF ADVISORY:  1   Min.   :0.000e+00  
##   COASTAL FLOOD       :  1   1st Qu.:0.000e+00  
##   FLASH FLOOD         :  1   Median :0.000e+00  
##   LIGHTNING           :  1   Mean   :4.846e+08  
##   TSTM WIND           :  1   3rd Qu.:8.500e+04  
##   TSTM WIND (G45)     :  1   Max.   :1.503e+11  
##  (Other)              :979

Results:

  # Now show the figure of the most economical damaging event types

cntToShow = 10

topEcTot = head(ecTot, cntToShow)
par(mgp=c(5,1,0))
par(mar=c(15, 10, 5, 5))


barplot(topEcTot$COST_TOTAL, width=2, space=2, topEcTot$EVTYPE, cex.axis=1, cex.names=1,
        las=2,main=paste0("Top ", as.character(cntToShow), " Event type by total economical damage costs"), 
        ylab= 'Total economical damage costs')