Synopsis

The goal of this report is to explore NOAA Storm Database and analyze that dataset to identify the most harmful events that can be affect the United states. This report is prepared with the view to help a government or municipal manager who might be responsible for preparing for severe weather events and will need to prioritize resources for different types of events. In this report, Storm dataset is analyzed in order to identify the weather event that has largest impact on population health and largest economic consequences. This analysis can be done by looking into the fatalities and injuries as well as damage on property and crops.

Load Library

library(ggplot2)        
## Warning: package 'ggplot2' was built under R version 3.2.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Data Processing

#set url to download data set
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if(!file.exists("repdata-data-StormData.csv.bz2")){
        download.file(url, "repdata-data-StormData.csv.bz2")
}
#Unzip and read .csv file into the variable data
bz <- bzfile("repdata-data-StormData.csv.bz2", "r")
data <- read.csv(bz, stringsAsFactors = FALSE)
close(bz)  

Select data that are useful for analysis and format accordingly

cols <- c("BGN_DATE", "EVTYPE",  "INJURIES", "FATALITIES", "PROPDMG", "CROPDMG")
data <- data[,cols]

#select values that are positive
data <- subset(data, data$FATALITIES > 0 | data$INJURIES > 0 | data$PROPDMG > 0 | data$CROPDMG > 0 )

#Create new column to have year of event
data$YEAR <- as.integer(format(as.Date(data$BGN_DATE, "%m/%d/%Y 0:00:00"), "%Y"))

#Convert event type to make it consistent
data$EVTYPE <- toupper(data$EVTYPE)

# Combine property damage and crop damage to economy damage
data$nPROPDMGEXP <- rep(0, nrow(data))
data$nPROPDMGEXP[toupper(data$PROPDMGEXP)=='B'] <- 9
data$nPROPDMGEXP[toupper(data$PROPDMGEXP)=='M'] <- 6
data$nPROPDMGEXP[toupper(data$PROPDMGEXP)=='K'] <- 3
data$nPROPDMGEXP[toupper(data$PROPDMGEXP)=='H'] <- 2

colIndex <- suppressWarnings(!is.na(as.numeric(data$nPROPDMGEXP)))
data$nPROPDMGEXP[colIndex] <- as.numeric(data$nPROPDMGEXP[colIndex])
data$totalPROPDMG <- data$PROPDMG * 10 ^ data$nPROPDMGEXP

data$nCROPDMGEXP <- rep(0, nrow(data))
data$nCROPDMGEXP[toupper(data$CROPDMGEXP)=='B'] <- 9
data$nCROPDMGEXP[toupper(data$CROPDMGEXP)=='M'] <- 6
data$nCROPDMGEXP[toupper(data$CROPDMGEXP)=='K'] <- 3
data$nCROPDMGEXP[toupper(data$CROPDMGEXP)=='H'] <- 2

colIndex <- suppressWarnings(!is.na(as.numeric(data$nCROPDMGEXP)))
data$nCROPDMGEXP[colIndex] <- as.numeric(data$nCROPDMGEXP[colIndex])
data$totalCROPDMG <- data$CROPDMG * 10 ^ data$nCROPDMGEXP

data$ECONOMYDMG <- data$totalPROPDMG + data$totalCROPDMG

data$POPDMG <- data$FATALITIES + data$INJURIES

# check dataset
head(data)
##             BGN_DATE  EVTYPE INJURIES FATALITIES PROPDMG CROPDMG YEAR
## 1  4/18/1950 0:00:00 TORNADO       15          0    25.0       0 1950
## 2  4/18/1950 0:00:00 TORNADO        0          0     2.5       0 1950
## 3  2/20/1951 0:00:00 TORNADO        2          0    25.0       0 1951
## 4   6/8/1951 0:00:00 TORNADO        2          0     2.5       0 1951
## 5 11/15/1951 0:00:00 TORNADO        2          0     2.5       0 1951
## 6 11/15/1951 0:00:00 TORNADO        6          0     2.5       0 1951
##   nPROPDMGEXP totalPROPDMG nCROPDMGEXP totalCROPDMG ECONOMYDMG POPDMG
## 1           0         25.0           0            0       25.0     15
## 2           0          2.5           0            0        2.5      0
## 3           0         25.0           0            0       25.0      2
## 4           0          2.5           0            0        2.5      2
## 5           0          2.5           0            0        2.5      2
## 6           0          2.5           0            0        2.5      6

Results

Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

Aggregate the population damage by events

#aggregate population damage by event
populationDMG <- aggregate(data$POPDMG, by = list(data$EVTYPE), "sum")
#Assign name for data frame
names(populationDMG) <- c("EVENT","POPDMG")
#Select only top 10 events
totalPOPDMG <- arrange(populationDMG, desc(POPDMG))[1:10,]
#Convert the event as factor
totalPOPDMG$EVENT <- factor(totalPOPDMG$EVENT, levels = totalPOPDMG$EVENT)
# Look into top 10 events 
totalPOPDMG
##                EVENT POPDMG
## 1            TORNADO  96979
## 2     EXCESSIVE HEAT   8428
## 3          TSTM WIND   7461
## 4              FLOOD   7259
## 5          LIGHTNING   6046
## 6               HEAT   3037
## 7        FLASH FLOOD   2755
## 8          ICE STORM   2064
## 9  THUNDERSTORM WIND   1621
## 10      WINTER STORM   1527
# Creating plot to present this data
g <- ggplot(totalPOPDMG, aes(x = EVENT, y = POPDMG))
g + geom_bar(stat = "identity", fill = "red") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Event Type") + ylab("Fatalities + Injuries") + ggtitle("Total Damage to population by each event type")

Across the United States, which types of events have the greatest economic consequences?

#aggregate population damage by event
propertyDMG <- aggregate(data$ECONOMYDMG, by = list(data$EVTYPE), "sum")
#Assign name for data frame
names(propertyDMG) <- c("EVENT","ECONOMYDMG")
#Select only top 10 events
totalECONOMYDMG <- arrange(propertyDMG, desc(ECONOMYDMG))[1:10,]
#Convert the event as factor
totalECONOMYDMG$EVENT <- factor(totalECONOMYDMG$EVENT, levels = totalECONOMYDMG$EVENT)
# Look into top 10 events 
totalECONOMYDMG
##                 EVENT ECONOMYDMG
## 1             TORNADO  3312276.7
## 2         FLASH FLOOD  1599325.1
## 3           TSTM WIND  1445198.2
## 4                HAIL  1268289.7
## 5               FLOOD  1067976.4
## 6   THUNDERSTORM WIND   943635.6
## 7           LIGHTNING   606932.4
## 8  THUNDERSTORM WINDS   464978.1
## 9           HIGH WIND   342014.8
## 10       WINTER STORM   134699.6
# Creating plot to present this data
g <- ggplot(totalECONOMYDMG, aes(x = EVENT, y = ECONOMYDMG))
g + geom_bar(stat = "identity", fill = "red") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Event Type") + ylab("Crop and Property Damage") + ggtitle("Total Economic Damage by each event type")

Conclusion

Tornado is the most harmful event that causes high damage to human and economy.