The goal of this report is to explore NOAA Storm Database and analyze that dataset to identify the most harmful events that can be affect the United states. This report is prepared with the view to help a government or municipal manager who might be responsible for preparing for severe weather events and will need to prioritize resources for different types of events. In this report, Storm dataset is analyzed in order to identify the weather event that has largest impact on population health and largest economic consequences. This analysis can be done by looking into the fatalities and injuries as well as damage on property and crops.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.3
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#set url to download data set
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if(!file.exists("repdata-data-StormData.csv.bz2")){
download.file(url, "repdata-data-StormData.csv.bz2")
}
#Unzip and read .csv file into the variable data
bz <- bzfile("repdata-data-StormData.csv.bz2", "r")
data <- read.csv(bz, stringsAsFactors = FALSE)
close(bz)
cols <- c("BGN_DATE", "EVTYPE", "INJURIES", "FATALITIES", "PROPDMG", "CROPDMG")
data <- data[,cols]
#select values that are positive
data <- subset(data, data$FATALITIES > 0 | data$INJURIES > 0 | data$PROPDMG > 0 | data$CROPDMG > 0 )
#Create new column to have year of event
data$YEAR <- as.integer(format(as.Date(data$BGN_DATE, "%m/%d/%Y 0:00:00"), "%Y"))
#Convert event type to make it consistent
data$EVTYPE <- toupper(data$EVTYPE)
# Combine property damage and crop damage to economy damage
data$nPROPDMGEXP <- rep(0, nrow(data))
data$nPROPDMGEXP[toupper(data$PROPDMGEXP)=='B'] <- 9
data$nPROPDMGEXP[toupper(data$PROPDMGEXP)=='M'] <- 6
data$nPROPDMGEXP[toupper(data$PROPDMGEXP)=='K'] <- 3
data$nPROPDMGEXP[toupper(data$PROPDMGEXP)=='H'] <- 2
colIndex <- suppressWarnings(!is.na(as.numeric(data$nPROPDMGEXP)))
data$nPROPDMGEXP[colIndex] <- as.numeric(data$nPROPDMGEXP[colIndex])
data$totalPROPDMG <- data$PROPDMG * 10 ^ data$nPROPDMGEXP
data$nCROPDMGEXP <- rep(0, nrow(data))
data$nCROPDMGEXP[toupper(data$CROPDMGEXP)=='B'] <- 9
data$nCROPDMGEXP[toupper(data$CROPDMGEXP)=='M'] <- 6
data$nCROPDMGEXP[toupper(data$CROPDMGEXP)=='K'] <- 3
data$nCROPDMGEXP[toupper(data$CROPDMGEXP)=='H'] <- 2
colIndex <- suppressWarnings(!is.na(as.numeric(data$nCROPDMGEXP)))
data$nCROPDMGEXP[colIndex] <- as.numeric(data$nCROPDMGEXP[colIndex])
data$totalCROPDMG <- data$CROPDMG * 10 ^ data$nCROPDMGEXP
data$ECONOMYDMG <- data$totalPROPDMG + data$totalCROPDMG
data$POPDMG <- data$FATALITIES + data$INJURIES
# check dataset
head(data)
## BGN_DATE EVTYPE INJURIES FATALITIES PROPDMG CROPDMG YEAR
## 1 4/18/1950 0:00:00 TORNADO 15 0 25.0 0 1950
## 2 4/18/1950 0:00:00 TORNADO 0 0 2.5 0 1950
## 3 2/20/1951 0:00:00 TORNADO 2 0 25.0 0 1951
## 4 6/8/1951 0:00:00 TORNADO 2 0 2.5 0 1951
## 5 11/15/1951 0:00:00 TORNADO 2 0 2.5 0 1951
## 6 11/15/1951 0:00:00 TORNADO 6 0 2.5 0 1951
## nPROPDMGEXP totalPROPDMG nCROPDMGEXP totalCROPDMG ECONOMYDMG POPDMG
## 1 0 25.0 0 0 25.0 15
## 2 0 2.5 0 0 2.5 0
## 3 0 25.0 0 0 25.0 2
## 4 0 2.5 0 0 2.5 2
## 5 0 2.5 0 0 2.5 2
## 6 0 2.5 0 0 2.5 6
#aggregate population damage by event
populationDMG <- aggregate(data$POPDMG, by = list(data$EVTYPE), "sum")
#Assign name for data frame
names(populationDMG) <- c("EVENT","POPDMG")
#Select only top 10 events
totalPOPDMG <- arrange(populationDMG, desc(POPDMG))[1:10,]
#Convert the event as factor
totalPOPDMG$EVENT <- factor(totalPOPDMG$EVENT, levels = totalPOPDMG$EVENT)
# Look into top 10 events
totalPOPDMG
## EVENT POPDMG
## 1 TORNADO 96979
## 2 EXCESSIVE HEAT 8428
## 3 TSTM WIND 7461
## 4 FLOOD 7259
## 5 LIGHTNING 6046
## 6 HEAT 3037
## 7 FLASH FLOOD 2755
## 8 ICE STORM 2064
## 9 THUNDERSTORM WIND 1621
## 10 WINTER STORM 1527
# Creating plot to present this data
g <- ggplot(totalPOPDMG, aes(x = EVENT, y = POPDMG))
g + geom_bar(stat = "identity", fill = "red") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Event Type") + ylab("Fatalities + Injuries") + ggtitle("Total Damage to population by each event type")
#aggregate population damage by event
propertyDMG <- aggregate(data$ECONOMYDMG, by = list(data$EVTYPE), "sum")
#Assign name for data frame
names(propertyDMG) <- c("EVENT","ECONOMYDMG")
#Select only top 10 events
totalECONOMYDMG <- arrange(propertyDMG, desc(ECONOMYDMG))[1:10,]
#Convert the event as factor
totalECONOMYDMG$EVENT <- factor(totalECONOMYDMG$EVENT, levels = totalECONOMYDMG$EVENT)
# Look into top 10 events
totalECONOMYDMG
## EVENT ECONOMYDMG
## 1 TORNADO 3312276.7
## 2 FLASH FLOOD 1599325.1
## 3 TSTM WIND 1445198.2
## 4 HAIL 1268289.7
## 5 FLOOD 1067976.4
## 6 THUNDERSTORM WIND 943635.6
## 7 LIGHTNING 606932.4
## 8 THUNDERSTORM WINDS 464978.1
## 9 HIGH WIND 342014.8
## 10 WINTER STORM 134699.6
# Creating plot to present this data
g <- ggplot(totalECONOMYDMG, aes(x = EVENT, y = ECONOMYDMG))
g + geom_bar(stat = "identity", fill = "red") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Event Type") + ylab("Crop and Property Damage") + ggtitle("Total Economic Damage by each event type")
Tornado is the most harmful event that causes high damage to human and economy.