Exploratory data analysis using the NOAA Storm Database

Reproducible research_Coursera R course

Week 4, Project 2

Instruction can be found here:

https://www.coursera.org/learn/reproducible-research/peer/OMZ37/course-project-2

About the project

This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.

Aim of the exploratory data analysis

Question 1

Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

Question 2

Across the United States, which types of events have the greatest economic consequences?

Report Sturcture

The report contains 2 major sections, which are: 1. Data Processing 2. Results

Loading the data

setwd("C:/Users/Xing/Documents/Coursera_R/ReproResearch_Project2")
data <- read.csv("repdata_data_StormData.csv", header = TRUE, na.strings = c(""))

Section 1

Data Processing

# Extract only EVTYPE and FATALITIES, INJURIES columns in the dataset
data1<- subset(data, select = c(EVTYPE,FATALITIES,INJURIES))

# within each EVTYPE, add up the fatalities, injuries data
data1_fatal <- aggregate(data1$FATALITIES, by = list(data1$EVTYPE), FUN = sum)
colnames(data1_fatal) <- c("EVTYPE","FATALITIES")

data1_inj <- aggregate(data1$INJURIES, by = list(data1$EVTYPE), FUN = sum)
colnames(data1_inj)<-c("EVTYPE","INJURIES")

# Extract only EVTYPE and DAMAGE related data, remove NA values
# Exclude dataset with 0 values in PROPDMG/CROPDMG, or +, -, ? in PROPDMGEXP/CROPDMGEXP
data2_prop <- subset(data, select = c(EVTYPE,PROPDMG, PROPDMGEXP))
data2_prop <- data2_prop[complete.cases(data2_prop),]
data2_prop <- data2_prop[data2_prop$PROPDMG != 0 & data2_prop$PROPDMGEXP != c("+") & data2_prop$PROPDMGEXP != c("-") & data2_prop$PROPDMGEXP != c("?"),]

data2_crop <- subset(data, select = c(EVTYPE,CROPDMG, CROPDMGEXP))
data2_crop <- data2_crop[complete.cases(data2_crop),]
data2_crop <- data2_crop[data2_crop$CROPDMG != 0 & data2_crop$CROPDMGEXP != c("+") & data2_crop$CROPDMGEXP != c("-") & data2_crop$CROPDMGEXP != c("?"),]

# Convert the values in PROPDMG/CROPDMG with the correct exp values provided by PROPDMGEXP/CROPDMGEXP
# 0 = 1e+00
# 1 = 1e+01
# 2 = 1e+02
# etc
# h/H = 1e+02
# k/K = 1e+03
# m/M = 1e+06
# b/B = 1e+09

# prop data set
data2_prop$PROPDMGEXP <- gsub("h", 1e+2, data2_prop$PROPDMGEXP, ignore.case=TRUE)
data2_prop$PROPDMGEXP <- gsub("k", 1e+3, data2_prop$PROPDMGEXP, ignore.case=TRUE)
data2_prop$PROPDMGEXP <- gsub("m", 1e+6, data2_prop$PROPDMGEXP, ignore.case=TRUE)
data2_prop$PROPDMGEXP <- gsub("b", 1e+9, data2_prop$PROPDMGEXP, ignore.case=TRUE)
data2_prop$PROPDMGEXP <- as.numeric(data2_prop$PROPDMGEXP) # convert to numeric
for (i in 9:0) {
      data2_prop$PROPDMGEXP[data2_prop$PROPDMGEXP==i]<-10^i
}

# crop data set
data2_crop$CROPDMGEXP <- gsub("h", 1e+2, data2_crop$CROPDMGEXP, ignore.case=TRUE)
data2_crop$CROPDMGEXP <- gsub("k", 1e+3, data2_crop$CROPDMGEXP, ignore.case=TRUE)
data2_crop$CROPDMGEXP <- gsub("m", 1e+6, data2_crop$CROPDMGEXP, ignore.case=TRUE)
data2_crop$CROPDMGEXP <- gsub("b", 1e+9, data2_crop$CROPDMGEXP, ignore.case=TRUE)
data2_crop$CROPDMGEXP <- as.numeric(data2_crop$CROPDMGEXP) # convert to numeric
for (i in 9:0) {
      data2_crop$CROPDMGEXP[data2_crop$CROPDMGEXP==i]<-10^i
}

# calculate the cost by multiplying PROPDMG and PROPDMGEXP
library(plyr)
data2_prop <- mutate(data2_prop, cost = PROPDMG * PROPDMGEXP)
data2_crop <- mutate(data2_crop, cost = CROPDMG * CROPDMGEXP)

# within each EVTYPE, add up the cost
data2_prop <- aggregate(data2_prop$cost, by = list(data2_prop$EVTYPE), FUN = sum)
colnames(data2_prop) <- c("EVTYPE","Cost")
data2_crop <- aggregate(data2_crop$cost, by = list(data2_crop$EVTYPE), FUN = sum)
colnames(data2_crop) <- c("EVTYPE","Cost")

# Sort data
data2_prop <- data2_prop[order(-data2_prop$Cost),]
data2_crop <- data2_crop[order(-data2_crop$Cost),]

Section 2

Results

# Sort data
data1_fatal <- data1_fatal[order(-data1_fatal$FATALITIES),]
data1_inj <- data1_inj[order(-data1_inj$INJURIES),]

# show the top 5 causes of fatalities
kable(head(data1_fatal,5))

	EVTYPE	FATALITIES
834	TORNADO	5633
130	EXCESSIVE HEAT	1903
153	FLASH FLOOD	978
275	HEAT	937
464	LIGHTNING	816

# show the top 5 causes of injuries
kable(head(data1_inj,5))

	EVTYPE	INJURIES
834	TORNADO	91346
856	TSTM WIND	6957
170	FLOOD	6789
130	EXCESSIVE HEAT	6525
464	LIGHTNING	5230

# plot
p1_fatal <- ggplot(data1_fatal[1:5,], aes(x = EVTYPE, y= FATALITIES)) + 
            geom_col()

p1_inj <- ggplot(data1_inj[1:5,], aes(x = EVTYPE, y= INJURIES)) + 
      geom_col()

# multiple plots 
grid.arrange(p1_fatal, p1_inj, nrow = 2)

# Sort data
data2_prop <- data2_prop[order(-data2_prop$Cost),]
data2_crop <- data2_crop[order(-data2_crop$Cost),]

# show the top 5 causes of property damage
kable(head(data2_prop,5))

	EVTYPE	Cost
62	FLOOD	144657709800
179	HURRICANE/TYPHOON	69305840000
331	TORNADO	56947380614
279	STORM SURGE	43323536000
50	FLASH FLOOD	16822673772

# plot
p2_prop <- ggplot(data2_prop[1:5,], aes(x = EVTYPE, y= Cost)) + 
      geom_col()

# show the top 5 causes of crop damage
kable(head(data2_crop,5))

	EVTYPE	Cost
10	DROUGHT	13972566000
27	FLOOD	5661968450
78	RIVER FLOOD	5029459000
72	ICE STORM	5022113500
42	HAIL	3025954470

# plot
p2_crop <- ggplot(data2_crop[1:5,], aes(x = EVTYPE, y= Cost)) + 
      geom_col()

# multiple plots 
grid.arrange(p2_prop, p2_crop, nrow = 2)