Reproducible research_Coursera R course

Week 4, Project 2

Instruction can be found here:

https://www.coursera.org/learn/reproducible-research/peer/OMZ37/course-project-2

About the project

This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.

Aim of the exploratory data analysis

Question 1

Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

Question 2

Across the United States, which types of events have the greatest economic consequences?

Report Sturcture

The report contains 2 major sections, which are: 1. Data Processing 2. Results

Loading the data

setwd("C:/Users/Xing/Documents/Coursera_R/ReproResearch_Project2")
data <- read.csv("repdata_data_StormData.csv", header = TRUE, na.strings = c(""))

Section 1

Data Processing

# Extract only EVTYPE and FATALITIES, INJURIES columns in the dataset
data1<- subset(data, select = c(EVTYPE,FATALITIES,INJURIES))

# within each EVTYPE, add up the fatalities, injuries data
data1_fatal <- aggregate(data1$FATALITIES, by = list(data1$EVTYPE), FUN = sum)
colnames(data1_fatal) <- c("EVTYPE","FATALITIES")

data1_inj <- aggregate(data1$INJURIES, by = list(data1$EVTYPE), FUN = sum)
colnames(data1_inj)<-c("EVTYPE","INJURIES")
# Extract only EVTYPE and DAMAGE related data, remove NA values
# Exclude dataset with 0 values in PROPDMG/CROPDMG, or +, -, ? in PROPDMGEXP/CROPDMGEXP
data2_prop <- subset(data, select = c(EVTYPE,PROPDMG, PROPDMGEXP))
data2_prop <- data2_prop[complete.cases(data2_prop),]
data2_prop <- data2_prop[data2_prop$PROPDMG != 0 & data2_prop$PROPDMGEXP != c("+") & data2_prop$PROPDMGEXP != c("-") & data2_prop$PROPDMGEXP != c("?"),]

data2_crop <- subset(data, select = c(EVTYPE,CROPDMG, CROPDMGEXP))
data2_crop <- data2_crop[complete.cases(data2_crop),]
data2_crop <- data2_crop[data2_crop$CROPDMG != 0 & data2_crop$CROPDMGEXP != c("+") & data2_crop$CROPDMGEXP != c("-") & data2_crop$CROPDMGEXP != c("?"),]

# Convert the values in PROPDMG/CROPDMG with the correct exp values provided by PROPDMGEXP/CROPDMGEXP
# 0 = 1e+00
# 1 = 1e+01
# 2 = 1e+02
# etc
# h/H = 1e+02
# k/K = 1e+03
# m/M = 1e+06
# b/B = 1e+09

# prop data set
data2_prop$PROPDMGEXP <- gsub("h", 1e+2, data2_prop$PROPDMGEXP, ignore.case=TRUE)
data2_prop$PROPDMGEXP <- gsub("k", 1e+3, data2_prop$PROPDMGEXP, ignore.case=TRUE)
data2_prop$PROPDMGEXP <- gsub("m", 1e+6, data2_prop$PROPDMGEXP, ignore.case=TRUE)
data2_prop$PROPDMGEXP <- gsub("b", 1e+9, data2_prop$PROPDMGEXP, ignore.case=TRUE)
data2_prop$PROPDMGEXP <- as.numeric(data2_prop$PROPDMGEXP) # convert to numeric
for (i in 9:0) {
      data2_prop$PROPDMGEXP[data2_prop$PROPDMGEXP==i]<-10^i
}

# crop data set
data2_crop$CROPDMGEXP <- gsub("h", 1e+2, data2_crop$CROPDMGEXP, ignore.case=TRUE)
data2_crop$CROPDMGEXP <- gsub("k", 1e+3, data2_crop$CROPDMGEXP, ignore.case=TRUE)
data2_crop$CROPDMGEXP <- gsub("m", 1e+6, data2_crop$CROPDMGEXP, ignore.case=TRUE)
data2_crop$CROPDMGEXP <- gsub("b", 1e+9, data2_crop$CROPDMGEXP, ignore.case=TRUE)
data2_crop$CROPDMGEXP <- as.numeric(data2_crop$CROPDMGEXP) # convert to numeric
for (i in 9:0) {
      data2_crop$CROPDMGEXP[data2_crop$CROPDMGEXP==i]<-10^i
}

# calculate the cost by multiplying PROPDMG and PROPDMGEXP
library(plyr)
data2_prop <- mutate(data2_prop, cost = PROPDMG * PROPDMGEXP)
data2_crop <- mutate(data2_crop, cost = CROPDMG * CROPDMGEXP)

# within each EVTYPE, add up the cost
data2_prop <- aggregate(data2_prop$cost, by = list(data2_prop$EVTYPE), FUN = sum)
colnames(data2_prop) <- c("EVTYPE","Cost")
data2_crop <- aggregate(data2_crop$cost, by = list(data2_crop$EVTYPE), FUN = sum)
colnames(data2_crop) <- c("EVTYPE","Cost")

# Sort data
data2_prop <- data2_prop[order(-data2_prop$Cost),]
data2_crop <- data2_crop[order(-data2_crop$Cost),]

Section 2

Results

# Sort data
data1_fatal <- data1_fatal[order(-data1_fatal$FATALITIES),]
data1_inj <- data1_inj[order(-data1_inj$INJURIES),]

# show the top 5 causes of fatalities
kable(head(data1_fatal,5))
EVTYPE FATALITIES
834 TORNADO 5633
130 EXCESSIVE HEAT 1903
153 FLASH FLOOD 978
275 HEAT 937
464 LIGHTNING 816
# show the top 5 causes of injuries
kable(head(data1_inj,5))
EVTYPE INJURIES
834 TORNADO 91346
856 TSTM WIND 6957
170 FLOOD 6789
130 EXCESSIVE HEAT 6525
464 LIGHTNING 5230
# plot
p1_fatal <- ggplot(data1_fatal[1:5,], aes(x = EVTYPE, y= FATALITIES)) + 
            geom_col()

p1_inj <- ggplot(data1_inj[1:5,], aes(x = EVTYPE, y= INJURIES)) + 
      geom_col()

# multiple plots 
grid.arrange(p1_fatal, p1_inj, nrow = 2)

# Sort data
data2_prop <- data2_prop[order(-data2_prop$Cost),]
data2_crop <- data2_crop[order(-data2_crop$Cost),]

# show the top 5 causes of property damage
kable(head(data2_prop,5))
EVTYPE Cost
62 FLOOD 144657709800
179 HURRICANE/TYPHOON 69305840000
331 TORNADO 56947380614
279 STORM SURGE 43323536000
50 FLASH FLOOD 16822673772
# plot
p2_prop <- ggplot(data2_prop[1:5,], aes(x = EVTYPE, y= Cost)) + 
      geom_col()

# show the top 5 causes of crop damage
kable(head(data2_crop,5))
EVTYPE Cost
10 DROUGHT 13972566000
27 FLOOD 5661968450
78 RIVER FLOOD 5029459000
72 ICE STORM 5022113500
42 HAIL 3025954470
# plot
p2_crop <- ggplot(data2_crop[1:5,], aes(x = EVTYPE, y= Cost)) + 
      geom_col()

# multiple plots 
grid.arrange(p2_prop, p2_crop, nrow = 2)