Week 4, Project 2
https://www.coursera.org/learn/reproducible-research/peer/OMZ37/course-project-2
This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
Across the United States, which types of events have the greatest economic consequences?
The report contains 2 major sections, which are: 1. Data Processing 2. Results
setwd("C:/Users/Xing/Documents/Coursera_R/ReproResearch_Project2")
data <- read.csv("repdata_data_StormData.csv", header = TRUE, na.strings = c(""))
# Extract only EVTYPE and FATALITIES, INJURIES columns in the dataset
data1<- subset(data, select = c(EVTYPE,FATALITIES,INJURIES))
# within each EVTYPE, add up the fatalities, injuries data
data1_fatal <- aggregate(data1$FATALITIES, by = list(data1$EVTYPE), FUN = sum)
colnames(data1_fatal) <- c("EVTYPE","FATALITIES")
data1_inj <- aggregate(data1$INJURIES, by = list(data1$EVTYPE), FUN = sum)
colnames(data1_inj)<-c("EVTYPE","INJURIES")
# Extract only EVTYPE and DAMAGE related data, remove NA values
# Exclude dataset with 0 values in PROPDMG/CROPDMG, or +, -, ? in PROPDMGEXP/CROPDMGEXP
data2_prop <- subset(data, select = c(EVTYPE,PROPDMG, PROPDMGEXP))
data2_prop <- data2_prop[complete.cases(data2_prop),]
data2_prop <- data2_prop[data2_prop$PROPDMG != 0 & data2_prop$PROPDMGEXP != c("+") & data2_prop$PROPDMGEXP != c("-") & data2_prop$PROPDMGEXP != c("?"),]
data2_crop <- subset(data, select = c(EVTYPE,CROPDMG, CROPDMGEXP))
data2_crop <- data2_crop[complete.cases(data2_crop),]
data2_crop <- data2_crop[data2_crop$CROPDMG != 0 & data2_crop$CROPDMGEXP != c("+") & data2_crop$CROPDMGEXP != c("-") & data2_crop$CROPDMGEXP != c("?"),]
# Convert the values in PROPDMG/CROPDMG with the correct exp values provided by PROPDMGEXP/CROPDMGEXP
# 0 = 1e+00
# 1 = 1e+01
# 2 = 1e+02
# etc
# h/H = 1e+02
# k/K = 1e+03
# m/M = 1e+06
# b/B = 1e+09
# prop data set
data2_prop$PROPDMGEXP <- gsub("h", 1e+2, data2_prop$PROPDMGEXP, ignore.case=TRUE)
data2_prop$PROPDMGEXP <- gsub("k", 1e+3, data2_prop$PROPDMGEXP, ignore.case=TRUE)
data2_prop$PROPDMGEXP <- gsub("m", 1e+6, data2_prop$PROPDMGEXP, ignore.case=TRUE)
data2_prop$PROPDMGEXP <- gsub("b", 1e+9, data2_prop$PROPDMGEXP, ignore.case=TRUE)
data2_prop$PROPDMGEXP <- as.numeric(data2_prop$PROPDMGEXP) # convert to numeric
for (i in 9:0) {
data2_prop$PROPDMGEXP[data2_prop$PROPDMGEXP==i]<-10^i
}
# crop data set
data2_crop$CROPDMGEXP <- gsub("h", 1e+2, data2_crop$CROPDMGEXP, ignore.case=TRUE)
data2_crop$CROPDMGEXP <- gsub("k", 1e+3, data2_crop$CROPDMGEXP, ignore.case=TRUE)
data2_crop$CROPDMGEXP <- gsub("m", 1e+6, data2_crop$CROPDMGEXP, ignore.case=TRUE)
data2_crop$CROPDMGEXP <- gsub("b", 1e+9, data2_crop$CROPDMGEXP, ignore.case=TRUE)
data2_crop$CROPDMGEXP <- as.numeric(data2_crop$CROPDMGEXP) # convert to numeric
for (i in 9:0) {
data2_crop$CROPDMGEXP[data2_crop$CROPDMGEXP==i]<-10^i
}
# calculate the cost by multiplying PROPDMG and PROPDMGEXP
library(plyr)
data2_prop <- mutate(data2_prop, cost = PROPDMG * PROPDMGEXP)
data2_crop <- mutate(data2_crop, cost = CROPDMG * CROPDMGEXP)
# within each EVTYPE, add up the cost
data2_prop <- aggregate(data2_prop$cost, by = list(data2_prop$EVTYPE), FUN = sum)
colnames(data2_prop) <- c("EVTYPE","Cost")
data2_crop <- aggregate(data2_crop$cost, by = list(data2_crop$EVTYPE), FUN = sum)
colnames(data2_crop) <- c("EVTYPE","Cost")
# Sort data
data2_prop <- data2_prop[order(-data2_prop$Cost),]
data2_crop <- data2_crop[order(-data2_crop$Cost),]
# Sort data
data1_fatal <- data1_fatal[order(-data1_fatal$FATALITIES),]
data1_inj <- data1_inj[order(-data1_inj$INJURIES),]
# show the top 5 causes of fatalities
kable(head(data1_fatal,5))
EVTYPE | FATALITIES | |
---|---|---|
834 | TORNADO | 5633 |
130 | EXCESSIVE HEAT | 1903 |
153 | FLASH FLOOD | 978 |
275 | HEAT | 937 |
464 | LIGHTNING | 816 |
# show the top 5 causes of injuries
kable(head(data1_inj,5))
EVTYPE | INJURIES | |
---|---|---|
834 | TORNADO | 91346 |
856 | TSTM WIND | 6957 |
170 | FLOOD | 6789 |
130 | EXCESSIVE HEAT | 6525 |
464 | LIGHTNING | 5230 |
# plot
p1_fatal <- ggplot(data1_fatal[1:5,], aes(x = EVTYPE, y= FATALITIES)) +
geom_col()
p1_inj <- ggplot(data1_inj[1:5,], aes(x = EVTYPE, y= INJURIES)) +
geom_col()
# multiple plots
grid.arrange(p1_fatal, p1_inj, nrow = 2)
# Sort data
data2_prop <- data2_prop[order(-data2_prop$Cost),]
data2_crop <- data2_crop[order(-data2_crop$Cost),]
# show the top 5 causes of property damage
kable(head(data2_prop,5))
EVTYPE | Cost | |
---|---|---|
62 | FLOOD | 144657709800 |
179 | HURRICANE/TYPHOON | 69305840000 |
331 | TORNADO | 56947380614 |
279 | STORM SURGE | 43323536000 |
50 | FLASH FLOOD | 16822673772 |
# plot
p2_prop <- ggplot(data2_prop[1:5,], aes(x = EVTYPE, y= Cost)) +
geom_col()
# show the top 5 causes of crop damage
kable(head(data2_crop,5))
EVTYPE | Cost | |
---|---|---|
10 | DROUGHT | 13972566000 |
27 | FLOOD | 5661968450 |
78 | RIVER FLOOD | 5029459000 |
72 | ICE STORM | 5022113500 |
42 | HAIL | 3025954470 |
# plot
p2_crop <- ggplot(data2_crop[1:5,], aes(x = EVTYPE, y= Cost)) +
geom_col()
# multiple plots
grid.arrange(p2_prop, p2_crop, nrow = 2)