Course Project 2 for Coursera’s Data Science Specialization by Johns Hopkins University. Given the provided dataset of storm data in the United States, the student was tasked at utilizing the course materials and lessons to identify which event(s) had the greatest impact on both population health and economics. To do this, one must load the data, subset it, sort through injuries and fatalities (health), and then plot, as well as similar analysis for the economics. The Economics proved more challenging, as one had to convert alphabetic variables to numeric for analysis. This lesson also taught how to publish to RPubs directly from RStudio.
Load libraries and get files and import to R:
library("ggplot2")
library("gridExtra")
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
dest="tmp.bz2",
method="curl")
stormData <- read.csv(bzfile("tmp.bz2"),
header=TRUE,
sep=",",
stringsAsFactors=FALSE)
Subset data:
# Required columns only
stormEvent <- stormData[, c("BGN_DATE", "EVTYPE", "FATALITIES", "INJURIES",
"PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
# healthEvent variable for first question
healthEvent <- subset(stormEvent, !stormEvent$FATALITIES == 0 & !stormEvent$INJURIES ==
0, select = c(EVTYPE, FATALITIES, INJURIES))
# economicEvent variable for second question
economicEvent <- subset(stormEvent, !stormEvent$PROPDMG == 0 & !stormEvent$CROPDMG ==
0, select = c(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP))
# Separate data set for Injury and Fatalities
healthEvent_fatal <- aggregate(healthEvent$FATALITIES, by = list(healthEvent$EVTYPE),
FUN = sum)
colnames(healthEvent_fatal) <- c("EVENTTYPE", "FATALITIES")
# Injury
healthEvent_nonfatal <- aggregate(healthEvent$INJURIES, by = list(healthEvent$EVTYPE),
FUN = sum)
# Give column name
colnames(healthEvent_nonfatal) <- c("EVENTTYPE", "INJURIES")
# Re-order and get top 5 (rest not necessary for answering the question(s))
healthEvent_fatal <- healthEvent_fatal[order(healthEvent_fatal$FATALITIES, decreasing = TRUE),
][1:5, ]
healthEvent_nonfatal <- healthEvent_nonfatal[order(healthEvent_nonfatal$INJURIES, decreasing = TRUE),
][1:5, ]
Now, we plot these:
# Plot fatalities
fatalities_plot <- ggplot() + geom_bar(data = healthEvent_fatal, aes(x = EVENTTYPE,
y = FATALITIES, fill = interaction(FATALITIES, EVENTTYPE)), stat = "identity",
show.legend = F) + theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
xlab("Weather Events") + ylab("# of Fatailities") + ggtitle("Top 5 Fatal Weather Events") +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
# Plot injuries
nonfatal_plot <- ggplot() + geom_bar(data = healthEvent_nonfatal, aes(x = EVENTTYPE, y = INJURIES,
fill = interaction(INJURIES, EVENTTYPE)), stat = "identity", show.legend = F) +
theme(axis.text.x = element_text(angle = 30, hjust = 1)) + xlab("Weather Events") +
ylab("No. of Injuries") + ggtitle("Top 5 Injury Prone Weather Events") +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
# Side by side
question1 <- grid.arrange(fatalities_plot, nonfatal_plot, ncol = 2)
# select required entries for economy
economicEvent <- subset(economicEvent, economicEvent$PROPDMGEXP == "K" | economicEvent$PROPDMGEXP ==
"k" | economicEvent$PROPDMGEXP == "M" | economicEvent$PROPDMGEXP == "m" |
economicEvent$PROPDMGEXP == "B" | economicEvent$PROPDMGEXP == "b")
economicEvent <- subset(economicEvent, economicEvent$CROPDMGEXP == "K" | economicEvent$CROPDMGEXP ==
"k" | economicEvent$CROPDMGEXP == "M" | economicEvent$CROPDMGEXP == "m" |
economicEvent$CROPDMGEXP == "B" | economicEvent$CROPDMGEXP == "b")
# Convert economic values to number
economicEvent$PROPDMGEXP <- gsub("m", 1e+06, economicEvent$PROPDMGEXP, ignore.case = TRUE)
economicEvent$PROPDMGEXP <- gsub("k", 1000, economicEvent$PROPDMGEXP, ignore.case = TRUE)
economicEvent$PROPDMGEXP <- gsub("b", 1e+09, economicEvent$PROPDMGEXP, ignore.case = TRUE)
economicEvent$PROPDMGEXP <- as.numeric(economicEvent$PROPDMGEXP)
economicEvent$CROPDMGEXP <- gsub("m", 1e+06, economicEvent$CROPDMGEXP, ignore.case = TRUE)
economicEvent$CROPDMGEXP <- gsub("k", 1000, economicEvent$CROPDMGEXP, ignore.case = TRUE)
economicEvent$CROPDMGEXP <- gsub("b", 1e+09, economicEvent$CROPDMGEXP, ignore.case = TRUE)
economicEvent$CROPDMGEXP <- as.numeric(economicEvent$CROPDMGEXP)
economicEvent$PROPDMGEXP <- as.numeric(economicEvent$PROPDMGEXP)
# Aggregate damage by event type
economicEvent$TOTALDMG <- (economicEvent$CROPDMG * economicEvent$CROPDMGEXP) +
(economicEvent$PROPDMG * economicEvent$PROPDMGEXP)
economicEvent <- aggregate(economicEvent$TOTALDMG, by = list(economicEvent$EVTYPE),
FUN = sum)
colnames(economicEvent) <- c("EVTYPE", "TOTALDMG")
# Again, sort and only care about the top 5
economicEvent <- economicEvent[order(economicEvent$TOTALDMG, decreasing = TRUE),
]
economicEvent <- economicEvent[1:5, ]
Plot:
# Now plot the graph
question2 <- ggplot() + geom_bar(data = economicEvent, aes(x = EVTYPE, y = TOTALDMG,
fill = interaction(TOTALDMG, EVTYPE)), stat = "identity", show.legend = F) +
theme(axis.text.x = element_text(angle = 30, hjust = 1)) + xlab("Event Type") +
ylab("Total Damage")
print(question1)
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
print(question2)
Across the United States, which types of events are most harmful with respect to population health? Answer: Tornados are the most harmful events with respect to population health.
Across the United States, which types of events have the greatest economic consequences? Answer: Floods have the greatest economic consequences across the United States