In this report, we will be using the NOAA Storm database to analyze the most harmful natural weather events from 1950 to 2011.
Our goal is to answer two questions:
Answers:
Tornados are most harmful to population health
## Import required libraries
library ("dplyr")
library(ggplot2)
## Check if the file exists before downloading another copy
if(!file.exists("StormData.csv.bz2")){
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
destfile = "StormData.csv.bz2")
}
## Read in the data from the bz2 archive
data <- read.csv("StormData.csv.bz2", stringsAsFactors = FALSE,
na.strings = "", header = TRUE)
## Character to date conversion on BGN_DATE
data$date <- as.Date(data$BGN_DATE, format="%m/%d/%Y")
## Subset the dataset, keeping only the columns needed for the analysis
keep <- c("BGN_DATE", "EVTYPE", "FATALITIES", "INJURIES", "PROPDMG",
"PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
data <- data[keep]
## Remove events where no damages was recorded
data <- subset(data, FATALITIES>0 | INJURIES>0 | PROPDMG>0 | CROPDMG>0)
## Remove misc categories that don't fit the model
data$PROPDMGEXP[data$PROPDMGEXP == "1"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "2"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "3"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "4"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "5"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "6"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "7"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "8"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "+"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "0"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "?"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "-"] <- ""
data$CROPDMGEXP[data$CROPDMGEXP == "2"] <- ""
data$CROPDMGEXP[data$CROPDMGEXP == "?"] <- ""
data$CROPDMGEXP[data$CROPDMGEXP == "0"] <- ""
## Fix lower/uppercase issues
data$PROPDMGEXP <- gsub("m", "M", data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("h", "H", data$PROPDMGEXP)
data$CROPDMGEXP <- gsub("m", "M", data$CROPDMGEXP)
data$CROPDMGEXP <- gsub("k", "K", data$CROPDMGEXP)
## Convert character to numeric values to assist damage calculations
data$PROPDMGEXP[data$PROPDMGEXP =="H"] <- as.numeric(100)
data$PROPDMGEXP[data$PROPDMGEXP =="K"] <- as.numeric(1000)
data$PROPDMGEXP[data$PROPDMGEXP =="M"] <- as.numeric(1000000)
data$PROPDMGEXP[data$PROPDMGEXP =="B"] <- as.numeric(1000000000)
data$CROPDMGEXP[data$CROPDMGEXP =="K"] <- as.numeric(1000)
data$CROPDMGEXP[data$CROPDMGEXP =="M"] <- as.numeric(1000000)
data$CROPDMGEXP[data$CROPDMGEXP =="B"] <- as.numeric(1000000000)
## Make the PROD/CROP DMGEXP fields numeric
data$PROPDMGEXP <- as.numeric(data$PROPDMGEXP)
data$CROPDMGEXP <- as.numeric(data$CROPDMGEXP)
## Crate a new dataset by summing fatalities + injuries and returning top 10
health <- data%>%group_by (EVTYPE)%>%summarise (EVNTS = sum (FATALITIES + INJURIES))%>%top_n (10)
## Selecting by EVNTS
## Order by event count
health <- health[order(-health$EVNTS),]
## Plot the results
ggplot (health, aes (EVTYPE, EVNTS)) +
geom_bar (stat="identity",fill="red") +
theme (axis.text.x = element_text(angle = 25, hjust = 1)) +
ggtitle ("Top 10 Most Dangerous Events") +
xlab ("Event Type") + ylab ("Casualties (Injuries + Fatalities)")
## Create two new datasets to distinguish crop damage and property damage
property <- subset(data, select = c("EVTYPE", "PROPDMG", "PROPDMGEXP"))
crops <- subset(data, select = c("EVTYPE", "CROPDMG", "CROPDMGEXP"))
## Calculate total damage by multiplying damage with multiplier
property$TOTALDMG <- property$PROPDMG * property$PROPDMGEXP
crops$TOTALDMG <- crops$CROPDMG * crops$CROPDMGEXP
## Remove all NAs from both datasets
property <- na.omit(property)
crops <- na.omit(crops)
## Create aggregate datasets for each type to graph
property_agg <- aggregate(property$TOTALDMG ~ property$EVTYPE, data = property, sum)
crops_agg <- aggregate(crops$TOTALDMG ~ crops$EVTYPE, data = crops, sum)
## Order by TOTALDMG
property_agg <- property_agg[order(-property_agg$`property$TOTALDMG`),]
crops_agg <- crops_agg[order(-crops_agg$`crops$TOTALDMG`),]
## Take only top 10
property_agg <- subset(property_agg[1:10,])
crops_agg <- subset(crops_agg[1:10,])
## Rename columns since ggplot gets confused
colnames(property_agg) <- c("EVENT", "DMG")
colnames(crops_agg) <- c("EVENT", "DMG")
## Plot property data
ggplot(data = property_agg,
aes(x=EVENT, y=DMG, fill=EVENT)) +
geom_bar(stat = "identity") +
labs(title="Ten 10 Causes of Property Damage") +
labs(x = "Type of event", y = "Property Damage") +
theme(axis.text.x = element_text(size=0)) +
scale_fill_discrete(name = "Event Type")
## Plot crops data
ggplot(data = crops_agg,
aes(x=EVENT, y=DMG, fill=EVENT)) +
geom_bar(stat = "identity") +
labs(title="Top 10 Causes of Crop Damage") +
labs(x = "Event Type", y = "Crop Damage") +
theme(axis.text.x = element_text(size=0)) +
scale_fill_discrete(name = "Event Type")