This project is aimed at exploring the NOAA Storm Database and attempting to answer questions regarding that data.
Q1: Across the United States, which types of events are most harmful with respect to population health?
Q2: Across the United States, which types of events have the greatest economic consequences?
#Download Data
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", "stormData.csv.bz2")
#Read Data
data <- read.csv(bzfile("stormData.csv.bz2"))
# Fatalities and Injuries look like the only columns related to health. We will subset them.
evHealthData <- data[,c("EVTYPE", "FATALITIES", "INJURIES")]
str(evHealthData)
## 'data.frame': 902297 obs. of 3 variables:
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
# Exploratory
summary(evHealthData)
## EVTYPE FATALITIES INJURIES
## Length:902297 Min. : 0.0000 Min. : 0.0000
## Class :character 1st Qu.: 0.0000 1st Qu.: 0.0000
## Mode :character Median : 0.0000 Median : 0.0000
## Mean : 0.0168 Mean : 0.1557
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :583.0000 Max. :1700.0000
#What is the average fatality for each event type
uniqueEvents <- unique(evHealthData$EVTYPE)
##I want to know on average which events are most dangerous with Injuries and Fatalities intersected or treated as one.
#Making a vector of event types that have fatalities greater than the average fatality.
aboveAvgFatalities <- unique(subset(evHealthData$EVTYPE, evHealthData$FATALITIES > mean(evHealthData$FATALITIES)))
#Again though with Injuries
aboveAvgInjuries <- unique(subset(evHealthData$EVTYPE, evHealthData$INJURIES > mean(evHealthData$INJURIES)))
#Now lets intersect them to find which events are present in both
aboveAvgEvents <- intersect(aboveAvgFatalities, aboveAvgInjuries)
#Subset again with these new event types
aboveAvgData <- subset(evHealthData, evHealthData$EVTYPE %in% aboveAvgEvents)
#Also, we no longer need the average so now I'm going to remove the zeros.
filteredData <- aboveAvgData[aboveAvgData$FATALITIES != 0 | aboveAvgData$INJURIES !=0, ]
#Since there are so many events Re filter for top 1%.
filteredData <- filteredData[filteredData$FATALITIES >= quantile(filteredData$FATALITIES, probs = 0.99) & filteredData$INJURIES >= quantile(filteredData$INJURIES, probs = 0.99), ]
cat("These are the events that are at the top 1% most harmful to health in regards to both indirect Fatalities and Injuries: ", unique(filteredData$EVTYPE))
## These are the events that are at the top 1% most harmful to health in regards to both indirect Fatalities and Injuries: TORNADO HEAT FLASH FLOOD FLOOD EXCESSIVE HEAT HURRICANE/TYPHOON TSUNAMI
# For this, we will use property damage and crop damage: PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP
evEconData <- data[,c("EVTYPE", "PROPDMG", "CROPDMG")]
str(evEconData)
## 'data.frame': 902297 obs. of 3 variables:
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ PROPDMG: num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ CROPDMG: num 0 0 0 0 0 0 0 0 0 0 ...
summary(evEconData)
## EVTYPE PROPDMG CROPDMG
## Length:902297 Min. : 0.00 Min. : 0.000
## Class :character 1st Qu.: 0.00 1st Qu.: 0.000
## Mode :character Median : 0.00 Median : 0.000
## Mean : 12.06 Mean : 1.527
## 3rd Qu.: 0.50 3rd Qu.: 0.000
## Max. :5000.00 Max. :990.000
maxPropDmgPerEvent <- aggregate(PROPDMG ~ EVTYPE, data = evEconData, FUN = max)
maxCropDmgPerEvent <- aggregate(CROPDMG ~ EVTYPE, data = evEconData, FUN = max)
maxDmgPerEvent <- cbind(maxCropDmgPerEvent, maxPropDmgPerEvent)
#Duplicated column, prob easier way to combine for this, but removed col 3 which is "EVTYPE" duplicated.
maxDmgPerEvent <- maxDmgPerEvent[,-3]
#Now will remove the empty ones.
maxDmgPerEvent <- maxDmgPerEvent[maxDmgPerEvent$CROPDMG != 0 | maxDmgPerEvent$PROPDMG != 0,]
#Now I want another column that has the sum of the two cols and we will see after which one is max.
maxDmgPerEvent$totalDamage <- maxDmgPerEvent$CROPDMG + maxDmgPerEvent$PROPDMG
damageResult <- maxDmgPerEvent[maxDmgPerEvent$EVTYPE %in% maxDmgPerEvent$EVTYPE[maxDmgPerEvent$totalDamage == max(maxDmgPerEvent$totalDamage)], ]
print(damageResult)
## EVTYPE CROPDMG PROPDMG totalDamage
## 153 FLASH FLOOD 950 5000 5950
cat("The combined maximum damage between crops and property were caused by ", damageResult$EVTYPE, " with the total damage as $5,950,000")
## The combined maximum damage between crops and property were caused by FLASH FLOOD with the total damage as $5,950,000
These are the events that are at the top 1% most harmful to health in regards to both indirect Fatalities and Injuries: TORNADO, HEAT, FLASH FLOOD, FLOOD EXCESSIVE HEAT, HURRICANE/TYPHOON TSUNAMI The combined maximum damage between crops and property were caused by FLASH FLOOD with the total damage as $5,950,000