The storm data used in this analysis is from the link https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2. More information about the data can be found at https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf and https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2FNCDC%20Storm%20Events-FAQ%20Page.pdf.
To load data into R, I used utils’ download.file() to download the compressed data from the internet, then unzip the downloaded .bz2 file with R.utils’ bunzip2() to achieve the uncompressed .csv file that is read into R with utils’ read.csv()
library(utils)
library(R.utils)
if (!file.exists("StormData.csv")) {
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = "StormData.csv.bz2")
bunzip2("StormData.csv.bz2", remove = FALSE)
}
StormData = read.csv("StormData.csv")
This block cleans up the data a little bit. Many columns unnecessary for the analysis are removed. StormData$BGN_DATE is converted to a date, which is used to extract the year with lubridate. With this, all data before 1996 is removed, as that was the point when data for all current environment types (StormData$EVTYPE) were being recorded. This is a crucial step to avoid a batch effect caused by time. Then, StormData$EVTYPE has to be cleaned up, as there are over 1000 unique values, though official types are only around 50. It begins by removing data with StormData$EVTYPE == “OTHERS”, as they cannot be attributed to any particular event. All events are set to capitilise, then leading and trailing whitespaces are removed, then trailing S’s are removed.
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.2
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
StormData = StormData[,c(2,7,8,23:28,36)]
StormData$BGN_DATE = mdy_hms(StormData$BGN_DATE)
StormData = StormData[year(StormData$BGN_DATE) >= 1996,]
StormData$EVTYPE = toupper(StormData$EVTYPE)
StormData = StormData[!grepl("^OTHER$",StormData$EVTYPE),]
StormData$EVTYPE = trimws(StormData$EVTYPE)
StormData$EVTYPE = sub("S$","",StormData$EVTYPE)
This block involves replacing as many EVTYPES with the official types as possible.
StormData$EVTYPE[grepl("FREEZE", StormData$EVTYPE)] = "FROST/FREEZE"
StormData$EVTYPE[grepl("FREEZING RAIN|FREEZING DRIZZLE", StormData$EVTYPE)] = "WINTER STORM"
StormData$EVTYPE[grepl("TSTM", StormData$EVTYPE)] = "MARINE THUNDERSTORM WIND"
StormData$EVTYPE[grepl("EXTREME", StormData$EVTYPE)] = "EXTREME COLD/WIND CHILL"
StormData$EVTYPE[grepl("ROUGH SURF", StormData$EVTYPE)] = "RIP CURRENT"
StormData$EVTYPE[grepl("URBAN/SML STREAM FLD", StormData$EVTYPE)] = "HEAVY RAIN"
StormData$EVTYPE[grepl("[F/f]reez",StormData$REMARKS[grepl("^FOG$",StormData$EVTYPE)])] = "FREEZING FOG"
StormData$EVTYPE[grepl("^FOG",StormData$EVTYPE)] = "DENSE FOG"
#River flooding may be included in the Flood category, page 37
StormData$EVTYPE[grepl("RIVER FLOOD",StormData$EVTYPE)] = "FLOOD"
StormData$EVTYPE[grepl("COASTAL FLOOD|CSTL FLOOD",StormData$EVTYPE)] = "COASTAL FLOOD"
StormData$EVTYPE[grepl("TIDAL FLOODING|STORM SURGE",StormData$EVTYPE)] = "STORM SURGE/TIDE"
StormData$EVTYPE[grepl("MARINE ACCIDENT",StormData$EVTYPE)] = "STORM SURGE/TIDE"
StormData$EVTYPE[grepl("RECORD HEAT",StormData$EVTYPE)] = "EXCESSIVE HEAT"
StormData$EVTYPE[grepl("HEAT WAVE",StormData$EVTYPE)] = "HEAT"
StormData$EVTYPE[grepl("GLAZE|ICE.*ROAD|BLACK ICE|SNOW.*ICE|FROST|WINTRY MIX",StormData$EVTYPE)] = "WINTER WEATHER"
StormData$EVTYPE[grepl("^GUSTY|^STRONG",StormData$EVTYPE)] = "STRONG WIND"
StormData$EVTYPE[grepl("WINTER WEATHER",StormData$EVTYPE)] = "WINTER WEATHER"
StormData$EVTYPE[grepl("HURRICANE|TYPHOON",StormData$EVTYPE)] = "HURRICANE(TYPHOON)"
StormData$EVTYPE[grepl("FIRE",StormData$EVTYPE)] = "WILDFIRE"
StormData$EVTYPE[grepl("WARM",StormData$EVTYPE)] = "HEAT"
StormData$EVTYPE[!grepl("EXTREME",StormData$EVTYPE) & grepl("COLD",StormData$EVTYPE)] = "COLD/WIND CHILL"
StormData$EVTYPE[grepl("RAIN",StormData$EVTYPE)] = "HEAVY RAIN"
StormData$EVTYPE[grepl("SEA",StormData$EVTYPE)] = "STORM SURGE/TIDE"
StormData$EVTYPE[grepl("SNOW",StormData$EVTYPE)] = "HEAVY SNOW"
StormData$EVTYPE[grepl("SURF",StormData$EVTYPE)] = "HIGH SURF"
StormData$EVTYPE[grepl("WHIRLWIND|LANDSPOUT",StormData$EVTYPE)] = "TORNADO"
StormData$EVTYPE[grepl("COAST.*STORM",StormData$EVTYPE)] = "TROPICAL STORM"
StormData$EVTYPE[grepl("SLIDE$|LANDSLUMP",StormData$EVTYPE)] = "DEBRIS FLOW"
StormData$EVTYPE[grepl("^THUNDERSTORM|BURST",StormData$EVTYPE)] = "THUNDERSTORM WIND"
StormData$EVTYPE[grepl("NON-SEVERE WIND DAMAGE|^WIND$|HIGH WIND",StormData$EVTYPE)] = "HIGH WIND"
StormData$EVTYPE[grepl("HAIL",StormData$EVTYPE)] = "HAIL"
StormData$EVTYPE[grepl("^HIGH WATER$|ASTRONOMICAL HIGH TIDE",StormData$EVTYPE)] = "HIGH SURF"
StormData$EVTYPE[grepl("COASTAL.*EROSION",StormData$EVTYPE)] = "COASTAL FLOOD"
StormData$EVTYPE[grepl("FLASH.*FLOOD",StormData$EVTYPE)] = "FLASH FLOOD"
StormData$EVTYPE[grepl("BLOWING DUST",StormData$EVTYPE)] = "DUST STORM"
StormData$EVTYPE[grepl("LAKESHORE FLOOD",StormData$EVTYPE)] = "FLOOD"
StormData$EVTYPE[grepl("GRADIENT WIND|WIND DAMAGE",StormData$EVTYPE)] = "STRONG WIND"
StormData$EVTYPE[grepl("ICE JAM FLOOD",StormData$EVTYPE)] = "FLOOD"
StormData$EVTYPE[grepl("ICY ROAD",StormData$EVTYPE)] = "WINTER WEATHER"
This block first trims off a few columns that are unrelated to population health. Then, it removes data points that are irrelevant to population health i.e. 0 deaths and 0 injuries. Then, the sums of fatalities and injuries by environment types are evaluated.
PopHealthData = StormData[-c(6:9)]
PopHealthData = PopHealthData[!(PopHealthData$FATALITIES == 0 & PopHealthData$INJURIES == 0),]
SumFatalitiesByEV = tapply(PopHealthData$FATALITIES,PopHealthData$EVTYPE,sum)
head(sort(SumFatalitiesByEV,decreasing = TRUE),12)
## EXCESSIVE HEAT TORNADO FLASH FLOOD
## 1688 1430 861
## LIGHTNING RIP CURRENT FREEZING FOG
## 613 523 443
## FLOOD MARINE THUNDERSTORM WIND EXTREME COLD/WIND CHILL
## 405 253 248
## HIGH WIND HEAT AVALANCHE
## 238 225 217
SumInjuriesByEV = tapply(PopHealthData$INJURIES,PopHealthData$EVTYPE,sum)
head(sort(SumInjuriesByEV,decreasing = TRUE),12)
## TORNADO FLOOD EXCESSIVE HEAT
## 19871 5900 5746
## LIGHTNING FREEZING FOG MARINE THUNDERSTORM WIND
## 3930 3644 3584
## FLASH FLOOD WILDFIRE THUNDERSTORM WIND
## 1592 1379 1328
## HEAT HURRICANE(TYPHOON) WINTER STORM
## 1308 1214 1118
As can be seen from the results, there is a significant difference in the distribution of the affected population between deaths and injuries.
par(mar = c(8,4,4,0.5))
barplot(SumInjuriesByEV,col=rgb(0,0,1,0.25), las = 2 ,cex.names = 0.5,ylim = c(0,25000),)
barplot(SumFatalitiesByEV,col=rgb(1,0,0,0.25), las = 2 ,cex.names = 0.5,add = TRUE)
legend("topright",legend = c("Sums of Injuries","Sums of Fatalities"),col = c(rgb(0,0,1,0.25),rgb(1,0,0,0.25)),pch = 15)
title(main = "Overlapped Graphs of Sums of Injuries and Fatalities by Event Type")
It can also be noted that fatalities were always significantly lower than injuries. In an attempt to coerce these two metrics together into one, a factor of 3 is arbitrarily chosen to be multiplied by fatalities before being added to injuries,creating the metric PopHealthImpact
PopHealthImpact = sort(3*SumFatalitiesByEV+SumInjuriesByEV,decreasing = TRUE)
barplot(PopHealthImpact,las = 2,cex.names = 0.5)
title(main = "Weighted Population Health Impact by Environment Type")
As can be seen with this new metric, population health is most affected by the first 7 events before a steep drop off. Hence, the answer to the question can be given by:
print(PopHealthImpact[1:7])
## TORNADO EXCESSIVE HEAT FLOOD
## 24161 10810 7115
## LIGHTNING FREEZING FOG MARINE THUNDERSTORM WIND
## 5769 4973 4343
## FLASH FLOOD
## 4175
This block first trims off a few columns that are unrelated to economic damage. Then, it removes data points that are irrelevant to economic damage ie $PROPDMG == 0 and $CROPDMG == 0. Then, exponents are multiplied to their base values (for the respective economic damages) to give 2 vectors that are bound to the list as columns. Then, they are summed by EVTYPE
EconData = StormData[-c(4,5)]
EconData = EconData[!(EconData$PROPDMG == 0 & EconData$CROPDMG == 0),]
PROP = numeric(nrow(EconData))
for (i in 1:nrow(EconData)){
if(EconData$PROPDMGEXP[i] == "B"){
PROP[i] = EconData$PROPDMG[i] * 10**9
}
else if(EconData$PROPDMGEXP[i] == "M"){
PROP[i] = EconData$PROPDMG[i] * 10**6
}
else if(EconData$PROPDMGEXP[i] == "K"){
PROP[i] = EconData$PROPDMG[i] * 10**3
}
else{
PROP[i] = EconData$PROPDMG[i]
}
}
EconData = cbind(EconData,PROP)
SumPROPByEV = tapply(EconData$PROP,EconData$EVTYPE,sum)
CROP = numeric(nrow(EconData))
for (i in 1:nrow(EconData)){
if(EconData$CROPDMGEXP[i] == "B"){
CROP[i] = EconData$CROPDMG[i] * 10**9
}
else if(EconData$CROPDMGEXP[i] == "M"){
CROP[i] = EconData$CROPDMG[i] * 10**6
}
else if(EconData$CROPDMGEXP[i] == "K"){
CROP[i] = EconData$CROPDMG[i] * 10**3
}
else{
CROP[i] = EconData$CROPDMG[i]
}
}
EconData = cbind(EconData,CROP)
SumCROPByEV = tapply(EconData$CROP,EconData$EVTYPE,sum)
The graph below shows the results
invisible(library(reshape2))
## Warning: package 'reshape2' was built under R version 4.4.2
invisible(library(ggplot2))
## Warning: package 'ggplot2' was built under R version 4.4.2
df = data.frame(SumPROPByEV,SumCROPByEV,EVTYPE = rownames(SumPROPByEV))
df = melt(df,id.vars = "EVTYPE")
ggplot(df,aes(x=EVTYPE,y=value,fill = variable))+geom_bar(position = "stack", stat = "identity")
As can be seen from the graph, there are 3 very large values, and arguably one more, so our answer to the question will be the top 4 environment types. Hence, the answer can be given by:
sort(SumCROPByEV+SumPROPByEV,decreasing = TRUE)[1:4]
## FLOOD HURRICANE(TYPHOON) FREEZING FOG TORNADO
## 146503124250 66638949810 61469327970 24007107620
Answering Question 1:
PopHealthImpact[1:7]
## TORNADO EXCESSIVE HEAT FLOOD
## 24161 10810 7115
## LIGHTNING FREEZING FOG MARINE THUNDERSTORM WIND
## 5769 4973 4343
## FLASH FLOOD
## 4175
Answering Question 2:
sort(SumCROPByEV+SumPROPByEV,decreasing = TRUE)[1:4]
## FLOOD HURRICANE(TYPHOON) FREEZING FOG TORNADO
## 146503124250 66638949810 61469327970 24007107620