Synopsis

The storm data used in this analysis is from the link https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2. More information about the data can be found at https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf and https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2FNCDC%20Storm%20Events-FAQ%20Page.pdf.

Data Processing

To load data into R, I used utils’ download.file() to download the compressed data from the internet, then unzip the downloaded .bz2 file with R.utils’ bunzip2() to achieve the uncompressed .csv file that is read into R with utils’ read.csv()

library(utils)
library(R.utils)
if (!file.exists("StormData.csv")) {
  download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = "StormData.csv.bz2")
  bunzip2("StormData.csv.bz2", remove = FALSE)
}
StormData = read.csv("StormData.csv")

This block cleans up the data a little bit. Many columns unnecessary for the analysis are removed. StormData$BGN_DATE is converted to a date, which is used to extract the year with lubridate. With this, all data before 1996 is removed, as that was the point when data for all current environment types (StormData$EVTYPE) were being recorded. This is a crucial step to avoid a batch effect caused by time. Then, StormData$EVTYPE has to be cleaned up, as there are over 1000 unique values, though official types are only around 50. It begins by removing data with StormData$EVTYPE == “OTHERS”, as they cannot be attributed to any particular event. All events are set to capitilise, then leading and trailing whitespaces are removed, then trailing S’s are removed.

library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.2
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
StormData = StormData[,c(2,7,8,23:28,36)]
StormData$BGN_DATE = mdy_hms(StormData$BGN_DATE)
StormData = StormData[year(StormData$BGN_DATE) >= 1996,]
StormData$EVTYPE = toupper(StormData$EVTYPE)
StormData = StormData[!grepl("^OTHER$",StormData$EVTYPE),]
StormData$EVTYPE = trimws(StormData$EVTYPE)
StormData$EVTYPE = sub("S$","",StormData$EVTYPE)

This block involves replacing as many EVTYPES with the official types as possible.

StormData$EVTYPE[grepl("FREEZE", StormData$EVTYPE)] = "FROST/FREEZE"
StormData$EVTYPE[grepl("FREEZING RAIN|FREEZING DRIZZLE", StormData$EVTYPE)] = "WINTER STORM"
StormData$EVTYPE[grepl("TSTM", StormData$EVTYPE)] = "MARINE THUNDERSTORM WIND"
StormData$EVTYPE[grepl("EXTREME", StormData$EVTYPE)] = "EXTREME COLD/WIND CHILL"
StormData$EVTYPE[grepl("ROUGH SURF", StormData$EVTYPE)] = "RIP CURRENT"
StormData$EVTYPE[grepl("URBAN/SML STREAM FLD", StormData$EVTYPE)] = "HEAVY RAIN"
StormData$EVTYPE[grepl("[F/f]reez",StormData$REMARKS[grepl("^FOG$",StormData$EVTYPE)])] = "FREEZING FOG"
StormData$EVTYPE[grepl("^FOG",StormData$EVTYPE)] = "DENSE FOG"
#River flooding may be included in the Flood category, page 37
StormData$EVTYPE[grepl("RIVER FLOOD",StormData$EVTYPE)] = "FLOOD"
StormData$EVTYPE[grepl("COASTAL FLOOD|CSTL FLOOD",StormData$EVTYPE)] = "COASTAL FLOOD"
StormData$EVTYPE[grepl("TIDAL FLOODING|STORM SURGE",StormData$EVTYPE)] = "STORM SURGE/TIDE"
StormData$EVTYPE[grepl("MARINE ACCIDENT",StormData$EVTYPE)] = "STORM SURGE/TIDE"
StormData$EVTYPE[grepl("RECORD HEAT",StormData$EVTYPE)] = "EXCESSIVE HEAT"
StormData$EVTYPE[grepl("HEAT WAVE",StormData$EVTYPE)] = "HEAT"
StormData$EVTYPE[grepl("GLAZE|ICE.*ROAD|BLACK ICE|SNOW.*ICE|FROST|WINTRY MIX",StormData$EVTYPE)] = "WINTER WEATHER"
StormData$EVTYPE[grepl("^GUSTY|^STRONG",StormData$EVTYPE)] = "STRONG WIND"
StormData$EVTYPE[grepl("WINTER WEATHER",StormData$EVTYPE)] = "WINTER WEATHER"
StormData$EVTYPE[grepl("HURRICANE|TYPHOON",StormData$EVTYPE)] = "HURRICANE(TYPHOON)"
StormData$EVTYPE[grepl("FIRE",StormData$EVTYPE)] = "WILDFIRE"
StormData$EVTYPE[grepl("WARM",StormData$EVTYPE)] = "HEAT"
StormData$EVTYPE[!grepl("EXTREME",StormData$EVTYPE) & grepl("COLD",StormData$EVTYPE)] = "COLD/WIND CHILL"
StormData$EVTYPE[grepl("RAIN",StormData$EVTYPE)] = "HEAVY RAIN"
StormData$EVTYPE[grepl("SEA",StormData$EVTYPE)] = "STORM SURGE/TIDE"
StormData$EVTYPE[grepl("SNOW",StormData$EVTYPE)] = "HEAVY SNOW"
StormData$EVTYPE[grepl("SURF",StormData$EVTYPE)] = "HIGH SURF"
StormData$EVTYPE[grepl("WHIRLWIND|LANDSPOUT",StormData$EVTYPE)] = "TORNADO"
StormData$EVTYPE[grepl("COAST.*STORM",StormData$EVTYPE)] = "TROPICAL STORM"
StormData$EVTYPE[grepl("SLIDE$|LANDSLUMP",StormData$EVTYPE)] = "DEBRIS FLOW"
StormData$EVTYPE[grepl("^THUNDERSTORM|BURST",StormData$EVTYPE)] = "THUNDERSTORM WIND"
StormData$EVTYPE[grepl("NON-SEVERE WIND DAMAGE|^WIND$|HIGH WIND",StormData$EVTYPE)] = "HIGH WIND"
StormData$EVTYPE[grepl("HAIL",StormData$EVTYPE)] = "HAIL"
StormData$EVTYPE[grepl("^HIGH WATER$|ASTRONOMICAL HIGH TIDE",StormData$EVTYPE)] = "HIGH SURF"
StormData$EVTYPE[grepl("COASTAL.*EROSION",StormData$EVTYPE)] = "COASTAL FLOOD"
StormData$EVTYPE[grepl("FLASH.*FLOOD",StormData$EVTYPE)] = "FLASH FLOOD"
StormData$EVTYPE[grepl("BLOWING DUST",StormData$EVTYPE)] = "DUST STORM"
StormData$EVTYPE[grepl("LAKESHORE FLOOD",StormData$EVTYPE)] = "FLOOD"
StormData$EVTYPE[grepl("GRADIENT WIND|WIND DAMAGE",StormData$EVTYPE)] = "STRONG WIND"
StormData$EVTYPE[grepl("ICE JAM FLOOD",StormData$EVTYPE)] = "FLOOD"
StormData$EVTYPE[grepl("ICY ROAD",StormData$EVTYPE)] = "WINTER WEATHER"

1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

This block first trims off a few columns that are unrelated to population health. Then, it removes data points that are irrelevant to population health i.e. 0 deaths and 0 injuries. Then, the sums of fatalities and injuries by environment types are evaluated.

PopHealthData = StormData[-c(6:9)]
PopHealthData = PopHealthData[!(PopHealthData$FATALITIES == 0 & PopHealthData$INJURIES == 0),]

SumFatalitiesByEV = tapply(PopHealthData$FATALITIES,PopHealthData$EVTYPE,sum)
head(sort(SumFatalitiesByEV,decreasing = TRUE),12)
##           EXCESSIVE HEAT                  TORNADO              FLASH FLOOD 
##                     1688                     1430                      861 
##                LIGHTNING              RIP CURRENT             FREEZING FOG 
##                      613                      523                      443 
##                    FLOOD MARINE THUNDERSTORM WIND  EXTREME COLD/WIND CHILL 
##                      405                      253                      248 
##                HIGH WIND                     HEAT                AVALANCHE 
##                      238                      225                      217
SumInjuriesByEV = tapply(PopHealthData$INJURIES,PopHealthData$EVTYPE,sum)
head(sort(SumInjuriesByEV,decreasing = TRUE),12)
##                  TORNADO                    FLOOD           EXCESSIVE HEAT 
##                    19871                     5900                     5746 
##                LIGHTNING             FREEZING FOG MARINE THUNDERSTORM WIND 
##                     3930                     3644                     3584 
##              FLASH FLOOD                 WILDFIRE        THUNDERSTORM WIND 
##                     1592                     1379                     1328 
##                     HEAT       HURRICANE(TYPHOON)             WINTER STORM 
##                     1308                     1214                     1118

As can be seen from the results, there is a significant difference in the distribution of the affected population between deaths and injuries.

par(mar = c(8,4,4,0.5))
barplot(SumInjuriesByEV,col=rgb(0,0,1,0.25), las = 2 ,cex.names = 0.5,ylim = c(0,25000),)
barplot(SumFatalitiesByEV,col=rgb(1,0,0,0.25), las = 2 ,cex.names = 0.5,add = TRUE)
legend("topright",legend = c("Sums of Injuries","Sums of Fatalities"),col = c(rgb(0,0,1,0.25),rgb(1,0,0,0.25)),pch = 15)
title(main = "Overlapped Graphs of Sums of Injuries and Fatalities by Event Type")

It can also be noted that fatalities were always significantly lower than injuries. In an attempt to coerce these two metrics together into one, a factor of 3 is arbitrarily chosen to be multiplied by fatalities before being added to injuries,creating the metric PopHealthImpact

PopHealthImpact = sort(3*SumFatalitiesByEV+SumInjuriesByEV,decreasing = TRUE)
barplot(PopHealthImpact,las = 2,cex.names = 0.5)
title(main = "Weighted Population Health Impact by Environment Type")

As can be seen with this new metric, population health is most affected by the first 7 events before a steep drop off. Hence, the answer to the question can be given by:

print(PopHealthImpact[1:7])
##                  TORNADO           EXCESSIVE HEAT                    FLOOD 
##                    24161                    10810                     7115 
##                LIGHTNING             FREEZING FOG MARINE THUNDERSTORM WIND 
##                     5769                     4973                     4343 
##              FLASH FLOOD 
##                     4175

2. Across the United States, which types of events have the greatest economic consequences?

This block first trims off a few columns that are unrelated to economic damage. Then, it removes data points that are irrelevant to economic damage ie $PROPDMG == 0 and $CROPDMG == 0. Then, exponents are multiplied to their base values (for the respective economic damages) to give 2 vectors that are bound to the list as columns. Then, they are summed by EVTYPE

EconData = StormData[-c(4,5)]
EconData = EconData[!(EconData$PROPDMG == 0 & EconData$CROPDMG == 0),]
PROP = numeric(nrow(EconData))
for (i in 1:nrow(EconData)){
  if(EconData$PROPDMGEXP[i] == "B"){
    PROP[i] = EconData$PROPDMG[i] * 10**9
  }
  else if(EconData$PROPDMGEXP[i] == "M"){
    PROP[i] = EconData$PROPDMG[i] * 10**6
  }
  else if(EconData$PROPDMGEXP[i] == "K"){
    PROP[i] = EconData$PROPDMG[i] * 10**3
  }
  else{
    PROP[i] = EconData$PROPDMG[i]
  }
}
EconData = cbind(EconData,PROP)
SumPROPByEV = tapply(EconData$PROP,EconData$EVTYPE,sum)



CROP = numeric(nrow(EconData))
for (i in 1:nrow(EconData)){
  if(EconData$CROPDMGEXP[i] == "B"){
    CROP[i] = EconData$CROPDMG[i] * 10**9
  }
  else if(EconData$CROPDMGEXP[i] == "M"){
    CROP[i] = EconData$CROPDMG[i] * 10**6
  }
  else if(EconData$CROPDMGEXP[i] == "K"){
    CROP[i] = EconData$CROPDMG[i] * 10**3
  }
  else{
    CROP[i] = EconData$CROPDMG[i]
  }
}
EconData = cbind(EconData,CROP)
SumCROPByEV = tapply(EconData$CROP,EconData$EVTYPE,sum)

The graph below shows the results

invisible(library(reshape2))
## Warning: package 'reshape2' was built under R version 4.4.2
invisible(library(ggplot2))
## Warning: package 'ggplot2' was built under R version 4.4.2
df = data.frame(SumPROPByEV,SumCROPByEV,EVTYPE = rownames(SumPROPByEV))
df = melt(df,id.vars = "EVTYPE")
ggplot(df,aes(x=EVTYPE,y=value,fill = variable))+geom_bar(position = "stack", stat = "identity")

As can be seen from the graph, there are 3 very large values, and arguably one more, so our answer to the question will be the top 4 environment types. Hence, the answer can be given by:

sort(SumCROPByEV+SumPROPByEV,decreasing = TRUE)[1:4]
##              FLOOD HURRICANE(TYPHOON)       FREEZING FOG            TORNADO 
##       146503124250        66638949810        61469327970        24007107620

Results

Answering Question 1:

PopHealthImpact[1:7]
##                  TORNADO           EXCESSIVE HEAT                    FLOOD 
##                    24161                    10810                     7115 
##                LIGHTNING             FREEZING FOG MARINE THUNDERSTORM WIND 
##                     5769                     4973                     4343 
##              FLASH FLOOD 
##                     4175

Answering Question 2:

sort(SumCROPByEV+SumPROPByEV,decreasing = TRUE)[1:4]
##              FLOOD HURRICANE(TYPHOON)       FREEZING FOG            TORNADO 
##       146503124250        66638949810        61469327970        24007107620