library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(tidyr)
library(readr)
library(R.utils)
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.8.2 (2022-06-13 22:00:14 UTC) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.27.1 (2025-05-02 21:00:05 UTC) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
## The following object is masked from 'package:R.methodsS3':
##
## throw
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
## The following objects are masked from 'package:base':
##
## attach, detach, load, save
## R.utils v2.13.0 (2025-02-24 21:20:02 UTC) successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:utils':
##
## timestamp
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, isOpen, nullfile, parse, use, warnings
library(stringdist)
##
## Attaching package: 'stringdist'
## The following object is masked from 'package:R.utils':
##
## extract
## The following object is masked from 'package:tidyr':
##
## extract
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
fileurl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
destfile <- "StormData.csv.bz2"
# Download the file if it doesn't already exist
if (!file.exists(destfile)) {
download.file(url, destfile, mode = "wb")
}
# Save download date
download_date <- Sys.Date()
list.files() # check file
## [1] "Assign2.html"
## [2] "Assign2.md"
## [3] "Assign2.Rmd"
## [4] "Assign2_cache"
## [5] "Assign2_files"
## [6] "baggerly.pdf"
## [7] "column headers.docx"
## [8] "Commentaries.txt"
## [9] "EVT_names.csv"
## [10] "repdata_peer2_doc_NCDC Storm Events-FAQ Page.pdf"
## [11] "repdata_peer2_doc_pd01016005curr.pdf"
## [12] "RRCaseStudy.pdf"
## [13] "StormData.csv.bz2"
# reading the file as csv file and replacing blank cells (NULL) with NAs
# as operations are not feasible with NULL
storm <- read.csv("StormData.csv.bz2", na.strings = c("", "NA"))
# understand data
dim(storm) # check data download
## [1] 902297 37
#str(storm)
#names(storm)
#head(storm)
#unique(storm$EVTYPE) # turned off internally
# select the required columns
storm1 <- storm %>% select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
storm1$EVTYPE <- as.character(storm1$EVTYPE) # to make sure this is set as character to be able to match with official event list
head(storm1, n = 20) # to see some data
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO 0 15 25.0 K 0 <NA>
## 2 TORNADO 0 0 2.5 K 0 <NA>
## 3 TORNADO 0 2 25.0 K 0 <NA>
## 4 TORNADO 0 2 2.5 K 0 <NA>
## 5 TORNADO 0 2 2.5 K 0 <NA>
## 6 TORNADO 0 6 2.5 K 0 <NA>
## 7 TORNADO 0 1 2.5 K 0 <NA>
## 8 TORNADO 0 0 2.5 K 0 <NA>
## 9 TORNADO 1 14 25.0 K 0 <NA>
## 10 TORNADO 0 0 25.0 K 0 <NA>
## 11 TORNADO 0 3 2.5 M 0 <NA>
## 12 TORNADO 0 3 2.5 M 0 <NA>
## 13 TORNADO 1 26 250.0 K 0 <NA>
## 14 TORNADO 0 12 0.0 K 0 <NA>
## 15 TORNADO 0 6 25.0 K 0 <NA>
## 16 TORNADO 4 50 25.0 K 0 <NA>
## 17 TORNADO 0 2 25.0 K 0 <NA>
## 18 TORNADO 0 0 25.0 K 0 <NA>
## 19 TORNADO 0 0 25.0 K 0 <NA>
## 20 TORNADO 0 0 25.0 K 0 <NA>
# official event list created manually from documentation PDF file
# located here: https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf
# copy and paste table of official events to csv file
# read correct names of events
EVT_names <- read.csv("EVT_names.csv", sep=",", header=TRUE)
EVT_names$EVT_Name <- as.character(EVT_names$EVT_Name) # to make sure this is set as character to be able to match with names in EVTYPE
EVT_names
## EVT_Name
## 1 Astronomical Low Tide
## 2 Avalanche
## 3 Blizzard
## 4 Coastal Flood
## 5 Cold/Wind Chill
## 6 Debris Flow
## 7 Dense Fog
## 8 Dense Smoke
## 9 Drought
## 10 Dust Devil
## 11 Dust Storm
## 12 Excessive Heat
## 13 Extreme Cold/Wind Chill
## 14 Flash Flood
## 15 Flood
## 16 Frost/Freeeze
## 17 Funnel Cloud
## 18 Freezing Fog
## 19 Hail
## 20 Heat
## 21 Heavy Rain
## 22 Heavy Snow
## 23 High Surf
## 24 High Wind
## 25 Hurricane (Typhoon)
## 26 Ice Storm
## 27 Lake-Effect Snow
## 28 Lakeshore Flood
## 29 Lightning
## 30 Marine Hail
## 31 Marine High Wind
## 32 Marine Strong Wind
## 33 Marine Thunderstorm Wind
## 34 Rip Current
## 35 Seiche
## 36 Sleet
## 37 Storm Surge/Tide
## 38 Strong Wind
## 39 Thunderstorm Wind
## 40 Tornado
## 41 Tropical Depression
## 42 Tropical Storm
## 43 Tsunami
## 44 Volcanic Ash
## 45 Waterspout
## 46 Wildfire
## 47 Winter Storm
## 48 Winter Weather
# new column with correct type of event with closest match
# had to try multiple maxDist values to get all the original event names to get closet match index (ie no more NAs in index values)
match_index <- amatch(storm1$EVTYPE, EVT_names$EVT_Name, method = "osa", maxDist = 20)
# include matched values as a new column "Event"
storm1$Event <- EVT_names$EVT_Name[match_index]
head(storm1, n=20)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP Event
## 1 TORNADO 0 15 25.0 K 0 <NA> Flood
## 2 TORNADO 0 0 2.5 K 0 <NA> Flood
## 3 TORNADO 0 2 25.0 K 0 <NA> Flood
## 4 TORNADO 0 2 2.5 K 0 <NA> Flood
## 5 TORNADO 0 2 2.5 K 0 <NA> Flood
## 6 TORNADO 0 6 2.5 K 0 <NA> Flood
## 7 TORNADO 0 1 2.5 K 0 <NA> Flood
## 8 TORNADO 0 0 2.5 K 0 <NA> Flood
## 9 TORNADO 1 14 25.0 K 0 <NA> Flood
## 10 TORNADO 0 0 25.0 K 0 <NA> Flood
## 11 TORNADO 0 3 2.5 M 0 <NA> Flood
## 12 TORNADO 0 3 2.5 M 0 <NA> Flood
## 13 TORNADO 1 26 250.0 K 0 <NA> Flood
## 14 TORNADO 0 12 0.0 K 0 <NA> Flood
## 15 TORNADO 0 6 25.0 K 0 <NA> Flood
## 16 TORNADO 4 50 25.0 K 0 <NA> Flood
## 17 TORNADO 0 2 25.0 K 0 <NA> Flood
## 18 TORNADO 0 0 25.0 K 0 <NA> Flood
## 19 TORNADO 0 0 25.0 K 0 <NA> Flood
## 20 TORNADO 0 0 25.0 K 0 <NA> Flood
# Assumption here is that "most harmful" means events involving maximum no of fatalities based on mean value from the type of event
storm1_summary <- storm1 %>%
group_by(Event) %>%
summarise(
total_fatalities = sum(FATALITIES, na.rm = TRUE),
) %>%
arrange(desc(total_fatalities))
head(storm1_summary)
## # A tibble: 6 × 2
## Event total_fatalities
## <chr> <dbl>
## 1 "Flood " 6188
## 2 "Hail " 3186
## 3 "Excessive Heat " 1903
## 4 "Dense Fog " 1206
## 5 "Rip Current " 580
## 6 "Heavy Snow " 479
storm1_summary %>% slice_max(total_fatalities, n = 1) # which event causes most fatalities
## # A tibble: 1 × 2
## Event total_fatalities
## <chr> <dbl>
## 1 "Flood " 6188
## create a new column to multiply with exponents for both property and crop damages and add them to get total damage by event
# create a column with all possible exponents
exp_list <- c("B" = 10^9, "b" = 10^9, "H" = 10^2, "h" = 10^2, "K" = 10^3, "k" = 10^3, "M" = 10^6, "m" = 10^6, "0" = 1, "1" = 10, "2" = 100, "3" = 10^3, "4" = 10^4, "5" = 10^5, "6" = 10^6, "7" = 10^7, "8" = 10^8, "+" = 1, "-" = 0, "?" = 0)
# add a column for total property damage after calculation in $$
storm2 <- storm1 %>%
mutate(
exp_factor = exp_list[as.character(PROPDMGEXP)],
total_prop_dmg = PROPDMG * ifelse(is.na(exp_factor), 0, exp_factor)
)
head(storm2)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP Event
## 1 TORNADO 0 15 25.0 K 0 <NA> Flood
## 2 TORNADO 0 0 2.5 K 0 <NA> Flood
## 3 TORNADO 0 2 25.0 K 0 <NA> Flood
## 4 TORNADO 0 2 2.5 K 0 <NA> Flood
## 5 TORNADO 0 2 2.5 K 0 <NA> Flood
## 6 TORNADO 0 6 2.5 K 0 <NA> Flood
## exp_factor total_prop_dmg
## 1 1000 25000
## 2 1000 2500
## 3 1000 25000
## 4 1000 2500
## 5 1000 2500
## 6 1000 2500
# add a column for total crop damage after calculation in $$
storm3 <- storm2 %>%
mutate(
exp_factor = exp_list[as.character(CROPDMGEXP)],
total_crop_dmg = CROPDMG * ifelse(is.na(exp_factor), 0, exp_factor)
)
head(storm3)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP Event
## 1 TORNADO 0 15 25.0 K 0 <NA> Flood
## 2 TORNADO 0 0 2.5 K 0 <NA> Flood
## 3 TORNADO 0 2 25.0 K 0 <NA> Flood
## 4 TORNADO 0 2 2.5 K 0 <NA> Flood
## 5 TORNADO 0 2 2.5 K 0 <NA> Flood
## 6 TORNADO 0 6 2.5 K 0 <NA> Flood
## exp_factor total_prop_dmg total_crop_dmg
## 1 NA 25000 0
## 2 NA 2500 0
## 3 NA 25000 0
## 4 NA 2500 0
## 5 NA 2500 0
## 6 NA 2500 0
# calculate total damages from combined property and crop damages
storm_final <- storm3 %>%
mutate(
total_damages = total_prop_dmg + total_crop_dmg,
) %>%
arrange(desc(total_damages)) # arrange data in descending order
# head(storm_final) added to check intermediate result
head(storm_final)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 FLOOD 0 0 115.00 B 32.5 M
## 2 STORM SURGE 0 0 31.30 B 0.0 <NA>
## 3 HURRICANE/TYPHOON 0 0 16.93 B 0.0 <NA>
## 4 STORM SURGE 0 0 11.26 B 0.0 <NA>
## 5 RIVER FLOOD 0 0 5.00 B 5.0 B
## 6 HURRICANE/TYPHOON 5 0 10.00 B 0.0 <NA>
## Event exp_factor total_prop_dmg total_crop_dmg total_damages
## 1 Flood 1e+06 1.150e+11 3.25e+07 115032500000
## 2 Heavy Snow NA 3.130e+10 0.00e+00 31300000000
## 3 Avalanche NA 1.693e+10 0.00e+00 16930000000
## 4 Heavy Snow NA 1.126e+10 0.00e+00 11260000000
## 5 Dense Fog 1e+09 5.000e+09 5.00e+09 10000000000
## 6 Avalanche NA 1.000e+10 0.00e+00 10000000000
top_dmg <- storm_final %>%
select(Event, total_damages) %>%
slice_max(total_damages, n = 20)
head(top_dmg, n=20)
## Event total_damages
## 1 Flood 115032500000
## 2 Heavy Snow 31300000000
## 3 Avalanche 16930000000
## 4 Heavy Snow 11260000000
## 5 Dense Fog 10000000000
## 6 Avalanche 10000000000
## 7 Avalanche 7390000000
## 8 Avalanche 7350000000
## 9 Avalanche 5705000000
## 10 Heavy Snow 5150000000
## 11 Ice Storm 5000500000
## 12 Heavy Snow 5000000000
## 13 Avalanche 4923200000
## 14 Avalanche 4025000000
## 15 Avalanche 4000000000
## 16 Storm Surge/Tide 4000000000
## 17 Hail 3500000000
## 18 Flood 3000000000
## 19 Flood 2800000000
## 20 Avalanche 2525000000
# select top 10 events
topevents <- storm1_summary %>% filter(total_fatalities > 120)
head(topevents, n = 10)
## # A tibble: 10 × 2
## Event total_fatalities
## <chr> <dbl>
## 1 "Flood " 6188
## 2 "Hail " 3186
## 3 "Excessive Heat " 1903
## 4 "Dense Fog " 1206
## 5 "Rip Current " 580
## 6 "Heavy Snow " 479
## 7 "Avalanche " 289
## 8 "Blizzard" 223
## 9 "Extreme Cold/Wind Chill " 125
## 10 "Ice Storm " 123
# create a plot to the events with higher fatalities to answer Q1
ggplot(topevents, aes(x = reorder(Event, -total_fatalities), y = total_fatalities)) +
geom_col(fill = "blue") +
labs(
title = "Figure 1. Top 10 Impactful Events on Population Health\nAcross the USA",
x = "Event Type",
y = "Total Fatalities"
) +
scale_y_continuous(limits=c(0, 8000)) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
### Figure 2
ggplot(top_dmg, aes(x = Event, y = total_damages)) +
geom_col(fill = "orange") +
scale_y_continuous(
labels = label_dollar(scale = 1e-9, suffix = "B", accuracy = 1),
expand = expansion(mult = c(0, 0.05))
) +
labs(
title = "Top Storm Events by Economic Damage",
x = "Event Type",
y = "Total Damages (USD Billions)"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))