Set up dir

load libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(ggplot2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tidyr)
library(readr)
library(R.utils)
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.8.2 (2022-06-13 22:00:14 UTC) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.27.1 (2025-05-02 21:00:05 UTC) successfully loaded. See ?R.oo for help.
## 
## Attaching package: 'R.oo'
## The following object is masked from 'package:R.methodsS3':
## 
##     throw
## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods
## The following objects are masked from 'package:base':
## 
##     attach, detach, load, save
## R.utils v2.13.0 (2025-02-24 21:20:02 UTC) successfully loaded. See ?R.utils for help.
## 
## Attaching package: 'R.utils'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:utils':
## 
##     timestamp
## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, isOpen, nullfile, parse, use, warnings
library(stringdist)
## 
## Attaching package: 'stringdist'
## The following object is masked from 'package:R.utils':
## 
##     extract
## The following object is masked from 'package:tidyr':
## 
##     extract
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor

Data Processing

download and read data

fileurl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
destfile <- "StormData.csv.bz2"

# Download the file if it doesn't already exist
if (!file.exists(destfile)) {
  download.file(url, destfile, mode = "wb")
}
# Save download date
download_date <- Sys.Date()
list.files() # check file
##  [1] "Assign2.html"                                    
##  [2] "Assign2.md"                                      
##  [3] "Assign2.Rmd"                                     
##  [4] "Assign2_cache"                                   
##  [5] "Assign2_files"                                   
##  [6] "baggerly.pdf"                                    
##  [7] "column headers.docx"                             
##  [8] "Commentaries.txt"                                
##  [9] "EVT_names.csv"                                   
## [10] "repdata_peer2_doc_NCDC Storm Events-FAQ Page.pdf"
## [11] "repdata_peer2_doc_pd01016005curr.pdf"            
## [12] "RRCaseStudy.pdf"                                 
## [13] "StormData.csv.bz2"
# reading the file as csv file and replacing blank cells (NULL) with NAs
# as operations are not feasible with NULL

storm <- read.csv("StormData.csv.bz2", na.strings = c("", "NA"))
# understand data
dim(storm) # check data download
## [1] 902297     37
#str(storm)
#names(storm)
#head(storm)
#unique(storm$EVTYPE) # turned off internally

keep required columns

# select the required columns

storm1 <- storm %>% select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)

storm1$EVTYPE <- as.character(storm1$EVTYPE) # to make sure this is set as character to be able to match with official event list

head(storm1, n = 20) # to see some data
##     EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1  TORNADO          0       15    25.0          K       0       <NA>
## 2  TORNADO          0        0     2.5          K       0       <NA>
## 3  TORNADO          0        2    25.0          K       0       <NA>
## 4  TORNADO          0        2     2.5          K       0       <NA>
## 5  TORNADO          0        2     2.5          K       0       <NA>
## 6  TORNADO          0        6     2.5          K       0       <NA>
## 7  TORNADO          0        1     2.5          K       0       <NA>
## 8  TORNADO          0        0     2.5          K       0       <NA>
## 9  TORNADO          1       14    25.0          K       0       <NA>
## 10 TORNADO          0        0    25.0          K       0       <NA>
## 11 TORNADO          0        3     2.5          M       0       <NA>
## 12 TORNADO          0        3     2.5          M       0       <NA>
## 13 TORNADO          1       26   250.0          K       0       <NA>
## 14 TORNADO          0       12     0.0          K       0       <NA>
## 15 TORNADO          0        6    25.0          K       0       <NA>
## 16 TORNADO          4       50    25.0          K       0       <NA>
## 17 TORNADO          0        2    25.0          K       0       <NA>
## 18 TORNADO          0        0    25.0          K       0       <NA>
## 19 TORNADO          0        0    25.0          K       0       <NA>
## 20 TORNADO          0        0    25.0          K       0       <NA>

tidy up the event names to 48 types and do fuzzy matching

# official event list created manually from documentation PDF file 
# located here: https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf
# copy and paste table of official events to csv file
# read correct names of events

EVT_names <- read.csv("EVT_names.csv", sep=",", header=TRUE)

EVT_names$EVT_Name <- as.character(EVT_names$EVT_Name) # to make sure this is set as character to be able to match with names in EVTYPE

EVT_names
##                    EVT_Name
## 1    Astronomical Low Tide 
## 2                Avalanche 
## 3                  Blizzard
## 4            Coastal Flood 
## 5           Cold/Wind Chill
## 6              Debris Flow 
## 7                Dense Fog 
## 8              Dense Smoke 
## 9                  Drought 
## 10              Dust Devil 
## 11              Dust Storm 
## 12          Excessive Heat 
## 13 Extreme Cold/Wind Chill 
## 14             Flash Flood 
## 15                   Flood 
## 16            Frost/Freeeze
## 17             Funnel Cloud
## 18            Freezing Fog 
## 19                    Hail 
## 20                    Heat 
## 21              Heavy Rain 
## 22              Heavy Snow 
## 23               High Surf 
## 24               High Wind 
## 25     Hurricane (Typhoon) 
## 26               Ice Storm 
## 27        Lake-Effect Snow 
## 28         Lakeshore Flood 
## 29               Lightning 
## 30              Marine Hail
## 31         Marine High Wind
## 32       Marine Strong Wind
## 33 Marine Thunderstorm Wind
## 34             Rip Current 
## 35                  Seiche 
## 36                   Sleet 
## 37        Storm Surge/Tide 
## 38             Strong Wind 
## 39       Thunderstorm Wind 
## 40                 Tornado 
## 41     Tropical Depression 
## 42          Tropical Storm 
## 43                 Tsunami 
## 44            Volcanic Ash 
## 45               Waterspout
## 46                Wildfire 
## 47            Winter Storm 
## 48          Winter Weather
# new column with correct type of event with closest match 
# had to try multiple maxDist values to get all the original event names to get closet match index (ie no more NAs in index values)

match_index <- amatch(storm1$EVTYPE, EVT_names$EVT_Name, method = "osa", maxDist = 20)

# include matched values as a new column "Event"

storm1$Event <- EVT_names$EVT_Name[match_index]

head(storm1, n=20)
##     EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP  Event
## 1  TORNADO          0       15    25.0          K       0       <NA> Flood 
## 2  TORNADO          0        0     2.5          K       0       <NA> Flood 
## 3  TORNADO          0        2    25.0          K       0       <NA> Flood 
## 4  TORNADO          0        2     2.5          K       0       <NA> Flood 
## 5  TORNADO          0        2     2.5          K       0       <NA> Flood 
## 6  TORNADO          0        6     2.5          K       0       <NA> Flood 
## 7  TORNADO          0        1     2.5          K       0       <NA> Flood 
## 8  TORNADO          0        0     2.5          K       0       <NA> Flood 
## 9  TORNADO          1       14    25.0          K       0       <NA> Flood 
## 10 TORNADO          0        0    25.0          K       0       <NA> Flood 
## 11 TORNADO          0        3     2.5          M       0       <NA> Flood 
## 12 TORNADO          0        3     2.5          M       0       <NA> Flood 
## 13 TORNADO          1       26   250.0          K       0       <NA> Flood 
## 14 TORNADO          0       12     0.0          K       0       <NA> Flood 
## 15 TORNADO          0        6    25.0          K       0       <NA> Flood 
## 16 TORNADO          4       50    25.0          K       0       <NA> Flood 
## 17 TORNADO          0        2    25.0          K       0       <NA> Flood 
## 18 TORNADO          0        0    25.0          K       0       <NA> Flood 
## 19 TORNADO          0        0    25.0          K       0       <NA> Flood 
## 20 TORNADO          0        0    25.0          K       0       <NA> Flood

Process the data for Q1

# Assumption here is that "most harmful" means events involving maximum no of fatalities based on mean value from the type of event

storm1_summary <- storm1 %>%
  group_by(Event) %>%
  summarise(
    total_fatalities = sum(FATALITIES, na.rm = TRUE),
  ) %>%
arrange(desc(total_fatalities)) 

head(storm1_summary)
## # A tibble: 6 × 2
##   Event             total_fatalities
##   <chr>                        <dbl>
## 1 "Flood "                      6188
## 2 "Hail "                       3186
## 3 "Excessive Heat "             1903
## 4 "Dense Fog "                  1206
## 5 "Rip Current "                 580
## 6 "Heavy Snow "                  479
storm1_summary %>% slice_max(total_fatalities, n = 1) # which event causes most fatalities
## # A tibble: 1 × 2
##   Event    total_fatalities
##   <chr>               <dbl>
## 1 "Flood "             6188

Process the data to answer Q2

## create a new column to multiply with exponents for both property and crop damages and add them to get total damage by event

# create a column with all possible exponents 

exp_list <- c("B" = 10^9, "b" = 10^9, "H" = 10^2, "h" = 10^2, "K" = 10^3, "k" = 10^3, "M" = 10^6, "m" = 10^6, "0" = 1, "1" = 10, "2" = 100, "3" = 10^3, "4" = 10^4, "5" = 10^5, "6" = 10^6, "7" = 10^7, "8" = 10^8, "+" = 1, "-" = 0, "?" = 0)

# add a column for total property damage after calculation in $$
storm2 <- storm1 %>%
  mutate(
    exp_factor = exp_list[as.character(PROPDMGEXP)],
   total_prop_dmg = PROPDMG * ifelse(is.na(exp_factor), 0, exp_factor)
  )

head(storm2)
##    EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP  Event
## 1 TORNADO          0       15    25.0          K       0       <NA> Flood 
## 2 TORNADO          0        0     2.5          K       0       <NA> Flood 
## 3 TORNADO          0        2    25.0          K       0       <NA> Flood 
## 4 TORNADO          0        2     2.5          K       0       <NA> Flood 
## 5 TORNADO          0        2     2.5          K       0       <NA> Flood 
## 6 TORNADO          0        6     2.5          K       0       <NA> Flood 
##   exp_factor total_prop_dmg
## 1       1000          25000
## 2       1000           2500
## 3       1000          25000
## 4       1000           2500
## 5       1000           2500
## 6       1000           2500
# add a column for total crop damage after calculation in $$

storm3 <- storm2 %>%
  mutate(
    exp_factor = exp_list[as.character(CROPDMGEXP)],
   total_crop_dmg = CROPDMG * ifelse(is.na(exp_factor), 0, exp_factor)
  )

head(storm3)
##    EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP  Event
## 1 TORNADO          0       15    25.0          K       0       <NA> Flood 
## 2 TORNADO          0        0     2.5          K       0       <NA> Flood 
## 3 TORNADO          0        2    25.0          K       0       <NA> Flood 
## 4 TORNADO          0        2     2.5          K       0       <NA> Flood 
## 5 TORNADO          0        2     2.5          K       0       <NA> Flood 
## 6 TORNADO          0        6     2.5          K       0       <NA> Flood 
##   exp_factor total_prop_dmg total_crop_dmg
## 1         NA          25000              0
## 2         NA           2500              0
## 3         NA          25000              0
## 4         NA           2500              0
## 5         NA           2500              0
## 6         NA           2500              0
# calculate total damages from combined property and crop damages
storm_final <- storm3 %>%
  mutate(
    total_damages = total_prop_dmg + total_crop_dmg,
  ) %>% 
  arrange(desc(total_damages)) # arrange data in descending order

# head(storm_final) added to check intermediate result

head(storm_final)
##              EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1             FLOOD          0        0  115.00          B    32.5          M
## 2       STORM SURGE          0        0   31.30          B     0.0       <NA>
## 3 HURRICANE/TYPHOON          0        0   16.93          B     0.0       <NA>
## 4       STORM SURGE          0        0   11.26          B     0.0       <NA>
## 5       RIVER FLOOD          0        0    5.00          B     5.0          B
## 6 HURRICANE/TYPHOON          5        0   10.00          B     0.0       <NA>
##         Event exp_factor total_prop_dmg total_crop_dmg total_damages
## 1      Flood       1e+06      1.150e+11       3.25e+07  115032500000
## 2 Heavy Snow          NA      3.130e+10       0.00e+00   31300000000
## 3  Avalanche          NA      1.693e+10       0.00e+00   16930000000
## 4 Heavy Snow          NA      1.126e+10       0.00e+00   11260000000
## 5  Dense Fog       1e+09      5.000e+09       5.00e+09   10000000000
## 6  Avalanche          NA      1.000e+10       0.00e+00   10000000000
top_dmg <- storm_final %>% 
  select(Event, total_damages) %>%
  slice_max(total_damages, n = 20)
head(top_dmg, n=20)
##                Event total_damages
## 1             Flood   115032500000
## 2        Heavy Snow    31300000000
## 3         Avalanche    16930000000
## 4        Heavy Snow    11260000000
## 5         Dense Fog    10000000000
## 6         Avalanche    10000000000
## 7         Avalanche     7390000000
## 8         Avalanche     7350000000
## 9         Avalanche     5705000000
## 10       Heavy Snow     5150000000
## 11        Ice Storm     5000500000
## 12       Heavy Snow     5000000000
## 13        Avalanche     4923200000
## 14        Avalanche     4025000000
## 15        Avalanche     4000000000
## 16 Storm Surge/Tide     4000000000
## 17             Hail     3500000000
## 18            Flood     3000000000
## 19            Flood     2800000000
## 20        Avalanche     2525000000

RESULTS

Impact of storm events on population health especially fatalities

# select top 10 events

topevents <- storm1_summary %>% filter(total_fatalities > 120)
head(topevents, n = 10)
## # A tibble: 10 × 2
##    Event                      total_fatalities
##    <chr>                                 <dbl>
##  1 "Flood "                               6188
##  2 "Hail "                                3186
##  3 "Excessive Heat "                      1903
##  4 "Dense Fog "                           1206
##  5 "Rip Current "                          580
##  6 "Heavy Snow "                           479
##  7 "Avalanche "                            289
##  8 "Blizzard"                              223
##  9 "Extreme Cold/Wind Chill "              125
## 10 "Ice Storm "                            123
# create a plot to the events with higher fatalities to answer Q1

ggplot(topevents, aes(x = reorder(Event, -total_fatalities), y = total_fatalities)) +
  geom_col(fill = "blue") +
  labs(
    title = "Figure 1. Top 10 Impactful Events on Population Health\nAcross the USA",
    x = "Event Type",
    y = "Total Fatalities"
  ) +
  scale_y_continuous(limits=c(0, 8000)) + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

### Figure 2

ggplot(top_dmg, aes(x = Event, y = total_damages)) +
  geom_col(fill = "orange") +
  scale_y_continuous(
    labels = label_dollar(scale = 1e-9, suffix = "B", accuracy = 1),
    expand = expansion(mult = c(0, 0.05))
  ) +
  labs(
    title = "Top Storm Events by Economic Damage",
    x = "Event Type",
    y = "Total Damages (USD Billions)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))