Set up dir

load libraries

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(ggplot2)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(tidyr)
library(readr)
library(R.utils)

## Loading required package: R.oo

## Loading required package: R.methodsS3

## R.methodsS3 v1.8.2 (2022-06-13 22:00:14 UTC) successfully loaded. See ?R.methodsS3 for help.

## R.oo v1.27.1 (2025-05-02 21:00:05 UTC) successfully loaded. See ?R.oo for help.

## 
## Attaching package: 'R.oo'

## The following object is masked from 'package:R.methodsS3':
## 
##     throw

## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods

## The following objects are masked from 'package:base':
## 
##     attach, detach, load, save

## R.utils v2.13.0 (2025-02-24 21:20:02 UTC) successfully loaded. See ?R.utils for help.

## 
## Attaching package: 'R.utils'

## The following object is masked from 'package:tidyr':
## 
##     extract

## The following object is masked from 'package:utils':
## 
##     timestamp

## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, isOpen, nullfile, parse, use, warnings

library(stringdist)

## 
## Attaching package: 'stringdist'

## The following object is masked from 'package:R.utils':
## 
##     extract

## The following object is masked from 'package:tidyr':
## 
##     extract

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:readr':
## 
##     col_factor

Data Processing

download and read data

fileurl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
destfile <- "StormData.csv.bz2"

# Download the file if it doesn't already exist
if (!file.exists(destfile)) {
  download.file(url, destfile, mode = "wb")
}
# Save download date
download_date <- Sys.Date()
list.files() # check file

##  [1] "Assign2.html"                                    
##  [2] "Assign2.md"                                      
##  [3] "Assign2.Rmd"                                     
##  [4] "Assign2_cache"                                   
##  [5] "Assign2_files"                                   
##  [6] "baggerly.pdf"                                    
##  [7] "column headers.docx"                             
##  [8] "Commentaries.txt"                                
##  [9] "EVT_names.csv"                                   
## [10] "repdata_peer2_doc_NCDC Storm Events-FAQ Page.pdf"
## [11] "repdata_peer2_doc_pd01016005curr.pdf"            
## [12] "RRCaseStudy.pdf"                                 
## [13] "StormData.csv.bz2"

# reading the file as csv file and replacing blank cells (NULL) with NAs
# as operations are not feasible with NULL

storm <- read.csv("StormData.csv.bz2", na.strings = c("", "NA"))
# understand data
dim(storm) # check data download

## [1] 902297     37

#str(storm)
#names(storm)
#head(storm)
#unique(storm$EVTYPE) # turned off internally

keep required columns

# select the required columns

storm1 <- storm %>% select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)

storm1$EVTYPE <- as.character(storm1$EVTYPE) # to make sure this is set as character to be able to match with official event list

head(storm1, n = 20) # to see some data

##     EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1  TORNADO          0       15    25.0          K       0       <NA>
## 2  TORNADO          0        0     2.5          K       0       <NA>
## 3  TORNADO          0        2    25.0          K       0       <NA>
## 4  TORNADO          0        2     2.5          K       0       <NA>
## 5  TORNADO          0        2     2.5          K       0       <NA>
## 6  TORNADO          0        6     2.5          K       0       <NA>
## 7  TORNADO          0        1     2.5          K       0       <NA>
## 8  TORNADO          0        0     2.5          K       0       <NA>
## 9  TORNADO          1       14    25.0          K       0       <NA>
## 10 TORNADO          0        0    25.0          K       0       <NA>
## 11 TORNADO          0        3     2.5          M       0       <NA>
## 12 TORNADO          0        3     2.5          M       0       <NA>
## 13 TORNADO          1       26   250.0          K       0       <NA>
## 14 TORNADO          0       12     0.0          K       0       <NA>
## 15 TORNADO          0        6    25.0          K       0       <NA>
## 16 TORNADO          4       50    25.0          K       0       <NA>
## 17 TORNADO          0        2    25.0          K       0       <NA>
## 18 TORNADO          0        0    25.0          K       0       <NA>
## 19 TORNADO          0        0    25.0          K       0       <NA>
## 20 TORNADO          0        0    25.0          K       0       <NA>

tidy up the event names to 48 types and do fuzzy matching

# official event list created manually from documentation PDF file 
# located here: https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf
# copy and paste table of official events to csv file
# read correct names of events

EVT_names <- read.csv("EVT_names.csv", sep=",", header=TRUE)

EVT_names$EVT_Name <- as.character(EVT_names$EVT_Name) # to make sure this is set as character to be able to match with names in EVTYPE

EVT_names

##                    EVT_Name
## 1    Astronomical Low Tide 
## 2                Avalanche 
## 3                  Blizzard
## 4            Coastal Flood 
## 5           Cold/Wind Chill
## 6              Debris Flow 
## 7                Dense Fog 
## 8              Dense Smoke 
## 9                  Drought 
## 10              Dust Devil 
## 11              Dust Storm 
## 12          Excessive Heat 
## 13 Extreme Cold/Wind Chill 
## 14             Flash Flood 
## 15                   Flood 
## 16            Frost/Freeeze
## 17             Funnel Cloud
## 18            Freezing Fog 
## 19                    Hail 
## 20                    Heat 
## 21              Heavy Rain 
## 22              Heavy Snow 
## 23               High Surf 
## 24               High Wind 
## 25     Hurricane (Typhoon) 
## 26               Ice Storm 
## 27        Lake-Effect Snow 
## 28         Lakeshore Flood 
## 29               Lightning 
## 30              Marine Hail
## 31         Marine High Wind
## 32       Marine Strong Wind
## 33 Marine Thunderstorm Wind
## 34             Rip Current 
## 35                  Seiche 
## 36                   Sleet 
## 37        Storm Surge/Tide 
## 38             Strong Wind 
## 39       Thunderstorm Wind 
## 40                 Tornado 
## 41     Tropical Depression 
## 42          Tropical Storm 
## 43                 Tsunami 
## 44            Volcanic Ash 
## 45               Waterspout
## 46                Wildfire 
## 47            Winter Storm 
## 48          Winter Weather

# new column with correct type of event with closest match 
# had to try multiple maxDist values to get all the original event names to get closet match index (ie no more NAs in index values)

match_index <- amatch(storm1$EVTYPE, EVT_names$EVT_Name, method = "osa", maxDist = 20)

# include matched values as a new column "Event"

storm1$Event <- EVT_names$EVT_Name[match_index]

head(storm1, n=20)

##     EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP  Event
## 1  TORNADO          0       15    25.0          K       0       <NA> Flood 
## 2  TORNADO          0        0     2.5          K       0       <NA> Flood 
## 3  TORNADO          0        2    25.0          K       0       <NA> Flood 
## 4  TORNADO          0        2     2.5          K       0       <NA> Flood 
## 5  TORNADO          0        2     2.5          K       0       <NA> Flood 
## 6  TORNADO          0        6     2.5          K       0       <NA> Flood 
## 7  TORNADO          0        1     2.5          K       0       <NA> Flood 
## 8  TORNADO          0        0     2.5          K       0       <NA> Flood 
## 9  TORNADO          1       14    25.0          K       0       <NA> Flood 
## 10 TORNADO          0        0    25.0          K       0       <NA> Flood 
## 11 TORNADO          0        3     2.5          M       0       <NA> Flood 
## 12 TORNADO          0        3     2.5          M       0       <NA> Flood 
## 13 TORNADO          1       26   250.0          K       0       <NA> Flood 
## 14 TORNADO          0       12     0.0          K       0       <NA> Flood 
## 15 TORNADO          0        6    25.0          K       0       <NA> Flood 
## 16 TORNADO          4       50    25.0          K       0       <NA> Flood 
## 17 TORNADO          0        2    25.0          K       0       <NA> Flood 
## 18 TORNADO          0        0    25.0          K       0       <NA> Flood 
## 19 TORNADO          0        0    25.0          K       0       <NA> Flood 
## 20 TORNADO          0        0    25.0          K       0       <NA> Flood

Process the data for Q1

# Assumption here is that "most harmful" means events involving maximum no of fatalities based on mean value from the type of event

storm1_summary <- storm1 %>%
  group_by(Event) %>%
  summarise(
    total_fatalities = sum(FATALITIES, na.rm = TRUE),
  ) %>%
arrange(desc(total_fatalities)) 

head(storm1_summary)

## # A tibble: 6 × 2
##   Event             total_fatalities
##   <chr>                        <dbl>
## 1 "Flood "                      6188
## 2 "Hail "                       3186
## 3 "Excessive Heat "             1903
## 4 "Dense Fog "                  1206
## 5 "Rip Current "                 580
## 6 "Heavy Snow "                  479

storm1_summary %>% slice_max(total_fatalities, n = 1) # which event causes most fatalities

## # A tibble: 1 × 2
##   Event    total_fatalities
##   <chr>               <dbl>
## 1 "Flood "             6188

Process the data to answer Q2

## create a new column to multiply with exponents for both property and crop damages and add them to get total damage by event

# create a column with all possible exponents 

exp_list <- c("B" = 10^9, "b" = 10^9, "H" = 10^2, "h" = 10^2, "K" = 10^3, "k" = 10^3, "M" = 10^6, "m" = 10^6, "0" = 1, "1" = 10, "2" = 100, "3" = 10^3, "4" = 10^4, "5" = 10^5, "6" = 10^6, "7" = 10^7, "8" = 10^8, "+" = 1, "-" = 0, "?" = 0)

# add a column for total property damage after calculation in $$
storm2 <- storm1 %>%
  mutate(
    exp_factor = exp_list[as.character(PROPDMGEXP)],
   total_prop_dmg = PROPDMG * ifelse(is.na(exp_factor), 0, exp_factor)
  )

head(storm2)

##    EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP  Event
## 1 TORNADO          0       15    25.0          K       0       <NA> Flood 
## 2 TORNADO          0        0     2.5          K       0       <NA> Flood 
## 3 TORNADO          0        2    25.0          K       0       <NA> Flood 
## 4 TORNADO          0        2     2.5          K       0       <NA> Flood 
## 5 TORNADO          0        2     2.5          K       0       <NA> Flood 
## 6 TORNADO          0        6     2.5          K       0       <NA> Flood 
##   exp_factor total_prop_dmg
## 1       1000          25000
## 2       1000           2500
## 3       1000          25000
## 4       1000           2500
## 5       1000           2500
## 6       1000           2500

# add a column for total crop damage after calculation in $$

storm3 <- storm2 %>%
  mutate(
    exp_factor = exp_list[as.character(CROPDMGEXP)],
   total_crop_dmg = CROPDMG * ifelse(is.na(exp_factor), 0, exp_factor)
  )

head(storm3)

##    EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP  Event
## 1 TORNADO          0       15    25.0          K       0       <NA> Flood 
## 2 TORNADO          0        0     2.5          K       0       <NA> Flood 
## 3 TORNADO          0        2    25.0          K       0       <NA> Flood 
## 4 TORNADO          0        2     2.5          K       0       <NA> Flood 
## 5 TORNADO          0        2     2.5          K       0       <NA> Flood 
## 6 TORNADO          0        6     2.5          K       0       <NA> Flood 
##   exp_factor total_prop_dmg total_crop_dmg
## 1         NA          25000              0
## 2         NA           2500              0
## 3         NA          25000              0
## 4         NA           2500              0
## 5         NA           2500              0
## 6         NA           2500              0

# calculate total damages from combined property and crop damages
storm_final <- storm3 %>%
  mutate(
    total_damages = total_prop_dmg + total_crop_dmg,
  ) %>% 
  arrange(desc(total_damages)) # arrange data in descending order

# head(storm_final) added to check intermediate result

head(storm_final)

##              EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1             FLOOD          0        0  115.00          B    32.5          M
## 2       STORM SURGE          0        0   31.30          B     0.0       <NA>
## 3 HURRICANE/TYPHOON          0        0   16.93          B     0.0       <NA>
## 4       STORM SURGE          0        0   11.26          B     0.0       <NA>
## 5       RIVER FLOOD          0        0    5.00          B     5.0          B
## 6 HURRICANE/TYPHOON          5        0   10.00          B     0.0       <NA>
##         Event exp_factor total_prop_dmg total_crop_dmg total_damages
## 1      Flood       1e+06      1.150e+11       3.25e+07  115032500000
## 2 Heavy Snow          NA      3.130e+10       0.00e+00   31300000000
## 3  Avalanche          NA      1.693e+10       0.00e+00   16930000000
## 4 Heavy Snow          NA      1.126e+10       0.00e+00   11260000000
## 5  Dense Fog       1e+09      5.000e+09       5.00e+09   10000000000
## 6  Avalanche          NA      1.000e+10       0.00e+00   10000000000

top_dmg <- storm_final %>% 
  select(Event, total_damages) %>%
  slice_max(total_damages, n = 20)
head(top_dmg, n=20)

##                Event total_damages
## 1             Flood   115032500000
## 2        Heavy Snow    31300000000
## 3         Avalanche    16930000000
## 4        Heavy Snow    11260000000
## 5         Dense Fog    10000000000
## 6         Avalanche    10000000000
## 7         Avalanche     7390000000
## 8         Avalanche     7350000000
## 9         Avalanche     5705000000
## 10       Heavy Snow     5150000000
## 11        Ice Storm     5000500000
## 12       Heavy Snow     5000000000
## 13        Avalanche     4923200000
## 14        Avalanche     4025000000
## 15        Avalanche     4000000000
## 16 Storm Surge/Tide     4000000000
## 17             Hail     3500000000
## 18            Flood     3000000000
## 19            Flood     2800000000
## 20        Avalanche     2525000000

RESULTS

Impact of storm events on population health especially fatalities

# select top 10 events

topevents <- storm1_summary %>% filter(total_fatalities > 120)
head(topevents, n = 10)

## # A tibble: 10 × 2
##    Event                      total_fatalities
##    <chr>                                 <dbl>
##  1 "Flood "                               6188
##  2 "Hail "                                3186
##  3 "Excessive Heat "                      1903
##  4 "Dense Fog "                           1206
##  5 "Rip Current "                          580
##  6 "Heavy Snow "                           479
##  7 "Avalanche "                            289
##  8 "Blizzard"                              223
##  9 "Extreme Cold/Wind Chill "              125
## 10 "Ice Storm "                            123

# create a plot to the events with higher fatalities to answer Q1

ggplot(topevents, aes(x = reorder(Event, -total_fatalities), y = total_fatalities)) +
  geom_col(fill = "blue") +
  labs(
    title = "Figure 1. Top 10 Impactful Events on Population Health\nAcross the USA",
    x = "Event Type",
    y = "Total Fatalities"
  ) +
  scale_y_continuous(limits=c(0, 8000)) + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

### Figure 2

ggplot(top_dmg, aes(x = Event, y = total_damages)) +
  geom_col(fill = "orange") +
  scale_y_continuous(
    labels = label_dollar(scale = 1e-9, suffix = "B", accuracy = 1),
    expand = expansion(mult = c(0, 0.05))
  ) +
  labs(
    title = "Top Storm Events by Economic Damage",
    x = "Event Type",
    y = "Total Damages (USD Billions)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Impact of Storms on Public Health and Economies

VM

2025-06-24

Set up dir

load libraries

Data Processing

download and read data

keep required columns

tidy up the event names to 48 types and do fuzzy matching

Process the data for Q1

Process the data to answer Q2

RESULTS

Impact of storm events on population health especially fatalities