Introduction

This study analyses storm data, focusing on event type and different kinds of damage, specifically population healt in the form of fatalities and injuries and economic consecuenses in the form of property and crop damage.

Data Loading

We upload the .csv data, located in the same directory as the script and create a subset with the health and economic data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
library(stringdist)

# Reading the data
storm_data <- read.csv(file = bzfile("repdata_data_StormData.csv.bz2"))

Data Processing

For this part we will create a list with the official event names and correct the dataframe to match it.

#First we create a list with the EVTYPEs from the official list
# Create a vector containing the events
events <- c("Astronomical Low Tide", "Avalanche", "Blizzard", "Coastal Flood", 
            "Cold/Wind Chill", "Debris Flow", "Dense Fog", "Dense Smoke", 
            "Drought", "Dust Devil", "Dust Storm", "Excessive Heat", 
            "Extreme Cold/Wind Chill", "Flash Flood", "Flood", "Frost/Freeze", 
            "Funnel Cloud", "Freezing Fog", "Hail", "Heat", "Heavy Rain", 
            "Heavy Snow", "High Surf", "High Wind", "Hurricane (Typhoon)", 
            "Ice Storm", "Lake-Effect Snow", "Lakeshore Flood", "Lightning", 
            "Marine Hail", "Marine High Wind", "Marine Strong Wind", 
            "Marine Thunderstorm Wind", "Rip Current", "Seiche", "Sleet", 
            "Storm Surge/Tide", "Strong Wind", "Thunderstorm Wind", "Tornado", 
            "Tropical Depression", "Tropical Storm", "Tsunami", "Volcanic Ash", 
            "Waterspout", "Wildfire", "Winter Storm", "Winter Weather")

# Create a data frame with one column named 'EVENT'
event_table <- data.frame(EVENT = events)

# Now 

# Use amatch to find the closest match for each event in EVTYPE
# The maximum distance can be adjusted according to your needs. A lower value means a closer match.
closest_matches <- sapply(storm_data$EVTYPE, function(x) {
  match_index <- amatch(x, events, maxDist = 30)
  if (!is.na(match_index)) {
    return(events[match_index])
  } else {
    return(NA) # Return NA if no close match is found
  }
})

# You can then replace the EVTYPE column with these matches or create a new column
storm_data$EVTYPE_closest_match <- closest_matches

Data Summirizing

We will sumarize the data, considering the exponential number for economic dammage

# Summarizing the data

storm_data$PROPDMGEXP[storm_data$REFNUM == "605943"] <- "M"

library(dplyr)

storm_data <- storm_data %>%
  mutate(PROPDMGEXP = case_when(
    PROPDMGEXP %in% c("", "+", "-", "?") ~ "1",
    PROPDMGEXP %in% as.character(1:9) ~ "10",
    toupper(PROPDMGEXP) == "H" ~ "100",
    toupper(PROPDMGEXP) == "K" ~ "1000",
    toupper(PROPDMGEXP) == "M" ~ "1000000",
    toupper(PROPDMGEXP) == "B" ~ "1000000000",
    TRUE ~ PROPDMGEXP
  )) %>%
  mutate(CROPDMGEXP = case_when(
    CROPDMGEXP %in% c("", "+", "-", "?") ~ "1",
    CROPDMGEXP %in% as.character(1:9) ~ "10",
    toupper(CROPDMGEXP) == "H" ~ "100",
    toupper(CROPDMGEXP) == "K" ~ "1000",
    toupper(CROPDMGEXP) == "M" ~ "1000000",
    toupper(CROPDMGEXP) == "B" ~ "1000000000",
    TRUE ~ CROPDMGEXP
  )) %>%
  mutate_at(vars(PROPDMGEXP, CROPDMGEXP), as.numeric) %>%
  mutate(
    PROPERTY = PROPDMG * PROPDMGEXP / 1000000000,
    CROP = CROPDMG * CROPDMGEXP / 1000000000
  )

summarized_data <- storm_data %>%
  group_by(EVTYPE_closest_match) %>%
  summarise(
    Sum_Fatalities = sum(FATALITIES),
    Sum_Injuries = sum(INJURIES),
    Sum_PropDmg = sum(PROPERTY),
    Sum_CropDmg = sum(CROP)
  )

For the analysis itself, we are going to identify the top 10 event types on each of the damage categories, spliting the study by health and economy from now on.

Population’s Health Damage

We identify the top 10 causes on each category now

# Top 10 EVTYPEs by Fatalities
top_fatalities <- summarized_data %>%
  arrange(desc(Sum_Fatalities)) %>%
  slice(1:10)

# Top 10 EVTYPEs by Injuries
top_injuries <- summarized_data %>%
  arrange(desc(Sum_Injuries)) %>%
  slice(1:10)

# Print Top 10 EVTYPEs by Fatalities
print("Top 10 EVTYPEs by Fatalities:")
## [1] "Top 10 EVTYPEs by Fatalities:"
print(top_fatalities[, c("EVTYPE_closest_match", "Sum_Fatalities")])
## # A tibble: 10 × 2
##    EVTYPE_closest_match Sum_Fatalities
##    <chr>                         <dbl>
##  1 Tornado                        5633
##  2 Excessive Heat                 1924
##  3 Hail                           1899
##  4 High Wind                      1181
##  5 Flash Flood                    1019
##  6 Rip Current                     575
##  7 Flood                           551
##  8 Ice Storm                       420
##  9 Avalanche                       294
## 10 Debris Flow                     192
# Print Top 10 EVTYPEs by Injuries
print("Top 10 EVTYPEs by Injuries:")
## [1] "Top 10 EVTYPEs by Injuries:"
print(top_injuries[,c("EVTYPE_closest_match", "Sum_Injuries")])
## # A tibble: 10 × 2
##    EVTYPE_closest_match Sum_Injuries
##    <chr>                       <dbl>
##  1 Tornado                     91346
##  2 High Wind                   11168
##  3 Hail                         9043
##  4 Flood                        7751
##  5 Excessive Heat               6527
##  6 Ice Storm                    3946
##  7 Flash Flood                  1786
##  8 Avalanche                    1475
##  9 Heavy Snow                   1161
## 10 Wildfire                      913

With the most harmful event types identified, we plot them to visualize them

# Plot and save the plot for Top 10 EVTYPEs by Fatalities
fatalities_plot <- ggplot(top_fatalities, aes(x = reorder(EVTYPE_closest_match, Sum_Fatalities), y = Sum_Fatalities)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Top 10 event types by Fatalities", x = "EVTYPE_closest_match", y = "Sum of Fatalities")
print(fatalities_plot)

# Plot and save the plot for Top 10 EVTYPEs by Injuries
injuries_plot <- ggplot(top_injuries, aes(x = reorder(EVTYPE_closest_match, Sum_Injuries), y = Sum_Injuries)) +
  geom_bar(stat = "identity", fill = "darkred") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Top 10 event types by Injuries", x = "EVTYPE_closest_match", y = "Sum of Injuries")
print(injuries_plot)

With this information, we can assess that Tornado is the most harmful event in both categories by a wide margin.

To explore further, we are going to compare the sum of this event type with the sum of the remaining 9.

# Value for the top EVTYPE
top_value_fatalities <- top_fatalities$Sum_Fatalities[1]

# Sum of the rest
sum_rest_fatalities <- sum(top_fatalities$Sum_Fatalities[-1])

# Comparison
times_bigger_fatalities <- top_value_fatalities / sum_rest_fatalities

# Print the result
print(paste(top_fatalities$EVTYPE_closest_match[1] ," by fatalities is", times_bigger_fatalities, "times bigger than the sum of the rest in the top 10."))
## [1] "Tornado  by fatalities is 0.699317194289261 times bigger than the sum of the rest in the top 10."
# Value for the top EVTYPE
top_value_injuries <- top_injuries$Sum_Injuries[1]

# Sum of the rest
sum_rest_injuries <- sum(top_injuries$Sum_Injuries[-1])

# Comparison
times_bigger_injuries <- top_value_injuries / sum_rest_injuries

# Print the result
print(paste(top_injuries$EVTYPE_closest_match[1]," by injuries is", times_bigger_injuries, "times bigger than the sum of the rest in the top 10."))
## [1] "Tornado  by injuries is 2.08695453506968 times bigger than the sum of the rest in the top 10."

Economic Damage

We identify the top 10 causes on each category now

# Top 10 EVTYPEs by Property Damage
top_prop <- summarized_data %>%
  arrange(desc(Sum_PropDmg)) %>%
  slice(1:10)

# Top 10 EVTYPEs by Crop Damage
top_crop <- summarized_data %>%
  arrange(desc(Sum_CropDmg)) %>%
  slice(1:10)

# Print Top 10 EVTYPEs by Property Damage
print("Top 10 EVTYPEs by Property Damage:")
## [1] "Top 10 EVTYPEs by Property Damage:"
print(top_prop[, c("EVTYPE_closest_match", "Sum_PropDmg")])
## # A tibble: 10 × 2
##    EVTYPE_closest_match Sum_PropDmg
##    <chr>                      <dbl>
##  1 Avalanche                  69.3 
##  2 Tornado                    56.9 
##  3 Dense Smoke                44.9 
##  4 Flood                      30.0 
##  5 Hail                       29.4 
##  6 Ice Storm                  19.3 
##  7 Flash Flood                16.8 
##  8 High Wind                  15.6 
##  9 Heavy Rain                  6.75
## 10 Storm Surge/Tide            5.90
# Print Top 10 EVTYPEs by Crop Damage
print("Top 10 EVTYPEs by Crop Damage:")
## [1] "Top 10 EVTYPEs by Crop Damage:"
print(top_crop[,c("EVTYPE_closest_match", "Sum_CropDmg")])
## # A tibble: 10 × 2
##    EVTYPE_closest_match Sum_CropDmg
##    <chr>                      <dbl>
##  1 Drought                    14.0 
##  2 Flood                       6.19
##  3 Hail                        6.18
##  4 Ice Storm                   5.76
##  5 Dense Fog                   5.06
##  6 Avalanche                   2.61
##  7 High Wind                   1.84
##  8 Debris Flow                 1.80
##  9 Flash Flood                 1.47
## 10 Frost/Freeze                1.24

With the most harmful event types identified, we plot them to visualize them

# Plot and save the plot for Top 10 EVTYPEs by Property Damage
prop_plot <- ggplot(top_prop, aes(x = reorder(EVTYPE_closest_match, Sum_PropDmg), y = Sum_PropDmg)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Top 10 event types by Property Damage", x = "EVTYPE_closest_match", y = "Sum of Property Damage")
print(prop_plot)

# Plot and save the plot for Top 10 EVTYPEs by Crop Damage
crop_plot <- ggplot(top_crop, aes(x = reorder(EVTYPE_closest_match, Sum_CropDmg), y = Sum_CropDmg)) +
  geom_bar(stat = "identity", fill = "darkred") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Top 10 event types by Crop Damage", x = "EVTYPE_closest_match", y = "Sum of Crop Damage")
print(crop_plot)

With this information, we can assess that Avalanche is the most harmful event on property and Drought is the most harmful on crops by wide margin.

To explore further, we are going to compare the sum of this event types with the sum of the remaining 9.

# Value for the top EVTYPE
top_value_prop <- top_prop$Sum_PropDmg[1]

# Sum of the rest
sum_rest_prop <- sum(top_prop$Sum_PropDmg[-1])

# Comparison
times_bigger_prop <- top_value_prop / sum_rest_prop

# Print the result
print(paste(top_prop$EVTYPE_closest_match[1] ," by Property Damage is", times_bigger_prop, "times bigger than the sum of the rest in the top 10."))
## [1] "Avalanche  by Property Damage is 0.307148521696779 times bigger than the sum of the rest in the top 10."
# Value for the top EVTYPE
top_value_crop <- top_crop$Sum_CropDmg[1]

# Sum of the rest
sum_rest_crop <- sum(top_crop$Sum_CropDmg[-1])

# Comparison
times_bigger_crop <- top_value_crop / sum_rest_crop

# Print the result
print(paste(top_crop$EVTYPE_closest_match[1]," by crop damage is", times_bigger_crop, "times bigger than the sum of the rest in the top 10."))
## [1] "Drought  by crop damage is 0.43458659745176 times bigger than the sum of the rest in the top 10."