This study analyses storm data, focusing on event type and different kinds of damage, specifically population healt in the form of fatalities and injuries and economic consecuenses in the form of property and crop damage.
We upload the .csv data, located in the same directory as the script and create a subset with the health and economic data
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
library(stringdist)
# Reading the data
storm_data <- read.csv(file = bzfile("repdata_data_StormData.csv.bz2"))
For this part we will create a list with the official event names and correct the dataframe to match it.
#First we create a list with the EVTYPEs from the official list
# Create a vector containing the events
events <- c("Astronomical Low Tide", "Avalanche", "Blizzard", "Coastal Flood",
"Cold/Wind Chill", "Debris Flow", "Dense Fog", "Dense Smoke",
"Drought", "Dust Devil", "Dust Storm", "Excessive Heat",
"Extreme Cold/Wind Chill", "Flash Flood", "Flood", "Frost/Freeze",
"Funnel Cloud", "Freezing Fog", "Hail", "Heat", "Heavy Rain",
"Heavy Snow", "High Surf", "High Wind", "Hurricane (Typhoon)",
"Ice Storm", "Lake-Effect Snow", "Lakeshore Flood", "Lightning",
"Marine Hail", "Marine High Wind", "Marine Strong Wind",
"Marine Thunderstorm Wind", "Rip Current", "Seiche", "Sleet",
"Storm Surge/Tide", "Strong Wind", "Thunderstorm Wind", "Tornado",
"Tropical Depression", "Tropical Storm", "Tsunami", "Volcanic Ash",
"Waterspout", "Wildfire", "Winter Storm", "Winter Weather")
# Create a data frame with one column named 'EVENT'
event_table <- data.frame(EVENT = events)
# Now
# Use amatch to find the closest match for each event in EVTYPE
# The maximum distance can be adjusted according to your needs. A lower value means a closer match.
closest_matches <- sapply(storm_data$EVTYPE, function(x) {
match_index <- amatch(x, events, maxDist = 30)
if (!is.na(match_index)) {
return(events[match_index])
} else {
return(NA) # Return NA if no close match is found
}
})
# You can then replace the EVTYPE column with these matches or create a new column
storm_data$EVTYPE_closest_match <- closest_matches
We will sumarize the data, considering the exponential number for economic dammage
# Summarizing the data
storm_data$PROPDMGEXP[storm_data$REFNUM == "605943"] <- "M"
library(dplyr)
storm_data <- storm_data %>%
mutate(PROPDMGEXP = case_when(
PROPDMGEXP %in% c("", "+", "-", "?") ~ "1",
PROPDMGEXP %in% as.character(1:9) ~ "10",
toupper(PROPDMGEXP) == "H" ~ "100",
toupper(PROPDMGEXP) == "K" ~ "1000",
toupper(PROPDMGEXP) == "M" ~ "1000000",
toupper(PROPDMGEXP) == "B" ~ "1000000000",
TRUE ~ PROPDMGEXP
)) %>%
mutate(CROPDMGEXP = case_when(
CROPDMGEXP %in% c("", "+", "-", "?") ~ "1",
CROPDMGEXP %in% as.character(1:9) ~ "10",
toupper(CROPDMGEXP) == "H" ~ "100",
toupper(CROPDMGEXP) == "K" ~ "1000",
toupper(CROPDMGEXP) == "M" ~ "1000000",
toupper(CROPDMGEXP) == "B" ~ "1000000000",
TRUE ~ CROPDMGEXP
)) %>%
mutate_at(vars(PROPDMGEXP, CROPDMGEXP), as.numeric) %>%
mutate(
PROPERTY = PROPDMG * PROPDMGEXP / 1000000000,
CROP = CROPDMG * CROPDMGEXP / 1000000000
)
summarized_data <- storm_data %>%
group_by(EVTYPE_closest_match) %>%
summarise(
Sum_Fatalities = sum(FATALITIES),
Sum_Injuries = sum(INJURIES),
Sum_PropDmg = sum(PROPERTY),
Sum_CropDmg = sum(CROP)
)
For the analysis itself, we are going to identify the top 10 event types on each of the damage categories, spliting the study by health and economy from now on.
We identify the top 10 causes on each category now
# Top 10 EVTYPEs by Fatalities
top_fatalities <- summarized_data %>%
arrange(desc(Sum_Fatalities)) %>%
slice(1:10)
# Top 10 EVTYPEs by Injuries
top_injuries <- summarized_data %>%
arrange(desc(Sum_Injuries)) %>%
slice(1:10)
# Print Top 10 EVTYPEs by Fatalities
print("Top 10 EVTYPEs by Fatalities:")
## [1] "Top 10 EVTYPEs by Fatalities:"
print(top_fatalities[, c("EVTYPE_closest_match", "Sum_Fatalities")])
## # A tibble: 10 × 2
## EVTYPE_closest_match Sum_Fatalities
## <chr> <dbl>
## 1 Tornado 5633
## 2 Excessive Heat 1924
## 3 Hail 1899
## 4 High Wind 1181
## 5 Flash Flood 1019
## 6 Rip Current 575
## 7 Flood 551
## 8 Ice Storm 420
## 9 Avalanche 294
## 10 Debris Flow 192
# Print Top 10 EVTYPEs by Injuries
print("Top 10 EVTYPEs by Injuries:")
## [1] "Top 10 EVTYPEs by Injuries:"
print(top_injuries[,c("EVTYPE_closest_match", "Sum_Injuries")])
## # A tibble: 10 × 2
## EVTYPE_closest_match Sum_Injuries
## <chr> <dbl>
## 1 Tornado 91346
## 2 High Wind 11168
## 3 Hail 9043
## 4 Flood 7751
## 5 Excessive Heat 6527
## 6 Ice Storm 3946
## 7 Flash Flood 1786
## 8 Avalanche 1475
## 9 Heavy Snow 1161
## 10 Wildfire 913
With the most harmful event types identified, we plot them to visualize them
# Plot and save the plot for Top 10 EVTYPEs by Fatalities
fatalities_plot <- ggplot(top_fatalities, aes(x = reorder(EVTYPE_closest_match, Sum_Fatalities), y = Sum_Fatalities)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Top 10 event types by Fatalities", x = "EVTYPE_closest_match", y = "Sum of Fatalities")
print(fatalities_plot)
# Plot and save the plot for Top 10 EVTYPEs by Injuries
injuries_plot <- ggplot(top_injuries, aes(x = reorder(EVTYPE_closest_match, Sum_Injuries), y = Sum_Injuries)) +
geom_bar(stat = "identity", fill = "darkred") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Top 10 event types by Injuries", x = "EVTYPE_closest_match", y = "Sum of Injuries")
print(injuries_plot)
With this information, we can assess that Tornado is the most harmful event in both categories by a wide margin.
To explore further, we are going to compare the sum of this event type with the sum of the remaining 9.
# Value for the top EVTYPE
top_value_fatalities <- top_fatalities$Sum_Fatalities[1]
# Sum of the rest
sum_rest_fatalities <- sum(top_fatalities$Sum_Fatalities[-1])
# Comparison
times_bigger_fatalities <- top_value_fatalities / sum_rest_fatalities
# Print the result
print(paste(top_fatalities$EVTYPE_closest_match[1] ," by fatalities is", times_bigger_fatalities, "times bigger than the sum of the rest in the top 10."))
## [1] "Tornado by fatalities is 0.699317194289261 times bigger than the sum of the rest in the top 10."
# Value for the top EVTYPE
top_value_injuries <- top_injuries$Sum_Injuries[1]
# Sum of the rest
sum_rest_injuries <- sum(top_injuries$Sum_Injuries[-1])
# Comparison
times_bigger_injuries <- top_value_injuries / sum_rest_injuries
# Print the result
print(paste(top_injuries$EVTYPE_closest_match[1]," by injuries is", times_bigger_injuries, "times bigger than the sum of the rest in the top 10."))
## [1] "Tornado by injuries is 2.08695453506968 times bigger than the sum of the rest in the top 10."
We identify the top 10 causes on each category now
# Top 10 EVTYPEs by Property Damage
top_prop <- summarized_data %>%
arrange(desc(Sum_PropDmg)) %>%
slice(1:10)
# Top 10 EVTYPEs by Crop Damage
top_crop <- summarized_data %>%
arrange(desc(Sum_CropDmg)) %>%
slice(1:10)
# Print Top 10 EVTYPEs by Property Damage
print("Top 10 EVTYPEs by Property Damage:")
## [1] "Top 10 EVTYPEs by Property Damage:"
print(top_prop[, c("EVTYPE_closest_match", "Sum_PropDmg")])
## # A tibble: 10 × 2
## EVTYPE_closest_match Sum_PropDmg
## <chr> <dbl>
## 1 Avalanche 69.3
## 2 Tornado 56.9
## 3 Dense Smoke 44.9
## 4 Flood 30.0
## 5 Hail 29.4
## 6 Ice Storm 19.3
## 7 Flash Flood 16.8
## 8 High Wind 15.6
## 9 Heavy Rain 6.75
## 10 Storm Surge/Tide 5.90
# Print Top 10 EVTYPEs by Crop Damage
print("Top 10 EVTYPEs by Crop Damage:")
## [1] "Top 10 EVTYPEs by Crop Damage:"
print(top_crop[,c("EVTYPE_closest_match", "Sum_CropDmg")])
## # A tibble: 10 × 2
## EVTYPE_closest_match Sum_CropDmg
## <chr> <dbl>
## 1 Drought 14.0
## 2 Flood 6.19
## 3 Hail 6.18
## 4 Ice Storm 5.76
## 5 Dense Fog 5.06
## 6 Avalanche 2.61
## 7 High Wind 1.84
## 8 Debris Flow 1.80
## 9 Flash Flood 1.47
## 10 Frost/Freeze 1.24
With the most harmful event types identified, we plot them to visualize them
# Plot and save the plot for Top 10 EVTYPEs by Property Damage
prop_plot <- ggplot(top_prop, aes(x = reorder(EVTYPE_closest_match, Sum_PropDmg), y = Sum_PropDmg)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Top 10 event types by Property Damage", x = "EVTYPE_closest_match", y = "Sum of Property Damage")
print(prop_plot)
# Plot and save the plot for Top 10 EVTYPEs by Crop Damage
crop_plot <- ggplot(top_crop, aes(x = reorder(EVTYPE_closest_match, Sum_CropDmg), y = Sum_CropDmg)) +
geom_bar(stat = "identity", fill = "darkred") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Top 10 event types by Crop Damage", x = "EVTYPE_closest_match", y = "Sum of Crop Damage")
print(crop_plot)
With this information, we can assess that Avalanche is the most harmful event on property and Drought is the most harmful on crops by wide margin.
To explore further, we are going to compare the sum of this event types with the sum of the remaining 9.
# Value for the top EVTYPE
top_value_prop <- top_prop$Sum_PropDmg[1]
# Sum of the rest
sum_rest_prop <- sum(top_prop$Sum_PropDmg[-1])
# Comparison
times_bigger_prop <- top_value_prop / sum_rest_prop
# Print the result
print(paste(top_prop$EVTYPE_closest_match[1] ," by Property Damage is", times_bigger_prop, "times bigger than the sum of the rest in the top 10."))
## [1] "Avalanche by Property Damage is 0.307148521696779 times bigger than the sum of the rest in the top 10."
# Value for the top EVTYPE
top_value_crop <- top_crop$Sum_CropDmg[1]
# Sum of the rest
sum_rest_crop <- sum(top_crop$Sum_CropDmg[-1])
# Comparison
times_bigger_crop <- top_value_crop / sum_rest_crop
# Print the result
print(paste(top_crop$EVTYPE_closest_match[1]," by crop damage is", times_bigger_crop, "times bigger than the sum of the rest in the top 10."))
## [1] "Drought by crop damage is 0.43458659745176 times bigger than the sum of the rest in the top 10."