# Load packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(readr)
library(knitr)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(RColorBrewer)
# Theme
theme_set(theme_minimal(base_size = 12) +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "bottom"))
# Set working directory
setwd("/Users/payplu/Downloads")
# Read the CSV files
details <- read_csv("StormEvents_details-ftp_v1.0_d2025_c20251204.csv",
show_col_types = FALSE)
fatalities <- read_csv("StormEvents_fatalities-ftp_v1.0_d2025_c20251204.csv",
show_col_types = FALSE)
locations <- read_csv("StormEvents_locations-ftp_v1.0_d2025_c20251204.csv",
show_col_types = FALSE)
# Check the data
cat("Details dataset dimensions:", dim(details), "\n")
## Details dataset dimensions: 55633 51
cat("Fatalities dataset dimensions:", dim(fatalities), "\n")
## Fatalities dataset dimensions: 710 11
cat("Locations dataset dimensions:", dim(locations), "\n")
## Locations dataset dimensions: 42106 11
# Check what years are in the data
cat("\n=== CHECKING YEAR INFORMATION ===\n")
##
## === CHECKING YEAR INFORMATION ===
cat("Unique years in details:", unique(details$YEAR), "\n")
## Unique years in details: 2025
# Check the range of dates
cat("\nChecking date range from BEGIN_DATE_TIME...\n")
##
## Checking date range from BEGIN_DATE_TIME...
# Extract year from BEGIN_DATE_TIME
details <- details %>%
mutate(
BEGIN_DATE = as.Date(BEGIN_DATE_TIME, format = "%d-%b-%y"),
YEAR_FROM_DATE = year(BEGIN_DATE)
)
cat("Years from BEGIN_DATE:", unique(details$YEAR_FROM_DATE), "\n")
## Years from BEGIN_DATE: 2025
cat("Year column values:", unique(details$YEAR), "\n")
## Year column values: 2025
# Check if there's a mismatch between YEAR column and actual dates
if (any(details$YEAR != details$YEAR_FROM_DATE, na.rm = TRUE)) {
cat("\nWARNING: YEAR column doesn't match dates in BEGIN_DATE_TIME!\n")
cat("Using dates from BEGIN_DATE_TIME for analysis.\n")
}
# 1. DATA CLEANING
# Aggregate fatalities by EVENT_ID
fatalities_summary <- fatalities %>%
group_by(EVENT_ID) %>%
summarize(
FATALITIES = n(), # Count rows as number of fatalities
FATALITY_TYPES = paste(unique(FATALITY_TYPE), collapse = ", ")
)
# clean and prepare the data
storm_clean <- details %>%
# Convert date-time columns to proper format
mutate(
# Parse BEGIN_DATE_TIME (format: "01-JAN-24 00:00:00")
BEGIN_DATE = as.Date(BEGIN_DATE_TIME, format = "%d-%b-%y"),
# Parse END_DATE_TIME
END_DATE = as.Date(END_DATE_TIME, format = "%d-%b-%y"),
# Extract month and year from the date
MONTH = month(BEGIN_DATE, label = TRUE, abbr = TRUE),
YEAR = year(BEGIN_DATE), # Use year from actual date, not YEAR column
# Handle missing values in EVENT_TYPE
EVENT_TYPE = ifelse(is.na(EVENT_TYPE) | EVENT_TYPE == "", "UNKNOWN", EVENT_TYPE),
# Clean STATE names
STATE = toupper(STATE), # Convert to uppercase for consistency
# Combine injury columns (if they exist)
INJURIES_TOTAL = ifelse(
exists("INJURIES_DIRECT") & exists("INJURIES_INDIRECT"),
INJURIES_DIRECT + INJURIES_INDIRECT,
ifelse(exists("INJURIES_DIRECT"), INJURIES_DIRECT, 0)
),
# Combine death columns (if they exist)
DEATHS_TOTAL = ifelse(
exists("DEATHS_DIRECT") & exists("DEATHS_INDIRECT"),
DEATHS_DIRECT + DEATHS_INDIRECT,
ifelse(exists("DEATHS_DIRECT"), DEATHS_DIRECT, 0)
)
) %>%
# Merge with fatalities summary
left_join(fatalities_summary, by = "EVENT_ID") %>%
# Handle missing values after merge
mutate(
# Use fatalities from summary or default to 0
FATALITIES = ifelse(is.na(FATALITIES), 0, FATALITIES),
# Combine fatalities from both sources if needed
TOTAL_FATALITIES = FATALITIES + ifelse(is.na(DEATHS_TOTAL), 0, DEATHS_TOTAL),
TOTAL_INJURIES = ifelse(is.na(INJURIES_TOTAL), 0, INJURIES_TOTAL)
) %>%
# IMPORTANT: Use the most recent year available
# Filter for the most recent year in the dataset
filter(YEAR == max(YEAR, na.rm = TRUE)) %>%
# Select relevant columns
select(EVENT_ID, EVENT_TYPE, STATE, MONTH, YEAR,
FATALITIES = TOTAL_FATALITIES,
INJURIES = TOTAL_INJURIES,
BEGIN_DATE, END_DATE)
# Check cleaned data
cat("\n=== CLEANED DATA SUMMARY ===\n")
##
## === CLEANED DATA SUMMARY ===
cat("Dimensions:", dim(storm_clean), "\n")
## Dimensions: 55633 9
cat("Year analyzed:", unique(storm_clean$YEAR), "\n")
## Year analyzed: 2025
cat("Date range:",
as.character(min(storm_clean$BEGIN_DATE, na.rm = TRUE)), "to",
as.character(max(storm_clean$BEGIN_DATE, na.rm = TRUE)), "\n")
## Date range: 2025-01-01 to 2025-07-31
cat("Number of unique event types:", length(unique(storm_clean$EVENT_TYPE)), "\n")
## Number of unique event types: 46
cat("Total fatalities:", sum(storm_clean$FATALITIES), "\n")
## Total fatalities: 709
cat("Total injuries:", sum(storm_clean$INJURIES), "\n")
## Total injuries: 0
# Show a sample of the cleaned data
cat("\n=== SAMPLE OF CLEANED DATA ===\n")
##
## === SAMPLE OF CLEANED DATA ===
print(head(storm_clean, 5))
## # A tibble: 5 × 9
## EVENT_ID EVENT_TYPE STATE MONTH YEAR FATALITIES INJURIES BEGIN_DATE
## <dbl> <chr> <chr> <ord> <dbl> <dbl> <dbl> <date>
## 1 1252415 Thunderstorm Wind GEORGIA Mar 2025 0 0 2025-03-31
## 2 1241136 Tornado MICHIGAN Mar 2025 0 0 2025-03-30
## 3 1222851 Winter Storm VIRGINIA Jan 2025 0 0 2025-01-05
## 4 1223112 Winter Weather MARYLAND Jan 2025 0 0 2025-01-03
## 5 1223113 Winter Weather MARYLAND Jan 2025 0 0 2025-01-03
## # ℹ 1 more variable: END_DATE <date>
# Check if we have data
if (nrow(storm_clean) == 0) {
cat("\nWARNING: No data after filtering! Using all available data instead.\n")
# Use all data without year filtering
storm_clean <- details %>%
mutate(
BEGIN_DATE = as.Date(BEGIN_DATE_TIME, format = "%d-%b-%y"),
END_DATE = as.Date(END_DATE_TIME, format = "%d-%b-%y"),
MONTH = month(BEGIN_DATE, label = TRUE, abbr = TRUE),
YEAR = year(BEGIN_DATE),
EVENT_TYPE = ifelse(is.na(EVENT_TYPE) | EVENT_TYPE == "", "UNKNOWN", EVENT_TYPE),
STATE = toupper(STATE)
) %>%
left_join(fatalities_summary, by = "EVENT_ID") %>%
mutate(
FATALITIES = ifelse(is.na(FATALITIES), 0, FATALITIES),
TOTAL_INJURIES = ifelse(is.na(INJURIES_DIRECT), 0, INJURIES_DIRECT) +
ifelse(is.na(INJURIES_INDIRECT), 0, INJURIES_INDIRECT)
) %>%
select(EVENT_ID, EVENT_TYPE, STATE, MONTH, YEAR,
FATALITIES,
INJURIES = TOTAL_INJURIES,
BEGIN_DATE, END_DATE)
cat("Now using data from years:", unique(storm_clean$YEAR), "\n")
}
This R markdown file will show you natural disasters that occur across the United States. It will show you what events occur the most, which occurs the least, what months these events occur, and how catastrophic these events are. From our analysis, you will find that Rip Currents are the most terrifying events that could occur which dominates in damaging the health of the population across the US. In our analysis, you will also find Flash Floods to occur the most, showing why so many states have to have a sewage system and a preparation for this event. The bar charts and graphs of these events will perfectly show you a visual of which occurs most often and which does the most damage.
health_impact <- storm_clean %>%
group_by(EVENT_TYPE) %>%
summarize(
Total_Fatalities = sum(FATALITIES, na.rm = TRUE),
Total_Injuries = sum(INJURIES, na.rm = TRUE),
Total_Harm = Total_Fatalities + Total_Injuries,
Event_Count = n(),
Avg_Harm_per_Event = round(Total_Harm / Event_Count, 3)
) %>%
arrange(desc(Total_Harm)) %>%
head(10) # Top 10 most harmful events
cat("\n=== QUESTION 1 RESULTS ===\n")
##
## === QUESTION 1 RESULTS ===
cat("Top 10 most harmful event types to population health:\n")
## Top 10 most harmful event types to population health:
print(health_impact)
## # A tibble: 10 × 6
## EVENT_TYPE Total_Fatalities Total_Injuries Total_Harm Event_Count
## <chr> <dbl> <dbl> <dbl> <int>
## 1 Flash Flood 188 0 188 4258
## 2 Heat 117 0 117 1864
## 3 Wildfire 63 0 63 214
## 4 Excessive Heat 61 0 61 1153
## 5 Tornado 60 0 60 1404
## 6 Thunderstorm Wind 37 0 37 18658
## 7 Rip Current 21 0 21 44
## 8 Winter Storm 21 0 21 2111
## 9 Cold/Wind Chill 20 0 20 972
## 10 Lightning 19 0 19 216
## # ℹ 1 more variable: Avg_Harm_per_Event <dbl>
state_events <- storm_clean %>%
filter(!is.na(STATE), STATE != "") %>%
group_by(STATE, EVENT_TYPE) %>%
summarize(Event_Count = n(), .groups = 'drop') %>%
group_by(STATE) %>%
arrange(desc(Event_Count)) %>%
slice(1) %>% # Get the top event for each state
rename(Most_Common_Event = EVENT_TYPE) %>%
ungroup() %>%
arrange(desc(Event_Count))
top_state_events <- head(state_events, 15)
cat("\n=== QUESTION 2 RESULTS ===\n")
##
## === QUESTION 2 RESULTS ===
cat("Most common event type in each state (top 15 by frequency):\n")
## Most common event type in each state (top 15 by frequency):
print(top_state_events)
## # A tibble: 15 × 3
## STATE Most_Common_Event Event_Count
## <chr> <chr> <int>
## 1 ALABAMA Thunderstorm Wind 1376
## 2 TEXAS Hail 1328
## 3 VIRGINIA Thunderstorm Wind 1060
## 4 GEORGIA Thunderstorm Wind 987
## 5 PENNSYLVANIA Thunderstorm Wind 958
## 6 ILLINOIS Thunderstorm Wind 806
## 7 MISSOURI Thunderstorm Wind 742
## 8 OKLAHOMA Hail 731
## 9 OHIO Thunderstorm Wind 685
## 10 NORTH CAROLINA Thunderstorm Wind 679
## 11 INDIANA Thunderstorm Wind 657
## 12 MISSISSIPPI Thunderstorm Wind 650
## 13 WEST VIRGINIA Thunderstorm Wind 644
## 14 KANSAS Thunderstorm Wind 631
## 15 KENTUCKY Thunderstorm Wind 615
monthly_patterns <- storm_clean %>%
filter(!is.na(MONTH)) %>%
group_by(EVENT_TYPE, MONTH) %>%
summarize(Event_Count = n(), .groups = 'drop') %>%
group_by(EVENT_TYPE) %>%
mutate(
Total_Events = sum(Event_Count),
Percent_of_Total = round(Event_Count / Total_Events * 100, 1)
) %>%
arrange(EVENT_TYPE, MONTH)
# Get top 8 events by total frequency for visualization
top8_events <- storm_clean %>%
count(EVENT_TYPE, sort = TRUE) %>%
head(8) %>%
pull(EVENT_TYPE)
monthly_top_events <- monthly_patterns %>%
filter(EVENT_TYPE %in% top8_events)
# Find peak month for each event type
dominant_months <- monthly_patterns %>%
group_by(EVENT_TYPE) %>%
arrange(desc(Event_Count)) %>%
slice(1) %>%
select(EVENT_TYPE, Peak_Month = MONTH,
Events_in_Peak_Month = Event_Count,
Percent_in_Peak_Month = Percent_of_Total)
cat("\n=== QUESTION 3 RESULTS ===\n")
##
## === QUESTION 3 RESULTS ===
cat("Peak month for top event types:\n")
## Peak month for top event types:
print(head(dominant_months, 10))
## # A tibble: 10 × 4
## # Groups: EVENT_TYPE [10]
## EVENT_TYPE Peak_Month Events_in_Peak_Month Percent_in_Peak_Month
## <chr> <ord> <int> <dbl>
## 1 Astronomical Low Tide Mar 8 53.3
## 2 Avalanche Feb 15 55.6
## 3 Blizzard Mar 266 70
## 4 Coastal Flood Jan 10 45.5
## 5 Cold/Wind Chill Jan 643 66.2
## 6 Debris Flow Apr 34 37
## 7 Dense Fog May 100 41.8
## 8 Drought Apr 294 18.7
## 9 Dust Storm Mar 123 49
## 10 Excessive Heat Jul 731 63.4
custom_analysis <- storm_clean %>%
group_by(EVENT_TYPE) %>%
summarize(
Total_Events = n(),
Total_Fatalities = sum(FATALITIES),
Total_Injuries = sum(INJURIES),
Fatality_Rate = round(Total_Fatalities / Total_Events * 100, 2), # Fatalities per 100 events
Injury_Rate = round(Total_Injuries / Total_Events * 100, 2), # Injuries per 100 events
Events_with_Fatalities = sum(FATALITIES > 0),
Events_with_Injuries = sum(INJURIES > 0)
) %>%
filter(Total_Events >= 10) %>% # Only events with at least 10 occurrences
arrange(desc(Fatality_Rate)) %>%
head(10)
cat("\n=== QUESTION 4 RESULTS (CUSTOM) ===\n")
##
## === QUESTION 4 RESULTS (CUSTOM) ===
cat("Top 10 events by fatality rate (fatalities per 100 events):\n")
## Top 10 events by fatality rate (fatalities per 100 events):
print(custom_analysis)
## # A tibble: 10 × 8
## EVENT_TYPE Total_Events Total_Fatalities Total_Injuries Fatality_Rate
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Rip Current 44 21 0 47.7
## 2 Avalanche 27 8 0 29.6
## 3 Wildfire 214 63 0 29.4
## 4 High Surf 38 6 0 15.8
## 5 Lightning 216 19 0 8.8
## 6 Dust Storm 251 17 0 6.77
## 7 Heat 1864 117 0 6.28
## 8 Excessive Heat 1153 61 0 5.29
## 9 Flash Flood 4258 188 0 4.42
## 10 Tornado 1404 60 0 4.27
## # ℹ 3 more variables: Injury_Rate <dbl>, Events_with_Fatalities <int>,
## # Events_with_Injuries <int>