# Load packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(readr)
library(knitr)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(RColorBrewer)
# Theme
theme_set(theme_minimal(base_size = 12) +
            theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                  plot.subtitle = element_text(hjust = 0.5),
                  legend.position = "bottom"))

# Set working directory
setwd("/Users/payplu/Downloads")

# Read the CSV files
details <- read_csv("StormEvents_details-ftp_v1.0_d2025_c20251204.csv", 
                    show_col_types = FALSE)
fatalities <- read_csv("StormEvents_fatalities-ftp_v1.0_d2025_c20251204.csv", 
                       show_col_types = FALSE)
locations <- read_csv("StormEvents_locations-ftp_v1.0_d2025_c20251204.csv", 
                      show_col_types = FALSE)

# Check the data
cat("Details dataset dimensions:", dim(details), "\n")
## Details dataset dimensions: 55633 51
cat("Fatalities dataset dimensions:", dim(fatalities), "\n")
## Fatalities dataset dimensions: 710 11
cat("Locations dataset dimensions:", dim(locations), "\n")
## Locations dataset dimensions: 42106 11
# Check what years are in the data
cat("\n=== CHECKING YEAR INFORMATION ===\n")
## 
## === CHECKING YEAR INFORMATION ===
cat("Unique years in details:", unique(details$YEAR), "\n")
## Unique years in details: 2025
# Check the range of dates
cat("\nChecking date range from BEGIN_DATE_TIME...\n")
## 
## Checking date range from BEGIN_DATE_TIME...
# Extract year from BEGIN_DATE_TIME
details <- details %>%
  mutate(
    BEGIN_DATE = as.Date(BEGIN_DATE_TIME, format = "%d-%b-%y"),
    YEAR_FROM_DATE = year(BEGIN_DATE)
  )

cat("Years from BEGIN_DATE:", unique(details$YEAR_FROM_DATE), "\n")
## Years from BEGIN_DATE: 2025
cat("Year column values:", unique(details$YEAR), "\n")
## Year column values: 2025
# Check if there's a mismatch between YEAR column and actual dates
if (any(details$YEAR != details$YEAR_FROM_DATE, na.rm = TRUE)) {
  cat("\nWARNING: YEAR column doesn't match dates in BEGIN_DATE_TIME!\n")
  cat("Using dates from BEGIN_DATE_TIME for analysis.\n")
}


# 1. DATA CLEANING 
# Aggregate fatalities by EVENT_ID
fatalities_summary <- fatalities %>%
  group_by(EVENT_ID) %>%
  summarize(
    FATALITIES = n(),  # Count rows as number of fatalities
    FATALITY_TYPES = paste(unique(FATALITY_TYPE), collapse = ", ")
  )

# clean and prepare the data
storm_clean <- details %>%
  # Convert date-time columns to proper format
  mutate(
    # Parse BEGIN_DATE_TIME (format: "01-JAN-24 00:00:00")
    BEGIN_DATE = as.Date(BEGIN_DATE_TIME, format = "%d-%b-%y"),
    # Parse END_DATE_TIME
    END_DATE = as.Date(END_DATE_TIME, format = "%d-%b-%y"),
    # Extract month and year from the date
    MONTH = month(BEGIN_DATE, label = TRUE, abbr = TRUE),
    YEAR = year(BEGIN_DATE),  # Use year from actual date, not YEAR column
    # Handle missing values in EVENT_TYPE
    EVENT_TYPE = ifelse(is.na(EVENT_TYPE) | EVENT_TYPE == "", "UNKNOWN", EVENT_TYPE),
    # Clean STATE names
    STATE = toupper(STATE),  # Convert to uppercase for consistency
    # Combine injury columns (if they exist)
    INJURIES_TOTAL = ifelse(
      exists("INJURIES_DIRECT") & exists("INJURIES_INDIRECT"),
      INJURIES_DIRECT + INJURIES_INDIRECT,
      ifelse(exists("INJURIES_DIRECT"), INJURIES_DIRECT, 0)
    ),
    # Combine death columns (if they exist)
    DEATHS_TOTAL = ifelse(
      exists("DEATHS_DIRECT") & exists("DEATHS_INDIRECT"),
      DEATHS_DIRECT + DEATHS_INDIRECT,
      ifelse(exists("DEATHS_DIRECT"), DEATHS_DIRECT, 0)
    )
  ) %>%
  # Merge with fatalities summary
  left_join(fatalities_summary, by = "EVENT_ID") %>%
  # Handle missing values after merge
  mutate(
    # Use fatalities from summary or default to 0
    FATALITIES = ifelse(is.na(FATALITIES), 0, FATALITIES),
    # Combine fatalities from both sources if needed
    TOTAL_FATALITIES = FATALITIES + ifelse(is.na(DEATHS_TOTAL), 0, DEATHS_TOTAL),
    TOTAL_INJURIES = ifelse(is.na(INJURIES_TOTAL), 0, INJURIES_TOTAL)
  ) %>%
  # IMPORTANT: Use the most recent year available
  # Filter for the most recent year in the dataset
  filter(YEAR == max(YEAR, na.rm = TRUE)) %>%
  # Select relevant columns
  select(EVENT_ID, EVENT_TYPE, STATE, MONTH, YEAR, 
         FATALITIES = TOTAL_FATALITIES, 
         INJURIES = TOTAL_INJURIES, 
         BEGIN_DATE, END_DATE)

# Check cleaned data
cat("\n=== CLEANED DATA SUMMARY ===\n")
## 
## === CLEANED DATA SUMMARY ===
cat("Dimensions:", dim(storm_clean), "\n")
## Dimensions: 55633 9
cat("Year analyzed:", unique(storm_clean$YEAR), "\n")
## Year analyzed: 2025
cat("Date range:", 
    as.character(min(storm_clean$BEGIN_DATE, na.rm = TRUE)), "to",
    as.character(max(storm_clean$BEGIN_DATE, na.rm = TRUE)), "\n")
## Date range: 2025-01-01 to 2025-07-31
cat("Number of unique event types:", length(unique(storm_clean$EVENT_TYPE)), "\n")
## Number of unique event types: 46
cat("Total fatalities:", sum(storm_clean$FATALITIES), "\n")
## Total fatalities: 709
cat("Total injuries:", sum(storm_clean$INJURIES), "\n")
## Total injuries: 0
# Show a sample of the cleaned data
cat("\n=== SAMPLE OF CLEANED DATA ===\n")
## 
## === SAMPLE OF CLEANED DATA ===
print(head(storm_clean, 5))
## # A tibble: 5 × 9
##   EVENT_ID EVENT_TYPE        STATE    MONTH  YEAR FATALITIES INJURIES BEGIN_DATE
##      <dbl> <chr>             <chr>    <ord> <dbl>      <dbl>    <dbl> <date>    
## 1  1252415 Thunderstorm Wind GEORGIA  Mar    2025          0        0 2025-03-31
## 2  1241136 Tornado           MICHIGAN Mar    2025          0        0 2025-03-30
## 3  1222851 Winter Storm      VIRGINIA Jan    2025          0        0 2025-01-05
## 4  1223112 Winter Weather    MARYLAND Jan    2025          0        0 2025-01-03
## 5  1223113 Winter Weather    MARYLAND Jan    2025          0        0 2025-01-03
## # ℹ 1 more variable: END_DATE <date>
# Check if we have data
if (nrow(storm_clean) == 0) {
  cat("\nWARNING: No data after filtering! Using all available data instead.\n")
  
  # Use all data without year filtering
  storm_clean <- details %>%
    mutate(
      BEGIN_DATE = as.Date(BEGIN_DATE_TIME, format = "%d-%b-%y"),
      END_DATE = as.Date(END_DATE_TIME, format = "%d-%b-%y"),
      MONTH = month(BEGIN_DATE, label = TRUE, abbr = TRUE),
      YEAR = year(BEGIN_DATE),
      EVENT_TYPE = ifelse(is.na(EVENT_TYPE) | EVENT_TYPE == "", "UNKNOWN", EVENT_TYPE),
      STATE = toupper(STATE)
    ) %>%
    left_join(fatalities_summary, by = "EVENT_ID") %>%
    mutate(
      FATALITIES = ifelse(is.na(FATALITIES), 0, FATALITIES),
      TOTAL_INJURIES = ifelse(is.na(INJURIES_DIRECT), 0, INJURIES_DIRECT) + 
        ifelse(is.na(INJURIES_INDIRECT), 0, INJURIES_INDIRECT)
    ) %>%
    select(EVENT_ID, EVENT_TYPE, STATE, MONTH, YEAR, 
           FATALITIES, 
           INJURIES = TOTAL_INJURIES, 
           BEGIN_DATE, END_DATE)
  
  cat("Now using data from years:", unique(storm_clean$YEAR), "\n")
}

Intro

This R markdown file will show you natural disasters that occur across the United States. It will show you what events occur the most, which occurs the least, what months these events occur, and how catastrophic these events are. From our analysis, you will find that Rip Currents are the most terrifying events that could occur which dominates in damaging the health of the population across the US. In our analysis, you will also find Flash Floods to occur the most, showing why so many states have to have a sewage system and a preparation for this event. The bar charts and graphs of these events will perfectly show you a visual of which occurs most often and which does the most damage.

health_impact <- storm_clean %>%
  group_by(EVENT_TYPE) %>%
  summarize(
    Total_Fatalities = sum(FATALITIES, na.rm = TRUE),
    Total_Injuries = sum(INJURIES, na.rm = TRUE),
    Total_Harm = Total_Fatalities + Total_Injuries,
    Event_Count = n(),
    Avg_Harm_per_Event = round(Total_Harm / Event_Count, 3)
  ) %>%
  arrange(desc(Total_Harm)) %>%
  head(10)  # Top 10 most harmful events

cat("\n=== QUESTION 1 RESULTS ===\n")
## 
## === QUESTION 1 RESULTS ===
cat("Top 10 most harmful event types to population health:\n")
## Top 10 most harmful event types to population health:
print(health_impact)
## # A tibble: 10 × 6
##    EVENT_TYPE        Total_Fatalities Total_Injuries Total_Harm Event_Count
##    <chr>                        <dbl>          <dbl>      <dbl>       <int>
##  1 Flash Flood                    188              0        188        4258
##  2 Heat                           117              0        117        1864
##  3 Wildfire                        63              0         63         214
##  4 Excessive Heat                  61              0         61        1153
##  5 Tornado                         60              0         60        1404
##  6 Thunderstorm Wind               37              0         37       18658
##  7 Rip Current                     21              0         21          44
##  8 Winter Storm                    21              0         21        2111
##  9 Cold/Wind Chill                 20              0         20         972
## 10 Lightning                       19              0         19         216
## # ℹ 1 more variable: Avg_Harm_per_Event <dbl>

state_events <- storm_clean %>%
  filter(!is.na(STATE), STATE != "") %>%
  group_by(STATE, EVENT_TYPE) %>%
  summarize(Event_Count = n(), .groups = 'drop') %>%
  group_by(STATE) %>%
  arrange(desc(Event_Count)) %>%
  slice(1) %>%  # Get the top event for each state
  rename(Most_Common_Event = EVENT_TYPE) %>%
  ungroup() %>%
  arrange(desc(Event_Count))
top_state_events <- head(state_events, 15)

cat("\n=== QUESTION 2 RESULTS ===\n")
## 
## === QUESTION 2 RESULTS ===
cat("Most common event type in each state (top 15 by frequency):\n")
## Most common event type in each state (top 15 by frequency):
print(top_state_events)
## # A tibble: 15 × 3
##    STATE          Most_Common_Event Event_Count
##    <chr>          <chr>                   <int>
##  1 ALABAMA        Thunderstorm Wind        1376
##  2 TEXAS          Hail                     1328
##  3 VIRGINIA       Thunderstorm Wind        1060
##  4 GEORGIA        Thunderstorm Wind         987
##  5 PENNSYLVANIA   Thunderstorm Wind         958
##  6 ILLINOIS       Thunderstorm Wind         806
##  7 MISSOURI       Thunderstorm Wind         742
##  8 OKLAHOMA       Hail                      731
##  9 OHIO           Thunderstorm Wind         685
## 10 NORTH CAROLINA Thunderstorm Wind         679
## 11 INDIANA        Thunderstorm Wind         657
## 12 MISSISSIPPI    Thunderstorm Wind         650
## 13 WEST VIRGINIA  Thunderstorm Wind         644
## 14 KANSAS         Thunderstorm Wind         631
## 15 KENTUCKY       Thunderstorm Wind         615

monthly_patterns <- storm_clean %>%
  filter(!is.na(MONTH)) %>%
  group_by(EVENT_TYPE, MONTH) %>%
  summarize(Event_Count = n(), .groups = 'drop') %>%
  group_by(EVENT_TYPE) %>%
  mutate(
    Total_Events = sum(Event_Count),
    Percent_of_Total = round(Event_Count / Total_Events * 100, 1)
  ) %>%
  arrange(EVENT_TYPE, MONTH)

# Get top 8 events by total frequency for visualization
top8_events <- storm_clean %>%
  count(EVENT_TYPE, sort = TRUE) %>%
  head(8) %>%
  pull(EVENT_TYPE)

monthly_top_events <- monthly_patterns %>%
  filter(EVENT_TYPE %in% top8_events)

# Find peak month for each event type
dominant_months <- monthly_patterns %>%
  group_by(EVENT_TYPE) %>%
  arrange(desc(Event_Count)) %>%
  slice(1) %>%
  select(EVENT_TYPE, Peak_Month = MONTH, 
         Events_in_Peak_Month = Event_Count,
         Percent_in_Peak_Month = Percent_of_Total)

cat("\n=== QUESTION 3 RESULTS ===\n")
## 
## === QUESTION 3 RESULTS ===
cat("Peak month for top event types:\n")
## Peak month for top event types:
print(head(dominant_months, 10))
## # A tibble: 10 × 4
## # Groups:   EVENT_TYPE [10]
##    EVENT_TYPE            Peak_Month Events_in_Peak_Month Percent_in_Peak_Month
##    <chr>                 <ord>                     <int>                 <dbl>
##  1 Astronomical Low Tide Mar                           8                  53.3
##  2 Avalanche             Feb                          15                  55.6
##  3 Blizzard              Mar                         266                  70  
##  4 Coastal Flood         Jan                          10                  45.5
##  5 Cold/Wind Chill       Jan                         643                  66.2
##  6 Debris Flow           Apr                          34                  37  
##  7 Dense Fog             May                         100                  41.8
##  8 Drought               Apr                         294                  18.7
##  9 Dust Storm            Mar                         123                  49  
## 10 Excessive Heat        Jul                         731                  63.4

custom_analysis <- storm_clean %>%
  group_by(EVENT_TYPE) %>%
  summarize(
    Total_Events = n(),
    Total_Fatalities = sum(FATALITIES),
    Total_Injuries = sum(INJURIES),
    Fatality_Rate = round(Total_Fatalities / Total_Events * 100, 2),  # Fatalities per 100 events
    Injury_Rate = round(Total_Injuries / Total_Events * 100, 2),      # Injuries per 100 events
    Events_with_Fatalities = sum(FATALITIES > 0),
    Events_with_Injuries = sum(INJURIES > 0)
  ) %>%
  filter(Total_Events >= 10) %>%  # Only events with at least 10 occurrences
  arrange(desc(Fatality_Rate)) %>%
  head(10)

cat("\n=== QUESTION 4 RESULTS (CUSTOM) ===\n")
## 
## === QUESTION 4 RESULTS (CUSTOM) ===
cat("Top 10 events by fatality rate (fatalities per 100 events):\n")
## Top 10 events by fatality rate (fatalities per 100 events):
print(custom_analysis)
## # A tibble: 10 × 8
##    EVENT_TYPE     Total_Events Total_Fatalities Total_Injuries Fatality_Rate
##    <chr>                 <int>            <dbl>          <dbl>         <dbl>
##  1 Rip Current              44               21              0         47.7 
##  2 Avalanche                27                8              0         29.6 
##  3 Wildfire                214               63              0         29.4 
##  4 High Surf                38                6              0         15.8 
##  5 Lightning               216               19              0          8.8 
##  6 Dust Storm              251               17              0          6.77
##  7 Heat                   1864              117              0          6.28
##  8 Excessive Heat         1153               61              0          5.29
##  9 Flash Flood            4258              188              0          4.42
## 10 Tornado                1404               60              0          4.27
## # ℹ 3 more variables: Injury_Rate <dbl>, Events_with_Fatalities <int>,
## #   Events_with_Injuries <int>