##Data Preparation

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
# Set folder path
folder_path <- "C:/Users/libguest/Desktop"

#Defining file paths
details_file <- file.path(folder_path, "StormEvents_details-ftp_v1.0_d1950_c20250401.csv")
fatalities_file <- file.path(folder_path, "StormEvents_fatalities-ftp_v1.0_d1950_c20250401.csv")
locations_file <- file.path(folder_path, "StormEvents_fatalities-ftp_v1.0_d1950_c20250401.csv")

#Load CSVs
details <- read_csv(details_file)
## Rows: 223 Columns: 51
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): STATE, MONTH_NAME, EVENT_TYPE, CZ_TYPE, CZ_NAME, BEGIN_DATE_TIME, ...
## dbl (24): BEGIN_YEARMONTH, BEGIN_DAY, BEGIN_TIME, END_YEARMONTH, END_DAY, EN...
## lgl (16): EPISODE_ID, WFO, SOURCE, MAGNITUDE_TYPE, FLOOD_CAUSE, CATEGORY, TO...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
fatalities <- read_csv(fatalities_file)
## Rows: 23 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): FATALITY_TYPE, FATALITY_DATE
## dbl (6): FAT_YEARMONTH, FAT_DAY, FAT_TIME, FATALITY_ID, EVENT_ID, EVENT_YEAR...
## lgl (3): FATALITY_AGE, FATALITY_SEX, FATALITY_LOCATION
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
locations <- read_csv(locations_file)
## Rows: 23 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): FATALITY_TYPE, FATALITY_DATE
## dbl (6): FAT_YEARMONTH, FAT_DAY, FAT_TIME, FATALITY_ID, EVENT_ID, EVENT_YEAR...
## lgl (3): FATALITY_AGE, FATALITY_SEX, FATALITY_LOCATION
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Joining datasets by EVENT_ID
joined_data <- details %>%
  left_join(locations, by = "EVENT_ID") %>%
  left_join(fatalities, by = "EVENT_ID")

#Save joined data
output_file <- file.path(folder_path, "StormEvents_joined_data.csv")
write_csv(joined_data, output_file)

#Preview
message("✅ Joined data saved to: ", output_file)
## ✅ Joined data saved to: C:/Users/libguest/Desktop/StormEvents_joined_data.csv
head(joined_data)
## # A tibble: 6 × 71
##   BEGIN_YEARMONTH BEGIN_DAY BEGIN_TIME END_YEARMONTH END_DAY END_TIME EPISODE_ID
##             <dbl>     <dbl>      <dbl>         <dbl>   <dbl>    <dbl> <lgl>     
## 1          195004        28       1445        195004      28     1445 NA        
## 2          195004        29       1530        195004      29     1530 NA        
## 3          195007         5       1800        195007       5     1800 NA        
## 4          195007         5       1830        195007       5     1830 NA        
## 5          195007        24       1440        195007      24     1440 NA        
## 6          195008        29       1600        195008      29     1600 NA        
## # ℹ 64 more variables: EVENT_ID <dbl>, STATE <chr>, STATE_FIPS <dbl>,
## #   YEAR <dbl>, MONTH_NAME <chr>, EVENT_TYPE <chr>, CZ_TYPE <chr>,
## #   CZ_FIPS <dbl>, CZ_NAME <chr>, WFO <lgl>, BEGIN_DATE_TIME <chr>,
## #   CZ_TIMEZONE <chr>, END_DATE_TIME <chr>, INJURIES_DIRECT <dbl>,
## #   INJURIES_INDIRECT <dbl>, DEATHS_DIRECT <dbl>, DEATHS_INDIRECT <dbl>,
## #   DAMAGE_PROPERTY <chr>, DAMAGE_CROPS <dbl>, SOURCE <lgl>, MAGNITUDE <dbl>,
## #   MAGNITUDE_TYPE <lgl>, FLOOD_CAUSE <lgl>, CATEGORY <lgl>, …

##Q1: Most Harmful Events to Population Health

#Summarize total injuries and deaths (direct + indirect)
health_impact <- joined_data %>%
  group_by(EVENT_TYPE) %>%
  summarise(
    total_injuries = sum(INJURIES_DIRECT, na.rm = TRUE) + sum(INJURIES_INDIRECT, na.rm = TRUE),
    total_deaths = sum(DEATHS_DIRECT, na.rm = TRUE) + sum(DEATHS_INDIRECT, na.rm = TRUE),
    total_harm = total_injuries + total_deaths,
    .groups = "drop"
  ) %>%
  arrange(desc(total_harm))

head(health_impact, 10)
## # A tibble: 1 × 4
##   EVENT_TYPE total_injuries total_deaths total_harm
##   <chr>               <dbl>        <dbl>      <dbl>
## 1 Tornado               659           70        729
top10_health <- health_impact %>% slice_max(total_harm, n = 10)

ggplot(top10_health, aes(x = reorder(EVENT_TYPE, total_harm), y = total_harm)) +
  geom_bar(stat = "identity", fill = "tomato") +
  coord_flip() +
  labs(
    title = "Top 10 Most Harmful Event Types (Health Impact)",
    x = "Event Type",
    y = "Total Harm (Injuries + Deaths)"
  )


##Q2:Most Frequent Events by State

#Count number of events by state and type
event_counts <- joined_data %>%
  group_by(STATE, EVENT_TYPE) %>%
  summarise(total_events = n(), .groups = "drop") %>%
  arrange(desc(total_events))

head(event_counts, 10)
## # A tibble: 10 × 3
##    STATE          EVENT_TYPE total_events
##    <chr>          <chr>             <int>
##  1 KANSAS         Tornado              33
##  2 LOUISIANA      Tornado              28
##  3 OKLAHOMA       Tornado              25
##  4 TEXAS          Tornado              20
##  5 MISSISSIPPI    Tornado              16
##  6 ARKANSAS       Tornado              13
##  7 ILLINOIS       Tornado              11
##  8 NORTH CAROLINA Tornado               9
##  9 FLORIDA        Tornado               6
## 10 MISSOURI       Tornado               6
#Top 5 states with most events
top_states <- joined_data %>%
  count(STATE) %>%
  arrange(desc(n)) %>%
  slice_max(n, n = 5) %>%
  pull(STATE)

#Filter event counts
top_state_events <- event_counts %>%
  filter(STATE %in% top_states)

#Plot
ggplot(top_state_events, aes(x = reorder(EVENT_TYPE, total_events), y = total_events, fill = STATE)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ STATE, scales = "free_y") +
  coord_flip() +
  labs(
    title = "Most Frequent Event Types in Top 5 States",
    x = "Event Type",
    y = "Number of Events"
  )


##Q3:Most Common Events by Month

monthly_counts <- joined_data %>%
  group_by(MONTH_NAME, EVENT_TYPE) %>%
  summarise(total_events = n(), .groups = "drop") %>%
  arrange(match(MONTH_NAME, month.name))  # To sort Jan–Dec

head(monthly_counts, 10)
## # A tibble: 10 × 3
##    MONTH_NAME EVENT_TYPE total_events
##    <chr>      <chr>             <int>
##  1 January    Tornado               8
##  2 February   Tornado              27
##  3 March      Tornado              24
##  4 April      Tornado              17
##  5 May        Tornado              64
##  6 June       Tornado              30
##  7 July       Tornado              23
##  8 August     Tornado              14
##  9 September  Tornado               3
## 10 October    Tornado               2
#Plot top event types for each month
ggplot(monthly_counts, aes(x = reorder(EVENT_TYPE, total_events), y = total_events, fill = MONTH_NAME)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ MONTH_NAME, scales = "free_y") +
  coord_flip() +
  labs(
    title = "Most Common Event Types by Month",
    x = "Event Type",
    y = "Number of Events"
  )


##Q4:Which Event Types Caused the Most Property Damage?

In this final section, we explore which types of weather events led to the highest economic losses in terms of property damage. This is based on the DAMAGE_PROPERTY column in the NOAA data, which includes values like “1.5K”, “2M”, etc. We’ll clean and convert these values before summarizing.

#Function to convert damage values like "1.5K", "2M", "3B" to numeric
convert_damage <- function(x) {
  x <- toupper(x)
  as.numeric(gsub("[KMB]", "", x)) *
    ifelse(grepl("K", x), 1e3,
    ifelse(grepl("M", x), 1e6,
    ifelse(grepl("B", x), 1e9, 1)))
}

#Convert DAMAGE_PROPERTY to numeric
joined_data$damage_clean <- convert_damage(joined_data$DAMAGE_PROPERTY)

#Summarize total property damage per event type
damage_summary <- joined_data %>%
  group_by(EVENT_TYPE) %>%
  summarise(total_damage = sum(damage_clean, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(total_damage))

head(damage_summary, 10)
## # A tibble: 1 × 2
##   EVENT_TYPE total_damage
##   <chr>             <dbl>
## 1 Tornado        34481650
top10_damage <- damage_summary %>% slice_max(total_damage, n = 10)

ggplot(top10_damage, aes(x = reorder(EVENT_TYPE, total_damage), y = total_damage)) +
  geom_bar(stat = "identity", fill = "darkblue") +
  coord_flip() +
  labs(
    title = "Top 10 Event Types by Property Damage",
    x = "Event Type",
    y = "Total Property Damage (USD)"
  )