##Data Preparation
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
# Set folder path
folder_path <- "C:/Users/libguest/Desktop"
#Defining file paths
details_file <- file.path(folder_path, "StormEvents_details-ftp_v1.0_d1950_c20250401.csv")
fatalities_file <- file.path(folder_path, "StormEvents_fatalities-ftp_v1.0_d1950_c20250401.csv")
locations_file <- file.path(folder_path, "StormEvents_fatalities-ftp_v1.0_d1950_c20250401.csv")
#Load CSVs
details <- read_csv(details_file)
## Rows: 223 Columns: 51
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): STATE, MONTH_NAME, EVENT_TYPE, CZ_TYPE, CZ_NAME, BEGIN_DATE_TIME, ...
## dbl (24): BEGIN_YEARMONTH, BEGIN_DAY, BEGIN_TIME, END_YEARMONTH, END_DAY, EN...
## lgl (16): EPISODE_ID, WFO, SOURCE, MAGNITUDE_TYPE, FLOOD_CAUSE, CATEGORY, TO...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
fatalities <- read_csv(fatalities_file)
## Rows: 23 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): FATALITY_TYPE, FATALITY_DATE
## dbl (6): FAT_YEARMONTH, FAT_DAY, FAT_TIME, FATALITY_ID, EVENT_ID, EVENT_YEAR...
## lgl (3): FATALITY_AGE, FATALITY_SEX, FATALITY_LOCATION
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
locations <- read_csv(locations_file)
## Rows: 23 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): FATALITY_TYPE, FATALITY_DATE
## dbl (6): FAT_YEARMONTH, FAT_DAY, FAT_TIME, FATALITY_ID, EVENT_ID, EVENT_YEAR...
## lgl (3): FATALITY_AGE, FATALITY_SEX, FATALITY_LOCATION
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Joining datasets by EVENT_ID
joined_data <- details %>%
left_join(locations, by = "EVENT_ID") %>%
left_join(fatalities, by = "EVENT_ID")
#Save joined data
output_file <- file.path(folder_path, "StormEvents_joined_data.csv")
write_csv(joined_data, output_file)
#Preview
message("✅ Joined data saved to: ", output_file)
## ✅ Joined data saved to: C:/Users/libguest/Desktop/StormEvents_joined_data.csv
head(joined_data)
## # A tibble: 6 × 71
## BEGIN_YEARMONTH BEGIN_DAY BEGIN_TIME END_YEARMONTH END_DAY END_TIME EPISODE_ID
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
## 1 195004 28 1445 195004 28 1445 NA
## 2 195004 29 1530 195004 29 1530 NA
## 3 195007 5 1800 195007 5 1800 NA
## 4 195007 5 1830 195007 5 1830 NA
## 5 195007 24 1440 195007 24 1440 NA
## 6 195008 29 1600 195008 29 1600 NA
## # ℹ 64 more variables: EVENT_ID <dbl>, STATE <chr>, STATE_FIPS <dbl>,
## # YEAR <dbl>, MONTH_NAME <chr>, EVENT_TYPE <chr>, CZ_TYPE <chr>,
## # CZ_FIPS <dbl>, CZ_NAME <chr>, WFO <lgl>, BEGIN_DATE_TIME <chr>,
## # CZ_TIMEZONE <chr>, END_DATE_TIME <chr>, INJURIES_DIRECT <dbl>,
## # INJURIES_INDIRECT <dbl>, DEATHS_DIRECT <dbl>, DEATHS_INDIRECT <dbl>,
## # DAMAGE_PROPERTY <chr>, DAMAGE_CROPS <dbl>, SOURCE <lgl>, MAGNITUDE <dbl>,
## # MAGNITUDE_TYPE <lgl>, FLOOD_CAUSE <lgl>, CATEGORY <lgl>, …
##Q1: Most Harmful Events to Population Health
#Summarize total injuries and deaths (direct + indirect)
health_impact <- joined_data %>%
group_by(EVENT_TYPE) %>%
summarise(
total_injuries = sum(INJURIES_DIRECT, na.rm = TRUE) + sum(INJURIES_INDIRECT, na.rm = TRUE),
total_deaths = sum(DEATHS_DIRECT, na.rm = TRUE) + sum(DEATHS_INDIRECT, na.rm = TRUE),
total_harm = total_injuries + total_deaths,
.groups = "drop"
) %>%
arrange(desc(total_harm))
head(health_impact, 10)
## # A tibble: 1 × 4
## EVENT_TYPE total_injuries total_deaths total_harm
## <chr> <dbl> <dbl> <dbl>
## 1 Tornado 659 70 729
top10_health <- health_impact %>% slice_max(total_harm, n = 10)
ggplot(top10_health, aes(x = reorder(EVENT_TYPE, total_harm), y = total_harm)) +
geom_bar(stat = "identity", fill = "tomato") +
coord_flip() +
labs(
title = "Top 10 Most Harmful Event Types (Health Impact)",
x = "Event Type",
y = "Total Harm (Injuries + Deaths)"
)
##Q2:Most Frequent Events by State
#Count number of events by state and type
event_counts <- joined_data %>%
group_by(STATE, EVENT_TYPE) %>%
summarise(total_events = n(), .groups = "drop") %>%
arrange(desc(total_events))
head(event_counts, 10)
## # A tibble: 10 × 3
## STATE EVENT_TYPE total_events
## <chr> <chr> <int>
## 1 KANSAS Tornado 33
## 2 LOUISIANA Tornado 28
## 3 OKLAHOMA Tornado 25
## 4 TEXAS Tornado 20
## 5 MISSISSIPPI Tornado 16
## 6 ARKANSAS Tornado 13
## 7 ILLINOIS Tornado 11
## 8 NORTH CAROLINA Tornado 9
## 9 FLORIDA Tornado 6
## 10 MISSOURI Tornado 6
#Top 5 states with most events
top_states <- joined_data %>%
count(STATE) %>%
arrange(desc(n)) %>%
slice_max(n, n = 5) %>%
pull(STATE)
#Filter event counts
top_state_events <- event_counts %>%
filter(STATE %in% top_states)
#Plot
ggplot(top_state_events, aes(x = reorder(EVENT_TYPE, total_events), y = total_events, fill = STATE)) +
geom_bar(stat = "identity") +
facet_wrap(~ STATE, scales = "free_y") +
coord_flip() +
labs(
title = "Most Frequent Event Types in Top 5 States",
x = "Event Type",
y = "Number of Events"
)
##Q3:Most Common Events by Month
monthly_counts <- joined_data %>%
group_by(MONTH_NAME, EVENT_TYPE) %>%
summarise(total_events = n(), .groups = "drop") %>%
arrange(match(MONTH_NAME, month.name)) # To sort Jan–Dec
head(monthly_counts, 10)
## # A tibble: 10 × 3
## MONTH_NAME EVENT_TYPE total_events
## <chr> <chr> <int>
## 1 January Tornado 8
## 2 February Tornado 27
## 3 March Tornado 24
## 4 April Tornado 17
## 5 May Tornado 64
## 6 June Tornado 30
## 7 July Tornado 23
## 8 August Tornado 14
## 9 September Tornado 3
## 10 October Tornado 2
#Plot top event types for each month
ggplot(monthly_counts, aes(x = reorder(EVENT_TYPE, total_events), y = total_events, fill = MONTH_NAME)) +
geom_bar(stat = "identity") +
facet_wrap(~ MONTH_NAME, scales = "free_y") +
coord_flip() +
labs(
title = "Most Common Event Types by Month",
x = "Event Type",
y = "Number of Events"
)
##Q4:Which Event Types Caused the Most Property Damage?
In this final section, we explore which types of weather events led
to the highest economic losses in terms of property damage. This is
based on the DAMAGE_PROPERTY column in the NOAA data, which
includes values like “1.5K”, “2M”, etc. We’ll clean and convert these
values before summarizing.
#Function to convert damage values like "1.5K", "2M", "3B" to numeric
convert_damage <- function(x) {
x <- toupper(x)
as.numeric(gsub("[KMB]", "", x)) *
ifelse(grepl("K", x), 1e3,
ifelse(grepl("M", x), 1e6,
ifelse(grepl("B", x), 1e9, 1)))
}
#Convert DAMAGE_PROPERTY to numeric
joined_data$damage_clean <- convert_damage(joined_data$DAMAGE_PROPERTY)
#Summarize total property damage per event type
damage_summary <- joined_data %>%
group_by(EVENT_TYPE) %>%
summarise(total_damage = sum(damage_clean, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(total_damage))
head(damage_summary, 10)
## # A tibble: 1 × 2
## EVENT_TYPE total_damage
## <chr> <dbl>
## 1 Tornado 34481650
top10_damage <- damage_summary %>% slice_max(total_damage, n = 10)
ggplot(top10_damage, aes(x = reorder(EVENT_TYPE, total_damage), y = total_damage)) +
geom_bar(stat = "identity", fill = "darkblue") +
coord_flip() +
labs(
title = "Top 10 Event Types by Property Damage",
x = "Event Type",
y = "Total Property Damage (USD)"
)