Stat220 Project 4s

Author

Marcel Pierre-Louis

This project reads and combines multiple yearly CSV files containing U.S. billion-dollar disaster data into a single cleaned dataset. It standardizes variable names and types, parses dates, and extracts the year from each file name. It then creates a summary that ocunts the number of disasters by typoe each year for a stacked bar chart, and a summary that calculates the total CPI adjusted damages per year. The second summary computes annual total damages, confidence interval bounds, and a 5-year moving average, which are visualized in a time-series plot showing overall damage trends over time.

library(tidyverse)
library(lubridate)
library(dplyr)
library(DT)
library(plotly)

read_disaster_data <- function(path) {
  readr::read_csv(
    file = path,
    skip = 3,
    col_names = c(
      "name",
      "disaster_type",
      "start_date",
      "end_date",
      "damages_adjusted",
      "damages_unadjusted",
      "deaths"
    ),
    col_types = readr::cols(
      name = readr::col_character(),
      disaster_type = readr::col_factor(),
      start_date = readr::col_date(format = "%Y%m%d"),
      end_date = readr::col_date(format = "%Y%m%d"),
      damages_adjusted = readr::col_double(),
      damages_unadjusted = readr::col_double(),
      deaths = readr::col_integer()
    ),
    show_col_types = FALSE
  ) |>
    mutate(
      name = stringr::str_remove(name, "\\s*\\(.*\\)$"),
      duration = as.integer(end_date - start_date) + 1,
      year = readr::parse_number(basename(path))
    )
}

files <- list.files("data", pattern = "\\.csv$", full.names = TRUE)
all_disasters <- purrr::map_dfr(files, read_disaster_data)

readr::write_csv(all_disasters, "all_disasters.csv")

all_disasters |>
  select(year, name, disaster_type, start_date, end_date,
         damages_adjusted, deaths, duration) |>
  arrange(desc(year), desc(damages_adjusted)) |>
  mutate(damages_adjusted = scales::dollar(damages_adjusted, scale = 1/1000,
                                           suffix = "B", accuracy = 0.1)) |>
  datatable(
    filter = "top",           # per-column filters
    extensions = "Buttons",
    options = list(
      dom = "Bfrtip",
      buttons = c("csv", "excel"),   # download buttons
      pageLength = 15,
      order = list(list(0, "desc"))
    ),
    colnames = c("Year", "Event Name", "Type", "Start", "End",
                 "Damages (Adj.)", "Deaths", "Duration (days)")
  )

disaster_counts <- all_disasters |>
  count(year, disaster_type, name = "n_disasters")

p1 <- ggplot(disaster_counts, aes(x = year, y = n_disasters, fill = disaster_type,
             text = paste0("Year: ", year, "<br>Type: ", disaster_type,
                           "<br>Count: ", n_disasters))) +
  geom_col(width = 0.85, color = "grey20", linewidth = 0.1) +
  scale_x_continuous(breaks = seq(min(disaster_counts$year),
                                  max(disaster_counts$year), by = 4)) +
  labs(title = "Billion-Dollar Disasters by Type and Year",
       x = "Year", y = "Number of Disasters", fill = "Disaster Type") +
  theme_minimal(base_size = 11) +
  theme(panel.grid.minor = element_blank(), legend.position = "bottom")

ggplotly(p1, tooltip = "text")

yearly_damages <- all_disasters |>
  group_by(year) |>
  summarise(
    total_damages_adjusted = sum(damages_adjusted, na.rm = TRUE),
    se = sd(damages_adjusted, na.rm = TRUE) / sqrt(n()),
    .groups = "drop"
  ) |>
  mutate(
    total_damages_bill = total_damages_adjusted / 1000,
    ci_low_bill = pmax((total_damages_adjusted - 1.96 * se) / 1000, 0),
    ci_high_bill = (total_damages_adjusted + 1.96 * se) / 1000,
    avg_5yr_bill = as.numeric(stats::filter(total_damages_bill, rep(1/5, 5), sides = 1))
  )

ggplot(yearly_damages, aes(x = year)) +
  geom_line(aes(y = total_damages_bill), color = "navy", linewidth = 1) +
  geom_point(aes(y = total_damages_bill), color = "navy", size = 2) +
    
  geom_line(aes(y = ci_low_bill), color = "gray45", linetype = "dashed") +
  geom_line(aes(y = ci_high_bill), color = "gray45", linetype = "dashed") +
  
  geom_line(aes(y = avg_5yr_bill), color = "tomato", linewidth = 1, na.rm = TRUE) +
  
  scale_x_continuous(
    breaks = seq(min(yearly_damages$year), max(yearly_damages$year), by = 4)
  ) +
  scale_y_continuous(
     labels = scales::label_number(accuracy = 0.1, suffix = "B")
  ) +
  labs(
    title = "Total CPI-Adjusted Disaster Damges by Year",
    x = "Year",
    y = "Total Damages (Billions USD)"
  ) +
  theme_minimal(base_size = 11) +
  theme_minimal()

I chose to use plotly to make my bar plot interactive so that the viewer could receive more information when going through my report. I thought it would be nice for the viewer to be able to hover over a given bar and understand the specifics of what it’s showing without having to connect the legend to the y and x axis. Also, because the y axis doesn’t show the specific count, plotly lets the user see the exact count of disasters. I also chose to create a data table, I worked with a lot of long and complicated data sets for this project, I thought it would also be helpful to the veiwer if it was easy for them to navigate the data set for any additional information they were looking for.