library(tidyverse)
library(lubridate) # Date formatting
library(mosaic) # favstats()
library(plotly) # Interactive line charts1
library(viridis) # Gradient scaling pallettes
library(ggridges) # For volatility analysis

# Load data
df <- read.csv("construction_spending.csv", stringsAsFactors = FALSE)
# Problem: The value ('val') is missing where 'dt_desc' == 'Monthly Percent Change for X Construction"
# Solution: Re-compute monthly percent change values for each 'cat_desc'

# IMPORTANT: Exclude "Annual Rate" categories as we are only interested in monthly change - will add back later
df_monthly <- df %>%
  filter(!grepl("Annual Rate", cat_desc))

# Convert date column to correct date format
df_monthly$date <- as.Date(df_monthly$per_name, format = "%m/%d/%y")

# Filter for only the total construction values (not the monthly percent change rows)
total_values <- df_monthly %>%
  filter(dt_desc %in% c("Total Construction", 
                        "Total Private Construction", 
                        "Total Public Construction"),
         is_adj == 0)  # Only non-adjusted values

# Calculate monthly percent changes for each category using a function
calculate_monthly_pct_change <- function(data) {
  data %>%
    arrange(cat_code, dt_code, date) %>%
    group_by(cat_code, cat_desc, dt_code, dt_desc) %>%
    mutate(
      # Calculate percent change from previous month
      pct_change = (val - lag(val)) / lag(val) * 100, # lag() gets prev val
      # First month in each group will be NA (no prev val)
      pct_change = round(pct_change, 2)
    ) %>%
    ungroup()
}

# CALCULATE VALUES
total_values_with_pct <- calculate_monthly_pct_change(total_values)

# Check math
cat("Sample of calculated percent changes:\n")
## Sample of calculated percent changes:
check_math <- total_values_with_pct %>%
  filter(!is.na(pct_change), cat_desc == "Total Construction") %>%
  select(per_name,
         cat_desc,
         dt_desc, val,
         pct_change) %>%
  head(10)
print(check_math)
## # A tibble: 10 × 5
##    per_name cat_desc           dt_desc                     val pct_change
##    <chr>    <chr>              <chr>                     <int>      <dbl>
##  1 2/1/02   Total Construction Total Public Construction 14113      -0.91
##  2 3/1/02   Total Construction Total Public Construction 14386       1.93
##  3 4/1/02   Total Construction Total Public Construction 16221      12.8 
##  4 5/1/02   Total Construction Total Public Construction 18375      13.3 
##  5 6/1/02   Total Construction Total Public Construction 19381       5.47
##  6 7/1/02   Total Construction Total Public Construction 20369       5.1 
##  7 8/1/02   Total Construction Total Public Construction 21507       5.59
##  8 9/1/02   Total Construction Total Public Construction 21385      -0.57
##  9 10/1/02  Total Construction Total Public Construction 19687      -7.94
## 10 11/1/02  Total Construction Total Public Construction 17810      -9.53
# Get the percent change rows that need updating
pct_change_rows_to_update <- df_monthly %>%
  filter(grepl("Monthly Percent Change", dt_desc),
         is_adj == 1)  # Percent change rows have is_adj = 1

cat("\nNumber of percent change rows to update:", nrow(pct_change_rows_to_update), "\n")
## 
## Number of percent change rows to update: 8448
# Make a lookup table for the new calculated percent change values
pct_lookup <- total_values_with_pct %>%
  filter(!is.na(pct_change)) %>%
  mutate(
    # Map dt_desc to match percent change descriptions
    pct_dt_desc = case_when(
      dt_desc == "Total Construction" ~ "Monthly Percent Change for Total Construction",
      dt_desc == "Total Private Construction" ~ "Monthly Percent Change for Private Construction",
      dt_desc == "Total Public Construction" ~ "Monthly Percent Change for Public Construction"
    ),
    # Map dt_code
    pct_dt_code = case_when(
      dt_desc == "Total Construction" ~ "MPCT",
      dt_desc == "Total Private Construction" ~ "MPCV",
      dt_desc == "Total Public Construction" ~ "MPCP"
    )
  ) %>%
  select(per_name, cat_code, geo_code, pct_dt_desc, pct_dt_code, pct_change)

# Update the percent change rows
df_updated <- df_monthly %>%
  left_join(
    pct_lookup,
    by = c("per_name" = "per_name",
           "cat_code" = "cat_code",
           "geo_code" = "geo_code",
           "dt_desc" = "pct_dt_desc",
           "dt_code" = "pct_dt_code")
  ) %>%
  mutate(
    val = case_when(
      !is.na(pct_change) ~ pct_change,  # Update with percent change
      TRUE ~ val  # Keep original value
    )
  ) %>%
  select(-pct_change) # Remove extra column

# Merge back with the annual rate data to get back to complete dataset
df_annual_rates <- df %>%
  filter(grepl("Annual Rate", cat_desc))

df_complete <- bind_rows(df_updated, df_annual_rates) %>%
  arrange(per_idx, serialid)

# VERIFY DATA INTEGRITY
# We'll check if df and df_complete have the same serialid after all of that data wrangling
cat("df # of rows:", nrow(df), "\n")
## df # of rows: 50208
cat("df_complete # of rows:", nrow(df_complete), "\n")
## df_complete # of rows: 50208
cat("Difference:", nrow(df_complete) - nrow(df), "\n")
## Difference: 0
original_serials <- sort(df$serialid)
complete_serials <- sort(df_complete$serialid)

missing_serials <- setdiff(original_serials, complete_serials)
extra_serials <- setdiff(complete_serials, original_serials)

missing_serials
## integer(0)
extra_serials
## integer(0)
# Save the updated data in case needed later
write.csv(df_complete, "construction_spending_with_pct_changes.csv", row.names = FALSE)

# FINALLY, REMOVE EXTRANEOUS COLUMNS
df_final <- df_complete %>% 
  select(
    -cat_indent,  # Placeholder
    -et_idx,      # Unkown type (empty)
    -et_code,     # Unkown type (empty)
    -et_desc,     # Unkown type (empty)
    -et_unit,     # Unkown type (empty)
    -geo_idx,     # All data is US
    -geo_code,
    -geo_desc,
    
  )
# ----- Dataframe housekeeping -----
# Add a log calculation for 'val'
df_final <- df_final %>% 
  mutate(val_log = log(val))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `val_log = log(val)`.
## Caused by warning in `log()`:
## ! NaNs produced
# Filter out annual values, split up cost and percent change
plot_monthly <- df_final %>%
  filter(!grepl("Annual Rate", cat_desc))

plot_monthly_cost <- plot_monthly %>% 
  filter(dt_unit == "MLN$")
plot_monthly_pc <- plot_monthly %>% 
  filter(dt_unit == "PCT")

# Annual values get their own plotting df
plot_annual <- df_final %>% 
  filter(grepl("Annual Rate", cat_desc))

# Drop 'Total Construction' for cases where not needed
plot_monthly_cost_no_total <- plot_monthly_cost %>% 
  filter(cat_desc != "Total Construction")
plot_monthly_pc_no_total <- plot_monthly_pc %>% 
  filter(cat_desc != "Total Construction")

# Asked Claude Opus 4 for a pallete that can effectively distinguish 19 cats
tableau_19 <- c(
  "#1F77B4", "#AEC7E8", "#FF7F0E", "#FFBB78", "#2CA02C",
  "#98DF8A", "#D62728", "#FF9896", "#9467BD", "#C5B0D5",
  "#8C564B", "#C49C94", "#E377C2", "#F7B6D2", "#7F7F7F",
  "#C7C7C7", "#BCBD22", "#DBDB8D", "#17BECF"
)

# ===== PLOT 1: Overview Line Chart =====

p1 <- ggplot(data = plot_monthly_cost, aes(x = date, y = val, color = cat_desc)) +
  geom_line(size = 1) +
  scale_y_log10() +
  facet_wrap(~dt_desc, nrow = 1, scales = "fixed") +
  scale_color_manual(values = tableau_19) +
  labs(
    title = "Construction Spending Trends: Total, Private, and Public",
    subtitle = "Monthly spending values by major construction categories",
    x = "Date",
    y = "Spending (Million $)"
  ) +
  theme_minimal() +
  theme(
    strip.text = element_text(size = 12, face = "bold"),
    strip.background = element_rect(fill = "grey90", color = NA),
    legend.position = "bottom",
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 9),
    plot.title = element_text(size = 14, face = "bold")
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Convert to interactive
p1_interactive <- ggplotly(p1)
p1_interactive