library(tidyverse)
library(lubridate) # Date formatting
library(mosaic) # favstats()
library(plotly) # Interactive line charts1
library(viridis) # Gradient scaling pallettes
library(ggridges) # For volatility analysis
# Load data
df <- read.csv("construction_spending.csv", stringsAsFactors = FALSE)
# Problem: The value ('val') is missing where 'dt_desc' == 'Monthly Percent Change for X Construction"
# Solution: Re-compute monthly percent change values for each 'cat_desc'
# IMPORTANT: Exclude "Annual Rate" categories as we are only interested in monthly change - will add back later
df_monthly <- df %>%
filter(!grepl("Annual Rate", cat_desc))
# Convert date column to correct date format
df_monthly$date <- as.Date(df_monthly$per_name, format = "%m/%d/%y")
# Filter for only the total construction values (not the monthly percent change rows)
total_values <- df_monthly %>%
filter(dt_desc %in% c("Total Construction",
"Total Private Construction",
"Total Public Construction"),
is_adj == 0) # Only non-adjusted values
# Calculate monthly percent changes for each category using a function
calculate_monthly_pct_change <- function(data) {
data %>%
arrange(cat_code, dt_code, date) %>%
group_by(cat_code, cat_desc, dt_code, dt_desc) %>%
mutate(
# Calculate percent change from previous month
pct_change = (val - lag(val)) / lag(val) * 100, # lag() gets prev val
# First month in each group will be NA (no prev val)
pct_change = round(pct_change, 2)
) %>%
ungroup()
}
# CALCULATE VALUES
total_values_with_pct <- calculate_monthly_pct_change(total_values)
# Check math
cat("Sample of calculated percent changes:\n")
## Sample of calculated percent changes:
check_math <- total_values_with_pct %>%
filter(!is.na(pct_change), cat_desc == "Total Construction") %>%
select(per_name,
cat_desc,
dt_desc, val,
pct_change) %>%
head(10)
print(check_math)
## # A tibble: 10 × 5
## per_name cat_desc dt_desc val pct_change
## <chr> <chr> <chr> <int> <dbl>
## 1 2/1/02 Total Construction Total Public Construction 14113 -0.91
## 2 3/1/02 Total Construction Total Public Construction 14386 1.93
## 3 4/1/02 Total Construction Total Public Construction 16221 12.8
## 4 5/1/02 Total Construction Total Public Construction 18375 13.3
## 5 6/1/02 Total Construction Total Public Construction 19381 5.47
## 6 7/1/02 Total Construction Total Public Construction 20369 5.1
## 7 8/1/02 Total Construction Total Public Construction 21507 5.59
## 8 9/1/02 Total Construction Total Public Construction 21385 -0.57
## 9 10/1/02 Total Construction Total Public Construction 19687 -7.94
## 10 11/1/02 Total Construction Total Public Construction 17810 -9.53
# Get the percent change rows that need updating
pct_change_rows_to_update <- df_monthly %>%
filter(grepl("Monthly Percent Change", dt_desc),
is_adj == 1) # Percent change rows have is_adj = 1
cat("\nNumber of percent change rows to update:", nrow(pct_change_rows_to_update), "\n")
##
## Number of percent change rows to update: 8448
# Make a lookup table for the new calculated percent change values
pct_lookup <- total_values_with_pct %>%
filter(!is.na(pct_change)) %>%
mutate(
# Map dt_desc to match percent change descriptions
pct_dt_desc = case_when(
dt_desc == "Total Construction" ~ "Monthly Percent Change for Total Construction",
dt_desc == "Total Private Construction" ~ "Monthly Percent Change for Private Construction",
dt_desc == "Total Public Construction" ~ "Monthly Percent Change for Public Construction"
),
# Map dt_code
pct_dt_code = case_when(
dt_desc == "Total Construction" ~ "MPCT",
dt_desc == "Total Private Construction" ~ "MPCV",
dt_desc == "Total Public Construction" ~ "MPCP"
)
) %>%
select(per_name, cat_code, geo_code, pct_dt_desc, pct_dt_code, pct_change)
# Update the percent change rows
df_updated <- df_monthly %>%
left_join(
pct_lookup,
by = c("per_name" = "per_name",
"cat_code" = "cat_code",
"geo_code" = "geo_code",
"dt_desc" = "pct_dt_desc",
"dt_code" = "pct_dt_code")
) %>%
mutate(
val = case_when(
!is.na(pct_change) ~ pct_change, # Update with percent change
TRUE ~ val # Keep original value
)
) %>%
select(-pct_change) # Remove extra column
# Merge back with the annual rate data to get back to complete dataset
df_annual_rates <- df %>%
filter(grepl("Annual Rate", cat_desc))
df_complete <- bind_rows(df_updated, df_annual_rates) %>%
arrange(per_idx, serialid)
# VERIFY DATA INTEGRITY
# We'll check if df and df_complete have the same serialid after all of that data wrangling
cat("df # of rows:", nrow(df), "\n")
## df # of rows: 50208
cat("df_complete # of rows:", nrow(df_complete), "\n")
## df_complete # of rows: 50208
cat("Difference:", nrow(df_complete) - nrow(df), "\n")
## Difference: 0
original_serials <- sort(df$serialid)
complete_serials <- sort(df_complete$serialid)
missing_serials <- setdiff(original_serials, complete_serials)
extra_serials <- setdiff(complete_serials, original_serials)
missing_serials
## integer(0)
extra_serials
## integer(0)
# Save the updated data in case needed later
write.csv(df_complete, "construction_spending_with_pct_changes.csv", row.names = FALSE)
# FINALLY, REMOVE EXTRANEOUS COLUMNS
df_final <- df_complete %>%
select(
-cat_indent, # Placeholder
-et_idx, # Unkown type (empty)
-et_code, # Unkown type (empty)
-et_desc, # Unkown type (empty)
-et_unit, # Unkown type (empty)
-geo_idx, # All data is US
-geo_code,
-geo_desc,
)
# ----- Dataframe housekeeping -----
# Add a log calculation for 'val'
df_final <- df_final %>%
mutate(val_log = log(val))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `val_log = log(val)`.
## Caused by warning in `log()`:
## ! NaNs produced
# Filter out annual values, split up cost and percent change
plot_monthly <- df_final %>%
filter(!grepl("Annual Rate", cat_desc))
plot_monthly_cost <- plot_monthly %>%
filter(dt_unit == "MLN$")
plot_monthly_pc <- plot_monthly %>%
filter(dt_unit == "PCT")
# Annual values get their own plotting df
plot_annual <- df_final %>%
filter(grepl("Annual Rate", cat_desc))
# Drop 'Total Construction' for cases where not needed
plot_monthly_cost_no_total <- plot_monthly_cost %>%
filter(cat_desc != "Total Construction")
plot_monthly_pc_no_total <- plot_monthly_pc %>%
filter(cat_desc != "Total Construction")
# Asked Claude Opus 4 for a pallete that can effectively distinguish 19 cats
tableau_19 <- c(
"#1F77B4", "#AEC7E8", "#FF7F0E", "#FFBB78", "#2CA02C",
"#98DF8A", "#D62728", "#FF9896", "#9467BD", "#C5B0D5",
"#8C564B", "#C49C94", "#E377C2", "#F7B6D2", "#7F7F7F",
"#C7C7C7", "#BCBD22", "#DBDB8D", "#17BECF"
)
# ===== PLOT 1: Overview Line Chart =====
p1 <- ggplot(data = plot_monthly_cost, aes(x = date, y = val, color = cat_desc)) +
geom_line(size = 1) +
scale_y_log10() +
facet_wrap(~dt_desc, nrow = 1, scales = "fixed") +
scale_color_manual(values = tableau_19) +
labs(
title = "Construction Spending Trends: Total, Private, and Public",
subtitle = "Monthly spending values by major construction categories",
x = "Date",
y = "Spending (Million $)"
) +
theme_minimal() +
theme(
strip.text = element_text(size = 12, face = "bold"),
strip.background = element_rect(fill = "grey90", color = NA),
legend.position = "bottom",
legend.title = element_text(size = 10),
legend.text = element_text(size = 9),
plot.title = element_text(size = 14, face = "bold")
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Convert to interactive
p1_interactive <- ggplotly(p1)
p1_interactive