diagnostic-plots-script.r

# CAMPAIGN FINANCE DIAGNOSTIC PLOTS
# Standalone script for comprehensive regression diagnostic visualizations

# Load necessary packages
if (!require("ggplot2")) install.packages("ggplot2")

## Loading required package: ggplot2

if (!require("gridExtra")) install.packages("gridExtra")

## Loading required package: gridExtra

if (!require("cowplot")) install.packages("cowplot")

## Loading required package: cowplot

if (!require("viridis")) install.packages("viridis")

## Loading required package: viridis

## Loading required package: viridisLite

if (!require("car")) install.packages("car")

## Loading required package: car

## Loading required package: carData

library(ggplot2)     # For creating plots
library(gridExtra)   # For arranging multiple plots
library(cowplot)     # For plot themes and grid arrangement
library(viridis)     # For better color palettes
library(car)         # For diagnostic functions

# Set global theme for all plots
theme_set(theme_minimal(base_size = 12) +
            theme(plot.title = element_text(face = "bold", size = 14),
                  plot.subtitle = element_text(size = 12, color = "gray30"),
                  axis.title = element_text(face = "bold"),
                  legend.title = element_text(face = "bold"),
                  panel.grid.major = element_line(color = "gray90"),
                  panel.grid.minor = element_line(color = "gray95"),
                  plot.background = element_rect(fill = "white", color = NA),
                  legend.background = element_rect(fill = "white", color = NA)))

# Create or load your data
# OPTION 1: Comment out this block if loading your own data
set.seed(123)
n <- 500
campaign_data <- data.frame(
  receipts = rlnorm(n, meanlog = 10, sdlog = 1),
  individual_contributions = rlnorm(n, meanlog = 9.5, sdlog = 1.2),
  disbursements = rlnorm(n, meanlog = 9.8, sdlog = 0.9),
  party = factor(sample(c("Democrat", "Republican"), n, replace = TRUE)),
  incumbency = factor(sample(c("Incumbent", "Challenger", "Open Seat"), n, replace = TRUE))
)

# OPTION 2: Uncomment to load your own data
# campaign_data <- read.csv("your_data_file.csv")

# Create necessary variables for analysis
campaign_data$log_receipts <- log(campaign_data$receipts + 1)  # Add 1 to handle zeros
campaign_data$log_individual_contributions <- log(campaign_data$individual_contributions + 1)
campaign_data$log_disbursements <- log(campaign_data$disbursements + 1)

# Create dummy variables for regression (if needed)
campaign_data$republican <- ifelse(campaign_data$party == "Republican", 1, 0)
campaign_data$challenger <- ifelse(campaign_data$incumbency == "Challenger", 1, 0)
campaign_data$open_seat <- ifelse(campaign_data$incumbency == "Open Seat", 1, 0)

# Create the regression models
model1 <- lm(log_receipts ~ party + incumbency + party:incumbency, data = campaign_data)
model2 <- lm(log_individual_contributions ~ party + incumbency + party:incumbency, data = campaign_data)
model3 <- lm(log_disbursements ~ party + incumbency + party:incumbency, data = campaign_data)

# Function to create enhanced diagnostic plots for a model
create_diagnostic_plots <- function(model, model_name, color_palette = "viridis") {
  # Extract diagnostic information
  model_data <- data.frame(
    fitted = fitted(model),
    residuals = residuals(model),
    std_resid = rstandard(model),
    stud_resid = rstudent(model),
    cooks_dist = cooks.distance(model),
    leverage = hatvalues(model),
    sqrt_abs_resid = sqrt(abs(rstudent(model))),
    index = 1:length(residuals(model))
  )
  
  # Add identifier for outliers and influential points
  model_data$outlier <- abs(model_data$stud_resid) > 3
  model_data$influential <- model_data$cooks_dist > (4/length(model_data$residuals))
  
  # Plot 1: Residuals vs Fitted with improved formatting
  p1 <- ggplot(model_data, aes(x = fitted, y = residuals)) +
    geom_point(aes(color = abs(residuals)), alpha = 0.7) +
    geom_smooth(method = "loess", se = FALSE, color = "#3366FF", linewidth = 1) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red", linewidth = 0.8) +
    scale_color_viridis_c(option = color_palette, end = 0.8) +
    labs(title = paste("Residuals vs Fitted Values -", model_name),
         subtitle = "Points should be randomly distributed around zero line",
         x = "Fitted values", 
         y = "Residuals",
         color = "|Residual|") +
    theme(legend.position = "right")
  
  # Plot 2: Enhanced Q-Q plot
  p2 <- ggplot(model_data, aes(sample = residuals)) +
    stat_qq(aes(color = abs(residuals)), alpha = 0.7) +
    stat_qq_line(linewidth = 0.8, color = "red") +
    scale_color_viridis_c(option = color_palette, end = 0.8) +
    labs(title = paste("Normal Q-Q Plot -", model_name),
         subtitle = "Points should follow the diagonal line",
         x = "Theoretical Quantiles", 
         y = "Sample Quantiles",
         color = "|Residual|") +
    theme(legend.position = "right")
  
  # Plot 3: Enhanced Scale-Location plot
  p3 <- ggplot(model_data, aes(x = fitted, y = sqrt_abs_resid)) +
    geom_point(aes(color = sqrt_abs_resid), alpha = 0.7) +
    geom_smooth(method = "loess", se = FALSE, color = "#3366FF", linewidth = 1) +
    scale_color_viridis_c(option = color_palette, end = 0.8) +
    labs(title = paste("Scale-Location Plot -", model_name),
         subtitle = "Checking homoscedasticity of residuals",
         x = "Fitted values", 
         y = "√|Studentized residuals|",
         color = "√|Residual|") +
    theme(legend.position = "right")
  
  # Plot 4: Enhanced Cook's distance plot with threshold line
  threshold <- 4/length(model_data$residuals)
  p4 <- ggplot(model_data, aes(x = index, y = cooks_dist)) +
    geom_segment(aes(x = index, xend = index, y = 0, yend = cooks_dist), 
                 color = "#3366FF", alpha = 0.6) +
    geom_point(aes(color = cooks_dist > threshold), size = 2) +
    geom_hline(yintercept = threshold, linetype = "dashed", color = "red", linewidth = 0.8) +
    scale_color_manual(values = c("FALSE" = "#3366FF", "TRUE" = "#FF0000"),
                       labels = c("Normal", "Influential")) +
    labs(title = paste("Cook's Distance -", model_name),
         subtitle = paste("Influential points above threshold:", threshold),
         x = "Observation Index", 
         y = "Cook's distance",
         color = "Influential") +
    theme(legend.position = "right")
  
  # Plot 5: Enhanced Residuals vs Leverage with contours
  p5 <- ggplot(model_data, aes(x = leverage, y = stud_resid)) +
    geom_point(aes(color = cooks_dist), size = 2, alpha = 0.7) +
    geom_hline(yintercept = c(-3, 0, 3), linetype = c("dashed", "solid", "dashed"), 
               color = c("red", "black", "red"), linewidth = c(0.8, 0.5, 0.8)) +
    scale_color_viridis_c(option = color_palette) +
    labs(title = paste("Residuals vs Leverage -", model_name),
         subtitle = "Identifying observations with high influence",
         x = "Leverage", 
         y = "Studentized Residuals",
         color = "Cook's Dist") +
    theme(legend.position = "right")
  
  # Residual histogram with density curve
  p6 <- ggplot(model_data, aes(x = residuals)) +
    geom_histogram(aes(y = ..density..), bins = 30, fill = "#3366FF", alpha = 0.7) +
    geom_density(color = "red", linewidth = 1) +
    labs(title = paste("Residual Distribution -", model_name),
         subtitle = "Should approximate a normal distribution",
         x = "Residuals", y = "Density") +
    theme(legend.position = "none")
  
  # Combine plots into a single grid
  combined <- plot_grid(
    p1, p2, p3, p4, p5, p6,
    ncol = 2, align = "hv", labels = "AUTO"
  )
  
  # Add overall title
  titled_plot <- plot_grid(
    ggdraw() + draw_label(paste("Diagnostic Plots:", model_name), 
                          fontface = "bold", size = 16),
    combined,
    ncol = 1,
    rel_heights = c(0.1, 1)
  )
  
  # Return list with individual plots and combined plot
  return(list(
    residuals_fitted = p1,
    qq_plot = p2,
    scale_location = p3,
    cooks_distance = p4,
    residuals_leverage = p5,
    residuals_histogram = p6,
    combined_plot = titled_plot
  ))
}

# Create residual pattern check plots
create_residual_pattern_plots <- function(model, model_name) {
  # Extract data
  model_data <- data.frame(
    fitted = fitted(model),
    residuals = residuals(model),
    party = campaign_data$party,
    incumbency = campaign_data$incumbency
  )
  
  # Residuals by party
  p1 <- ggplot(model_data, aes(x = party, y = residuals, fill = party)) +
    geom_boxplot(alpha = 0.8) +
    scale_fill_manual(values = c("Democrat" = "#3366FF", "Republican" = "#FF3333")) +
    labs(title = paste("Residuals by Party -", model_name),
         x = "Party", y = "Residuals") +
    theme(legend.position = "none")
  
  # Residuals by incumbency
  p2 <- ggplot(model_data, aes(x = incumbency, y = residuals, fill = incumbency)) +
    geom_boxplot(alpha = 0.8) +
    scale_fill_viridis_d() +
    labs(title = paste("Residuals by Incumbency Status -", model_name),
         x = "Incumbency Status", y = "Residuals") +
    theme(legend.position = "none")
  
  # Residuals by interaction
  p3 <- ggplot(model_data, aes(x = interaction(party, incumbency), y = residuals, fill = party)) +
    geom_boxplot(alpha = 0.8) +
    scale_fill_manual(values = c("Democrat" = "#3366FF", "Republican" = "#FF3333")) +
    labs(title = paste("Residuals by Party-Incumbency Interaction -", model_name),
         x = "Party-Incumbency Combination", y = "Residuals") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  # Combine plots
  combined <- grid.arrange(
    p1, p2, p3, 
    ncol = 2,
    top = paste("Residual Patterns:", model_name)
  )
  
  # Return the plots
  return(list(
    residuals_party = p1,
    residuals_incumbency = p2,
    residuals_interaction = p3,
    combined = combined
  ))
}

# Generate diagnostic plots for each model
model1_plots <- create_diagnostic_plots(model1, "Log Receipts Model", "viridis")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

## `geom_smooth()` using formula = 'y ~ x'

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

model2_plots <- create_diagnostic_plots(model2, "Log Individual Contributions Model", "plasma")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

## `geom_smooth()` using formula = 'y ~ x'

model3_plots <- create_diagnostic_plots(model3, "Log Disbursements Model", "inferno")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

## `geom_smooth()` using formula = 'y ~ x'

# Generate residual pattern plots
model1_pattern_plots <- create_residual_pattern_plots(model1, "Log Receipts Model")

model2_pattern_plots <- create_residual_pattern_plots(model2, "Log Individual Contributions Model")

model3_pattern_plots <- create_residual_pattern_plots(model3, "Log Disbursements Model")

# Display plots
# Uncomment whichever plots you want to see
print(model1_plots$combined_plot)

# print(model2_plots$combined_plot)
# print(model3_plots$combined_plot)
# print(model1_pattern_plots$combined)
# print(model2_pattern_plots$combined)
# print(model3_pattern_plots$combined)

# Save plots to PDF
save_diagnostic_plots <- function(filename = "campaign_finance_diagnostic_plots.pdf") {
  pdf(filename, width = 11, height = 8.5)
  
  # Title page
  plot.new()
  title <- "Campaign Finance Regression Diagnostics"
  subtitle <- paste("Generated:", format(Sys.time(), "%B %d, %Y"))
  text(0.5, 0.6, title, cex = 2, font = 2)
  text(0.5, 0.5, subtitle, cex = 1.2)
  
  # Print each set of diagnostic plots
  print(model1_plots$combined_plot)
  print(model2_plots$combined_plot)
  print(model3_plots$combined_plot)
  
  # Print each set of residual pattern plots
  print(model1_pattern_plots$combined)
  print(model2_pattern_plots$combined)
  print(model3_pattern_plots$combined)
  
  dev.off()
  cat("Diagnostic plots saved to", filename, "\n")
}

# Uncomment to save plots to PDF
# save_diagnostic_plots()

diagnostic-plots-script.r

lemmi

2025-04-24