# CAMPAIGN FINANCE DIAGNOSTIC PLOTS
# Standalone script for comprehensive regression diagnostic visualizations
# Load necessary packages
if (!require("ggplot2")) install.packages("ggplot2")
## Loading required package: ggplot2
if (!require("gridExtra")) install.packages("gridExtra")
## Loading required package: gridExtra
if (!require("cowplot")) install.packages("cowplot")
## Loading required package: cowplot
if (!require("viridis")) install.packages("viridis")
## Loading required package: viridis
## Loading required package: viridisLite
if (!require("car")) install.packages("car")
## Loading required package: car
## Loading required package: carData
library(ggplot2) # For creating plots
library(gridExtra) # For arranging multiple plots
library(cowplot) # For plot themes and grid arrangement
library(viridis) # For better color palettes
library(car) # For diagnostic functions
# Set global theme for all plots
theme_set(theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 12, color = "gray30"),
axis.title = element_text(face = "bold"),
legend.title = element_text(face = "bold"),
panel.grid.major = element_line(color = "gray90"),
panel.grid.minor = element_line(color = "gray95"),
plot.background = element_rect(fill = "white", color = NA),
legend.background = element_rect(fill = "white", color = NA)))
# Create or load your data
# OPTION 1: Comment out this block if loading your own data
set.seed(123)
n <- 500
campaign_data <- data.frame(
receipts = rlnorm(n, meanlog = 10, sdlog = 1),
individual_contributions = rlnorm(n, meanlog = 9.5, sdlog = 1.2),
disbursements = rlnorm(n, meanlog = 9.8, sdlog = 0.9),
party = factor(sample(c("Democrat", "Republican"), n, replace = TRUE)),
incumbency = factor(sample(c("Incumbent", "Challenger", "Open Seat"), n, replace = TRUE))
)
# OPTION 2: Uncomment to load your own data
# campaign_data <- read.csv("your_data_file.csv")
# Create necessary variables for analysis
campaign_data$log_receipts <- log(campaign_data$receipts + 1) # Add 1 to handle zeros
campaign_data$log_individual_contributions <- log(campaign_data$individual_contributions + 1)
campaign_data$log_disbursements <- log(campaign_data$disbursements + 1)
# Create dummy variables for regression (if needed)
campaign_data$republican <- ifelse(campaign_data$party == "Republican", 1, 0)
campaign_data$challenger <- ifelse(campaign_data$incumbency == "Challenger", 1, 0)
campaign_data$open_seat <- ifelse(campaign_data$incumbency == "Open Seat", 1, 0)
# Create the regression models
model1 <- lm(log_receipts ~ party + incumbency + party:incumbency, data = campaign_data)
model2 <- lm(log_individual_contributions ~ party + incumbency + party:incumbency, data = campaign_data)
model3 <- lm(log_disbursements ~ party + incumbency + party:incumbency, data = campaign_data)
# Function to create enhanced diagnostic plots for a model
create_diagnostic_plots <- function(model, model_name, color_palette = "viridis") {
# Extract diagnostic information
model_data <- data.frame(
fitted = fitted(model),
residuals = residuals(model),
std_resid = rstandard(model),
stud_resid = rstudent(model),
cooks_dist = cooks.distance(model),
leverage = hatvalues(model),
sqrt_abs_resid = sqrt(abs(rstudent(model))),
index = 1:length(residuals(model))
)
# Add identifier for outliers and influential points
model_data$outlier <- abs(model_data$stud_resid) > 3
model_data$influential <- model_data$cooks_dist > (4/length(model_data$residuals))
# Plot 1: Residuals vs Fitted with improved formatting
p1 <- ggplot(model_data, aes(x = fitted, y = residuals)) +
geom_point(aes(color = abs(residuals)), alpha = 0.7) +
geom_smooth(method = "loess", se = FALSE, color = "#3366FF", linewidth = 1) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red", linewidth = 0.8) +
scale_color_viridis_c(option = color_palette, end = 0.8) +
labs(title = paste("Residuals vs Fitted Values -", model_name),
subtitle = "Points should be randomly distributed around zero line",
x = "Fitted values",
y = "Residuals",
color = "|Residual|") +
theme(legend.position = "right")
# Plot 2: Enhanced Q-Q plot
p2 <- ggplot(model_data, aes(sample = residuals)) +
stat_qq(aes(color = abs(residuals)), alpha = 0.7) +
stat_qq_line(linewidth = 0.8, color = "red") +
scale_color_viridis_c(option = color_palette, end = 0.8) +
labs(title = paste("Normal Q-Q Plot -", model_name),
subtitle = "Points should follow the diagonal line",
x = "Theoretical Quantiles",
y = "Sample Quantiles",
color = "|Residual|") +
theme(legend.position = "right")
# Plot 3: Enhanced Scale-Location plot
p3 <- ggplot(model_data, aes(x = fitted, y = sqrt_abs_resid)) +
geom_point(aes(color = sqrt_abs_resid), alpha = 0.7) +
geom_smooth(method = "loess", se = FALSE, color = "#3366FF", linewidth = 1) +
scale_color_viridis_c(option = color_palette, end = 0.8) +
labs(title = paste("Scale-Location Plot -", model_name),
subtitle = "Checking homoscedasticity of residuals",
x = "Fitted values",
y = "√|Studentized residuals|",
color = "√|Residual|") +
theme(legend.position = "right")
# Plot 4: Enhanced Cook's distance plot with threshold line
threshold <- 4/length(model_data$residuals)
p4 <- ggplot(model_data, aes(x = index, y = cooks_dist)) +
geom_segment(aes(x = index, xend = index, y = 0, yend = cooks_dist),
color = "#3366FF", alpha = 0.6) +
geom_point(aes(color = cooks_dist > threshold), size = 2) +
geom_hline(yintercept = threshold, linetype = "dashed", color = "red", linewidth = 0.8) +
scale_color_manual(values = c("FALSE" = "#3366FF", "TRUE" = "#FF0000"),
labels = c("Normal", "Influential")) +
labs(title = paste("Cook's Distance -", model_name),
subtitle = paste("Influential points above threshold:", threshold),
x = "Observation Index",
y = "Cook's distance",
color = "Influential") +
theme(legend.position = "right")
# Plot 5: Enhanced Residuals vs Leverage with contours
p5 <- ggplot(model_data, aes(x = leverage, y = stud_resid)) +
geom_point(aes(color = cooks_dist), size = 2, alpha = 0.7) +
geom_hline(yintercept = c(-3, 0, 3), linetype = c("dashed", "solid", "dashed"),
color = c("red", "black", "red"), linewidth = c(0.8, 0.5, 0.8)) +
scale_color_viridis_c(option = color_palette) +
labs(title = paste("Residuals vs Leverage -", model_name),
subtitle = "Identifying observations with high influence",
x = "Leverage",
y = "Studentized Residuals",
color = "Cook's Dist") +
theme(legend.position = "right")
# Residual histogram with density curve
p6 <- ggplot(model_data, aes(x = residuals)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "#3366FF", alpha = 0.7) +
geom_density(color = "red", linewidth = 1) +
labs(title = paste("Residual Distribution -", model_name),
subtitle = "Should approximate a normal distribution",
x = "Residuals", y = "Density") +
theme(legend.position = "none")
# Combine plots into a single grid
combined <- plot_grid(
p1, p2, p3, p4, p5, p6,
ncol = 2, align = "hv", labels = "AUTO"
)
# Add overall title
titled_plot <- plot_grid(
ggdraw() + draw_label(paste("Diagnostic Plots:", model_name),
fontface = "bold", size = 16),
combined,
ncol = 1,
rel_heights = c(0.1, 1)
)
# Return list with individual plots and combined plot
return(list(
residuals_fitted = p1,
qq_plot = p2,
scale_location = p3,
cooks_distance = p4,
residuals_leverage = p5,
residuals_histogram = p6,
combined_plot = titled_plot
))
}
# Create residual pattern check plots
create_residual_pattern_plots <- function(model, model_name) {
# Extract data
model_data <- data.frame(
fitted = fitted(model),
residuals = residuals(model),
party = campaign_data$party,
incumbency = campaign_data$incumbency
)
# Residuals by party
p1 <- ggplot(model_data, aes(x = party, y = residuals, fill = party)) +
geom_boxplot(alpha = 0.8) +
scale_fill_manual(values = c("Democrat" = "#3366FF", "Republican" = "#FF3333")) +
labs(title = paste("Residuals by Party -", model_name),
x = "Party", y = "Residuals") +
theme(legend.position = "none")
# Residuals by incumbency
p2 <- ggplot(model_data, aes(x = incumbency, y = residuals, fill = incumbency)) +
geom_boxplot(alpha = 0.8) +
scale_fill_viridis_d() +
labs(title = paste("Residuals by Incumbency Status -", model_name),
x = "Incumbency Status", y = "Residuals") +
theme(legend.position = "none")
# Residuals by interaction
p3 <- ggplot(model_data, aes(x = interaction(party, incumbency), y = residuals, fill = party)) +
geom_boxplot(alpha = 0.8) +
scale_fill_manual(values = c("Democrat" = "#3366FF", "Republican" = "#FF3333")) +
labs(title = paste("Residuals by Party-Incumbency Interaction -", model_name),
x = "Party-Incumbency Combination", y = "Residuals") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Combine plots
combined <- grid.arrange(
p1, p2, p3,
ncol = 2,
top = paste("Residual Patterns:", model_name)
)
# Return the plots
return(list(
residuals_party = p1,
residuals_incumbency = p2,
residuals_interaction = p3,
combined = combined
))
}
# Generate diagnostic plots for each model
model1_plots <- create_diagnostic_plots(model1, "Log Receipts Model", "viridis")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
model2_plots <- create_diagnostic_plots(model2, "Log Individual Contributions Model", "plasma")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## `geom_smooth()` using formula = 'y ~ x'
model3_plots <- create_diagnostic_plots(model3, "Log Disbursements Model", "inferno")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## `geom_smooth()` using formula = 'y ~ x'
# Generate residual pattern plots
model1_pattern_plots <- create_residual_pattern_plots(model1, "Log Receipts Model")

model2_pattern_plots <- create_residual_pattern_plots(model2, "Log Individual Contributions Model")

model3_pattern_plots <- create_residual_pattern_plots(model3, "Log Disbursements Model")

# Display plots
# Uncomment whichever plots you want to see
print(model1_plots$combined_plot)

# print(model2_plots$combined_plot)
# print(model3_plots$combined_plot)
# print(model1_pattern_plots$combined)
# print(model2_pattern_plots$combined)
# print(model3_pattern_plots$combined)
# Save plots to PDF
save_diagnostic_plots <- function(filename = "campaign_finance_diagnostic_plots.pdf") {
pdf(filename, width = 11, height = 8.5)
# Title page
plot.new()
title <- "Campaign Finance Regression Diagnostics"
subtitle <- paste("Generated:", format(Sys.time(), "%B %d, %Y"))
text(0.5, 0.6, title, cex = 2, font = 2)
text(0.5, 0.5, subtitle, cex = 1.2)
# Print each set of diagnostic plots
print(model1_plots$combined_plot)
print(model2_plots$combined_plot)
print(model3_plots$combined_plot)
# Print each set of residual pattern plots
print(model1_pattern_plots$combined)
print(model2_pattern_plots$combined)
print(model3_pattern_plots$combined)
dev.off()
cat("Diagnostic plots saved to", filename, "\n")
}
# Uncomment to save plots to PDF
# save_diagnostic_plots()