regression-diagnostics.r

# COMPLETE CAMPAIGN FINANCE REGRESSION DIAGNOSTICS
# This script performs comprehensive diagnostic checks on regression models
# analyzing women candidates' campaign finance data

# Load necessary packages
if (!require("car")) install.packages("car")

## Loading required package: car

## Loading required package: carData

if (!require("lmtest")) install.packages("lmtest")

## Loading required package: lmtest

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

if (!require("MASS")) install.packages("MASS")

## Loading required package: MASS

if (!require("ggplot2")) install.packages("ggplot2")

## Loading required package: ggplot2

if (!require("sandwich")) install.packages("sandwich")

## Loading required package: sandwich

if (!require("gridExtra")) install.packages("gridExtra")

## Loading required package: gridExtra

if (!require("corrplot")) install.packages("corrplot")

## Loading required package: corrplot

## corrplot 0.95 loaded

if (!require("gvlma")) install.packages("gvlma")

## Loading required package: gvlma

library(car)       # For VIF and diagnostic tests
library(lmtest)    # For heteroscedasticity tests
library(MASS)      # For boxcox transformations
library(ggplot2)   # For visualizations
library(sandwich)  # For robust standard errors
library(gridExtra) # For arranging multiple plots
library(corrplot)  # For correlation visualization
library(gvlma)     # For global model validation

# =====================================================================
# STEP 1: DATA PREPARATION AND MODEL DEFINITION
# =====================================================================

# Load your data
# Replace this with your actual data file path
# campaign_data <- read.csv("your_data_file.csv")

# For example purposes, creating a simple dummy dataset
set.seed(123)
n <- 500
campaign_data <- data.frame(
  receipts = rlnorm(n, meanlog = 10, sdlog = 1),
  individual_contributions = rlnorm(n, meanlog = 9.5, sdlog = 1.2),
  disbursements = rlnorm(n, meanlog = 9.8, sdlog = 0.9),
  party = factor(sample(c("Democrat", "Republican"), n, replace = TRUE)),
  incumbency = factor(sample(c("Incumbent", "Challenger", "Open Seat"), n, replace = TRUE))
)

# Display data structure
str(campaign_data)

## 'data.frame':    500 obs. of  5 variables:
##  $ receipts                : num  12576 17498 104685 23636 25067 ...
##  $ individual_contributions: num  6488 4054 45805 32902 2184 ...
##  $ disbursements           : num  7360 7073 17744 16011 1818 ...
##  $ party                   : Factor w/ 2 levels "Democrat","Republican": 2 1 2 2 1 2 1 1 1 2 ...
##  $ incumbency              : Factor w/ 3 levels "Challenger","Incumbent",..: 3 2 1 2 1 2 3 3 1 1 ...

# Check for missing values
cat("Missing values in dataset:\n")

## Missing values in dataset:

print(colSums(is.na(campaign_data)))

##                 receipts individual_contributions            disbursements 
##                        0                        0                        0 
##                    party               incumbency 
##                        0                        0

# Create dummy variables (if needed)
campaign_data$republican <- ifelse(campaign_data$party == "Republican", 1, 0)
campaign_data$challenger <- ifelse(campaign_data$incumbency == "Challenger", 1, 0)
campaign_data$open_seat <- ifelse(campaign_data$incumbency == "Open Seat", 1, 0)

# Create interaction terms (if needed)
campaign_data$rep_challenger <- campaign_data$republican * campaign_data$challenger
campaign_data$rep_open_seat <- campaign_data$republican * campaign_data$open_seat

# Create log transformed variables - THIS WAS MISSING IN THE ORIGINAL SCRIPT
campaign_data$log_receipts <- log(campaign_data$receipts + 1)  # Add 1 to handle zeros
campaign_data$log_individual_contributions <- log(campaign_data$individual_contributions + 1)
campaign_data$log_disbursements <- log(campaign_data$disbursements + 1)

# Create regression models
# Model 1: Log Receipts
model1 <- lm(log_receipts ~ party + incumbency + party:incumbency, data = campaign_data)

# Model 2: Log Individual Contributions
model2 <- lm(log_individual_contributions ~ party + incumbency + party:incumbency, data = campaign_data)

# Model 3: Log Disbursements
model3 <- lm(log_disbursements ~ party + incumbency + party:incumbency, data = campaign_data)

# =====================================================================
# STEP 2: INITIAL MODEL SUMMARIES
# =====================================================================

# Display model summaries
cat("\n\n=======================================================\n")

## 
## 
## =======================================================

cat("MODEL SUMMARIES\n")

## MODEL SUMMARIES

cat("=======================================================\n\n")

## =======================================================

cat("Model 1: Log Receipts\n")

## Model 1: Log Receipts

print(summary(model1))

## 
## Call:
## lm(formula = log_receipts ~ party + incumbency + party:incumbency, 
##     data = campaign_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8107 -0.6082 -0.0282  0.6308  3.0728 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         10.07180    0.10044 100.276   <2e-16 ***
## partyRepublican                      0.09641    0.14531   0.664    0.507    
## incumbencyIncumbent                 -0.07489    0.15134  -0.495    0.621    
## incumbencyOpen Seat                 -0.05964    0.14403  -0.414    0.679    
## partyRepublican:incumbencyIncumbent -0.03218    0.21581  -0.149    0.882    
## partyRepublican:incumbencyOpen Seat -0.22231    0.20817  -1.068    0.286    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9738 on 494 degrees of freedom
## Multiple R-squared:  0.007778,   Adjusted R-squared:  -0.002265 
## F-statistic: 0.7745 on 5 and 494 DF,  p-value: 0.5684

cat("\nModel 2: Log Individual Contributions\n")

## 
## Model 2: Log Individual Contributions

print(summary(model2))

## 
## Call:
## lm(formula = log_individual_contributions ~ party + incumbency + 
##     party:incumbency, data = campaign_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2790 -0.7869 -0.0127  0.8059  3.2915 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          9.40941    0.12497  75.291   <2e-16 ***
## partyRepublican                      0.36183    0.18080   2.001   0.0459 *  
## incumbencyIncumbent                 -0.03518    0.18830  -0.187   0.8519    
## incumbencyOpen Seat                  0.02919    0.17920   0.163   0.8707    
## partyRepublican:incumbencyIncumbent -0.28554    0.26852  -1.063   0.2881    
## partyRepublican:incumbencyOpen Seat -0.27178    0.25902  -1.049   0.2946    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.212 on 494 degrees of freedom
## Multiple R-squared:  0.01206,    Adjusted R-squared:  0.002059 
## F-statistic: 1.206 on 5 and 494 DF,  p-value: 0.3051

cat("\nModel 3: Log Disbursements\n")

## 
## Model 3: Log Disbursements

print(summary(model3))

## 
## Call:
## lm(formula = log_disbursements ~ party + incumbency + party:incumbency, 
##     data = campaign_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.56174 -0.60054  0.01358  0.57120  3.10302 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          9.74832    0.09223 105.694   <2e-16 ***
## partyRepublican                      0.06444    0.13343   0.483    0.629    
## incumbencyIncumbent                  0.18826    0.13897   1.355    0.176    
## incumbencyOpen Seat                  0.05859    0.13225   0.443    0.658    
## partyRepublican:incumbencyIncumbent -0.28297    0.19817  -1.428    0.154    
## partyRepublican:incumbencyOpen Seat  0.06301    0.19116   0.330    0.742    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8942 on 494 degrees of freedom
## Multiple R-squared:  0.008419,   Adjusted R-squared:  -0.001617 
## F-statistic: 0.8388 on 5 and 494 DF,  p-value: 0.5225

# =====================================================================
# STEP 3: EXPLORATORY DATA ANALYSIS
# =====================================================================

# Correlation matrix of numeric variables
numeric_vars <- sapply(campaign_data, is.numeric)
if(sum(numeric_vars) > 1) {
  cor_matrix <- cor(campaign_data[, numeric_vars], use = "complete.obs")
  corrplot(cor_matrix, method = "circle", type = "upper", 
           tl.col = "black", tl.srt = 45, addCoef.col = "black")
}

# Boxplots of dependent variables by party and incumbency
p1 <- ggplot(campaign_data, aes(x = incumbency, y = log_receipts, fill = party)) +
  geom_boxplot() +
  labs(title = "Log Receipts by Party and Incumbency Status",
       x = "Incumbency Status", y = "Log Receipts") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p2 <- ggplot(campaign_data, aes(x = incumbency, y = log_individual_contributions, fill = party)) +
  geom_boxplot() +
  labs(title = "Log Individual Contributions by Party and Incumbency Status",
       x = "Incumbency Status", y = "Log Individual Contributions") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p3 <- ggplot(campaign_data, aes(x = incumbency, y = log_disbursements, fill = party)) +
  geom_boxplot() +
  labs(title = "Log Disbursements by Party and Incumbency Status",
       x = "Incumbency Status", y = "Log Disbursements") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

grid.arrange(p1, p2, p3, ncol = 1)

# =====================================================================
# STEP 4: COMPREHENSIVE DIAGNOSTIC FUNCTION
# =====================================================================

run_diagnostics <- function(model, model_name) {
  cat("\n\n=======================================================\n")
  cat("DIAGNOSTIC TESTS FOR:", model_name, "\n")
  cat("=======================================================\n\n")
  
  # 1. Check for multicollinearity using VIF
  cat("1. MULTICOLLINEARITY CHECK (VIF)\n")
  tryCatch({
    vif_values <- vif(model)
    print(vif_values)
    
    cat("\nInterpretation:\n")
    if(any(vif_values > 10)) {
      cat("WARNING: Severe multicollinearity detected (VIF > 10)\n")
      cat("Variables with severe multicollinearity:", 
          names(vif_values)[vif_values > 10], "\n")
    } else if(any(vif_values > 5)) {
      cat("CAUTION: Moderate multicollinearity detected (VIF > 5)\n")
      cat("Variables with moderate multicollinearity:", 
          names(vif_values)[vif_values > 5], "\n")
    } else {
      cat("No problematic multicollinearity detected.\n")
    }
  }, error = function(e) {
    cat("Error in VIF calculation:", e$message, "\n")
    cat("This may be due to perfect multicollinearity or singular design matrix.\n")
  })
  
  # 2. Check for heteroscedasticity
  cat("\n2. HETEROSCEDASTICITY TESTS\n")
  
  # Breusch-Pagan test
  bp_test <- bptest(model)
  print(bp_test)
  
  cat("\nInterpretation:\n")
  if(bp_test$p.value < 0.05) {
    cat("WARNING: Heteroscedasticity detected (p < 0.05)\n")
    cat("Consider using robust standard errors or transforming variables.\n")
  } else {
    cat("No heteroscedasticity detected using Breusch-Pagan test.\n")
  }
  
  # 3. Normality of residuals
  cat("\n3. NORMALITY OF RESIDUALS\n")
  
  # Shapiro-Wilk test
  sw_test <- shapiro.test(residuals(model))
  print(sw_test)
  
  cat("\nInterpretation:\n")
  if(sw_test$p.value < 0.05) {
    cat("WARNING: Non-normal residuals detected (p < 0.05)\n")
    cat("Consider transforming dependent variable or using robust methods.\n")
  } else {
    cat("Residuals appear to be normally distributed.\n")
  }
  
  # 4. Influence measures
  cat("\n4. INFLUENTIAL OBSERVATIONS\n")
  
  # Calculate Cook's distance
  cooks_d <- cooks.distance(model)
  
  # Identify influential observations (Cook's D > 4/n)
  n <- length(residuals(model))
  threshold <- 4/n
  influential <- which(cooks_d > threshold)
  
  cat("Cook's Distance threshold (4/n):", threshold, "\n")
  cat("Number of influential observations:", length(influential), "\n")
  
  if(length(influential) > 0) {
    cat("Influential observations (row numbers):\n")
    print(influential)
    
    # Look at the actual influential observations
    cat("\nDetails of influential observations:\n")
    if(exists("campaign_data")) {
      print(campaign_data[influential, ])
    } else {
      cat("Data frame not available for inspection.\n")
    }
  }
  
  # 5. Outliers in the response (Studentized residuals)
  cat("\n5. OUTLIERS IN RESPONSE (STUDENTIZED RESIDUALS)\n")
  
  # Calculate studentized residuals
  stud_resid <- rstudent(model)
  
  # Identify outliers (|stud_resid| > 3)
  outliers <- which(abs(stud_resid) > 3)
  
  cat("Number of outliers (|studentized residual| > 3):", length(outliers), "\n")
  
  if(length(outliers) > 0) {
    cat("Outlier observations (row numbers):\n")
    print(outliers)
    
    # Look at the actual outliers
    cat("\nDetails of outlier observations:\n")
    if(exists("campaign_data")) {
      print(campaign_data[outliers, ])
    } else {
      cat("Data frame not available for inspection.\n")
    }
  }
  
  # 6. Global model validation
  cat("\n6. GLOBAL MODEL VALIDATION\n")
  
  # Global test
  tryCatch({
    global_test <- gvlma(model)
    print(global_test)
  }, error = function(e) {
    cat("Error in global validation:", e$message, "\n")
    cat("This may be due to near singularities or other model issues.\n")
  })
  
  # Return diagnostic data for plots
  return(list(
    model = model,
    cooks_d = cooks_d,
    stud_resid = stud_resid,
    fitted = fitted(model),
    residuals = residuals(model),
    bp_test = bp_test,
    sw_test = sw_test,
    influential = influential,
    outliers = outliers
  ))
}

# =====================================================================
# STEP 5: DIAGNOSTIC PLOTS FUNCTION
# =====================================================================

create_diagnostic_plots <- function(diag_data, model_name) {
  # Create data frame for plots
  plot_data <- data.frame(
    fitted = diag_data$fitted,
    residuals = diag_data$residuals,
    sqrt_abs_resid = sqrt(abs(diag_data$stud_resid)),
    stud_resid = diag_data$stud_resid,
    cooks_dist = diag_data$cooks_d,
    index = 1:length(diag_data$residuals)
  )
  
  # 1. Residuals vs Fitted plot
  p1 <- ggplot(plot_data, aes(x = fitted, y = residuals)) +
    geom_point(alpha = 0.6) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
    geom_smooth(method = "loess", se = FALSE, color = "blue") +
    labs(title = paste("Residuals vs Fitted Values -", model_name),
         x = "Fitted values", y = "Residuals") +
    theme_minimal()
  
  # 2. Q-Q plot of residuals
  p2 <- ggplot(plot_data, aes(sample = residuals)) +
    stat_qq() +
    stat_qq_line(color = "red") +
    labs(title = paste("Normal Q-Q Plot -", model_name),
         x = "Theoretical Quantiles", y = "Sample Quantiles") +
    theme_minimal()
  
  # 3. Scale-Location plot (sqrt of abs standardized residuals vs fitted)
  p3 <- ggplot(plot_data, aes(x = fitted, y = sqrt_abs_resid)) +
    geom_point(alpha = 0.6) +
    geom_smooth(method = "loess", se = FALSE, color = "blue") +
    labs(title = paste("Scale-Location Plot -", model_name),
         x = "Fitted values", 
         y = "√|Studentized residuals|") +
    theme_minimal()
  
  # 4. Cook's distance plot
  p4 <- ggplot(plot_data, aes(x = index, y = cooks_dist)) +
    geom_bar(stat = "identity", alpha = 0.6) +
    geom_hline(yintercept = 4/length(diag_data$residuals), 
               linetype = "dashed", color = "red") +
    labs(title = paste("Cook's Distance -", model_name),
         x = "Observation", y = "Cook's distance") +
    theme_minimal()
  
  # 5. Studentized Residuals vs Leverage plot
  p5 <- ggplot(cbind(plot_data, hatvalues = hatvalues(diag_data$model)), 
               aes(x = hatvalues, y = stud_resid)) +
    geom_point(alpha = 0.6) +
    geom_hline(yintercept = c(-3, 0, 3), linetype = "dashed", color = c("red", "black", "red")) +
    geom_smooth(method = "loess", se = FALSE, color = "blue") +
    labs(title = paste("Studentized Residuals vs Leverage -", model_name),
         x = "Leverage", y = "Studentized Residuals") +
    theme_minimal()
  
  # Combine all plots
  combined_plot <- grid.arrange(p1, p2, p3, p4, p5, ncol = 2)
  
  # Return all plots
  return(list(
    p1 = p1,
    p2 = p2, 
    p3 = p3,
    p4 = p4,
    p5 = p5,
    combined = combined_plot
  ))
}

# =====================================================================
# STEP 6: BOXCOX TRANSFORMATION FUNCTION
# =====================================================================

try_boxcox <- function(model, model_name) {
  cat("\n\n=======================================================\n")
  cat("BOX-COX TRANSFORMATION FOR:", model_name, "\n")
  cat("=======================================================\n\n")
  
  # Run Box-Cox analysis
  tryCatch({
    bc <- boxcox(model)
    
    # Find optimal lambda
    lambda <- bc$x[which.max(bc$y)]
    cat("Optimal lambda:", lambda, "\n")
    
    if(abs(lambda - 1) < 0.05) {
      cat("Interpretation: No transformation needed (lambda ≈ 1)\n")
    } else if(abs(lambda) < 0.05) {
      cat("Interpretation: Log transformation suggested (lambda ≈ 0)\n")
    } else if(abs(lambda - 0.5) < 0.05) {
      cat("Interpretation: Square root transformation suggested (lambda ≈ 0.5)\n")
    } else if(abs(lambda + 1) < 0.05) {
      cat("Interpretation: Reciprocal transformation suggested (lambda ≈ -1)\n")
    } else {
      cat("Interpretation: Power transformation with lambda =", lambda, "suggested\n")
    }
    
    return(lambda)
  }, error = function(e) {
    cat("Error in Box-Cox transformation:", e$message, "\n")
    cat("This may occur if the response variable contains negative or zero values.\n")
    return(NA)
  })
}

# =====================================================================
# STEP 7: ROBUST STANDARD ERRORS FUNCTION
# =====================================================================

calculate_robust_se <- function(model, model_name) {
  cat("\n\n=======================================================\n")
  cat("ROBUST STANDARD ERRORS FOR:", model_name, "\n")
  cat("=======================================================\n\n")
  
  # Calculate robust standard errors using HC3 (best for heteroscedasticity)
  robust_se <- coeftest(model, vcov = hccm(model, type = "hc3"))
  print(robust_se)
  
  # Compare with original model
  cat("\nComparison with original model coefficients:\n")
  original_coef <- summary(model)$coefficients
  robust_coef <- robust_se
  
  comparison <- data.frame(
    Original_Estimate = original_coef[, 1],
    Original_SE = original_coef[, 2],
    Original_p = original_coef[, 4],
    Robust_SE = robust_coef[, 2],
    Robust_p = robust_coef[, 4]
  )
  
  print(comparison)
  
  return(robust_se)
}

# =====================================================================
# STEP 8: RUNNING ALL DIAGNOSTICS
# =====================================================================

# Run diagnostics for Model 1 (Log Receipts)
model1_diag <- run_diagnostics(model1, "Log Receipts Model")

## 
## 
## =======================================================
## DIAGNOSTIC TESTS FOR: Log Receipts Model 
## =======================================================
## 
## 1. MULTICOLLINEARITY CHECK (VIF)

## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif

##                      GVIF Df GVIF^(1/(2*Df))
## party            2.781094  1        1.667661
## incumbency       3.807750  2        1.396906
## party:incumbency 7.376954  2        1.648046
## 
## Interpretation:
## CAUTION: Moderate multicollinearity detected (VIF > 5)
## Variables with moderate multicollinearity: 
## 
## 2. HETEROSCEDASTICITY TESTS
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 4.4652, df = 5, p-value = 0.4846
## 
## 
## Interpretation:
## No heteroscedasticity detected using Breusch-Pagan test.
## 
## 3. NORMALITY OF RESIDUALS
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.9979, p-value = 0.7981
## 
## 
## Interpretation:
## Residuals appear to be normally distributed.
## 
## 4. INFLUENTIAL OBSERVATIONS
## Cook's Distance threshold (4/n): 0.008 
## Number of influential observations: 23 
## Influential observations (row numbers):
##  18  44  72  97 135 139 164 174 196 201 265 310 336 352 359 360 371 392 416 427 
##  18  44  72  97 135 139 164 174 196 201 265 310 336 352 359 360 371 392 416 427 
## 439 449 456 
## 439 449 456 
## 
## Details of influential observations:
##       receipts individual_contributions disbursements      party incumbency
## 18    3082.150                19380.455      5792.328 Republican  Incumbent
## 44  192712.740                 3175.320     14260.532 Republican  Open Seat
## 72    2188.192                 2326.781     12903.445   Democrat  Incumbent
## 97  196286.969                11469.214     26774.825   Democrat Challenger
## 135   2826.382                 2631.736      7181.866   Democrat Challenger
## 139 148613.398                28662.612     74699.674   Democrat  Incumbent
## 164 563002.831                12912.175     16257.351 Republican Challenger
## 174 185063.058                 6903.139     20954.384 Republican Challenger
## 196 162301.888               255166.765     20929.432   Democrat  Incumbent
## 201 198552.802                 5575.499     31503.851 Republican  Incumbent
## 265 218180.717                20795.470     19268.728   Democrat Challenger
## 310 168986.485                11048.469     18027.503 Republican  Open Seat
## 336   2829.279                18782.008      3370.107   Democrat  Open Seat
## 352   3002.653                11885.863      4963.141   Democrat Challenger
## 359   1870.763                15931.446      9031.057 Republican  Open Seat
## 360 288213.887                93746.110     26046.525 Republican Challenger
## 371 246908.562                 3404.860     11588.114 Republican  Open Seat
## 392   2546.191                26224.508      6861.774   Democrat  Open Seat
## 416   1566.895                49807.964     22866.435 Republican Challenger
## 427 242183.863                11913.350     17969.119   Democrat  Open Seat
## 439   2380.373                10327.800     11985.797 Republican  Open Seat
## 449   2828.956                 9636.170     29618.273 Republican Challenger
## 456   1539.291                 1907.922     28540.749 Republican  Open Seat
##     republican challenger open_seat rep_challenger rep_open_seat log_receipts
## 18           1          0         0              0             0     8.033707
## 44           1          0         1              0             1    12.168961
## 72           0          0         0              0             0     7.691288
## 97           0          1         0              0             0    12.187338
## 135          0          1         0              0             0     7.947107
## 139          0          0         0              0             0    11.909110
## 164          1          1         0              1             0    13.241042
## 174          1          1         0              1             0    12.128457
## 196          0          0         0              0             0    11.997220
## 201          1          0         0              0             0    12.198815
## 265          0          1         0              0             0    12.293084
## 310          1          0         1              0             1    12.037580
## 336          0          0         1              0             0     7.948131
## 352          0          1         0              0             0     8.007584
## 359          1          0         1              0             1     7.534636
## 360          1          1         0              1             0    12.571462
## 371          1          0         1              0             1    12.416777
## 392          0          0         1              0             0     7.842746
## 416          1          1         0              1             0     7.357489
## 427          0          0         1              0             0    12.397457
## 439          1          0         1              0             1     7.775432
## 449          1          1         0              1             0     7.948016
## 456          1          0         1              0             1     7.339727
##     log_individual_contributions log_disbursements
## 18                      9.872072          8.664462
## 44                      8.063479          9.565321
## 72                      7.752671          9.465327
## 97                      9.347509         10.195255
## 135                     7.875779          8.879454
## 139                    10.263384         11.221244
## 164                     9.466003          9.696362
## 174                     8.839876          9.950151
## 196                    12.449677          9.948959
## 201                     8.626316         10.357897
## 265                     9.942539          9.866291
## 310                     9.310138          9.799709
## 336                     9.840708          8.122996
## 352                     9.383189          8.509996
## 359                     9.676113          9.108535
## 360                    11.448356         10.167678
## 371                     8.133253          9.357822
## 392                    10.174488          8.833867
## 416                    10.815950         10.037469
## 427                     9.385499          9.796466
## 439                     9.242691          9.391561
## 449                     9.173383         10.296181
## 456                     7.554294         10.259123
## 
## 5. OUTLIERS IN RESPONSE (STUDENTIZED RESIDUALS)
## Number of outliers (|studentized residual| > 3): 1 
## Outlier observations (row numbers):
## 164 
## 164 
## 
## Details of outlier observations:
##     receipts individual_contributions disbursements      party incumbency
## 164 563002.8                 12912.17      16257.35 Republican Challenger
##     republican challenger open_seat rep_challenger rep_open_seat log_receipts
## 164          1          1         0              1             0     13.24104
##     log_individual_contributions log_disbursements
## 164                     9.466003          9.696362
## 
## 6. GLOBAL MODEL VALIDATION
## 
## Call:
## lm(formula = log_receipts ~ party + incumbency + party:incumbency, 
##     data = campaign_data)
## 
## Coefficients:
##                         (Intercept)                      partyRepublican  
##                            10.07180                              0.09641  
##                 incumbencyIncumbent                  incumbencyOpen Seat  
##                            -0.07489                             -0.05964  
## partyRepublican:incumbencyIncumbent  partyRepublican:incumbencyOpen Seat  
##                            -0.03218                             -0.22231  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = model) 
## 
##                         Value p-value                Decision
## Global Stat         1.730e+00  0.7853 Assumptions acceptable.
## Skewness            6.976e-01  0.4036 Assumptions acceptable.
## Kurtosis            1.261e-01  0.7225 Assumptions acceptable.
## Link Function      -6.637e-13  1.0000 Assumptions acceptable.
## Heteroscedasticity  9.059e-01  0.3412 Assumptions acceptable.

model1_plots <- create_diagnostic_plots(model1_diag, "Log Receipts Model")

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

print(model1_plots$combined)

## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
## 5 5 (3-3,1-1) arrange gtable[layout]

robust_se1 <- calculate_robust_se(model1, "Log Receipts Model")

## 
## 
## =======================================================
## ROBUST STANDARD ERRORS FOR: Log Receipts Model 
## =======================================================
## 
## 
## t test of coefficients:
## 
##                                      Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                         10.071799   0.091172 110.4708   <2e-16 ***
## partyRepublican                      0.096415   0.144831   0.6657   0.5059    
## incumbencyIncumbent                 -0.074885   0.140473  -0.5331   0.5942    
## incumbencyOpen Seat                 -0.059635   0.136694  -0.4363   0.6628    
## partyRepublican:incumbencyIncumbent -0.032182   0.214220  -0.1502   0.8806    
## partyRepublican:incumbencyOpen Seat -0.222310   0.212285  -1.0472   0.2955    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## Comparison with original model coefficients:
##                                     Original_Estimate Original_SE Original_p
## (Intercept)                               10.07179937   0.1004404  0.0000000
## partyRepublican                            0.09641466   0.1453100  0.5073134
## incumbencyIncumbent                       -0.07488510   0.1513377  0.6209464
## incumbencyOpen Seat                       -0.05963517   0.1440254  0.6790098
## partyRepublican:incumbencyIncumbent       -0.03218202   0.2158096  0.8815180
## partyRepublican:incumbencyOpen Seat       -0.22231004   0.2081697  0.2860751
##                                      Robust_SE  Robust_p
## (Intercept)                         0.09117159 0.0000000
## partyRepublican                     0.14483061 0.5059095
## incumbencyIncumbent                 0.14047273 0.5942086
## incumbencyOpen Seat                 0.13669384 0.6628328
## partyRepublican:incumbencyIncumbent 0.21422039 0.8806456
## partyRepublican:incumbencyOpen Seat 0.21228483 0.2955078

lambda1 <- try_boxcox(model1, "Log Receipts Model")

## 
## 
## =======================================================
## BOX-COX TRANSFORMATION FOR: Log Receipts Model 
## =======================================================

## Optimal lambda: 0.6666667 
## Interpretation: Power transformation with lambda = 0.6666667 suggested

# Run diagnostics for Model 2 (Log Individual Contributions)
model2_diag <- run_diagnostics(model2, "Log Individual Contributions Model")

## 
## 
## =======================================================
## DIAGNOSTIC TESTS FOR: Log Individual Contributions Model 
## =======================================================
## 
## 1. MULTICOLLINEARITY CHECK (VIF)

## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif

##                      GVIF Df GVIF^(1/(2*Df))
## party            2.781094  1        1.667661
## incumbency       3.807750  2        1.396906
## party:incumbency 7.376954  2        1.648046
## 
## Interpretation:
## CAUTION: Moderate multicollinearity detected (VIF > 5)
## Variables with moderate multicollinearity: 
## 
## 2. HETEROSCEDASTICITY TESTS
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 4.1899, df = 5, p-value = 0.5224
## 
## 
## Interpretation:
## No heteroscedasticity detected using Breusch-Pagan test.
## 
## 3. NORMALITY OF RESIDUALS
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.99576, p-value = 0.1967
## 
## 
## Interpretation:
## Residuals appear to be normally distributed.
## 
## 4. INFLUENTIAL OBSERVATIONS
## Cook's Distance threshold (4/n): 0.008 
## Number of influential observations: 26 
## Influential observations (row numbers):
##  35  49  91 116 130 151 165 196 243 246 247 249 259 308 311 342 355 366 376 411 
##  35  49  91 116 130 151 165 196 243 246 247 249 259 308 311 342 355 366 376 411 
## 418 429 462 467 472 481 
## 418 429 462 467 472 481 
## 
## Details of influential observations:
##       receipts individual_contributions disbursements      party incumbency
## 35   50090.221              138001.7246     41858.213   Democrat  Incumbent
## 49   48048.448              246784.8005     87254.240   Democrat  Open Seat
## 91   59486.451                 458.6422     88691.000   Democrat Challenger
## 116  29766.931                 588.7257     37897.285   Democrat  Incumbent
## 130  20510.494              163417.0697      4395.857   Democrat Challenger
## 151  48423.419                1304.7860     17132.397 Republican  Incumbent
## 165  14517.969                 898.8702     10042.071   Democrat  Incumbent
## 196 162301.888              255166.7649     20929.432   Democrat  Incumbent
## 243  98327.662              233167.4244     64983.638 Republican  Incumbent
## 246 147614.854                 941.2796     32393.094   Democrat  Open Seat
## 247  19910.950              337748.4964     21263.413   Democrat  Open Seat
## 249  11330.240              230886.5303     63845.148   Democrat  Open Seat
## 259  19660.368                1158.3838     16192.109 Republican  Open Seat
## 308  27227.436              178742.0556     19860.965 Republican  Open Seat
## 311  80916.739                 658.8518     60376.880 Republican  Open Seat
## 342  57469.254              334981.5716     67102.015 Republican Challenger
## 355   9016.328              216432.1254     40650.584 Republican  Open Seat
## 366  17857.999              285966.8964     15459.804 Republican Challenger
## 376  12216.209              138810.5326     31763.353   Democrat  Incumbent
## 411  36034.373              293766.2660     19121.358 Republican  Open Seat
## 418  33869.992                1325.8721     16240.727 Republican Challenger
## 429 112822.202              264500.2440     16479.873 Republican Challenger
## 462  15545.517              210985.1927     34773.272   Democrat  Open Seat
## 467  46207.621                1453.6496     16828.358 Republican Challenger
## 472  16622.590                1060.5575      5501.559 Republican  Incumbent
## 481  22481.560              212367.6773     59216.905 Republican  Incumbent
##     republican challenger open_seat rep_challenger rep_open_seat log_receipts
## 35           0          0         0              0             0    10.821601
## 49           0          0         1              0             0    10.779986
## 91           0          1         0              0             0    10.993521
## 116          0          0         0              0             0    10.301187
## 130          0          1         0              0             0     9.928741
## 151          1          0         0              0             0    10.787759
## 165          0          0         0              0             0     9.583211
## 196          0          0         0              0             0    11.997220
## 243          1          0         0              0             0    11.496071
## 246          0          0         1              0             0    11.902369
## 247          0          0         1              0             0     9.899075
## 249          0          0         1              0             0     9.335319
## 259          1          0         1              0             1     9.886411
## 308          1          0         1              0             1    10.212017
## 311          1          0         1              0             1    11.301188
## 342          1          1         0              1             0    10.959023
## 355          1          0         1              0             1     9.106903
## 366          1          1         0              1             0     9.790263
## 376          0          0         0              0             0     9.410601
## 411          1          0         1              0             1    10.492256
## 418          1          1         0              1             0    10.430314
## 429          1          1         0              1             0    11.633577
## 462          0          0         1              0             0     9.651592
## 467          1          1         0              1             0    10.740922
## 472          1          0         0              0             0     9.718578
## 481          1          0         0              0             0    10.020495
##     log_individual_contributions log_disbursements
## 35                     11.835029         10.642067
## 49                     12.416276         11.376593
## 91                      6.130448         11.392925
## 116                     6.379658         10.542661
## 130                    12.004067          8.388645
## 151                     7.174560          9.748785
## 165                     6.802251          9.214638
## 196                    12.449677          9.948959
## 243                    12.359516         11.081906
## 246                     6.848302         10.385731
## 247                    12.730060          9.964790
## 249                    12.349686         11.064232
## 259                     7.055644          9.692341
## 308                    12.093705          9.896562
## 311                     6.492015         11.008378
## 342                    12.721834         11.113984
## 355                    12.285037         10.612793
## 366                    12.563635          9.646063
## 376                    11.840872         10.366100
## 411                    12.590543          9.858614
## 418                     7.190580          9.695339
## 429                    12.485601          9.709956
## 462                    12.259548         10.456633
## 467                     7.282520          9.730880
## 472                     6.967492          8.612969
## 481                    12.266079         10.988979
## 
## 5. OUTLIERS IN RESPONSE (STUDENTIZED RESIDUALS)
## Number of outliers (|studentized residual| > 3): 0 
## 
## 6. GLOBAL MODEL VALIDATION
## 
## Call:
## lm(formula = log_individual_contributions ~ party + incumbency + 
##     party:incumbency, data = campaign_data)
## 
## Coefficients:
##                         (Intercept)                      partyRepublican  
##                             9.40941                              0.36183  
##                 incumbencyIncumbent                  incumbencyOpen Seat  
##                            -0.03518                              0.02919  
## partyRepublican:incumbencyIncumbent  partyRepublican:incumbencyOpen Seat  
##                            -0.28554                             -0.27178  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = model) 
## 
##                         Value p-value                Decision
## Global Stat         9.693e-01  0.9144 Assumptions acceptable.
## Skewness            1.265e-01  0.7221 Assumptions acceptable.
## Kurtosis            2.980e-01  0.5852 Assumptions acceptable.
## Link Function      -2.775e-14  1.0000 Assumptions acceptable.
## Heteroscedasticity  5.449e-01  0.4604 Assumptions acceptable.

model2_plots <- create_diagnostic_plots(model2_diag, "Log Individual Contributions Model")

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

print(model2_plots$combined)

## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
## 5 5 (3-3,1-1) arrange gtable[layout]

robust_se2 <- calculate_robust_se(model2, "Log Individual Contributions Model")

## 
## 
## =======================================================
## ROBUST STANDARD ERRORS FOR: Log Individual Contributions Model 
## =======================================================
## 
## 
## t test of coefficients:
## 
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          9.409406   0.114309 82.3157  < 2e-16 ***
## partyRepublican                      0.361829   0.177271  2.0411  0.04177 *  
## incumbencyIncumbent                 -0.035179   0.183938 -0.1913  0.84840    
## incumbencyOpen Seat                  0.029186   0.181022  0.1612  0.87198    
## partyRepublican:incumbencyIncumbent -0.285540   0.262913 -1.0861  0.27798    
## partyRepublican:incumbencyOpen Seat -0.271780   0.264524 -1.0274  0.30472    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## Comparison with original model coefficients:
##                                     Original_Estimate Original_SE    Original_p
## (Intercept)                                9.40940578   0.1249737 7.065772e-273
## partyRepublican                            0.36182897   0.1808030  4.591374e-02
## incumbencyIncumbent                       -0.03517908   0.1883030  8.518772e-01
## incumbencyOpen Seat                        0.02918620   0.1792046  8.706911e-01
## partyRepublican:incumbencyIncumbent       -0.28553988   0.2685226  2.881321e-01
## partyRepublican:incumbencyOpen Seat       -0.27178026   0.2590166  2.945636e-01
##                                     Robust_SE      Robust_p
## (Intercept)                         0.1143087 1.328757e-290
## partyRepublican                     0.1772706  4.177071e-02
## incumbencyIncumbent                 0.1839385  8.484047e-01
## incumbencyOpen Seat                 0.1810221  8.719782e-01
## partyRepublican:incumbencyIncumbent 0.2629130  2.779811e-01
## partyRepublican:incumbencyOpen Seat 0.2645240  3.047202e-01

lambda2 <- try_boxcox(model2, "Log Individual Contributions Model")

## 
## 
## =======================================================
## BOX-COX TRANSFORMATION FOR: Log Individual Contributions Model 
## =======================================================

## Optimal lambda: 0.8686869 
## Interpretation: Power transformation with lambda = 0.8686869 suggested

# Run diagnostics for Model 3 (Log Disbursements)
model3_diag <- run_diagnostics(model3, "Log Disbursements Model")

## 
## 
## =======================================================
## DIAGNOSTIC TESTS FOR: Log Disbursements Model 
## =======================================================
## 
## 1. MULTICOLLINEARITY CHECK (VIF)

## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif

##                      GVIF Df GVIF^(1/(2*Df))
## party            2.781094  1        1.667661
## incumbency       3.807750  2        1.396906
## party:incumbency 7.376954  2        1.648046
## 
## Interpretation:
## CAUTION: Moderate multicollinearity detected (VIF > 5)
## Variables with moderate multicollinearity: 
## 
## 2. HETEROSCEDASTICITY TESTS
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 2.8734, df = 5, p-value = 0.7195
## 
## 
## Interpretation:
## No heteroscedasticity detected using Breusch-Pagan test.
## 
## 3. NORMALITY OF RESIDUALS
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.99754, p-value = 0.6738
## 
## 
## Interpretation:
## Residuals appear to be normally distributed.
## 
## 4. INFLUENTIAL OBSERVATIONS
## Cook's Distance threshold (4/n): 0.008 
## Number of influential observations: 25 
## Influential observations (row numbers):
##   5   8  11  12  92 113 126 166 190 192 193 217 223 284 289 292 322 324 356 386 
##   5   8  11  12  92 113 126 166 190 192 193 217 223 284 289 292 322 324 356 386 
## 435 437 447 489 494 
## 435 437 447 489 494 
## 
## Details of influential observations:
##       receipts individual_contributions disbursements      party incumbency
## 5    25066.504                 2184.190      1818.180   Democrat Challenger
## 8     6216.354                 1113.317    158670.759   Democrat  Open Seat
## 11   74912.931                11886.480    223608.500 Republican  Open Seat
## 12   31565.304                17315.945    230729.988   Democrat  Incumbent
## 92   38116.287                23340.880      2122.466 Republican Challenger
## 113   4368.248               101225.642    159857.844 Republican Challenger
## 126  11476.424                41332.276    316677.767 Republican  Open Seat
## 166  29679.967                13874.823      2931.238 Republican  Open Seat
## 190  13369.189                 1103.026      2492.319   Democrat  Open Seat
## 192  15919.723                55526.973    115639.614   Democrat  Incumbent
## 193  24211.513                55821.899    129862.446 Republican  Incumbent
## 217  14169.355                36362.085      2548.570 Republican Challenger
## 223  66826.042                28859.544      2570.633 Republican Challenger
## 284  16887.827                 3255.071      2247.644 Republican Challenger
## 289  17693.447                 1389.098      3003.747 Republican  Open Seat
## 292  63207.676                 4463.644     88957.568 Republican  Incumbent
## 322  74466.796                32557.285      2996.886   Democrat  Incumbent
## 324  42651.610                34467.823    381296.947   Democrat Challenger
## 356  30757.924                 6475.380    109646.403   Democrat  Incumbent
## 386  87427.935                23128.683      3066.232 Republican  Incumbent
## 435  22454.791                13183.812      1594.322   Democrat  Incumbent
## 437  44922.916                 5859.510      2561.694   Democrat Challenger
## 447  34160.275                 6032.225      3763.408   Democrat  Incumbent
## 489 112070.253                 3750.676    132325.330   Democrat  Incumbent
## 494   7405.721                10342.650    140885.789   Democrat  Open Seat
##     republican challenger open_seat rep_challenger rep_open_seat log_receipts
## 5            0          1         0              0             0    10.129328
## 8            0          0         1              0             0     8.735100
## 11           1          0         1              0             1    11.224095
## 12           0          0         0              0             0    10.359846
## 92           1          1         0              1             0    10.548423
## 113          1          1         0              1             0     8.382346
## 126          1          0         1              0             1     9.348137
## 166          1          0         1              0             1    10.298261
## 190          0          0         1              0             0     9.500783
## 192          0          0         0              0             0     9.675377
## 193          1          0         0              0             0    10.094625
## 217          1          1         0              1             0     9.558907
## 223          1          1         0              1             0    11.109863
## 284          1          1         0              1             0     9.734408
## 289          1          0         1              0             1     9.781006
## 292          1          0         0              0             0    11.054197
## 322          0          0         0              0             0    11.218122
## 324          0          1         0              0             0    10.660844
## 356          0          0         0              0             0    10.333935
## 386          1          0         0              0             0    11.378582
## 435          0          0         0              0             0    10.019304
## 437          0          1         0              0             0    10.712726
## 447          0          0         0              0             0    10.438848
## 489          0          0         0              0             0    11.626890
## 494          0          0         1              0             0     8.910143
##     log_individual_contributions log_disbursements
## 5                       7.689458          7.506141
## 8                       7.015997         11.974593
## 11                      9.383241         12.317657
## 12                      9.759441         12.349008
## 92                     10.058004          7.660805
## 113                    11.525117         11.982046
## 126                    10.629423         12.665643
## 166                     9.537903          7.983521
## 190                     7.006719          7.821370
## 192                    10.924642         11.658243
## 193                    10.929939         11.774239
## 217                    10.501309          7.843680
## 223                    10.270231          7.852297
## 284                     8.088277          7.718083
## 289                     7.237129          8.007949
## 292                     8.403945         11.395926
## 322                    10.390787          8.005663
## 324                    10.447811         12.851336
## 356                     8.775917         11.605025
## 386                    10.048872          8.028531
## 435                     9.486821          7.374831
## 437                     8.675992          7.848814
## 447                     8.705037          8.233346
## 489                     8.229958         11.793026
## 494                     9.244128         11.855712
## 
## 5. OUTLIERS IN RESPONSE (STUDENTIZED RESIDUALS)
## Number of outliers (|studentized residual| > 3): 2 
## Outlier observations (row numbers):
## 126 324 
## 126 324 
## 
## Details of outlier observations:
##     receipts individual_contributions disbursements      party incumbency
## 126 11476.42                 41332.28      316677.8 Republican  Open Seat
## 324 42651.61                 34467.82      381296.9   Democrat Challenger
##     republican challenger open_seat rep_challenger rep_open_seat log_receipts
## 126          1          0         1              0             1     9.348137
## 324          0          1         0              0             0    10.660844
##     log_individual_contributions log_disbursements
## 126                     10.62942          12.66564
## 324                     10.44781          12.85134
## 
## 6. GLOBAL MODEL VALIDATION
## 
## Call:
## lm(formula = log_disbursements ~ party + incumbency + party:incumbency, 
##     data = campaign_data)
## 
## Coefficients:
##                         (Intercept)                      partyRepublican  
##                             9.74832                              0.06444  
##                 incumbencyIncumbent                  incumbencyOpen Seat  
##                             0.18826                              0.05859  
## partyRepublican:incumbencyIncumbent  partyRepublican:incumbencyOpen Seat  
##                            -0.28297                              0.06301  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = model) 
## 
##                         Value p-value                Decision
## Global Stat         6.115e-01  0.9618 Assumptions acceptable.
## Skewness            1.004e-01  0.7514 Assumptions acceptable.
## Kurtosis            1.285e-01  0.7200 Assumptions acceptable.
## Link Function      -2.822e-13  1.0000 Assumptions acceptable.
## Heteroscedasticity  3.826e-01  0.5362 Assumptions acceptable.

model3_plots <- create_diagnostic_plots(model3_diag, "Log Disbursements Model")

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

print(model3_plots$combined)

## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
## 5 5 (3-3,1-1) arrange gtable[layout]

robust_se3 <- calculate_robust_se(model3, "Log Disbursements Model")

## 
## 
## =======================================================
## ROBUST STANDARD ERRORS FOR: Log Disbursements Model 
## =======================================================
## 
## 
## t test of coefficients:
## 
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          9.748315   0.098840 98.6275   <2e-16 ***
## partyRepublican                      0.064445   0.134950  0.4775   0.6332    
## incumbencyIncumbent                  0.188255   0.146442  1.2855   0.1992    
## incumbencyOpen Seat                  0.058587   0.140171  0.4180   0.6762    
## partyRepublican:incumbencyIncumbent -0.282973   0.197258 -1.4345   0.1521    
## partyRepublican:incumbencyOpen Seat  0.063015   0.193512  0.3256   0.7448    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## Comparison with original model coefficients:
##                                     Original_Estimate Original_SE Original_p
## (Intercept)                                9.74831521  0.09223127  0.0000000
## partyRepublican                            0.06444475  0.13343360  0.6293293
## incumbencyIncumbent                        0.18825548  0.13896869  0.1761448
## incumbencyOpen Seat                        0.05858721  0.13225397  0.6579668
## partyRepublican:incumbencyIncumbent       -0.28297271  0.19817122  0.1539466
## partyRepublican:incumbencyOpen Seat        0.06301478  0.19115572  0.7418030
##                                      Robust_SE  Robust_p
## (Intercept)                         0.09883971 0.0000000
## partyRepublican                     0.13494998 0.6331850
## incumbencyIncumbent                 0.14644186 0.1992093
## incumbencyOpen Seat                 0.14017102 0.6761511
## partyRepublican:incumbencyIncumbent 0.19725787 0.1520532
## partyRepublican:incumbencyOpen Seat 0.19351245 0.7448370

lambda3 <- try_boxcox(model3, "Log Disbursements Model")

## 
## 
## =======================================================
## BOX-COX TRANSFORMATION FOR: Log Disbursements Model 
## =======================================================

## Optimal lambda: 0.8686869 
## Interpretation: Power transformation with lambda = 0.8686869 suggested

# =====================================================================
# STEP 9: ADDRESSING DIAGNOSTIC ISSUES (IF NEEDED)
# =====================================================================

# If heteroscedasticity was detected, robust SEs have already been calculated above

# If influential observations were detected
if(length(model1_diag$influential) > 0) {
  cat("\n\n=======================================================\n")
  cat("RERUNNING MODEL 1 WITHOUT INFLUENTIAL OBSERVATIONS\n")
  cat("=======================================================\n\n")
  
  # Create new dataset without influential observations
  clean_data1 <- campaign_data[-model1_diag$influential, ]
  
  # Rerun model
  clean_model1 <- lm(log_receipts ~ party + incumbency + party:incumbency, data = clean_data1)
  
  # Compare models
  cat("Original Model 1:\n")
  print(summary(model1)$coefficients)
  
  cat("\nModel 1 without influential observations:\n")
  print(summary(clean_model1)$coefficients)
  
  # Calculate percentage change in coefficients
  orig_coef <- summary(model1)$coefficients[, 1]
  new_coef <- summary(clean_model1)$coefficients[, 1]
  pct_change <- 100 * (new_coef - orig_coef) / orig_coef
  
  cat("\nPercentage change in coefficients:\n")
  print(data.frame(
    Original = orig_coef,
    Without_Outliers = new_coef,
    Percent_Change = pct_change
  ))
}

## 
## 
## =======================================================
## RERUNNING MODEL 1 WITHOUT INFLUENTIAL OBSERVATIONS
## =======================================================
## 
## Original Model 1:
##                                        Estimate Std. Error     t value
## (Intercept)                         10.07179937  0.1004404 100.2763677
## partyRepublican                      0.09641466  0.1453100   0.6635102
## incumbencyIncumbent                 -0.07488510  0.1513377  -0.4948211
## incumbencyOpen Seat                 -0.05963517  0.1440254  -0.4140602
## partyRepublican:incumbencyIncumbent -0.03218202  0.2158096  -0.1491222
## partyRepublican:incumbencyOpen Seat -0.22231004  0.2081697  -1.0679268
##                                      Pr(>|t|)
## (Intercept)                         0.0000000
## partyRepublican                     0.5073134
## incumbencyIncumbent                 0.6209464
## incumbencyOpen Seat                 0.6790098
## partyRepublican:incumbencyIncumbent 0.8815180
## partyRepublican:incumbencyOpen Seat 0.2860751
## 
## Model 1 without influential observations:
##                                        Estimate Std. Error      t value
## (Intercept)                         10.07015587 0.09076403 110.94874994
## partyRepublican                      0.06836192 0.13187708   0.51837605
## incumbencyIncumbent                 -0.09587365 0.13667766  -0.70145804
## incumbencyOpen Seat                 -0.03650145 0.12984370  -0.28111835
## partyRepublican:incumbencyIncumbent  0.01699279 0.19491232   0.08718169
## partyRepublican:incumbencyOpen Seat -0.21516200 0.18912561  -1.13766719
##                                      Pr(>|t|)
## (Intercept)                         0.0000000
## partyRepublican                     0.6044394
## incumbencyIncumbent                 0.4833636
## incumbencyOpen Seat                 0.7787431
## partyRepublican:incumbencyIncumbent 0.9305641
## partyRepublican:incumbencyOpen Seat 0.2558379
## 
## Percentage change in coefficients:
##                                        Original Without_Outliers Percent_Change
## (Intercept)                         10.07179937      10.07015587    -0.01631786
## partyRepublican                      0.09641466       0.06836192   -29.09593107
## incumbencyIncumbent                 -0.07488510      -0.09587365    28.02765815
## incumbencyOpen Seat                 -0.05963517      -0.03650145   -38.79208156
## partyRepublican:incumbencyIncumbent -0.03218202       0.01699279  -152.80211302
## partyRepublican:incumbencyOpen Seat -0.22231004      -0.21516200    -3.21534683

# Repeat for other models if needed
if(length(model2_diag$influential) > 0) {
  cat("\n\n=======================================================\n")
  cat("RERUNNING MODEL 2 WITHOUT INFLUENTIAL OBSERVATIONS\n")
  cat("=======================================================\n\n")
  
  # Create new dataset without influential observations
  clean_data2 <- campaign_data[-model2_diag$influential, ]
  
  # Rerun model
  clean_model2 <- lm(log_individual_contributions ~ party + incumbency + party:incumbency, data = clean_data2)
  
  # Compare models
  cat("Original Model 2:\n")
  print(summary(model2)$coefficients)
  
  cat("\nModel 2 without influential observations:\n")
  print(summary(clean_model2)$coefficients)
  
  # Calculate percentage change in coefficients
  orig_coef <- summary(model2)$coefficients[, 1]
  new_coef <- summary(clean_model2)$coefficients[, 1]
  pct_change <- 100 * (new_coef - orig_coef) / orig_coef
  
  cat("\nPercentage change in coefficients:\n")
  print(data.frame(
    Original = orig_coef,
    Without_Outliers = new_coef,
    Percent_Change = pct_change
  ))
}

## 
## 
## =======================================================
## RERUNNING MODEL 2 WITHOUT INFLUENTIAL OBSERVATIONS
## =======================================================
## 
## Original Model 2:
##                                        Estimate Std. Error    t value
## (Intercept)                          9.40940578  0.1249737 75.2911044
## partyRepublican                      0.36182897  0.1808030  2.0012336
## incumbencyIncumbent                 -0.03517908  0.1883030 -0.1868216
## incumbencyOpen Seat                  0.02918620  0.1792046  0.1628653
## partyRepublican:incumbencyIncumbent -0.28553988  0.2685226 -1.0633735
## partyRepublican:incumbencyOpen Seat -0.27178026  0.2590166 -1.0492773
##                                          Pr(>|t|)
## (Intercept)                         7.065772e-273
## partyRepublican                      4.591374e-02
## incumbencyIncumbent                  8.518772e-01
## incumbencyOpen Seat                  8.706911e-01
## partyRepublican:incumbencyIncumbent  2.881321e-01
## partyRepublican:incumbencyOpen Seat  2.945636e-01
## 
## Model 2 without influential observations:
##                                        Estimate Std. Error    t value
## (Intercept)                          9.41684378  0.1102126 85.4425379
## partyRepublican                      0.31256386  0.1610689  1.9405596
## incumbencyIncumbent                 -0.07792654  0.1683525 -0.4628772
## incumbencyOpen Seat                 -0.09028647  0.1595320 -0.5659457
## partyRepublican:incumbencyIncumbent -0.21456510  0.2405795 -0.8918678
## partyRepublican:incumbencyOpen Seat -0.14780178  0.2318619 -0.6374561
##                                          Pr(>|t|)
## (Intercept)                         1.201162e-287
## partyRepublican                      5.291190e-02
## incumbencyIncumbent                  6.436676e-01
## incumbencyOpen Seat                  5.717020e-01
## partyRepublican:incumbencyIncumbent  3.729221e-01
## partyRepublican:incumbencyOpen Seat  5.241395e-01
## 
## Percentage change in coefficients:
##                                        Original Without_Outliers Percent_Change
## (Intercept)                          9.40940578       9.41684378     0.07904858
## partyRepublican                      0.36182897       0.31256386   -13.61558020
## incumbencyIncumbent                 -0.03517908      -0.07792654   121.51384528
## incumbencyOpen Seat                  0.02918620      -0.09028647  -409.34642144
## partyRepublican:incumbencyIncumbent -0.28553988      -0.21456510   -24.85634482
## partyRepublican:incumbencyOpen Seat -0.27178026      -0.14780178   -45.61717482

if(length(model3_diag$influential) > 0) {
  cat("\n\n=======================================================\n")
  cat("RERUNNING MODEL 3 WITHOUT INFLUENTIAL OBSERVATIONS\n")
  cat("=======================================================\n\n")
  
  # Create new dataset without influential observations
  clean_data3 <- campaign_data[-model3_diag$influential, ]
  
  # Rerun model
  clean_model3 <- lm(log_disbursements ~ party + incumbency + party:incumbency, data = clean_data3)
  
  # Compare models
  cat("Original Model 3:\n")
  print(summary(model3)$coefficients)
  
  cat("\nModel 3 without influential observations:\n")
  print(summary(clean_model3)$coefficients)
  
  # Calculate percentage change in coefficients
  orig_coef <- summary(model3)$coefficients[, 1]
  new_coef <- summary(clean_model3)$coefficients[, 1]
  pct_change <- 100 * (new_coef - orig_coef) / orig_coef
  
  cat("\nPercentage change in coefficients:\n")
  print(data.frame(
    Original = orig_coef,
    Without_Outliers = new_coef,
    Percent_Change = pct_change
  ))
}

## 
## 
## =======================================================
## RERUNNING MODEL 3 WITHOUT INFLUENTIAL OBSERVATIONS
## =======================================================
## 
## Original Model 3:
##                                        Estimate Std. Error     t value
## (Intercept)                          9.74831521 0.09223127 105.6942528
## partyRepublican                      0.06444475 0.13343360   0.4829724
## incumbencyIncumbent                  0.18825548 0.13896869   1.3546611
## incumbencyOpen Seat                  0.05858721 0.13225397   0.4429902
## partyRepublican:incumbencyIncumbent -0.28297271 0.19817122  -1.4279204
## partyRepublican:incumbencyOpen Seat  0.06301478 0.19115572   0.3296515
##                                      Pr(>|t|)
## (Intercept)                         0.0000000
## partyRepublican                     0.6293293
## incumbencyIncumbent                 0.1761448
## incumbencyOpen Seat                 0.6579668
## partyRepublican:incumbencyIncumbent 0.1539466
## partyRepublican:incumbencyOpen Seat 0.7418030
## 
## Model 3 without influential observations:
##                                        Estimate Std. Error      t value
## (Intercept)                          9.75972898 0.08144172 119.83697389
## partyRepublican                      0.12719010 0.11867757   1.07172823
## incumbencyIncumbent                  0.15500371 0.12506566   1.23937866
## incumbencyOpen Seat                  0.02123195 0.11683806   0.18172115
## partyRepublican:incumbencyIncumbent -0.35227683 0.17741503  -1.98560871
## partyRepublican:incumbencyOpen Seat  0.01034781 0.16982625   0.06093177
##                                       Pr(>|t|)
## (Intercept)                         0.00000000
## partyRepublican                     0.28439329
## incumbencyIncumbent                 0.21582513
## incumbencyOpen Seat                 0.85588008
## partyRepublican:incumbencyIncumbent 0.04765867
## partyRepublican:incumbencyOpen Seat 0.95143950
## 
## Percentage change in coefficients:
##                                        Original Without_Outliers Percent_Change
## (Intercept)                          9.74831521       9.75972898      0.1170846
## partyRepublican                      0.06444475       0.12719010     97.3630137
## incumbencyIncumbent                  0.18825548       0.15500371    -17.6631118
## incumbencyOpen Seat                  0.05858721       0.02123195    -63.7600992
## partyRepublican:incumbencyIncumbent -0.28297271      -0.35227683     24.4914488
## partyRepublican:incumbencyOpen Seat  0.06301478       0.01034781    -83.5787499

# =====================================================================
# STEP 10: FINAL SUMMARY AND RECOMMENDATIONS
# =====================================================================

cat("\n\n=======================================================\n")

## 
## 
## =======================================================

cat("FINAL SUMMARY AND RECOMMENDATIONS\n")

## FINAL SUMMARY AND RECOMMENDATIONS

cat("=======================================================\n\n")

## =======================================================

cat("Model 1 (Log Receipts):\n")

## Model 1 (Log Receipts):

if(model1_diag$bp_test$p.value < 0.05) {
  cat("- Heteroscedasticity detected: Use robust standard errors\n")
} else {
  cat("- No heteroscedasticity detected\n")
}

## - No heteroscedasticity detected

if(model1_diag$sw_test$p.value < 0.05) {
  cat("- Non-normal residuals detected: Consider transformation\n")
} else {
  cat("- Residuals appear normally distributed\n")
}

## - Residuals appear normally distributed

if(length(model1_diag$influential) > 0) {
  cat("- Influential observations detected: Consider analysis without these cases\n")
} else {
  cat("- No influential observations detected\n")
}

## - Influential observations detected: Consider analysis without these cases

cat("\nModel 2 (Log Individual Contributions):\n")

## 
## Model 2 (Log Individual Contributions):

if(model2_diag$bp_test$p.value < 0.05) {
  cat("- Heteroscedasticity detected: Use robust standard errors\n")
} else {
  cat("- No heteroscedasticity detected\n")
}

## - No heteroscedasticity detected

if(model2_diag$sw_test$p.value < 0.05) {
  cat("- Non-normal residuals detected: Consider transformation\n")
} else {
  cat("- Residuals appear normally distributed\n")
}

## - Residuals appear normally distributed

if(length(model2_diag$influential) > 0) {
  cat("- Influential observations detected: Consider analysis without these cases\n")
} else {
  cat("- No influential observations detected\n")
}

## - Influential observations detected: Consider analysis without these cases

cat("\nModel 3 (Log Disbursements):\n")

## 
## Model 3 (Log Disbursements):

if(model3_diag$bp_test$p.value < 0.05) {
  cat("- Heteroscedasticity detected: Use robust standard errors\n")
} else {
  cat("- No heteroscedasticity detected\n")
}

## - No heteroscedasticity detected

if(model3_diag$sw_test$p.value < 0.05) {
  cat("- Non-normal residuals detected: Consider transformation\n")
} else {
  cat("- Residuals appear normally distributed\n")
}

## - Residuals appear normally distributed

if(length(model3_diag$influential) > 0) {
  cat("- Influential observations detected: Consider analysis without these cases\n")
} else {
  cat("- No influential observations detected\n")
}

## - Influential observations detected: Consider analysis without these cases

cat("\nOverall Conclusion:\n")

## 
## Overall Conclusion:

cat("Based on the diagnostics, the following adjustments are recommended:\n")

## Based on the diagnostics, the following adjustments are recommended:

cat("1. Use robust standard errors for all models due to potential heteroscedasticity\n")

## 1. Use robust standard errors for all models due to potential heteroscedasticity

cat("2. Consider examining the most influential observations to understand their impact\n")

## 2. Consider examining the most influential observations to understand their impact

cat("3. The Box-Cox analysis suggests the log transformation is appropriate for these models\n")

## 3. The Box-Cox analysis suggests the log transformation is appropriate for these models

# =====================================================================
# STEP 11: SAVING RESULTS (OPTIONAL)
# =====================================================================

# Save diagnostic plots
# Uncomment if you want to save the plots
# pdf("diagnostic_plots.pdf")
# print(model1_plots$combined)
# print(model2_plots$combined)
# print(model3_plots$combined)
# dev.off()

# Save robust models results
# Uncomment if you want to save the results
# sink("robust_model_results.txt")
# cat("ROBUST STANDARD ERRORS FOR WOMEN CANDIDATES CAMPAIGN FINANCE MODELS\n\n")
# cat("Model 1 (Log Receipts):\n")
# print(robust_se1)
# cat("\nModel 2 (Log Individual Contributions):\n")
# print(robust_se2)
# cat("\nModel 3 (Log Disbursements):\n")
# print(robust_se3)
# sink()

cat("\nAnalysis complete! Check diagnostic plots and robust model results for final interpretation.\n")

## 
## Analysis complete! Check diagnostic plots and robust model results for final interpretation.

regression-diagnostics.r

lemmi

2025-04-24