# COMPLETE CAMPAIGN FINANCE REGRESSION DIAGNOSTICS
# This script performs comprehensive diagnostic checks on regression models
# analyzing women candidates' campaign finance data
# Load necessary packages
if (!require("car")) install.packages("car")
## Loading required package: car
## Loading required package: carData
if (!require("lmtest")) install.packages("lmtest")
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
if (!require("MASS")) install.packages("MASS")
## Loading required package: MASS
if (!require("ggplot2")) install.packages("ggplot2")
## Loading required package: ggplot2
if (!require("sandwich")) install.packages("sandwich")
## Loading required package: sandwich
if (!require("gridExtra")) install.packages("gridExtra")
## Loading required package: gridExtra
if (!require("corrplot")) install.packages("corrplot")
## Loading required package: corrplot
## corrplot 0.95 loaded
if (!require("gvlma")) install.packages("gvlma")
## Loading required package: gvlma
library(car) # For VIF and diagnostic tests
library(lmtest) # For heteroscedasticity tests
library(MASS) # For boxcox transformations
library(ggplot2) # For visualizations
library(sandwich) # For robust standard errors
library(gridExtra) # For arranging multiple plots
library(corrplot) # For correlation visualization
library(gvlma) # For global model validation
# =====================================================================
# STEP 1: DATA PREPARATION AND MODEL DEFINITION
# =====================================================================
# Load your data
# Replace this with your actual data file path
# campaign_data <- read.csv("your_data_file.csv")
# For example purposes, creating a simple dummy dataset
set.seed(123)
n <- 500
campaign_data <- data.frame(
receipts = rlnorm(n, meanlog = 10, sdlog = 1),
individual_contributions = rlnorm(n, meanlog = 9.5, sdlog = 1.2),
disbursements = rlnorm(n, meanlog = 9.8, sdlog = 0.9),
party = factor(sample(c("Democrat", "Republican"), n, replace = TRUE)),
incumbency = factor(sample(c("Incumbent", "Challenger", "Open Seat"), n, replace = TRUE))
)
# Display data structure
str(campaign_data)
## 'data.frame': 500 obs. of 5 variables:
## $ receipts : num 12576 17498 104685 23636 25067 ...
## $ individual_contributions: num 6488 4054 45805 32902 2184 ...
## $ disbursements : num 7360 7073 17744 16011 1818 ...
## $ party : Factor w/ 2 levels "Democrat","Republican": 2 1 2 2 1 2 1 1 1 2 ...
## $ incumbency : Factor w/ 3 levels "Challenger","Incumbent",..: 3 2 1 2 1 2 3 3 1 1 ...
# Check for missing values
cat("Missing values in dataset:\n")
## Missing values in dataset:
print(colSums(is.na(campaign_data)))
## receipts individual_contributions disbursements
## 0 0 0
## party incumbency
## 0 0
# Create dummy variables (if needed)
campaign_data$republican <- ifelse(campaign_data$party == "Republican", 1, 0)
campaign_data$challenger <- ifelse(campaign_data$incumbency == "Challenger", 1, 0)
campaign_data$open_seat <- ifelse(campaign_data$incumbency == "Open Seat", 1, 0)
# Create interaction terms (if needed)
campaign_data$rep_challenger <- campaign_data$republican * campaign_data$challenger
campaign_data$rep_open_seat <- campaign_data$republican * campaign_data$open_seat
# Create log transformed variables - THIS WAS MISSING IN THE ORIGINAL SCRIPT
campaign_data$log_receipts <- log(campaign_data$receipts + 1) # Add 1 to handle zeros
campaign_data$log_individual_contributions <- log(campaign_data$individual_contributions + 1)
campaign_data$log_disbursements <- log(campaign_data$disbursements + 1)
# Create regression models
# Model 1: Log Receipts
model1 <- lm(log_receipts ~ party + incumbency + party:incumbency, data = campaign_data)
# Model 2: Log Individual Contributions
model2 <- lm(log_individual_contributions ~ party + incumbency + party:incumbency, data = campaign_data)
# Model 3: Log Disbursements
model3 <- lm(log_disbursements ~ party + incumbency + party:incumbency, data = campaign_data)
# =====================================================================
# STEP 2: INITIAL MODEL SUMMARIES
# =====================================================================
# Display model summaries
cat("\n\n=======================================================\n")
##
##
## =======================================================
cat("MODEL SUMMARIES\n")
## MODEL SUMMARIES
cat("=======================================================\n\n")
## =======================================================
cat("Model 1: Log Receipts\n")
## Model 1: Log Receipts
print(summary(model1))
##
## Call:
## lm(formula = log_receipts ~ party + incumbency + party:incumbency,
## data = campaign_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8107 -0.6082 -0.0282 0.6308 3.0728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.07180 0.10044 100.276 <2e-16 ***
## partyRepublican 0.09641 0.14531 0.664 0.507
## incumbencyIncumbent -0.07489 0.15134 -0.495 0.621
## incumbencyOpen Seat -0.05964 0.14403 -0.414 0.679
## partyRepublican:incumbencyIncumbent -0.03218 0.21581 -0.149 0.882
## partyRepublican:incumbencyOpen Seat -0.22231 0.20817 -1.068 0.286
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9738 on 494 degrees of freedom
## Multiple R-squared: 0.007778, Adjusted R-squared: -0.002265
## F-statistic: 0.7745 on 5 and 494 DF, p-value: 0.5684
cat("\nModel 2: Log Individual Contributions\n")
##
## Model 2: Log Individual Contributions
print(summary(model2))
##
## Call:
## lm(formula = log_individual_contributions ~ party + incumbency +
## party:incumbency, data = campaign_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2790 -0.7869 -0.0127 0.8059 3.2915
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.40941 0.12497 75.291 <2e-16 ***
## partyRepublican 0.36183 0.18080 2.001 0.0459 *
## incumbencyIncumbent -0.03518 0.18830 -0.187 0.8519
## incumbencyOpen Seat 0.02919 0.17920 0.163 0.8707
## partyRepublican:incumbencyIncumbent -0.28554 0.26852 -1.063 0.2881
## partyRepublican:incumbencyOpen Seat -0.27178 0.25902 -1.049 0.2946
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.212 on 494 degrees of freedom
## Multiple R-squared: 0.01206, Adjusted R-squared: 0.002059
## F-statistic: 1.206 on 5 and 494 DF, p-value: 0.3051
cat("\nModel 3: Log Disbursements\n")
##
## Model 3: Log Disbursements
print(summary(model3))
##
## Call:
## lm(formula = log_disbursements ~ party + incumbency + party:incumbency,
## data = campaign_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.56174 -0.60054 0.01358 0.57120 3.10302
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.74832 0.09223 105.694 <2e-16 ***
## partyRepublican 0.06444 0.13343 0.483 0.629
## incumbencyIncumbent 0.18826 0.13897 1.355 0.176
## incumbencyOpen Seat 0.05859 0.13225 0.443 0.658
## partyRepublican:incumbencyIncumbent -0.28297 0.19817 -1.428 0.154
## partyRepublican:incumbencyOpen Seat 0.06301 0.19116 0.330 0.742
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8942 on 494 degrees of freedom
## Multiple R-squared: 0.008419, Adjusted R-squared: -0.001617
## F-statistic: 0.8388 on 5 and 494 DF, p-value: 0.5225
# =====================================================================
# STEP 3: EXPLORATORY DATA ANALYSIS
# =====================================================================
# Correlation matrix of numeric variables
numeric_vars <- sapply(campaign_data, is.numeric)
if(sum(numeric_vars) > 1) {
cor_matrix <- cor(campaign_data[, numeric_vars], use = "complete.obs")
corrplot(cor_matrix, method = "circle", type = "upper",
tl.col = "black", tl.srt = 45, addCoef.col = "black")
}

# Boxplots of dependent variables by party and incumbency
p1 <- ggplot(campaign_data, aes(x = incumbency, y = log_receipts, fill = party)) +
geom_boxplot() +
labs(title = "Log Receipts by Party and Incumbency Status",
x = "Incumbency Status", y = "Log Receipts") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p2 <- ggplot(campaign_data, aes(x = incumbency, y = log_individual_contributions, fill = party)) +
geom_boxplot() +
labs(title = "Log Individual Contributions by Party and Incumbency Status",
x = "Incumbency Status", y = "Log Individual Contributions") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p3 <- ggplot(campaign_data, aes(x = incumbency, y = log_disbursements, fill = party)) +
geom_boxplot() +
labs(title = "Log Disbursements by Party and Incumbency Status",
x = "Incumbency Status", y = "Log Disbursements") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
grid.arrange(p1, p2, p3, ncol = 1)

# =====================================================================
# STEP 4: COMPREHENSIVE DIAGNOSTIC FUNCTION
# =====================================================================
run_diagnostics <- function(model, model_name) {
cat("\n\n=======================================================\n")
cat("DIAGNOSTIC TESTS FOR:", model_name, "\n")
cat("=======================================================\n\n")
# 1. Check for multicollinearity using VIF
cat("1. MULTICOLLINEARITY CHECK (VIF)\n")
tryCatch({
vif_values <- vif(model)
print(vif_values)
cat("\nInterpretation:\n")
if(any(vif_values > 10)) {
cat("WARNING: Severe multicollinearity detected (VIF > 10)\n")
cat("Variables with severe multicollinearity:",
names(vif_values)[vif_values > 10], "\n")
} else if(any(vif_values > 5)) {
cat("CAUTION: Moderate multicollinearity detected (VIF > 5)\n")
cat("Variables with moderate multicollinearity:",
names(vif_values)[vif_values > 5], "\n")
} else {
cat("No problematic multicollinearity detected.\n")
}
}, error = function(e) {
cat("Error in VIF calculation:", e$message, "\n")
cat("This may be due to perfect multicollinearity or singular design matrix.\n")
})
# 2. Check for heteroscedasticity
cat("\n2. HETEROSCEDASTICITY TESTS\n")
# Breusch-Pagan test
bp_test <- bptest(model)
print(bp_test)
cat("\nInterpretation:\n")
if(bp_test$p.value < 0.05) {
cat("WARNING: Heteroscedasticity detected (p < 0.05)\n")
cat("Consider using robust standard errors or transforming variables.\n")
} else {
cat("No heteroscedasticity detected using Breusch-Pagan test.\n")
}
# 3. Normality of residuals
cat("\n3. NORMALITY OF RESIDUALS\n")
# Shapiro-Wilk test
sw_test <- shapiro.test(residuals(model))
print(sw_test)
cat("\nInterpretation:\n")
if(sw_test$p.value < 0.05) {
cat("WARNING: Non-normal residuals detected (p < 0.05)\n")
cat("Consider transforming dependent variable or using robust methods.\n")
} else {
cat("Residuals appear to be normally distributed.\n")
}
# 4. Influence measures
cat("\n4. INFLUENTIAL OBSERVATIONS\n")
# Calculate Cook's distance
cooks_d <- cooks.distance(model)
# Identify influential observations (Cook's D > 4/n)
n <- length(residuals(model))
threshold <- 4/n
influential <- which(cooks_d > threshold)
cat("Cook's Distance threshold (4/n):", threshold, "\n")
cat("Number of influential observations:", length(influential), "\n")
if(length(influential) > 0) {
cat("Influential observations (row numbers):\n")
print(influential)
# Look at the actual influential observations
cat("\nDetails of influential observations:\n")
if(exists("campaign_data")) {
print(campaign_data[influential, ])
} else {
cat("Data frame not available for inspection.\n")
}
}
# 5. Outliers in the response (Studentized residuals)
cat("\n5. OUTLIERS IN RESPONSE (STUDENTIZED RESIDUALS)\n")
# Calculate studentized residuals
stud_resid <- rstudent(model)
# Identify outliers (|stud_resid| > 3)
outliers <- which(abs(stud_resid) > 3)
cat("Number of outliers (|studentized residual| > 3):", length(outliers), "\n")
if(length(outliers) > 0) {
cat("Outlier observations (row numbers):\n")
print(outliers)
# Look at the actual outliers
cat("\nDetails of outlier observations:\n")
if(exists("campaign_data")) {
print(campaign_data[outliers, ])
} else {
cat("Data frame not available for inspection.\n")
}
}
# 6. Global model validation
cat("\n6. GLOBAL MODEL VALIDATION\n")
# Global test
tryCatch({
global_test <- gvlma(model)
print(global_test)
}, error = function(e) {
cat("Error in global validation:", e$message, "\n")
cat("This may be due to near singularities or other model issues.\n")
})
# Return diagnostic data for plots
return(list(
model = model,
cooks_d = cooks_d,
stud_resid = stud_resid,
fitted = fitted(model),
residuals = residuals(model),
bp_test = bp_test,
sw_test = sw_test,
influential = influential,
outliers = outliers
))
}
# =====================================================================
# STEP 5: DIAGNOSTIC PLOTS FUNCTION
# =====================================================================
create_diagnostic_plots <- function(diag_data, model_name) {
# Create data frame for plots
plot_data <- data.frame(
fitted = diag_data$fitted,
residuals = diag_data$residuals,
sqrt_abs_resid = sqrt(abs(diag_data$stud_resid)),
stud_resid = diag_data$stud_resid,
cooks_dist = diag_data$cooks_d,
index = 1:length(diag_data$residuals)
)
# 1. Residuals vs Fitted plot
p1 <- ggplot(plot_data, aes(x = fitted, y = residuals)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
labs(title = paste("Residuals vs Fitted Values -", model_name),
x = "Fitted values", y = "Residuals") +
theme_minimal()
# 2. Q-Q plot of residuals
p2 <- ggplot(plot_data, aes(sample = residuals)) +
stat_qq() +
stat_qq_line(color = "red") +
labs(title = paste("Normal Q-Q Plot -", model_name),
x = "Theoretical Quantiles", y = "Sample Quantiles") +
theme_minimal()
# 3. Scale-Location plot (sqrt of abs standardized residuals vs fitted)
p3 <- ggplot(plot_data, aes(x = fitted, y = sqrt_abs_resid)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
labs(title = paste("Scale-Location Plot -", model_name),
x = "Fitted values",
y = "√|Studentized residuals|") +
theme_minimal()
# 4. Cook's distance plot
p4 <- ggplot(plot_data, aes(x = index, y = cooks_dist)) +
geom_bar(stat = "identity", alpha = 0.6) +
geom_hline(yintercept = 4/length(diag_data$residuals),
linetype = "dashed", color = "red") +
labs(title = paste("Cook's Distance -", model_name),
x = "Observation", y = "Cook's distance") +
theme_minimal()
# 5. Studentized Residuals vs Leverage plot
p5 <- ggplot(cbind(plot_data, hatvalues = hatvalues(diag_data$model)),
aes(x = hatvalues, y = stud_resid)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = c(-3, 0, 3), linetype = "dashed", color = c("red", "black", "red")) +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
labs(title = paste("Studentized Residuals vs Leverage -", model_name),
x = "Leverage", y = "Studentized Residuals") +
theme_minimal()
# Combine all plots
combined_plot <- grid.arrange(p1, p2, p3, p4, p5, ncol = 2)
# Return all plots
return(list(
p1 = p1,
p2 = p2,
p3 = p3,
p4 = p4,
p5 = p5,
combined = combined_plot
))
}
# =====================================================================
# STEP 6: BOXCOX TRANSFORMATION FUNCTION
# =====================================================================
try_boxcox <- function(model, model_name) {
cat("\n\n=======================================================\n")
cat("BOX-COX TRANSFORMATION FOR:", model_name, "\n")
cat("=======================================================\n\n")
# Run Box-Cox analysis
tryCatch({
bc <- boxcox(model)
# Find optimal lambda
lambda <- bc$x[which.max(bc$y)]
cat("Optimal lambda:", lambda, "\n")
if(abs(lambda - 1) < 0.05) {
cat("Interpretation: No transformation needed (lambda ≈ 1)\n")
} else if(abs(lambda) < 0.05) {
cat("Interpretation: Log transformation suggested (lambda ≈ 0)\n")
} else if(abs(lambda - 0.5) < 0.05) {
cat("Interpretation: Square root transformation suggested (lambda ≈ 0.5)\n")
} else if(abs(lambda + 1) < 0.05) {
cat("Interpretation: Reciprocal transformation suggested (lambda ≈ -1)\n")
} else {
cat("Interpretation: Power transformation with lambda =", lambda, "suggested\n")
}
return(lambda)
}, error = function(e) {
cat("Error in Box-Cox transformation:", e$message, "\n")
cat("This may occur if the response variable contains negative or zero values.\n")
return(NA)
})
}
# =====================================================================
# STEP 7: ROBUST STANDARD ERRORS FUNCTION
# =====================================================================
calculate_robust_se <- function(model, model_name) {
cat("\n\n=======================================================\n")
cat("ROBUST STANDARD ERRORS FOR:", model_name, "\n")
cat("=======================================================\n\n")
# Calculate robust standard errors using HC3 (best for heteroscedasticity)
robust_se <- coeftest(model, vcov = hccm(model, type = "hc3"))
print(robust_se)
# Compare with original model
cat("\nComparison with original model coefficients:\n")
original_coef <- summary(model)$coefficients
robust_coef <- robust_se
comparison <- data.frame(
Original_Estimate = original_coef[, 1],
Original_SE = original_coef[, 2],
Original_p = original_coef[, 4],
Robust_SE = robust_coef[, 2],
Robust_p = robust_coef[, 4]
)
print(comparison)
return(robust_se)
}
# =====================================================================
# STEP 8: RUNNING ALL DIAGNOSTICS
# =====================================================================
# Run diagnostics for Model 1 (Log Receipts)
model1_diag <- run_diagnostics(model1, "Log Receipts Model")
##
##
## =======================================================
## DIAGNOSTIC TESTS FOR: Log Receipts Model
## =======================================================
##
## 1. MULTICOLLINEARITY CHECK (VIF)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## GVIF Df GVIF^(1/(2*Df))
## party 2.781094 1 1.667661
## incumbency 3.807750 2 1.396906
## party:incumbency 7.376954 2 1.648046
##
## Interpretation:
## CAUTION: Moderate multicollinearity detected (VIF > 5)
## Variables with moderate multicollinearity:
##
## 2. HETEROSCEDASTICITY TESTS
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 4.4652, df = 5, p-value = 0.4846
##
##
## Interpretation:
## No heteroscedasticity detected using Breusch-Pagan test.
##
## 3. NORMALITY OF RESIDUALS
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.9979, p-value = 0.7981
##
##
## Interpretation:
## Residuals appear to be normally distributed.
##
## 4. INFLUENTIAL OBSERVATIONS
## Cook's Distance threshold (4/n): 0.008
## Number of influential observations: 23
## Influential observations (row numbers):
## 18 44 72 97 135 139 164 174 196 201 265 310 336 352 359 360 371 392 416 427
## 18 44 72 97 135 139 164 174 196 201 265 310 336 352 359 360 371 392 416 427
## 439 449 456
## 439 449 456
##
## Details of influential observations:
## receipts individual_contributions disbursements party incumbency
## 18 3082.150 19380.455 5792.328 Republican Incumbent
## 44 192712.740 3175.320 14260.532 Republican Open Seat
## 72 2188.192 2326.781 12903.445 Democrat Incumbent
## 97 196286.969 11469.214 26774.825 Democrat Challenger
## 135 2826.382 2631.736 7181.866 Democrat Challenger
## 139 148613.398 28662.612 74699.674 Democrat Incumbent
## 164 563002.831 12912.175 16257.351 Republican Challenger
## 174 185063.058 6903.139 20954.384 Republican Challenger
## 196 162301.888 255166.765 20929.432 Democrat Incumbent
## 201 198552.802 5575.499 31503.851 Republican Incumbent
## 265 218180.717 20795.470 19268.728 Democrat Challenger
## 310 168986.485 11048.469 18027.503 Republican Open Seat
## 336 2829.279 18782.008 3370.107 Democrat Open Seat
## 352 3002.653 11885.863 4963.141 Democrat Challenger
## 359 1870.763 15931.446 9031.057 Republican Open Seat
## 360 288213.887 93746.110 26046.525 Republican Challenger
## 371 246908.562 3404.860 11588.114 Republican Open Seat
## 392 2546.191 26224.508 6861.774 Democrat Open Seat
## 416 1566.895 49807.964 22866.435 Republican Challenger
## 427 242183.863 11913.350 17969.119 Democrat Open Seat
## 439 2380.373 10327.800 11985.797 Republican Open Seat
## 449 2828.956 9636.170 29618.273 Republican Challenger
## 456 1539.291 1907.922 28540.749 Republican Open Seat
## republican challenger open_seat rep_challenger rep_open_seat log_receipts
## 18 1 0 0 0 0 8.033707
## 44 1 0 1 0 1 12.168961
## 72 0 0 0 0 0 7.691288
## 97 0 1 0 0 0 12.187338
## 135 0 1 0 0 0 7.947107
## 139 0 0 0 0 0 11.909110
## 164 1 1 0 1 0 13.241042
## 174 1 1 0 1 0 12.128457
## 196 0 0 0 0 0 11.997220
## 201 1 0 0 0 0 12.198815
## 265 0 1 0 0 0 12.293084
## 310 1 0 1 0 1 12.037580
## 336 0 0 1 0 0 7.948131
## 352 0 1 0 0 0 8.007584
## 359 1 0 1 0 1 7.534636
## 360 1 1 0 1 0 12.571462
## 371 1 0 1 0 1 12.416777
## 392 0 0 1 0 0 7.842746
## 416 1 1 0 1 0 7.357489
## 427 0 0 1 0 0 12.397457
## 439 1 0 1 0 1 7.775432
## 449 1 1 0 1 0 7.948016
## 456 1 0 1 0 1 7.339727
## log_individual_contributions log_disbursements
## 18 9.872072 8.664462
## 44 8.063479 9.565321
## 72 7.752671 9.465327
## 97 9.347509 10.195255
## 135 7.875779 8.879454
## 139 10.263384 11.221244
## 164 9.466003 9.696362
## 174 8.839876 9.950151
## 196 12.449677 9.948959
## 201 8.626316 10.357897
## 265 9.942539 9.866291
## 310 9.310138 9.799709
## 336 9.840708 8.122996
## 352 9.383189 8.509996
## 359 9.676113 9.108535
## 360 11.448356 10.167678
## 371 8.133253 9.357822
## 392 10.174488 8.833867
## 416 10.815950 10.037469
## 427 9.385499 9.796466
## 439 9.242691 9.391561
## 449 9.173383 10.296181
## 456 7.554294 10.259123
##
## 5. OUTLIERS IN RESPONSE (STUDENTIZED RESIDUALS)
## Number of outliers (|studentized residual| > 3): 1
## Outlier observations (row numbers):
## 164
## 164
##
## Details of outlier observations:
## receipts individual_contributions disbursements party incumbency
## 164 563002.8 12912.17 16257.35 Republican Challenger
## republican challenger open_seat rep_challenger rep_open_seat log_receipts
## 164 1 1 0 1 0 13.24104
## log_individual_contributions log_disbursements
## 164 9.466003 9.696362
##
## 6. GLOBAL MODEL VALIDATION
##
## Call:
## lm(formula = log_receipts ~ party + incumbency + party:incumbency,
## data = campaign_data)
##
## Coefficients:
## (Intercept) partyRepublican
## 10.07180 0.09641
## incumbencyIncumbent incumbencyOpen Seat
## -0.07489 -0.05964
## partyRepublican:incumbencyIncumbent partyRepublican:incumbencyOpen Seat
## -0.03218 -0.22231
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = model)
##
## Value p-value Decision
## Global Stat 1.730e+00 0.7853 Assumptions acceptable.
## Skewness 6.976e-01 0.4036 Assumptions acceptable.
## Kurtosis 1.261e-01 0.7225 Assumptions acceptable.
## Link Function -6.637e-13 1.0000 Assumptions acceptable.
## Heteroscedasticity 9.059e-01 0.3412 Assumptions acceptable.
model1_plots <- create_diagnostic_plots(model1_diag, "Log Receipts Model")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

print(model1_plots$combined)
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
## 5 5 (3-3,1-1) arrange gtable[layout]
robust_se1 <- calculate_robust_se(model1, "Log Receipts Model")
##
##
## =======================================================
## ROBUST STANDARD ERRORS FOR: Log Receipts Model
## =======================================================
##
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.071799 0.091172 110.4708 <2e-16 ***
## partyRepublican 0.096415 0.144831 0.6657 0.5059
## incumbencyIncumbent -0.074885 0.140473 -0.5331 0.5942
## incumbencyOpen Seat -0.059635 0.136694 -0.4363 0.6628
## partyRepublican:incumbencyIncumbent -0.032182 0.214220 -0.1502 0.8806
## partyRepublican:incumbencyOpen Seat -0.222310 0.212285 -1.0472 0.2955
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## Comparison with original model coefficients:
## Original_Estimate Original_SE Original_p
## (Intercept) 10.07179937 0.1004404 0.0000000
## partyRepublican 0.09641466 0.1453100 0.5073134
## incumbencyIncumbent -0.07488510 0.1513377 0.6209464
## incumbencyOpen Seat -0.05963517 0.1440254 0.6790098
## partyRepublican:incumbencyIncumbent -0.03218202 0.2158096 0.8815180
## partyRepublican:incumbencyOpen Seat -0.22231004 0.2081697 0.2860751
## Robust_SE Robust_p
## (Intercept) 0.09117159 0.0000000
## partyRepublican 0.14483061 0.5059095
## incumbencyIncumbent 0.14047273 0.5942086
## incumbencyOpen Seat 0.13669384 0.6628328
## partyRepublican:incumbencyIncumbent 0.21422039 0.8806456
## partyRepublican:incumbencyOpen Seat 0.21228483 0.2955078
lambda1 <- try_boxcox(model1, "Log Receipts Model")
##
##
## =======================================================
## BOX-COX TRANSFORMATION FOR: Log Receipts Model
## =======================================================

## Optimal lambda: 0.6666667
## Interpretation: Power transformation with lambda = 0.6666667 suggested
# Run diagnostics for Model 2 (Log Individual Contributions)
model2_diag <- run_diagnostics(model2, "Log Individual Contributions Model")
##
##
## =======================================================
## DIAGNOSTIC TESTS FOR: Log Individual Contributions Model
## =======================================================
##
## 1. MULTICOLLINEARITY CHECK (VIF)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## GVIF Df GVIF^(1/(2*Df))
## party 2.781094 1 1.667661
## incumbency 3.807750 2 1.396906
## party:incumbency 7.376954 2 1.648046
##
## Interpretation:
## CAUTION: Moderate multicollinearity detected (VIF > 5)
## Variables with moderate multicollinearity:
##
## 2. HETEROSCEDASTICITY TESTS
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 4.1899, df = 5, p-value = 0.5224
##
##
## Interpretation:
## No heteroscedasticity detected using Breusch-Pagan test.
##
## 3. NORMALITY OF RESIDUALS
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.99576, p-value = 0.1967
##
##
## Interpretation:
## Residuals appear to be normally distributed.
##
## 4. INFLUENTIAL OBSERVATIONS
## Cook's Distance threshold (4/n): 0.008
## Number of influential observations: 26
## Influential observations (row numbers):
## 35 49 91 116 130 151 165 196 243 246 247 249 259 308 311 342 355 366 376 411
## 35 49 91 116 130 151 165 196 243 246 247 249 259 308 311 342 355 366 376 411
## 418 429 462 467 472 481
## 418 429 462 467 472 481
##
## Details of influential observations:
## receipts individual_contributions disbursements party incumbency
## 35 50090.221 138001.7246 41858.213 Democrat Incumbent
## 49 48048.448 246784.8005 87254.240 Democrat Open Seat
## 91 59486.451 458.6422 88691.000 Democrat Challenger
## 116 29766.931 588.7257 37897.285 Democrat Incumbent
## 130 20510.494 163417.0697 4395.857 Democrat Challenger
## 151 48423.419 1304.7860 17132.397 Republican Incumbent
## 165 14517.969 898.8702 10042.071 Democrat Incumbent
## 196 162301.888 255166.7649 20929.432 Democrat Incumbent
## 243 98327.662 233167.4244 64983.638 Republican Incumbent
## 246 147614.854 941.2796 32393.094 Democrat Open Seat
## 247 19910.950 337748.4964 21263.413 Democrat Open Seat
## 249 11330.240 230886.5303 63845.148 Democrat Open Seat
## 259 19660.368 1158.3838 16192.109 Republican Open Seat
## 308 27227.436 178742.0556 19860.965 Republican Open Seat
## 311 80916.739 658.8518 60376.880 Republican Open Seat
## 342 57469.254 334981.5716 67102.015 Republican Challenger
## 355 9016.328 216432.1254 40650.584 Republican Open Seat
## 366 17857.999 285966.8964 15459.804 Republican Challenger
## 376 12216.209 138810.5326 31763.353 Democrat Incumbent
## 411 36034.373 293766.2660 19121.358 Republican Open Seat
## 418 33869.992 1325.8721 16240.727 Republican Challenger
## 429 112822.202 264500.2440 16479.873 Republican Challenger
## 462 15545.517 210985.1927 34773.272 Democrat Open Seat
## 467 46207.621 1453.6496 16828.358 Republican Challenger
## 472 16622.590 1060.5575 5501.559 Republican Incumbent
## 481 22481.560 212367.6773 59216.905 Republican Incumbent
## republican challenger open_seat rep_challenger rep_open_seat log_receipts
## 35 0 0 0 0 0 10.821601
## 49 0 0 1 0 0 10.779986
## 91 0 1 0 0 0 10.993521
## 116 0 0 0 0 0 10.301187
## 130 0 1 0 0 0 9.928741
## 151 1 0 0 0 0 10.787759
## 165 0 0 0 0 0 9.583211
## 196 0 0 0 0 0 11.997220
## 243 1 0 0 0 0 11.496071
## 246 0 0 1 0 0 11.902369
## 247 0 0 1 0 0 9.899075
## 249 0 0 1 0 0 9.335319
## 259 1 0 1 0 1 9.886411
## 308 1 0 1 0 1 10.212017
## 311 1 0 1 0 1 11.301188
## 342 1 1 0 1 0 10.959023
## 355 1 0 1 0 1 9.106903
## 366 1 1 0 1 0 9.790263
## 376 0 0 0 0 0 9.410601
## 411 1 0 1 0 1 10.492256
## 418 1 1 0 1 0 10.430314
## 429 1 1 0 1 0 11.633577
## 462 0 0 1 0 0 9.651592
## 467 1 1 0 1 0 10.740922
## 472 1 0 0 0 0 9.718578
## 481 1 0 0 0 0 10.020495
## log_individual_contributions log_disbursements
## 35 11.835029 10.642067
## 49 12.416276 11.376593
## 91 6.130448 11.392925
## 116 6.379658 10.542661
## 130 12.004067 8.388645
## 151 7.174560 9.748785
## 165 6.802251 9.214638
## 196 12.449677 9.948959
## 243 12.359516 11.081906
## 246 6.848302 10.385731
## 247 12.730060 9.964790
## 249 12.349686 11.064232
## 259 7.055644 9.692341
## 308 12.093705 9.896562
## 311 6.492015 11.008378
## 342 12.721834 11.113984
## 355 12.285037 10.612793
## 366 12.563635 9.646063
## 376 11.840872 10.366100
## 411 12.590543 9.858614
## 418 7.190580 9.695339
## 429 12.485601 9.709956
## 462 12.259548 10.456633
## 467 7.282520 9.730880
## 472 6.967492 8.612969
## 481 12.266079 10.988979
##
## 5. OUTLIERS IN RESPONSE (STUDENTIZED RESIDUALS)
## Number of outliers (|studentized residual| > 3): 0
##
## 6. GLOBAL MODEL VALIDATION
##
## Call:
## lm(formula = log_individual_contributions ~ party + incumbency +
## party:incumbency, data = campaign_data)
##
## Coefficients:
## (Intercept) partyRepublican
## 9.40941 0.36183
## incumbencyIncumbent incumbencyOpen Seat
## -0.03518 0.02919
## partyRepublican:incumbencyIncumbent partyRepublican:incumbencyOpen Seat
## -0.28554 -0.27178
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = model)
##
## Value p-value Decision
## Global Stat 9.693e-01 0.9144 Assumptions acceptable.
## Skewness 1.265e-01 0.7221 Assumptions acceptable.
## Kurtosis 2.980e-01 0.5852 Assumptions acceptable.
## Link Function -2.775e-14 1.0000 Assumptions acceptable.
## Heteroscedasticity 5.449e-01 0.4604 Assumptions acceptable.
model2_plots <- create_diagnostic_plots(model2_diag, "Log Individual Contributions Model")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

print(model2_plots$combined)
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
## 5 5 (3-3,1-1) arrange gtable[layout]
robust_se2 <- calculate_robust_se(model2, "Log Individual Contributions Model")
##
##
## =======================================================
## ROBUST STANDARD ERRORS FOR: Log Individual Contributions Model
## =======================================================
##
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.409406 0.114309 82.3157 < 2e-16 ***
## partyRepublican 0.361829 0.177271 2.0411 0.04177 *
## incumbencyIncumbent -0.035179 0.183938 -0.1913 0.84840
## incumbencyOpen Seat 0.029186 0.181022 0.1612 0.87198
## partyRepublican:incumbencyIncumbent -0.285540 0.262913 -1.0861 0.27798
## partyRepublican:incumbencyOpen Seat -0.271780 0.264524 -1.0274 0.30472
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## Comparison with original model coefficients:
## Original_Estimate Original_SE Original_p
## (Intercept) 9.40940578 0.1249737 7.065772e-273
## partyRepublican 0.36182897 0.1808030 4.591374e-02
## incumbencyIncumbent -0.03517908 0.1883030 8.518772e-01
## incumbencyOpen Seat 0.02918620 0.1792046 8.706911e-01
## partyRepublican:incumbencyIncumbent -0.28553988 0.2685226 2.881321e-01
## partyRepublican:incumbencyOpen Seat -0.27178026 0.2590166 2.945636e-01
## Robust_SE Robust_p
## (Intercept) 0.1143087 1.328757e-290
## partyRepublican 0.1772706 4.177071e-02
## incumbencyIncumbent 0.1839385 8.484047e-01
## incumbencyOpen Seat 0.1810221 8.719782e-01
## partyRepublican:incumbencyIncumbent 0.2629130 2.779811e-01
## partyRepublican:incumbencyOpen Seat 0.2645240 3.047202e-01
lambda2 <- try_boxcox(model2, "Log Individual Contributions Model")
##
##
## =======================================================
## BOX-COX TRANSFORMATION FOR: Log Individual Contributions Model
## =======================================================

## Optimal lambda: 0.8686869
## Interpretation: Power transformation with lambda = 0.8686869 suggested
# Run diagnostics for Model 3 (Log Disbursements)
model3_diag <- run_diagnostics(model3, "Log Disbursements Model")
##
##
## =======================================================
## DIAGNOSTIC TESTS FOR: Log Disbursements Model
## =======================================================
##
## 1. MULTICOLLINEARITY CHECK (VIF)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## GVIF Df GVIF^(1/(2*Df))
## party 2.781094 1 1.667661
## incumbency 3.807750 2 1.396906
## party:incumbency 7.376954 2 1.648046
##
## Interpretation:
## CAUTION: Moderate multicollinearity detected (VIF > 5)
## Variables with moderate multicollinearity:
##
## 2. HETEROSCEDASTICITY TESTS
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 2.8734, df = 5, p-value = 0.7195
##
##
## Interpretation:
## No heteroscedasticity detected using Breusch-Pagan test.
##
## 3. NORMALITY OF RESIDUALS
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.99754, p-value = 0.6738
##
##
## Interpretation:
## Residuals appear to be normally distributed.
##
## 4. INFLUENTIAL OBSERVATIONS
## Cook's Distance threshold (4/n): 0.008
## Number of influential observations: 25
## Influential observations (row numbers):
## 5 8 11 12 92 113 126 166 190 192 193 217 223 284 289 292 322 324 356 386
## 5 8 11 12 92 113 126 166 190 192 193 217 223 284 289 292 322 324 356 386
## 435 437 447 489 494
## 435 437 447 489 494
##
## Details of influential observations:
## receipts individual_contributions disbursements party incumbency
## 5 25066.504 2184.190 1818.180 Democrat Challenger
## 8 6216.354 1113.317 158670.759 Democrat Open Seat
## 11 74912.931 11886.480 223608.500 Republican Open Seat
## 12 31565.304 17315.945 230729.988 Democrat Incumbent
## 92 38116.287 23340.880 2122.466 Republican Challenger
## 113 4368.248 101225.642 159857.844 Republican Challenger
## 126 11476.424 41332.276 316677.767 Republican Open Seat
## 166 29679.967 13874.823 2931.238 Republican Open Seat
## 190 13369.189 1103.026 2492.319 Democrat Open Seat
## 192 15919.723 55526.973 115639.614 Democrat Incumbent
## 193 24211.513 55821.899 129862.446 Republican Incumbent
## 217 14169.355 36362.085 2548.570 Republican Challenger
## 223 66826.042 28859.544 2570.633 Republican Challenger
## 284 16887.827 3255.071 2247.644 Republican Challenger
## 289 17693.447 1389.098 3003.747 Republican Open Seat
## 292 63207.676 4463.644 88957.568 Republican Incumbent
## 322 74466.796 32557.285 2996.886 Democrat Incumbent
## 324 42651.610 34467.823 381296.947 Democrat Challenger
## 356 30757.924 6475.380 109646.403 Democrat Incumbent
## 386 87427.935 23128.683 3066.232 Republican Incumbent
## 435 22454.791 13183.812 1594.322 Democrat Incumbent
## 437 44922.916 5859.510 2561.694 Democrat Challenger
## 447 34160.275 6032.225 3763.408 Democrat Incumbent
## 489 112070.253 3750.676 132325.330 Democrat Incumbent
## 494 7405.721 10342.650 140885.789 Democrat Open Seat
## republican challenger open_seat rep_challenger rep_open_seat log_receipts
## 5 0 1 0 0 0 10.129328
## 8 0 0 1 0 0 8.735100
## 11 1 0 1 0 1 11.224095
## 12 0 0 0 0 0 10.359846
## 92 1 1 0 1 0 10.548423
## 113 1 1 0 1 0 8.382346
## 126 1 0 1 0 1 9.348137
## 166 1 0 1 0 1 10.298261
## 190 0 0 1 0 0 9.500783
## 192 0 0 0 0 0 9.675377
## 193 1 0 0 0 0 10.094625
## 217 1 1 0 1 0 9.558907
## 223 1 1 0 1 0 11.109863
## 284 1 1 0 1 0 9.734408
## 289 1 0 1 0 1 9.781006
## 292 1 0 0 0 0 11.054197
## 322 0 0 0 0 0 11.218122
## 324 0 1 0 0 0 10.660844
## 356 0 0 0 0 0 10.333935
## 386 1 0 0 0 0 11.378582
## 435 0 0 0 0 0 10.019304
## 437 0 1 0 0 0 10.712726
## 447 0 0 0 0 0 10.438848
## 489 0 0 0 0 0 11.626890
## 494 0 0 1 0 0 8.910143
## log_individual_contributions log_disbursements
## 5 7.689458 7.506141
## 8 7.015997 11.974593
## 11 9.383241 12.317657
## 12 9.759441 12.349008
## 92 10.058004 7.660805
## 113 11.525117 11.982046
## 126 10.629423 12.665643
## 166 9.537903 7.983521
## 190 7.006719 7.821370
## 192 10.924642 11.658243
## 193 10.929939 11.774239
## 217 10.501309 7.843680
## 223 10.270231 7.852297
## 284 8.088277 7.718083
## 289 7.237129 8.007949
## 292 8.403945 11.395926
## 322 10.390787 8.005663
## 324 10.447811 12.851336
## 356 8.775917 11.605025
## 386 10.048872 8.028531
## 435 9.486821 7.374831
## 437 8.675992 7.848814
## 447 8.705037 8.233346
## 489 8.229958 11.793026
## 494 9.244128 11.855712
##
## 5. OUTLIERS IN RESPONSE (STUDENTIZED RESIDUALS)
## Number of outliers (|studentized residual| > 3): 2
## Outlier observations (row numbers):
## 126 324
## 126 324
##
## Details of outlier observations:
## receipts individual_contributions disbursements party incumbency
## 126 11476.42 41332.28 316677.8 Republican Open Seat
## 324 42651.61 34467.82 381296.9 Democrat Challenger
## republican challenger open_seat rep_challenger rep_open_seat log_receipts
## 126 1 0 1 0 1 9.348137
## 324 0 1 0 0 0 10.660844
## log_individual_contributions log_disbursements
## 126 10.62942 12.66564
## 324 10.44781 12.85134
##
## 6. GLOBAL MODEL VALIDATION
##
## Call:
## lm(formula = log_disbursements ~ party + incumbency + party:incumbency,
## data = campaign_data)
##
## Coefficients:
## (Intercept) partyRepublican
## 9.74832 0.06444
## incumbencyIncumbent incumbencyOpen Seat
## 0.18826 0.05859
## partyRepublican:incumbencyIncumbent partyRepublican:incumbencyOpen Seat
## -0.28297 0.06301
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = model)
##
## Value p-value Decision
## Global Stat 6.115e-01 0.9618 Assumptions acceptable.
## Skewness 1.004e-01 0.7514 Assumptions acceptable.
## Kurtosis 1.285e-01 0.7200 Assumptions acceptable.
## Link Function -2.822e-13 1.0000 Assumptions acceptable.
## Heteroscedasticity 3.826e-01 0.5362 Assumptions acceptable.
model3_plots <- create_diagnostic_plots(model3_diag, "Log Disbursements Model")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

print(model3_plots$combined)
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
## 5 5 (3-3,1-1) arrange gtable[layout]
robust_se3 <- calculate_robust_se(model3, "Log Disbursements Model")
##
##
## =======================================================
## ROBUST STANDARD ERRORS FOR: Log Disbursements Model
## =======================================================
##
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.748315 0.098840 98.6275 <2e-16 ***
## partyRepublican 0.064445 0.134950 0.4775 0.6332
## incumbencyIncumbent 0.188255 0.146442 1.2855 0.1992
## incumbencyOpen Seat 0.058587 0.140171 0.4180 0.6762
## partyRepublican:incumbencyIncumbent -0.282973 0.197258 -1.4345 0.1521
## partyRepublican:incumbencyOpen Seat 0.063015 0.193512 0.3256 0.7448
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## Comparison with original model coefficients:
## Original_Estimate Original_SE Original_p
## (Intercept) 9.74831521 0.09223127 0.0000000
## partyRepublican 0.06444475 0.13343360 0.6293293
## incumbencyIncumbent 0.18825548 0.13896869 0.1761448
## incumbencyOpen Seat 0.05858721 0.13225397 0.6579668
## partyRepublican:incumbencyIncumbent -0.28297271 0.19817122 0.1539466
## partyRepublican:incumbencyOpen Seat 0.06301478 0.19115572 0.7418030
## Robust_SE Robust_p
## (Intercept) 0.09883971 0.0000000
## partyRepublican 0.13494998 0.6331850
## incumbencyIncumbent 0.14644186 0.1992093
## incumbencyOpen Seat 0.14017102 0.6761511
## partyRepublican:incumbencyIncumbent 0.19725787 0.1520532
## partyRepublican:incumbencyOpen Seat 0.19351245 0.7448370
lambda3 <- try_boxcox(model3, "Log Disbursements Model")
##
##
## =======================================================
## BOX-COX TRANSFORMATION FOR: Log Disbursements Model
## =======================================================

## Optimal lambda: 0.8686869
## Interpretation: Power transformation with lambda = 0.8686869 suggested
# =====================================================================
# STEP 9: ADDRESSING DIAGNOSTIC ISSUES (IF NEEDED)
# =====================================================================
# If heteroscedasticity was detected, robust SEs have already been calculated above
# If influential observations were detected
if(length(model1_diag$influential) > 0) {
cat("\n\n=======================================================\n")
cat("RERUNNING MODEL 1 WITHOUT INFLUENTIAL OBSERVATIONS\n")
cat("=======================================================\n\n")
# Create new dataset without influential observations
clean_data1 <- campaign_data[-model1_diag$influential, ]
# Rerun model
clean_model1 <- lm(log_receipts ~ party + incumbency + party:incumbency, data = clean_data1)
# Compare models
cat("Original Model 1:\n")
print(summary(model1)$coefficients)
cat("\nModel 1 without influential observations:\n")
print(summary(clean_model1)$coefficients)
# Calculate percentage change in coefficients
orig_coef <- summary(model1)$coefficients[, 1]
new_coef <- summary(clean_model1)$coefficients[, 1]
pct_change <- 100 * (new_coef - orig_coef) / orig_coef
cat("\nPercentage change in coefficients:\n")
print(data.frame(
Original = orig_coef,
Without_Outliers = new_coef,
Percent_Change = pct_change
))
}
##
##
## =======================================================
## RERUNNING MODEL 1 WITHOUT INFLUENTIAL OBSERVATIONS
## =======================================================
##
## Original Model 1:
## Estimate Std. Error t value
## (Intercept) 10.07179937 0.1004404 100.2763677
## partyRepublican 0.09641466 0.1453100 0.6635102
## incumbencyIncumbent -0.07488510 0.1513377 -0.4948211
## incumbencyOpen Seat -0.05963517 0.1440254 -0.4140602
## partyRepublican:incumbencyIncumbent -0.03218202 0.2158096 -0.1491222
## partyRepublican:incumbencyOpen Seat -0.22231004 0.2081697 -1.0679268
## Pr(>|t|)
## (Intercept) 0.0000000
## partyRepublican 0.5073134
## incumbencyIncumbent 0.6209464
## incumbencyOpen Seat 0.6790098
## partyRepublican:incumbencyIncumbent 0.8815180
## partyRepublican:incumbencyOpen Seat 0.2860751
##
## Model 1 without influential observations:
## Estimate Std. Error t value
## (Intercept) 10.07015587 0.09076403 110.94874994
## partyRepublican 0.06836192 0.13187708 0.51837605
## incumbencyIncumbent -0.09587365 0.13667766 -0.70145804
## incumbencyOpen Seat -0.03650145 0.12984370 -0.28111835
## partyRepublican:incumbencyIncumbent 0.01699279 0.19491232 0.08718169
## partyRepublican:incumbencyOpen Seat -0.21516200 0.18912561 -1.13766719
## Pr(>|t|)
## (Intercept) 0.0000000
## partyRepublican 0.6044394
## incumbencyIncumbent 0.4833636
## incumbencyOpen Seat 0.7787431
## partyRepublican:incumbencyIncumbent 0.9305641
## partyRepublican:incumbencyOpen Seat 0.2558379
##
## Percentage change in coefficients:
## Original Without_Outliers Percent_Change
## (Intercept) 10.07179937 10.07015587 -0.01631786
## partyRepublican 0.09641466 0.06836192 -29.09593107
## incumbencyIncumbent -0.07488510 -0.09587365 28.02765815
## incumbencyOpen Seat -0.05963517 -0.03650145 -38.79208156
## partyRepublican:incumbencyIncumbent -0.03218202 0.01699279 -152.80211302
## partyRepublican:incumbencyOpen Seat -0.22231004 -0.21516200 -3.21534683
# Repeat for other models if needed
if(length(model2_diag$influential) > 0) {
cat("\n\n=======================================================\n")
cat("RERUNNING MODEL 2 WITHOUT INFLUENTIAL OBSERVATIONS\n")
cat("=======================================================\n\n")
# Create new dataset without influential observations
clean_data2 <- campaign_data[-model2_diag$influential, ]
# Rerun model
clean_model2 <- lm(log_individual_contributions ~ party + incumbency + party:incumbency, data = clean_data2)
# Compare models
cat("Original Model 2:\n")
print(summary(model2)$coefficients)
cat("\nModel 2 without influential observations:\n")
print(summary(clean_model2)$coefficients)
# Calculate percentage change in coefficients
orig_coef <- summary(model2)$coefficients[, 1]
new_coef <- summary(clean_model2)$coefficients[, 1]
pct_change <- 100 * (new_coef - orig_coef) / orig_coef
cat("\nPercentage change in coefficients:\n")
print(data.frame(
Original = orig_coef,
Without_Outliers = new_coef,
Percent_Change = pct_change
))
}
##
##
## =======================================================
## RERUNNING MODEL 2 WITHOUT INFLUENTIAL OBSERVATIONS
## =======================================================
##
## Original Model 2:
## Estimate Std. Error t value
## (Intercept) 9.40940578 0.1249737 75.2911044
## partyRepublican 0.36182897 0.1808030 2.0012336
## incumbencyIncumbent -0.03517908 0.1883030 -0.1868216
## incumbencyOpen Seat 0.02918620 0.1792046 0.1628653
## partyRepublican:incumbencyIncumbent -0.28553988 0.2685226 -1.0633735
## partyRepublican:incumbencyOpen Seat -0.27178026 0.2590166 -1.0492773
## Pr(>|t|)
## (Intercept) 7.065772e-273
## partyRepublican 4.591374e-02
## incumbencyIncumbent 8.518772e-01
## incumbencyOpen Seat 8.706911e-01
## partyRepublican:incumbencyIncumbent 2.881321e-01
## partyRepublican:incumbencyOpen Seat 2.945636e-01
##
## Model 2 without influential observations:
## Estimate Std. Error t value
## (Intercept) 9.41684378 0.1102126 85.4425379
## partyRepublican 0.31256386 0.1610689 1.9405596
## incumbencyIncumbent -0.07792654 0.1683525 -0.4628772
## incumbencyOpen Seat -0.09028647 0.1595320 -0.5659457
## partyRepublican:incumbencyIncumbent -0.21456510 0.2405795 -0.8918678
## partyRepublican:incumbencyOpen Seat -0.14780178 0.2318619 -0.6374561
## Pr(>|t|)
## (Intercept) 1.201162e-287
## partyRepublican 5.291190e-02
## incumbencyIncumbent 6.436676e-01
## incumbencyOpen Seat 5.717020e-01
## partyRepublican:incumbencyIncumbent 3.729221e-01
## partyRepublican:incumbencyOpen Seat 5.241395e-01
##
## Percentage change in coefficients:
## Original Without_Outliers Percent_Change
## (Intercept) 9.40940578 9.41684378 0.07904858
## partyRepublican 0.36182897 0.31256386 -13.61558020
## incumbencyIncumbent -0.03517908 -0.07792654 121.51384528
## incumbencyOpen Seat 0.02918620 -0.09028647 -409.34642144
## partyRepublican:incumbencyIncumbent -0.28553988 -0.21456510 -24.85634482
## partyRepublican:incumbencyOpen Seat -0.27178026 -0.14780178 -45.61717482
if(length(model3_diag$influential) > 0) {
cat("\n\n=======================================================\n")
cat("RERUNNING MODEL 3 WITHOUT INFLUENTIAL OBSERVATIONS\n")
cat("=======================================================\n\n")
# Create new dataset without influential observations
clean_data3 <- campaign_data[-model3_diag$influential, ]
# Rerun model
clean_model3 <- lm(log_disbursements ~ party + incumbency + party:incumbency, data = clean_data3)
# Compare models
cat("Original Model 3:\n")
print(summary(model3)$coefficients)
cat("\nModel 3 without influential observations:\n")
print(summary(clean_model3)$coefficients)
# Calculate percentage change in coefficients
orig_coef <- summary(model3)$coefficients[, 1]
new_coef <- summary(clean_model3)$coefficients[, 1]
pct_change <- 100 * (new_coef - orig_coef) / orig_coef
cat("\nPercentage change in coefficients:\n")
print(data.frame(
Original = orig_coef,
Without_Outliers = new_coef,
Percent_Change = pct_change
))
}
##
##
## =======================================================
## RERUNNING MODEL 3 WITHOUT INFLUENTIAL OBSERVATIONS
## =======================================================
##
## Original Model 3:
## Estimate Std. Error t value
## (Intercept) 9.74831521 0.09223127 105.6942528
## partyRepublican 0.06444475 0.13343360 0.4829724
## incumbencyIncumbent 0.18825548 0.13896869 1.3546611
## incumbencyOpen Seat 0.05858721 0.13225397 0.4429902
## partyRepublican:incumbencyIncumbent -0.28297271 0.19817122 -1.4279204
## partyRepublican:incumbencyOpen Seat 0.06301478 0.19115572 0.3296515
## Pr(>|t|)
## (Intercept) 0.0000000
## partyRepublican 0.6293293
## incumbencyIncumbent 0.1761448
## incumbencyOpen Seat 0.6579668
## partyRepublican:incumbencyIncumbent 0.1539466
## partyRepublican:incumbencyOpen Seat 0.7418030
##
## Model 3 without influential observations:
## Estimate Std. Error t value
## (Intercept) 9.75972898 0.08144172 119.83697389
## partyRepublican 0.12719010 0.11867757 1.07172823
## incumbencyIncumbent 0.15500371 0.12506566 1.23937866
## incumbencyOpen Seat 0.02123195 0.11683806 0.18172115
## partyRepublican:incumbencyIncumbent -0.35227683 0.17741503 -1.98560871
## partyRepublican:incumbencyOpen Seat 0.01034781 0.16982625 0.06093177
## Pr(>|t|)
## (Intercept) 0.00000000
## partyRepublican 0.28439329
## incumbencyIncumbent 0.21582513
## incumbencyOpen Seat 0.85588008
## partyRepublican:incumbencyIncumbent 0.04765867
## partyRepublican:incumbencyOpen Seat 0.95143950
##
## Percentage change in coefficients:
## Original Without_Outliers Percent_Change
## (Intercept) 9.74831521 9.75972898 0.1170846
## partyRepublican 0.06444475 0.12719010 97.3630137
## incumbencyIncumbent 0.18825548 0.15500371 -17.6631118
## incumbencyOpen Seat 0.05858721 0.02123195 -63.7600992
## partyRepublican:incumbencyIncumbent -0.28297271 -0.35227683 24.4914488
## partyRepublican:incumbencyOpen Seat 0.06301478 0.01034781 -83.5787499
# =====================================================================
# STEP 10: FINAL SUMMARY AND RECOMMENDATIONS
# =====================================================================
cat("\n\n=======================================================\n")
##
##
## =======================================================
cat("FINAL SUMMARY AND RECOMMENDATIONS\n")
## FINAL SUMMARY AND RECOMMENDATIONS
cat("=======================================================\n\n")
## =======================================================
cat("Model 1 (Log Receipts):\n")
## Model 1 (Log Receipts):
if(model1_diag$bp_test$p.value < 0.05) {
cat("- Heteroscedasticity detected: Use robust standard errors\n")
} else {
cat("- No heteroscedasticity detected\n")
}
## - No heteroscedasticity detected
if(model1_diag$sw_test$p.value < 0.05) {
cat("- Non-normal residuals detected: Consider transformation\n")
} else {
cat("- Residuals appear normally distributed\n")
}
## - Residuals appear normally distributed
if(length(model1_diag$influential) > 0) {
cat("- Influential observations detected: Consider analysis without these cases\n")
} else {
cat("- No influential observations detected\n")
}
## - Influential observations detected: Consider analysis without these cases
cat("\nModel 2 (Log Individual Contributions):\n")
##
## Model 2 (Log Individual Contributions):
if(model2_diag$bp_test$p.value < 0.05) {
cat("- Heteroscedasticity detected: Use robust standard errors\n")
} else {
cat("- No heteroscedasticity detected\n")
}
## - No heteroscedasticity detected
if(model2_diag$sw_test$p.value < 0.05) {
cat("- Non-normal residuals detected: Consider transformation\n")
} else {
cat("- Residuals appear normally distributed\n")
}
## - Residuals appear normally distributed
if(length(model2_diag$influential) > 0) {
cat("- Influential observations detected: Consider analysis without these cases\n")
} else {
cat("- No influential observations detected\n")
}
## - Influential observations detected: Consider analysis without these cases
cat("\nModel 3 (Log Disbursements):\n")
##
## Model 3 (Log Disbursements):
if(model3_diag$bp_test$p.value < 0.05) {
cat("- Heteroscedasticity detected: Use robust standard errors\n")
} else {
cat("- No heteroscedasticity detected\n")
}
## - No heteroscedasticity detected
if(model3_diag$sw_test$p.value < 0.05) {
cat("- Non-normal residuals detected: Consider transformation\n")
} else {
cat("- Residuals appear normally distributed\n")
}
## - Residuals appear normally distributed
if(length(model3_diag$influential) > 0) {
cat("- Influential observations detected: Consider analysis without these cases\n")
} else {
cat("- No influential observations detected\n")
}
## - Influential observations detected: Consider analysis without these cases
cat("\nOverall Conclusion:\n")
##
## Overall Conclusion:
cat("Based on the diagnostics, the following adjustments are recommended:\n")
## Based on the diagnostics, the following adjustments are recommended:
cat("1. Use robust standard errors for all models due to potential heteroscedasticity\n")
## 1. Use robust standard errors for all models due to potential heteroscedasticity
cat("2. Consider examining the most influential observations to understand their impact\n")
## 2. Consider examining the most influential observations to understand their impact
cat("3. The Box-Cox analysis suggests the log transformation is appropriate for these models\n")
## 3. The Box-Cox analysis suggests the log transformation is appropriate for these models
# =====================================================================
# STEP 11: SAVING RESULTS (OPTIONAL)
# =====================================================================
# Save diagnostic plots
# Uncomment if you want to save the plots
# pdf("diagnostic_plots.pdf")
# print(model1_plots$combined)
# print(model2_plots$combined)
# print(model3_plots$combined)
# dev.off()
# Save robust models results
# Uncomment if you want to save the results
# sink("robust_model_results.txt")
# cat("ROBUST STANDARD ERRORS FOR WOMEN CANDIDATES CAMPAIGN FINANCE MODELS\n\n")
# cat("Model 1 (Log Receipts):\n")
# print(robust_se1)
# cat("\nModel 2 (Log Individual Contributions):\n")
# print(robust_se2)
# cat("\nModel 3 (Log Disbursements):\n")
# print(robust_se3)
# sink()
cat("\nAnalysis complete! Check diagnostic plots and robust model results for final interpretation.\n")
##
## Analysis complete! Check diagnostic plots and robust model results for final interpretation.