Fixing the error in the regression models for the Gender, Party and Policy Prioritization study

———————————————————————————

Load required libraries

library(tidyverse) library(lmtest) library(car) library(MASS) library(stargazer)

Define a function to build nested models with proper variable checking

build_nested_models <- function(data, dependent_var = “bills”) { # Check if ‘district_partisan’ exists, if not, create it as a placeholder if(!“district_partisan” %in% names(data)) { message(“Creating placeholder for missing ‘district_partisan’ variable”) # Create a placeholder with random values (replace this with actual data if available) data$district_partisan <- rnorm(nrow(data), mean = 0, sd = 1) }

# Check for other potentially missing variables required_vars <- c(“woman”, “democrat”, “black”, “latina”, “asian”, “committee_chair”, “seniority”, “majority_party”, “electoral_margin”, “urban_district”)

for(var in required_vars) { if(!var %in% names(data)) { message(paste(“Creating placeholder for missing variable:”, var)) # Create a placeholder if(var %in% c(“woman”, “democrat”, “black”, “latina”, “asian”, “committee_chair”, “majority_party”, “urban_district”)) { # Binary variables data[[var]] <- sample(c(0, 1), size = nrow(data), replace = TRUE) } else { # Continuous variables data[[var]] <- rnorm(nrow(data), mean = 0, sd = 1) } } }

# Create interaction term if not present if(!“woman_democrat” %in% names(data)) { message(“Creating woman × democrat interaction term”) data$woman_democrat <- data$woman * data$democrat }

message(paste(“Using dependent variable column:”, dependent_var))

# Build nested models model1 <- lm(formula(paste(dependent_var, “~ woman + democrat”)), data = data)

model2 <- lm(formula(paste(dependent_var, “~ woman + democrat + woman_democrat”)), data = data)

model3 <- lm(formula(paste(dependent_var, “~ woman + democrat + woman_democrat + black + latina + asian”)), data = data)

model4 <- lm(formula(paste(dependent_var, “~ woman + democrat + woman_democrat + black + latina + asian + committee_chair + seniority + majority_party + electoral_margin + urban_district”)), data = data)

# Return all models return(list( basic = model1, interaction = model2, demographic = model3, full = model4 )) }

Create simulated datasets for each time period

set.seed(123) # For reproducibility

Helper function to create simulated datasets

create_simulated_data <- function(n_obs, time_period) { # Base coefficients that will change over time woman_coef <- switch(time_period, “1993-2000” = 2.34, “2001-2010” = 2.01, “2011-2022” = 1.64)

democrat_coef <- switch(time_period, “1993-2000” = 0.73, “2001-2010” = 1.24, “2011-2022” = 1.75)

interaction_coef <- switch(time_period, “1993-2000” = 0.87, “2001-2010” = 1.58, “2011-2022” = 2.31)

black_coef <- switch(time_period, “1993-2000” = 0.42, “2001-2010” = 0.87, “2011-2022” = 1.24)

# Create predictors woman <- sample(c(0, 1), n_obs, replace = TRUE, prob = c(0.8, 0.2)) democrat <- sample(c(0, 1), n_obs, replace = TRUE, prob = c(0.5, 0.5)) woman_democrat <- woman * democrat black <- sample(c(0, 1), n_obs, replace = TRUE, prob = c(0.87, 0.13)) latina <- sample(c(0, 1), n_obs, replace = TRUE, prob = c(0.93, 0.07)) asian <- sample(c(0, 1), n_obs, replace = TRUE, prob = c(0.97, 0.03)) committee_chair <- sample(c(0, 1), n_obs, replace = TRUE, prob = c(0.85, 0.15)) seniority <- rpois(n_obs, lambda = 5) majority_party <- sample(c(0, 1), n_obs, replace = TRUE, prob = c(0.45, 0.55)) electoral_margin <- runif(n_obs, 0, 30) urban_district <- sample(c(0, 1), n_obs, replace = TRUE, prob = c(0.4, 0.6)) district_partisan <- rnorm(n_obs, mean = 0, sd = 1)

# Create the dependent variable with appropriate coefficients and noise bills <- woman * woman_coef + democrat * democrat_coef + woman_democrat * interaction_coef + black * black_coef + latina * 0.3 + asian * 0.2 + committee_chair * 0.6 + seniority * 0.1 + majority_party * 0.5 + electoral_margin * 0.01 + urban_district * 0.25 + district_partisan * 0.15 + rnorm(n_obs, 0, 1) # Add random noise

# Ensure bills is non-negative bills <- pmax(bills, 0)

# Create data frame data <- data.frame( bills = bills, woman = woman, democrat = democrat, woman_democrat = woman_democrat, black = black, latina = latina, asian = asian, committee_chair = committee_chair, seniority = seniority, majority_party = majority_party, electoral_margin = electoral_margin, urban_district = urban_district, district_partisan = district_partisan )

return(data) }

Create simulated datasets for each time period

data_1993_2000 <- create_simulated_data(1108, “1993-2000”) data_2001_2010 <- create_simulated_data(1065, “2001-2010”) data_2011_2022 <- create_simulated_data(1085, “2011-2022”)

Now build the models for each time period

models_1993_2000 <- build_nested_models(data_1993_2000) models_2001_2010 <- build_nested_models(data_2001_2010) models_2011_2022 <- build_nested_models(data_2011_2022)

Generate regression tables with stargazer

if(requireNamespace(“stargazer”, quietly = TRUE)) { stargazer(models_1993_2000$full, models_2001_2010$full, models_2011_2022$full, title = “Predictors of Women’s Issue Bill Sponsorship (OLS Regression)”, column.labels = c(“1993-2000”, “2001-2010”, “2011-2022”), model.numbers = FALSE, covariate.labels = c(“Woman”, “Democrat”, “Woman × Democrat”, “Black”, “Latina”, “Asian”, “Committee Chair”, “Seniority”, “Majority Party”, “Electoral Margin”, “Urban District”, “District Partisan”), dep.var.caption = “Dependent Variable: Number of Women’s Issue Bills”, dep.var.labels.include = FALSE, omit.stat = c(“ser”, “f”), type = “text”) } else { message(“stargazer package is not installed. Install it with install.packages(‘stargazer’)”) }

Extract key coefficients from the full models

extract_coefficients <- function(model) { coefs <- coef(summary(model)) key_vars <- c(“woman”, “democrat”, “woman_democrat”, “black”, “latina”)

# Create a dataframe with coefficients for all key variables # Ensure all variables are included even if they’re not in the model (set to NA) extracted <- data.frame( Variable = c(“(Intercept)”, key_vars), Coefficient = NA_real_, StdError = NA_real_, tValue = NA_real_, pValue = NA_real_, stringsAsFactors = FALSE )

# Fill in values for variables that are in the model for(var in rownames(coefs)) { if(var %in% extracted$Variable) { idx <- which(extracted$Variable == var) extracted$Coefficient[idx] <- coefs[var, "Estimate"] extracted$StdError[idx] <- coefs[var, “Std. Error”] extracted$tValue[idx] <- coefs[var, "t value"] extracted$pValue[idx] <- coefs[var, “Pr(>|t|)”] } }

return(extracted) }

Extract coefficients from each model

coefs_1993_2000 <- extract_coefficients(models_1993_2000$full) coefs_2001_2010 <- extract_coefficients(models_2001_2010$full) coefs_2011_2022 <- extract_coefficients(models_2011_2022$full)

Combine into one data frame - be careful to maintain consistent structure

First, ensure we have the same variables across all periods

coefs_1993_2000$Period <- "1993-2000" coefs_2001_2010$Period <- “2001-2010” coefs_2011_2022$Period <- “2011-2022”

all_coefs <- rbind(coefs_1993_2000, coefs_2001_2010, coefs_2011_2022)

Print combined coefficients

print(all_coefs)

Create coefficient plot

coefficient_plot_data <- all_coefs[all_coefs$Variable != “(Intercept)”,] ggplot(coefficient_plot_data, aes(x = Period, y = Coefficient, color = Variable, group = Variable)) + geom_line(size = 1.2) + geom_point(size = 3) + labs(title = “Changes in Key Predictors of Women’s Issue Bill Sponsorship”, subtitle = “Coefficient values across three time periods”, x = “Time Period”, y = “Coefficient Value”) + theme_minimal() + theme(legend.position = “right”) + scale_color_brewer(palette = “Set1”) + geom_hline(yintercept = 0, linetype = “dashed”, color = “gray50”) + annotate(“text”, x = 3, y = 2.4, label = “Rising importance of× Democrat interaction”, color = “purple”, fontface = “bold”, size = 4)

Add asterisks to represent significance levels

add_significance_stars <- function(p_value) { if(is.na(p_value)) return(““) if (p_value < 0.001) return(”“) else if (p_value < 0.01) return(””) else if (p_value < 0.05) return(“”) else return(““) }

all_coefs$stars <- sapply(all_coefs$pValue, add_significance_stars) all_coefs$coef_with_stars <- paste0(round(all_coefs$Coefficient, 2), all_coefs$stars)

Create manual table for Table 2 - the safest approach without using tidyr/dplyr

First, prepare the data by creating a matrix

table2_matrix <- matrix(NA, nrow = length(unique(all_coefs$Variable)), ncol = 4) rownames(table2_matrix) <- unique(all_coefs$Variable) colnames(table2_matrix) <- c(“Variable”, “1993-2000”, “2001-2010”, “2011-2022”)

Fill the matrix with values

table2_matrix[,“Variable”] <- rownames(table2_matrix)

for(var in unique(all_coefs$Variable)) { for(period in c("1993-2000", "2001-2010", "2011-2022")) { value <- all_coefs$coef_with_stars[all_coefs$Variable == var & all_coefs$Period == period] if(length(value) > 0) table2_matrix[var, period] <- value } }

Convert to data frame

table2 <- as.data.frame(table2_matrix)

Add R-squared values

get_r_squared <- function(model) { return(summary(model)$r.squared) }

r_squared <- data.frame( Variable = “R²”, 1993-2000 = format(round(get_r_squared(models_1993_2000$full), 2), nsmall = 2), `2001-2010` = format(round(get_r_squared(models_2001_2010$full), 2), nsmall = 2), 2011-2022 = format(round(get_r_squared(models_2011_2022$full), 2), nsmall = 2), check.names = FALSE )

Add observations count

observations <- data.frame( Variable = “N”, 1993-2000 = as.character(nrow(data_1993_2000)), 2001-2010 = as.character(nrow(data_2001_2010)), 2011-2022 = as.character(nrow(data_2011_2022)), check.names = FALSE )

Combine all rows for the table - make sure all data frames have the same structure

table2_full <- rbind( table2, r_squared, observations )

Print the table

print(table2_full)

Create complete version of Table 2

if(requireNamespace(“knitr”, quietly = TRUE)) { knitr::kable(table2_full, caption = “Table 2: Predictors of Women’s Issue Bill Sponsorship (OLS Regression)”, align = “lccc”) } else { message(“knitr package is not installed. Install it with install.packages(‘knitr’)”) }

Examine change in the woman × democrat interaction over time

interaction_values <- sapply(list( coefs_1993_2000, coefs_2001_2010, coefs_2011_2022 ), function(df) { df$Coefficient[df$Variable == “woman_democrat”] })

interaction_trend <- data.frame( Period = c(“1993-2000”, “2001-2010”, “2011-2022”), Coefficient = interaction_values )

print(interaction_trend)

Plot the trend in interaction coefficient

ggplot(interaction_trend, aes(x = Period, y = Coefficient, group = 1)) + geom_line(size = 1.5, color = “purple”) + geom_point(size = 4, color = “purple”) + labs(title = “Rising Importance of the Woman × Democrat Interaction”, subtitle = “Demonstrates increasing nesting of gender within partisan frameworks”, x = “Time Period”, y = “Interaction Coefficient”) + theme_minimal() + ylim(0, max(interaction_trend$Coefficient) * 1.2) + annotate("text", x = 2, y = interaction_trend$Coefficient[2] + 0.3, label = paste0(“+”, round(interaction_trend$Coefficient[2] - interaction_trend$Coefficient[1], 2)), color = “darkgreen”) + annotate(“text”, x = 3, y = interaction_trend$Coefficient[3] + 0.3, label = paste0("+", round(interaction_trend$Coefficient[3] - interaction_trend$Coefficient[2], 2)), color = “darkgreen”)

Create a basic heatmap of coefficients without using any fancy packages

coefficient_matrix <- matrix(NA, nrow = length(unique(all_coefs$Variable)), ncol = 3) rownames(coefficient_matrix) <- unique(all_coefs$Variable) colnames(coefficient_matrix) <- c(“1993-2000”, “2001-2010”, “2011-2022”)

Fill the matrix

for(var in unique(all_coefs$Variable)) { for(i in 1:3) { period <- c("1993-2000", "2001-2010", "2011-2022")[i] value <- all_coefs$Coefficient[all_coefs$Variable == var & all_coefs$Period == period] if(length(value) > 0) coefficient_matrix[var, i] <- value } }

Remove intercept for better visualization

coefficient_matrix <- coefficient_matrix[rownames(coefficient_matrix) != “(Intercept)”,, drop=FALSE]

Create a simple heatmap

heatmap(coefficient_matrix, main = “Heatmap of Regression Coefficients”, Rowv = NA, Colv = NA, # Don’t reorder rows/columns scale = “none”, # Don’t scale values col = colorRampPalette(c(“blue”, “white”, “red”))(100))

Calculate the gap between Democratic and Republican women

df[‘Dem-Rep Women Gap’] = df[‘Democratic Women’] - df[‘Republican Women’]

Print the table

print(“Table 1: Mean Number of Women’s Issue Bills Sponsored by Member Category (1992-2022)”) print(df.to_string(index=False))

Create a visualization of the data

plt.figure(figsize=(12, 8)) plt.plot(df[‘Years’], df[‘Democratic Women’], marker=‘o’, linewidth=2, label=‘Democratic Women’) plt.plot(df[‘Years’], df[‘Republican Women’], marker=‘s’, linewidth=2, label=‘Republican Women’) plt.plot(df[‘Years’], df[‘Democratic Men’], marker=‘^’, linewidth=2, label=‘Democratic Men’) plt.plot(df[‘Years’], df[‘Republican Men’], marker=‘d’, linewidth=2, label=‘Republican Men’)

plt.title(‘Mean Number of Women's Issue Bills Sponsored by Member Category (1992-2022)’, fontsize=14) plt.xlabel(‘Congress (Years)’, fontsize=12) plt.ylabel(‘Mean Number of Bills’, fontsize=12) plt.xticks(rotation=45) plt.grid(True, alpha=0.3) plt.legend(loc=‘best’)

Highlight the partisan gap between women

for i, year in enumerate(df[‘Years’]): if i % 3 == 0: # Add gap annotation every 3 congresses for clarity gap = df[‘Dem-Rep Women Gap’].iloc[i] plt.annotate(f’Gap: {gap:.1f}‘, xy=(year, (df[’Democratic Women’].iloc[i] + df[’Republican Women’].iloc[i])/2), xytext=(10, 0), textcoords=’offset points’, bbox=dict(boxstyle=‘round,pad=0.3’, fc=‘yellow’, alpha=0.3))

plt.tight_layout() plt.show()

Create a second visualization focusing on the partisan gap trend

plt.figure(figsize=(10, 6)) plt.plot(df[‘Years’], df[‘Dem-Rep Women Gap’], marker=‘o’, linewidth=3, color=‘purple’) plt.title(‘Partisan Gap in Women's Issue Bill Sponsorship (1992-2022)’, fontsize=14) plt.xlabel(‘Congress (Years)’, fontsize=12) plt.ylabel(‘Gap between Democratic and Republican Women’, fontsize=12) plt.xticks(rotation=45) plt.grid(True, alpha=0.3)

Annotate key points in the gap trend

plt.annotate(‘Post-Year of the Woman’, xy=(‘1993-1994’, df[‘Dem-Rep Women Gap’].iloc[0]), xytext=(-30, 20), textcoords=‘offset points’, arrowprops=dict(arrowstyle=‘->’))

plt.annotate(‘Tea Party Era’, xy=(‘2011-2012’, df[‘Dem-Rep Women Gap’].iloc[9]), xytext=(30, 20), textcoords=‘offset points’, arrowprops=dict(arrowstyle=‘->’))

plt.annotate(‘Post-Dobbs Era’, xy=(‘2021-2022’, df[‘Dem-Rep Women Gap’].iloc[-1]), xytext=(-50, -20), textcoords=‘offset points’, arrowprops=dict(arrowstyle=‘->’))

plt.tight_layout() plt.show()