Descriptive Statistics
## Total speeches
nrow(Data) ## 670,856
## [1] 670856
## Men vs women speakers
speakers<- unique(Data$speakername)
length(speakers)
## [1] 1093
count_gen_f <- Data %>%
filter(gender == "F") %>%
distinct(speakername) %>%
nrow()
count_gen_m <- Data %>%
filter(gender == "M") %>%
distinct(speakername) %>%
nrow()
print(paste("Women Speakers", count_gen_f))
## [1] "Women Speakers 333"
print(paste("Men Speakers", count_gen_m))
## [1] "Men Speakers 760"
## Men vs women speeches given
Data <- Data %>%
mutate(observation_number = row_number())
count_gen_f2 <- Data %>%
filter(gender == "F") %>%
distinct(observation_number) %>%
nrow()
count_gen_m2 <- Data %>%
filter(gender == "M") %>%
distinct(observation_number) %>%
nrow()
print(paste("Women Speakers:", count_gen_f2))
## [1] "Women Speakers: 168265"
print(paste("Men Speakers:", count_gen_m2))
## [1] "Men Speakers: 502591"
## Environmental Speech count
count_ones <- sum(Data$EnvDummy == 1)
count_zeros <- sum(Data$EnvDummy == 0)
print(count_ones)
## [1] 24425
print(count_zeros)
## [1] 646431
## Years of experience
mean(Data$YearsExp)
## [1] 11.102
## Env Debates count
count_ones2 <- sum(Data$DebateTopic == 1)
count_zeros2 <- sum(Data$DebateTopic == 0)
print(count_ones2)
## [1] 9815
print(count_zeros2)
## [1] 661041
## Env Committee membership
count_comittee_0 <- Data %>%
filter(EnvCommittee == 0) %>%
distinct(speakername) %>%
nrow()
count_committee_1 <- Data %>%
filter(EnvCommittee == 1) %>%
distinct(speakername) %>%
nrow()
print(paste("Number of non-Env Committee members:", count_comittee_0))
## [1] "Number of non-Env Committee members: 924"
print(paste("Number Env Committee members:", count_committee_1))
## [1] "Number Env Committee members: 169"
## Gender of Env Committee Members
count_comitteegen_M <- Data %>%
filter(EnvCommittee == 1) %>%
filter(gender == "M") %>%
distinct(speakername) %>%
nrow()
count_comitteegen_F <- Data %>%
filter(EnvCommittee == 1) %>%
filter(gender == "F") %>%
distinct(speakername) %>%
nrow()
print(paste("Number of Men Env Committee members:", count_comitteegen_M))
## [1] "Number of Men Env Committee members: 120"
print(paste("Number of Women Env Committee members:", count_comitteegen_F))
## [1] "Number of Women Env Committee members: 49"
## Ruling Party membership
count_conserv_0 <- Data %>%
filter(RulingParty == 0) %>%
distinct(speakername) %>%
nrow()
count_conserv_1 <- Data %>%
filter(RulingParty == 1) %>%
distinct(speakername) %>%
nrow()
print(paste("Number of Non-Conservative:", count_conserv_0))
## [1] "Number of Non-Conservative: 613"
print(paste("Number of Conservatives:", count_conserv_1))
## [1] "Number of Conservatives: 516"
# Create a line plot with points
##Gender Gap
ggplot(GenderGap, aes(x = Year, y =GenderGap)) +
geom_line() +
geom_point() +
xlim(2010, 2021) +
scale_x_continuous(breaks = seq(min(GenderGap$Year), max(GenderGap$Year), by = 1))+
ylim(-1, 3) +
geom_hline(yintercept = 0, color = "black", linetype = "solid", size = 0.9) +
geom_smooth(method = "lm", se = FALSE, color = "grey", linetype="dashed", size=0.75) +
labs(title = "Gender Gap in Speech Environmentalism Over Time", x = "Year", y = "Gender Gap (%)") +
theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

## Men Vs Women Proportion env speeches
ggplot(GenderGap, aes(x = Year)) +
geom_line(aes(y = WomEnvSpeech, color = "Women's Env Speech")) +
geom_point(aes(y = WomEnvSpeech, color = "Women's Env Speech")) +
geom_line(aes(y = MenEnvSpeech, color = "Men's Env Speech")) +
geom_point(aes(y = MenEnvSpeech, color = "Men's Env Speech")) +
xlim(2010, 2021) +
scale_x_continuous(breaks = seq(2010, 2021, by = 1)) +
ylim(0, 9) +
geom_hline(yintercept = 0, color = "black", linetype = "solid", size = 0.5) +
scale_color_manual(values = c("Men's Env Speech" = "blue", "Women's Env Speech" = "Red"), # Set colors
labels = c("Men's Env Speech" = "Men MPs", "Women's Env Speech" = "Women MPs")) +
labs(title = "Proportion of Women and Men's Environmental Speeches: 2010-2021",
x = "Year", y = "% Env. Speeches of Total Speeches",
color = "Legend") +
theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

## By Gender
# Summing EnvDummy by year and gender, excluding the year 2021 and filtering out NA gender
yearly_env_sum <- Data %>%
filter(year != 2021, !is.na(gender)) %>% # Exclude 2021 and remove rows where gender is NA
group_by(year, gender) %>%
summarise(total_envdummy = sum(EnvDummy)) %>%
ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Creating total EnvDummy for each year (without gender)
total_env_sum <- Data %>%
filter(year != 2021) %>%
group_by(year) %>%
summarise(total_envdummy = sum(EnvDummy)) %>%
mutate(gender = "Total") # Add a 'Total' label for consistency in plotting
# Combine gender-specific data with total data
combined_data <- bind_rows(yearly_env_sum, total_env_sum)
# Ensure 'year' is treated as an integer
combined_data$year <- as.integer(combined_data$year)
# Plot the data with separate lines for each gender and total
ggplot(combined_data, aes(x = year, y = total_envdummy, color = gender)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = combined_data$year) + # Ensure only whole years appear on x-axis
scale_color_manual(values = c("F" = "red", "M" = "blue", "Total" = "black"), # Set colors
labels = c("F" = "Female", "M" = "Male", "Total" = "Total")) + # Set labels
labs(title = "Total Environmental Speeches per Year: 2010-2020",
x = "Year", y = "Number of Environmental Speeches") +
theme_minimal()

Regressions
Testing for NBR or Poisson
## Testing if NBR is correct:
# Fit a Poisson regression model
AggregatedData$gender<- as.factor(AggregatedData$gender)
AggregatedData$gender <- relevel(AggregatedData$gender, ref = "M")
poisson_model <- glm(EnvDummy~gender+YearsExp+gvt_role+opp_role+parly_role+ EnvCommittee+RulingParty+DebateTopic, data = AggregatedData, family = "poisson")
# Calculate residual deviance
residual_deviance <- sum(resid(poisson_model, type = "pearson")^2)
# Calculate dispersion parameter
df <- df.residual(poisson_model)
dispersion <- residual_deviance / df
# Check if dispersion parameter is significantly greater than 1
if (dispersion > 1) {
print("Data is overdispersed.")
} else {
print("Data is not overdispersed.")
}
## [1] "Data is overdispersed."
Regressions:
Regressions: no controls. Printed with default and clustered
standard errors.
## Adding Clustered Standard Errors
cluster_vcov1<- vcovCR(LM1, cluster = AggregatedData$speakername, type = "CR0")
## Print
modelsummary(
list(
"Default Standard Errors" = LM1,
"Clustered Standard Errors" = LM1
),
vcov = list(
NULL, # default vcov for first
cluster_vcov1 # clustered vcov for second
),
exponentiate = TRUE,
conf_level = 0.95,
statistic = "std.error",
stars = TRUE
)
| |
Default Standard Errors |
Clustered Standard Errors |
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 |
| (Intercept) |
2.920*** |
2.920*** |
|
(0.063) |
(0.165) |
| genderF |
1.210*** |
1.210 |
|
(0.049) |
(0.143) |
| Num.Obs. |
7902 |
7902 |
| AIC |
34168.9 |
34168.9 |
| BIC |
34189.8 |
34189.8 |
| Log.Lik. |
-17081.431 |
-17081.431 |
| F |
22.145 |
|
| RMSE |
7.56 |
7.56 |
| Std.Errors |
|
Custom |
## Chi squared goodness of fit test: >0.05= good fit
p_value <- with(LM1, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value
## [1] 0.1366918
Regressions: with controls. Printed with default and clustered
standard errors
## With Controls
AggregatedData$gender <- relevel(AggregatedData$gender, ref = "M")
LM2<-glm.nb(EnvDummy~gender+YearsExp+gvt_role+opp_role+ EnvCommittee+RulingParty+DebateTopic, data = AggregatedData, control = glm.control(maxit = 100))
summary(LM2)
##
## Call:
## glm.nb(formula = EnvDummy ~ gender + YearsExp + gvt_role + opp_role +
## EnvCommittee + RulingParty + DebateTopic, data = AggregatedData,
## control = glm.control(maxit = 100), init.theta = 0.5534260782,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.521701 0.046682 11.176 < 2e-16 ***
## genderF 0.150494 0.038972 3.862 0.000113 ***
## YearsExp -0.002473 0.001933 -1.279 0.200918
## gvt_role 0.415893 0.045963 9.048 < 2e-16 ***
## opp_role 0.154089 0.048246 3.194 0.001404 **
## EnvCommittee 0.654666 0.043386 15.089 < 2e-16 ***
## RulingParty 0.132086 0.044370 2.977 0.002912 **
## DebateTopic 0.086891 0.002025 42.905 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.5534) family taken to be 1)
##
## Null deviance: 9522.9 on 7901 degrees of freedom
## Residual deviance: 8076.9 on 7894 degrees of freedom
## AIC: 32911
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.5534
## Std. Err.: 0.0121
##
## 2 x log-likelihood: -32892.5530
## Adding Clustered Standard Errors
cluster_vcov2 <- vcovCR(LM2, cluster = AggregatedData$speakername, type = "CR0")
## Print
modelsummary(
list(
"Default Standard Errors" = LM2,
"Clustered Standard Errors" = LM2
),
vcov = list(
NULL, # default vcov for first
cluster_vcov2 # clustered vcov for second
),
exponentiate = TRUE,
conf_level = 0.95,
statistic = "std.error",
stars = TRUE
)
| |
Default Standard Errors |
Clustered Standard Errors |
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 |
| (Intercept) |
1.685*** |
1.685*** |
|
(0.079) |
(0.163) |
| genderF |
1.162*** |
1.162 |
|
(0.045) |
(0.114) |
| YearsExp |
0.998 |
0.998 |
|
(0.002) |
(0.004) |
| gvt_role |
1.516*** |
1.516*** |
|
(0.070) |
(0.163) |
| opp_role |
1.167** |
1.167+ |
|
(0.056) |
(0.109) |
| EnvCommittee |
1.924*** |
1.924*** |
|
(0.083) |
(0.185) |
| RulingParty |
1.141** |
1.141 |
|
(0.051) |
(0.121) |
| DebateTopic |
1.091*** |
1.091*** |
|
(0.002) |
(0.003) |
| Num.Obs. |
7902 |
7902 |
| AIC |
32910.6 |
32910.6 |
| BIC |
32973.3 |
32973.3 |
| Log.Lik. |
-16446.276 |
-16446.276 |
| F |
333.554 |
|
| RMSE |
5277328.31 |
5277328.31 |
| Std.Errors |
|
Custom |
## Chi squared goodness of fit test: >0.05= good fit
p_value <- with(LM2, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value #0.071 = bigger than 0.05 = good.
## [1] 0.07353772
## No values over 10, so no multicollinearity
vif(LM2)
## gender YearsExp gvt_role opp_role EnvCommittee RulingParty
## 1.095836 1.095033 1.202180 1.583321 1.006176 1.745700
## DebateTopic
## 1.008056
Looking at difference between during and not during environmental
debates
Not during environmental Debates: Exponentiation Results with
Clustered Standard Errors and goodness of fit tests
## Adding Clustered Standard Errors
cluster_vcov3 <- vcovCR(DebateModel1, cluster = EnvDebateModelData$speakername, type = "CR0")
modelsummary(
list(
"Default Standard Errors" = DebateModel1,
"Clustered Standard Errors" = DebateModel1
),
vcov = list(
NULL, # default vcov for first
cluster_vcov3 # clustered vcov for second
),
exponentiate = TRUE,
conf_level = 0.95,
statistic = "std.error",
stars = TRUE
)
| |
Default Standard Errors |
Clustered Standard Errors |
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 |
| (Intercept) |
1.568*** |
1.568*** |
|
(0.083) |
(0.171) |
| genderF |
1.243*** |
1.243+ |
|
(0.055) |
(0.140) |
| YearsExp |
0.999 |
0.999 |
|
(0.002) |
(0.005) |
| gvt_role |
1.636*** |
1.636*** |
|
(0.085) |
(0.191) |
| opp_role |
1.144* |
1.144 |
|
(0.062) |
(0.128) |
| EnvCommittee |
2.051*** |
2.051*** |
|
(0.101) |
(0.234) |
| RulingParty |
1.162** |
1.162 |
|
(0.058) |
(0.143) |
| Num.Obs. |
7248 |
7248 |
| AIC |
28350.0 |
28350.0 |
| BIC |
28405.1 |
28405.1 |
| Log.Lik. |
-14166.999 |
-14166.999 |
| F |
59.983 |
|
| RMSE |
5.33 |
5.33 |
| Std.Errors |
|
Custom |
## Chi squared goodness of fit test: >0.05= good fit
p_value <- with(DebateModel1, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value
## [1] 0.780214
During environmental Debates: Exponentiation Results with Clustered
Standard Errors and goodness of fit tests
DebateModel2 <- glm.nb(DebateTopic ~ gender + YearsExp + gvt_role + opp_role + EnvCommittee + RulingParty , data = EnvDebateModelData)
summary(DebateModel2)
##
## Call:
## glm.nb(formula = DebateTopic ~ gender + YearsExp + gvt_role +
## opp_role + EnvCommittee + RulingParty, data = EnvDebateModelData,
## init.theta = 0.1226613002, link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.555138 0.102757 -5.402 6.57e-08 ***
## genderF 0.048738 0.086661 0.562 0.57385
## YearsExp -0.013769 0.004398 -3.131 0.00174 **
## gvt_role 0.565258 0.103327 5.471 4.49e-08 ***
## opp_role -0.036392 0.105680 -0.344 0.73057
## EnvCommittee 0.876661 0.095730 9.158 < 2e-16 ***
## RulingParty -0.469679 0.098755 -4.756 1.97e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.1227) family taken to be 1)
##
## Null deviance: 3430.1 on 7247 degrees of freedom
## Residual deviance: 3279.7 on 7241 degrees of freedom
## AIC: 11775
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.12266
## Std. Err.: 0.00486
##
## 2 x log-likelihood: -11758.55000
## Adding Clustered Standard Errors
cluster_vcov4 <- vcovCR(DebateModel2, cluster = EnvDebateModelData$speakername, type = "CR2")
modelsummary(
list(
"Default Standard Errors" = DebateModel2,
"Clustered Standard Errors" = DebateModel2
),
vcov = list(
NULL, # default vcov for first
cluster_vcov4 # clustered vcov for second
),
exponentiate = TRUE,
conf_level = 0.95,
statistic = "std.error",
stars = TRUE
)
| |
Default Standard Errors |
Clustered Standard Errors |
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 |
| (Intercept) |
0.574*** |
0.574* |
|
(0.059) |
(0.143) |
| genderF |
1.050 |
1.050 |
|
(0.091) |
(0.194) |
| YearsExp |
0.986** |
0.986 |
|
(0.004) |
(0.008) |
| gvt_role |
1.760*** |
1.760+ |
|
(0.182) |
(0.560) |
| opp_role |
0.964 |
0.964 |
|
(0.102) |
(0.195) |
| EnvCommittee |
2.403*** |
2.403*** |
|
(0.230) |
(0.365) |
| RulingParty |
0.625*** |
0.625 |
|
(0.062) |
(0.189) |
| Num.Obs. |
7248 |
7248 |
| AIC |
11774.5 |
11774.5 |
| BIC |
11829.7 |
11829.7 |
| Log.Lik. |
-5879.275 |
-5879.275 |
| F |
22.691 |
|
| RMSE |
3.12 |
3.12 |
| Std.Errors |
|
Custom |
## Chi squared goodness of fit test: >0.05= good fit
p_value <- with(DebateModel2, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value
## [1] 1
Party Specific Regressions
Party-subsetted Regressions with clustered standard errors
modelsummary(
list(
"Conservative" = Cons1,
"Labour" = Labour1,
"Lib Dem" = LibDem1,
"SNP" = SNP1,
"Lab (Co-op)" = LabCoOp1,
"Independent" = Independent1
),
vcov = list(
cluster_vcov_cons,
cluster_vcov_lab,
cluster_vcov_libdem,
cluster_vcov_SNP,
cluster_vcov_Labcoop,
cluster_vcov_ind
),
exponentiate = TRUE,
conf_level = 0.95,
statistic = "std.error",
stars = TRUE
)
| |
Conservative |
Labour |
Lib Dem |
SNP |
Lab (Co-op) |
Independent |
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 |
| (Intercept) |
1.810*** |
1.294* |
3.040*** |
2.300*** |
2.649** |
0.965 |
|
(0.143) |
(0.165) |
(0.631) |
(0.407) |
(0.988) |
(0.433) |
| genderF |
1.392* |
1.078 |
1.210 |
0.771 |
0.767 |
0.598 |
|
(0.225) |
(0.116) |
(0.356) |
(0.163) |
(0.189) |
(0.266) |
| YearsExp |
1.008 |
0.994 |
0.989 |
0.991 |
0.993 |
0.987 |
|
(0.005) |
(0.007) |
(0.011) |
(0.015) |
(0.017) |
(0.019) |
| gvt_role |
1.431** |
|
1.354 |
|
|
1.311 |
|
(0.167) |
|
(0.347) |
|
|
(0.716) |
| EnvCommittee |
1.656*** |
2.107*** |
1.671* |
1.396 |
1.289 |
1.651 |
|
(0.225) |
(0.267) |
(0.405) |
(0.352) |
(0.421) |
(1.156) |
| DebateTopic |
1.059*** |
1.206*** |
1.026*** |
1.207*** |
1.164*** |
1.463** |
|
(0.003) |
(0.008) |
(0.002) |
(0.020) |
(0.012) |
(0.170) |
| opp_role |
|
1.322** |
0.857 |
1.189 |
1.037 |
1.356 |
|
|
(0.139) |
(0.211) |
(0.245) |
(0.324) |
(0.587) |
| Num.Obs. |
3867 |
2624 |
407 |
361 |
363 |
62 |
| AIC |
16435.4 |
9908.8 |
1968.0 |
1585.7 |
1651.3 |
179.3 |
| BIC |
16479.2 |
9949.9 |
2000.1 |
1613.0 |
1678.6 |
196.3 |
| Log.Lik. |
-8210.710 |
-4947.414 |
-976.018 |
-785.870 |
-818.655 |
-81.651 |
| RMSE |
3606.71 |
1026996.98 |
47.50 |
10.63 |
200800.40 |
1.78 |
| Std.Errors |
Custom |
Custom |
Custom |
Custom |
Custom |
Custom |
Party x gender interaction model
# Plot using ggplot
plot(preds) +
aes(shape = group, color = NULL) + # Use shapes instead of color
scale_shape_manual(values = c(16, 17)) + # Optional: custom shapes for gender
scale_color_manual(values = c("black", "black"), guide = "none") +
labs(
title = "Predicted Counts of Environmental Speeches",
x = "Party",
y = "Predicted Probability of Environmental Speeches",
shape = "Gender" # This controls the legend title
) +
theme_minimal()
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
