library(haven)
## Warning: package 'haven' was built under R version 4.1.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(MASS)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'purrr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## Warning: package 'stringr' was built under R version 4.1.3
## Warning: package 'forcats' was built under R version 4.1.3
## Warning: package 'lubridate' was built under R version 4.1.3
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.2 v readr 2.1.4
## v forcats 1.0.0 v stringr 1.5.0
## v lubridate 1.9.2 v tibble 3.2.1
## v purrr 1.0.1 v tidyr 1.3.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::select() masks MASS::select()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
## Warning: package 'skimr' was built under R version 4.1.3
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.1.3
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading required package: survival
##
## Attaching package: 'survey'
##
## The following object is masked from 'package:graphics':
##
## dotchart
library(descr)
## Warning: package 'descr' was built under R version 4.1.3
library(DHARMa)
## Warning: package 'DHARMa' was built under R version 4.1.3
## This is DHARMa 0.4.6. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')
library(jtools)
library(ggeffects)
##
## Attaching package: 'ggeffects'
##
## The following object is masked from 'package:jtools':
##
## johnson_neyman
library(nnet)
library(brant)
## Warning: package 'brant' was built under R version 4.1.3
library(patchwork)
##
## Attaching package: 'patchwork'
##
## The following object is masked from 'package:MASS':
##
## area
set.seed(500)
Partisan_strength - People with a higher partisan strength are more likely to vote Education - People with a higher education level are more likely to vote Political interest - People with more interest in politics are more likely to vote
Age group - People with a younger age group are less likely to vote compared to older groups Ideological Identification - Different parties have different turn out rates thus controlling will keep them consistent Income - people with a higher income have more time or finances to be involved and vote controlling this will keep it even across
Data cleaning process - Checking for missing data - complete there were many Na values inside the variables - Checking for any bad data in scale - complete all variables were in the right scale format with the lowest and highest values being the correct format - Checking for intuitive names - complete the names were already intuitive so they were left alone - Checking for scale flips - complete the scale appeared in a good format as the highest value for all of them were the postive numbers or larger number which looked good.
Applied the weighted scale and then seperated the data out with my DV, IV, and control variables.
anes <- read_dta("anes_2020_vvoted_hw3.dta")
anes_comp <- anes[complete.cases(anes[, c("V200010b")]), ]
anes_weighted <- svydesign(ids = ~V200010c,
weights =~V200010b,
strata=~V200010d,
nest=TRUE, #Only True with clustered data like we have here
data = anes_comp) #PSU weight = 'ids', 'strata' = strata weight as specified in code book
# Just looking at the over all and we can see there is some missing data in most columns
# Looking like Na values however, other format seems fine.
# regression will omit the NA values so those are okay
freq(anes$voted_20)
## RECODE of val1_turnout20 (Voting status, 2020 general election (vendor 1))
## Frequency Percent Valid Percent
## 0 1605 19.384 20.01
## 1 6414 77.464 79.99
## NA's 261 3.152
## Total 8280 100.000 100.00
# the variable names are already pretty intuitive so I am going to to leave the names alone as well. variables of interest
anes_data <- anes %>%
dplyr::select(voted_20, educ, partisan_strength, polint, income, age_group, ideo3)
# looking like the lowest and highest values are in the correct format
# looking at representation of the numbers they appear to be on the right scale as well with no need to flip any values.
skim(anes_data)
Name | anes_data |
Number of rows | 8280 |
Number of columns | 7 |
_______________________ | |
Column type frequency: | |
numeric | 7 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
voted_20 | 261 | 0.97 | 0.80 | 0.40 | 0 | 1 | 1 | 1 | 1 | ▂▁▁▁▇ |
educ | 131 | 0.98 | 2.43 | 1.03 | 1 | 2 | 2 | 3 | 4 | ▅▇▁▆▅ |
partisan_strength | 35 | 1.00 | 1.99 | 1.07 | 0 | 1 | 2 | 3 | 3 | ▂▅▁▃▇ |
polint | 900 | 0.89 | 2.91 | 0.85 | 1 | 2 | 3 | 3 | 4 | ▁▃▁▇▅ |
income | 616 | 0.93 | 3.38 | 1.90 | 1 | 2 | 3 | 5 | 7 | ▇▃▂▃▃ |
age_group | 348 | 0.96 | 3.40 | 1.38 | 1 | 2 | 4 | 5 | 5 | ▃▅▅▇▇ |
ideo3 | 120 | 0.99 | 2.06 | 0.88 | 1 | 1 | 2 | 3 | 3 | ▇▁▅▁▇ |
# here we see that most people in the data voted
anes_data %>%
count(voted_20)
## # A tibble: 3 x 2
## voted_20 n
## <dbl+lbl> <int>
## 1 0 [Didn't Vote] 1605
## 2 1 [Voted] 6414
## 3 NA 261
Going to run 3 models here one logit, one probit, and one weighted then I will compare the results and pick the best model to move forward with.
When looking at the comparison of the three models I notice that the logit model appears to be score the best when it comes to AIC, followed by the probit then weighted. This makes sense as the weighted has more uncertainty in it. The coefficents remained pretty similar between logit and probit with logit being slightly larger. They also keep a consistent significance level showing how similar the models are however, when we get to age group and Ideological Identification they change sign when compared to weighted. For ideo we notice that it changes from a positive in logit and probit but then went negative in weighted. This trend continues when to goes to the next factor in that variable where weighted reports negative while logit and probit report positive Another key point to note here is that in the logit and probit when looking at Moderate ideology being the intercept and democrat being the variable we notice it is significant but not in the weighted model. Similar trend happens in the age group where the younger groups were positive but then flipped negative and lost significance in groups 2 and 3. Taking this into consideration I would want to use the logit model over the probit model but with the flipping signs I am going to use the weighted model.
For the models below in ideo I set the reference to 2 for moderate instead of it being liberal to make it easier to compare.
logit <- glm(voted_20 ~ as.factor(partisan_strength) + as.factor(educ) + polint + income + relevel(as.factor(ideo3), ref = "2") + as.factor(age_group), data = anes, family = binomial(link = "logit"))
probit <- glm(voted_20 ~ as.factor(partisan_strength) + as.factor(educ) + polint + income + relevel(as.factor(ideo3), ref = "2") + as.factor(age_group), data = anes, family = binomial(link = "probit"))
logit_weight <- svyglm(voted_20 ~ as.factor(partisan_strength) + as.factor(educ) + polint + income + relevel(as.factor(ideo3), ref = "2") + as.factor(age_group), design = anes_weighted, family = binomial(link = "logit"))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
stargazer(logit, probit, logit_weight, type = "text",
dep.var.labels = "Voting",
covariate.labels = c("Leaning Partisan", "Weak Partisan", "Strong Partisan", "Some College", "Bachelors", "Advanced education", "Political Interest", "Income", "Liberal Identification", "Conservative Identification", "Age(30-39)", "Age(40-49)", "Age(50-64)", "Age(65+)"), digits = 2)
##
## ===============================================================
## Dependent variable:
## -----------------------------------
## Voting
## logistic probit survey-weighted
## logistic
## (1) (2) (3)
## ---------------------------------------------------------------
## Leaning Partisan 0.63*** 0.38*** 0.59***
## (0.11) (0.07) (0.17)
##
## Weak Partisan 0.48*** 0.29*** 0.44***
## (0.11) (0.07) (0.14)
##
## Strong Partisan 0.89*** 0.52*** 0.93***
## (0.11) (0.06) (0.13)
##
## Some College 0.53*** 0.31*** 0.40***
## (0.09) (0.05) (0.12)
##
## Bachelors 0.91*** 0.51*** 0.59***
## (0.10) (0.06) (0.16)
##
## Advanced education 0.81*** 0.46*** 0.71***
## (0.12) (0.07) (0.14)
##
## Political Interest 0.22*** 0.13*** 0.25***
## (0.04) (0.02) (0.05)
##
## Income 0.16*** 0.09*** 0.19***
## (0.02) (0.01) (0.04)
##
## Liberal Identification 0.16* 0.09* -0.004
## (0.09) (0.05) (0.12)
##
## Conservative Identification 0.04 0.03 -0.04
## (0.09) (0.05) (0.10)
##
## Age(30-39) 0.16 0.10 -0.02
## (0.11) (0.06) (0.16)
##
## Age(40-49) 0.46*** 0.28*** 0.15
## (0.12) (0.07) (0.16)
##
## Age(50-64) 0.66*** 0.39*** 0.56***
## (0.11) (0.06) (0.16)
##
## Age(65+) 1.19*** 0.67*** 0.85***
## (0.11) (0.06) (0.17)
##
## Constant -1.48*** -0.82*** -1.46***
## (0.16) (0.09) (0.23)
##
## ---------------------------------------------------------------
## Observations 6,395 6,395 6,395
## Log Likelihood -2,794.10 -2,798.12 -3,089.86
## Akaike Inf. Crit. 5,618.21 5,626.24 6,209.71
## ===============================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
anes_data <- anes_data %>%
filter(complete.cases(.))
# cleaned out the missing data
skim(anes_data)
Name | anes_data |
Number of rows | 6395 |
Number of columns | 7 |
_______________________ | |
Column type frequency: | |
numeric | 7 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
voted_20 | 0 | 1 | 0.81 | 0.39 | 0 | 1 | 1 | 1 | 1 | ▂▁▁▁▇ |
educ | 0 | 1 | 2.48 | 1.02 | 1 | 2 | 2 | 3 | 4 | ▅▇▁▆▅ |
partisan_strength | 0 | 1 | 2.02 | 1.05 | 0 | 1 | 2 | 3 | 3 | ▂▃▁▃▇ |
polint | 0 | 1 | 2.92 | 0.85 | 1 | 2 | 3 | 4 | 4 | ▁▃▁▇▅ |
income | 0 | 1 | 3.41 | 1.89 | 1 | 2 | 3 | 5 | 7 | ▇▃▃▃▃ |
age_group | 0 | 1 | 3.42 | 1.37 | 1 | 2 | 4 | 5 | 5 | ▃▅▅▇▇ |
ideo3 | 0 | 1 | 2.03 | 0.88 | 1 | 1 | 2 | 3 | 3 | ▇▁▅▁▇ |
# Create bin educ
anes_data$predicted_prob<-predict(logit_weight, type="response")
anes_data$se<-predict(logit_weight, se.fit=TRUE)
anes_data$predicted_prob<-as.numeric(anes_data$predicted_prob)
anes_data$se<-as.numeric(anes_data$se)
anes_data$residuals <- anes_data$voted_20 - anes_data$predicted_prob
anes_data$bins <- cut(anes_data$educ, breaks = c(0:4))
# Summarize the residuals for each bin
binned_data <- anes_data %>%
group_by(bins) %>%
summarize(
mean_residual = mean(residuals),
ci_lower = mean_residual - 1.96 * sd(residuals) / sqrt(n()),
ci_upper = mean_residual + 1.96 * sd(residuals) / sqrt(n())
)
ggplot(binned_data, aes(x = bins, y = mean_residual)) +
geom_point(color = "black", size = 3) + # Dot graph with points
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2, position = position_dodge(width = 0.5)) + # CIs
labs(x = "Bins of x", y = "Avg Binned Residual", title = "Education Binned Residuals with 95% CI") +
theme_minimal() + geom_hline(yintercept=0, linetype="dashed",color="red")
# Partisan strength
# Create bin part_sat
anes_data$bins2 <- cut(anes_data$partisan_strength, breaks = c(-1:3))
# Summarize the residuals for each bin
binned_data <- anes_data %>%
group_by(bins2) %>%
summarize(
mean_residual = mean(residuals),
ci_lower = mean_residual - 1.96 * sd(residuals) / sqrt(n()),
ci_upper = mean_residual + 1.96 * sd(residuals) / sqrt(n())
)
ggplot(binned_data, aes(x = bins2, y = mean_residual)) +
geom_point(color = "black", size = 3) + # Dot graph with points
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2, position = position_dodge(width = 0.5)) + # CIs
labs(x = "Bins of x", y = "Avg Binned Residual", title = "Partisan Strength Binned Residuals with 95% CI") +
theme_minimal() + geom_hline(yintercept=0, linetype="dashed",color="red")
# Political interest
# Create bin polint
anes_data$bins3 <- cut(anes_data$polint, breaks = c(0:4))
# Summarize the residuals for each bin
binned_data <- anes_data %>%
group_by(bins3) %>%
summarize(
mean_residual = mean(residuals),
ci_lower = mean_residual - 1.96 * sd(residuals) / sqrt(n()),
ci_upper = mean_residual + 1.96 * sd(residuals) / sqrt(n())
)
ggplot(binned_data, aes(x = bins3, y = mean_residual)) +
geom_point(color = "black", size = 3) + # Dot graph with points
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2, position = position_dodge(width = 0.5)) + # CIs
labs(x = "Bins of x", y = "Avg Binned Residual", title = "Political interest Binned Residuals with 95% CI") +
theme_minimal() + geom_hline(yintercept=0, linetype="dashed",color="red")
When looking at partisan strength we notice right away it is consistently significant across all models. All factors of it are a positive impact towards actually voting. Strong partisan increased probability of voting by .93 log odds (~23% in methods of 4). Other factors slipped behind at about half the size of impact. The reason for this would be the stronger you feel about a particular party the more likely you want to be involved. Political interest seems more straight forward as people who are interested in politics are more likely to want to go out and vote. This increased probability of voting by .25(~6% method of 4) log odds. Income levels would have an impact as people with more finances can afford to be more involved and have more time to attend rally or actually have time to go vote compared to people are strapped for money. This increased probability of voting by .19(~4.75% method of 4) log odds. Age groups were impactful as younger crowds as not as interested in politics as older crowds. This can be seen in the odds as the age group 30-39 decrease the probability of voting by .02 log odds(~.5% method of 4) and the 65+ age group increased probability by .85 log odds (~21.25% method of 4). This is a massive increase as the age climbs up. Surprisingly ideological identification did not have an impact as I thought maybe one party would be more likely go out and vote however, this variable was not significant.
The actual number of yes is 80.91% which means if I said 1 for all variables I would obtain that accuracy level. However, with the model that I made it improved the accuracy to 81.65% which is a small improvement. This is an improvement of .75% however, this is improvement regardless and was interesting to see it improve it. This means that there is more impact variables out there and other variables that are impacting how my model is predicting.
###Classification Accuracy
#Create new vector with 0|1 predictions from the model
predicted_class <- ifelse(anes_data$predicted_prob >= 0.5, 1, 0)
# Compare the predicted binary outcomes to the actual y
actual_class <- anes_data$voted_20
# Calculate the classification accuracy
accuracy <- mean(predicted_class == actual_class)
print(accuracy)
## [1] 0.8157936
# Calculate the classification accuracy improvement
accuracy_improve <- accuracy-mean(anes_data$voted_20)
print(accuracy_improve)
## [1] 0.006724003
summ(logit_weight)
## MODEL INFO:
## Observations: 6395
## Dependent Variable: voted_20
## Type: Analysis of complex survey design
## Family: binomial
## Link function: logit
##
## MODEL FIT:
## Pseudo-R² (Cragg-Uhler) = 0.19
## Pseudo-R² (McFadden) = 0.12
## AIC = 6209.71
##
## ------------------------------------------------------------------
## Est. S.E. t val. p
## ----------------------------------- ------- ------ -------- ------
## (Intercept) -1.46 0.23 -6.32 0.00
## as.factor(partisan_strength)1 0.59 0.17 3.52 0.00
## as.factor(partisan_strength)2 0.44 0.14 3.18 0.00
## as.factor(partisan_strength)3 0.93 0.13 7.22 0.00
## as.factor(educ)2 0.40 0.12 3.27 0.00
## as.factor(educ)3 0.59 0.16 3.74 0.00
## as.factor(educ)4 0.71 0.14 5.05 0.00
## polint 0.25 0.05 5.30 0.00
## income 0.19 0.04 5.34 0.00
## relevel(as.factor(ideo3), -0.00 0.12 -0.03 0.97
## ref = "2")1
## relevel(as.factor(ideo3), -0.04 0.10 -0.40 0.69
## ref = "2")3
## as.factor(age_group)2 -0.02 0.16 -0.13 0.90
## as.factor(age_group)3 0.15 0.16 0.93 0.36
## as.factor(age_group)4 0.56 0.16 3.45 0.00
## as.factor(age_group)5 0.85 0.17 4.93 0.00
## ------------------------------------------------------------------
##
## Estimated dispersion parameter = 0.98
plot_summs(logit_weight)
## Registered S3 methods overwritten by 'broom':
## method from
## tidy.glht jtools
## tidy.summary.glht jtools
## Loading required namespace: broom.mixed
Write a hypothesis for each of the three IVs you select
# pulling out whats needed for study
anes_gm <- data.frame(anes$V201416, anes$libcon, anes$V201435, anes$V201601, anes$educ, anes$age_group)
# renaming
gm_names <- c("gay_marriage", "libcon", "relig_ident", "sex_orient", "educ", "age_group")
colnames(anes_gm) <- gm_names
# changed scale to show more support for gay marriage with higher numbers.
anes_gm <- anes_gm %>%
mutate(gay_marriage = case_when(
gay_marriage == 1 ~ 3,
gay_marriage == 2 ~ 2,
gay_marriage == 3 ~ 1,
gay_marriage >3 ~ NA_real_,
gay_marriage <1 ~ NA_real_
))
# changed into to the char representation for the graphing
anes_gm <- anes_gm %>%
mutate(libcon = case_when(
libcon == 1 ~ "Extreme L",
libcon == 2 ~ "Somewhat L",
libcon == 3 ~ "Slightly L",
libcon == 4 ~ "Moderate",
libcon == 5 ~ "Slightly C",
libcon == 6 ~ "Somewhat C",
libcon == 7 ~ "Extreme C"
))
anes_gm <- anes_gm %>%
mutate(relig_ident = case_when(
relig_ident == 1 ~ "Monotheist",
relig_ident == 2 ~ "Monotheist",
relig_ident == 3 ~ "Monotheist",
relig_ident == 4 ~ "Monotheist",
relig_ident == 5 ~ "Monotheist",
relig_ident == 6 ~ "Monotheist",
relig_ident == 7 ~ "Polytheist",
relig_ident == 8 ~ "Polytheist",
relig_ident == 9 ~ "Non-Religous",
relig_ident == 10 ~ "Non-Religous",
relig_ident == 12 ~ "Non-Religous",
relig_ident == 11 ~ "Other",
))
# remove NA values
anes_gm <- anes_gm %>%
filter(complete.cases(.))
# looking here we notice that a majority of people support gay marriages (68.66%).
freq(anes_gm$gay_marriage)
## anes_gm$gay_marriage
## Frequency Percent
## 1 981 12.91
## 2 1400 18.43
## 3 5217 68.66
## Total 7598 100.00
# checking for any skewed or missing data
skim(anes_gm)
Name | anes_gm |
Number of rows | 7598 |
Number of columns | 6 |
_______________________ | |
Column type frequency: | |
character | 2 |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
libcon | 0 | 1 | 8 | 10 | 0 | 7 | 0 |
relig_ident | 0 | 1 | 5 | 12 | 0 | 4 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
gay_marriage | 0 | 1 | 2.56 | 0.71 | 1 | 2 | 3 | 3 | 3 | ▂▁▂▁▇ |
sex_orient | 0 | 1 | 1.12 | 0.48 | 1 | 1 | 1 | 1 | 4 | ▇▁▁▁▁ |
educ | 0 | 1 | 2.43 | 1.02 | 1 | 2 | 2 | 3 | 4 | ▅▇▁▆▅ |
age_group | 0 | 1 | 3.39 | 1.38 | 1 | 2 | 4 | 5 | 5 | ▃▅▅▇▇ |
# changing vars to factor before model
anes_gm$gay_marriage <- factor(anes_gm$gay_marriage, ordered = TRUE)
anes_gm$educ_f <- factor(anes_gm$educ, ordered = TRUE)
anes_gm$sex_orient_f <- factor(anes_gm$sex_orient, ordered = TRUE)
anes_gm$age_f <- factor(anes_gm$age_group, ordered = TRUE)
anes_gm$libcon_f <- relevel(factor(anes_gm$libcon), ref = "Moderate", ordered = TRUE)
anes_gm$relig_ident_f <- relevel(factor(anes_gm$relig_ident), ref = "Non-Religous", ordered = TRUE)
# logistic model
gm_l <- polr(gay_marriage ~ libcon_f + relig_ident_f + sex_orient + educ + age_group, data = anes_gm, na.action = na.exclude, method = "logistic")
gm_l_f <- polr(gay_marriage ~ libcon_f + relig_ident_f + sex_orient + educ_f + age_f, data = anes_gm, na.action = na.exclude, method = "logistic")
# probit model
gm_p <- polr(gay_marriage ~ libcon_f + relig_ident_f + sex_orient + educ + age_group, data = anes_gm, na.action = na.exclude, method = "probit")
gm_p_f <- polr(gay_marriage ~ libcon_f + relig_ident_f + sex_orient + educ_f + age_f, data = anes_gm, na.action = na.exclude, method = "probit")
stargazer(gm_l, gm_p, type = "text", digits = 3)
##
## ====================================================
## Dependent variable:
## ----------------------------
## gay_marriage
## ordered ordered
## logistic probit
## (1) (2)
## ----------------------------------------------------
## libcon_fExtreme C -2.067*** -1.226***
## (0.111) (0.066)
##
## libcon_fExtreme L 1.033*** 0.559***
## (0.216) (0.108)
##
## libcon_fSlightly C -0.595*** -0.335***
## (0.080) (0.046)
##
## libcon_fSlightly L 0.340*** 0.198***
## (0.094) (0.052)
##
## libcon_fSomewhat C -1.264*** -0.734***
## (0.077) (0.046)
##
## libcon_fSomewhat L 1.494*** 0.788***
## (0.142) (0.069)
##
## relig_ident_fMonotheist -0.809*** -0.442***
## (0.078) (0.043)
##
## relig_ident_fOther -1.075*** -0.617***
## (0.089) (0.049)
##
## relig_ident_fPolytheist -0.538* -0.278*
## (0.276) (0.152)
##
## sex_orient 0.121 0.055
## (0.075) (0.039)
##
## educ 0.195*** 0.130***
## (0.027) (0.016)
##
## age_group -0.171*** -0.094***
## (0.021) (0.012)
##
## ----------------------------------------------------
## Observations 7,598 7,598
## ====================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
stargazer(gm_l_f, gm_p_f, type = "text", digits = 3)
##
## ====================================================
## Dependent variable:
## ----------------------------
## gay_marriage
## ordered ordered
## logistic probit
## (1) (2)
## ----------------------------------------------------
## libcon_fExtreme C -2.074*** -1.230***
## (0.111) (0.066)
##
## libcon_fExtreme L 1.043*** 0.564***
## (0.216) (0.108)
##
## libcon_fSlightly C -0.595*** -0.335***
## (0.080) (0.046)
##
## libcon_fSlightly L 0.341*** 0.198***
## (0.094) (0.052)
##
## libcon_fSomewhat C -1.279*** -0.742***
## (0.078) (0.046)
##
## libcon_fSomewhat L 1.500*** 0.792***
## (0.142) (0.070)
##
## relig_ident_fMonotheist -0.811*** -0.444***
## (0.078) (0.043)
##
## relig_ident_fOther -1.073*** -0.616***
## (0.089) (0.049)
##
## relig_ident_fPolytheist -0.517* -0.267*
## (0.277) (0.153)
##
## sex_orient 0.123 0.056
## (0.075) (0.039)
##
## educ_f.L 0.421*** 0.280***
## (0.061) (0.035)
##
## educ_f.Q -0.154*** -0.090***
## (0.056) (0.032)
##
## educ_f.C -0.005 -0.011
## (0.051) (0.030)
##
## age_f.L -0.547*** -0.298***
## (0.071) (0.040)
##
## age_f.Q 0.044 0.020
## (0.070) (0.039)
##
## age_f.C 0.052 0.041
## (0.063) (0.036)
##
## age_f4 -0.086 -0.057
## (0.066) (0.038)
##
## ----------------------------------------------------
## Observations 7,598 7,598
## ====================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# Brant test
brant(gm_l)
## ------------------------------------------------------------
## Test for X2 df probability
## ------------------------------------------------------------
## Omnibus 116.07 12 0
## libcon_fExtreme C 4.09 1 0.04
## libcon_fExtreme L 4.51 1 0.03
## libcon_fSlightly C 0 1 0.98
## libcon_fSlightly L 0.52 1 0.47
## libcon_fSomewhat C 6.21 1 0.01
## libcon_fSomewhat L 2.45 1 0.12
## relig_ident_fMonotheist 7.58 1 0.01
## relig_ident_fOther 0.57 1 0.45
## relig_ident_fPolytheist 1.96 1 0.16
## sex_orient 0.38 1 0.54
## educ 19.46 1 0
## age_group 9.54 1 0
## ------------------------------------------------------------
##
## H0: Parallel Regression Assumption holds
## Warning in brant(gm_l): 6 combinations in table(dv,ivs) do not occur. Because
## of that, the test results might be invalid.
brant(gm_p)
## ------------------------------------------------------------
## Test for X2 df probability
## ------------------------------------------------------------
## Omnibus 116.07 12 0
## libcon_fExtreme C 4.09 1 0.04
## libcon_fExtreme L 4.51 1 0.03
## libcon_fSlightly C 0 1 0.98
## libcon_fSlightly L 0.52 1 0.47
## libcon_fSomewhat C 6.21 1 0.01
## libcon_fSomewhat L 2.45 1 0.12
## relig_ident_fMonotheist 7.58 1 0.01
## relig_ident_fOther 0.57 1 0.45
## relig_ident_fPolytheist 1.96 1 0.16
## sex_orient 0.38 1 0.54
## educ 19.46 1 0
## age_group 9.54 1 0
## ------------------------------------------------------------
##
## H0: Parallel Regression Assumption holds
## Warning in brant(gm_p): 6 combinations in table(dv,ivs) do not occur. Because
## of that, the test results might be invalid.
brant(gm_l_f)
## ------------------------------------------------------------
## Test for X2 df probability
## ------------------------------------------------------------
## Omnibus 122.15 17 0
## libcon_fExtreme C 3.9 1 0.05
## libcon_fExtreme L 4.71 1 0.03
## libcon_fSlightly C 0 1 0.95
## libcon_fSlightly L 0.58 1 0.45
## libcon_fSomewhat C 5.76 1 0.02
## libcon_fSomewhat L 2.57 1 0.11
## relig_ident_fMonotheist 7.42 1 0.01
## relig_ident_fOther 0.5 1 0.48
## relig_ident_fPolytheist 1.96 1 0.16
## sex_orient 0.39 1 0.53
## educ_f.L 15.53 1 0
## educ_f.Q 0.09 1 0.76
## educ_f.C 0.43 1 0.51
## age_f.L 7.85 1 0.01
## age_f.Q 0.25 1 0.62
## age_f.C 3.57 1 0.06
## age_f^4 4.6 1 0.03
## ------------------------------------------------------------
##
## H0: Parallel Regression Assumption holds
## Warning in brant(gm_l_f): 707 combinations in table(dv,ivs) do not occur.
## Because of that, the test results might be invalid.
glm_fit <- function(model) {
# Calculate Likelihood Ratio
lr <- logLik(model)
# Calculate AIC
aic <- AIC(model)
# Calculate BIC
n <- nobs(model)
p <- length(coef(model))
bic <- -2 * logLik(model) + p * log(n)
# Calculate Deviance
deviance <- summary(model)$deviance
# Return the metrics as a list
metrics <- data.frame(Likelihood_Ratio = lr, AIC = aic, BIC = bic, Deviance = deviance)
return(metrics)
}
glm_fit(gm_l)
##
## Re-fitting to get Hessian
## Likelihood_Ratio AIC BIC Deviance
## 1 -5373.028 10774.06 10853.28 10746.06
# slight improvement of AIC BIC and deviance on factor version
glm_fit(gm_l_f)
##
## Re-fitting to get Hessian
## Likelihood_Ratio AIC BIC Deviance
## 1 -5367.878 10773.76 10887.66 10735.76
Looking at extreme conservative vs extreme liberal. We notice that liberals are far more supportive of gay marriage in comparison to extreme conservatives.
logit<-ggpredict(gm_l_f, terms="libcon_f") #ggpredict(model_name, terms="variable to graph")
print(logit) #Overview of the predicted probability results
## # Predicted probabilities of gay_marriage
##
## # Response Level = 1
##
## libcon_f | Predicted | 95% CI
## -------------------------------------
## Moderate | 0.04 | [0.03, 0.04]
## Extreme C | 0.24 | [0.19, 0.29]
## Extreme L | 0.01 | [0.01, 0.02]
## Slightly C | 0.07 | [0.05, 0.08]
## Slightly L | 0.03 | [0.02, 0.03]
## Somewhat C | 0.12 | [0.10, 0.15]
## Somewhat L | 0.01 | [0.01, 0.01]
##
## # Response Level = 2
##
## libcon_f | Predicted | 95% CI
## -------------------------------------
## Moderate | 0.10 | [0.08, 0.11]
## Extreme C | 0.31 | [0.26, 0.38]
## Extreme L | 0.04 | [0.02, 0.06]
## Slightly C | 0.15 | [0.13, 0.19]
## Slightly L | 0.07 | [0.06, 0.09]
## Somewhat C | 0.23 | [0.20, 0.28]
## Somewhat L | 0.02 | [0.02, 0.03]
##
## # Response Level = 3
##
## libcon_f | Predicted | 95% CI
## -------------------------------------
## Moderate | 0.87 | [0.85, 0.88]
## Extreme C | 0.45 | [0.38, 0.52]
## Extreme L | 0.95 | [0.92, 0.97]
## Slightly C | 0.78 | [0.74, 0.82]
## Slightly L | 0.90 | [0.88, 0.92]
## Somewhat C | 0.64 | [0.59, 0.69]
## Somewhat L | 0.97 | [0.95, 0.98]
##
## Adjusted for:
## * relig_ident_f = Non-Religous
## * sex_orient = 1.12
## * educ_f = 1
## * age_f = 1
logit$x <-as.factor(logit$x)
# Define custom levels and labels
custom_levels <- c(1, 2, 3)
custom_labels <- c("Anti-gay relationship", "No gay marriage", "Pro gay marriage")
# Reassign variable with custom labels
logit$response.level <- factor(logit$response.level, levels = custom_levels, labels = custom_labels)
strong <- logit %>%
filter(logit$x %in% c("Extreme L", "Extreme C")) #Filters for the variables we need
ggplot(strong, aes(x = x, y = predicted, fill = response.level)) +
geom_bar(stat = "identity", position = "dodge", width = 0.7) +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.4, position = position_dodge(width = 0.7)) +
theme_minimal(base_size = 13) +
labs(x = "Support for Gay Marriage", y = "Predicted Probability",
title = "Predicted Probability with Confidence Intervals") +
scale_fill_manual(values = c("Anti-gay relationship" = "darkgrey",
"No gay marriage" = "lightgrey",
"Pro gay marriage" = "antiquewhite")) +
labs(fill = "Ideological Identification") +
theme(legend.position = c(0.05, 0.95), legend.justification = c(0, 1))
Comparing sexual orientation support for gay marriage even though it was not significant. They follow almost an identical line with each other where homosexuals are slightly higher.
logit2 <-ggpredict(gm_l_f, terms="sex_orient") #ggpredict(model_name, terms="variable to graph")
print(logit2) #Overview of the predicted probability results
## # Predicted probabilities of gay_marriage
##
## # Response Level = 1
##
## sex_orient | Predicted | 95% CI
## -------------------------------------
## 1 | 0.04 | [0.03, 0.04]
## 2 | 0.03 | [0.03, 0.05]
## 3 | 0.03 | [0.02, 0.05]
## 4 | 0.03 | [0.02, 0.05]
##
## # Response Level = 2
##
## sex_orient | Predicted | 95% CI
## -------------------------------------
## 1 | 0.10 | [0.09, 0.11]
## 2 | 0.09 | [0.07, 0.12]
## 3 | 0.08 | [0.05, 0.12]
## 4 | 0.07 | [0.04, 0.12]
##
## # Response Level = 3
##
## sex_orient | Predicted | 95% CI
## -------------------------------------
## 1 | 0.86 | [0.85, 0.88]
## 2 | 0.88 | [0.84, 0.91]
## 3 | 0.89 | [0.84, 0.93]
## 4 | 0.90 | [0.84, 0.94]
##
## Adjusted for:
## * libcon_f = Moderate
## * relig_ident_f = Non-Religous
## * educ_f = 1
## * age_f = 1
logit2$x <-as.factor(logit2$x)
# Define custom levels and labels
custom_levels <- c(1, 2, 3)
custom_labels <- c("Anti-gay relationship", "No gay marriage", "Pro gay marriage")
# Reassign variable with custom labels
logit2$response.level <- factor(logit2$response.level, levels = custom_levels, labels = custom_labels)
orient <- logit2 %>%
filter(logit2$x %in% c("1", "2"))
ggplot(orient, aes(x = x, y = predicted, fill = response.level)) +
geom_bar(stat = "identity", position = "dodge", width = 0.7) +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.4, position = position_dodge(width = 0.7)) +
theme_minimal(base_size = 13) +
labs(x = "Support for Gay Marriage", y = "Predicted Probability",
title = "Predicted Probability with Confidence Intervals") +
scale_fill_manual(values = c("Anti-gay relationship" = "darkgrey",
"No gay marriage" = "lightgrey",
"Pro gay marriage" = "antiquewhite")) +
labs(fill = "Sexual Orientation") +
theme(legend.position = c(0.05, 0.95), legend.justification = c(0, 1))
Comparing the different religious identifications support levels for gay marriage. Here we can visually see the non-religious group was the most supportive of gay marriage which falls into my hypothesis.
logit3 <-ggpredict(gm_l_f, terms="relig_ident_f") #ggpredict(model_name, terms="variable to graph")
print(logit3) #Overview of the predicted probability results
## # Predicted probabilities of gay_marriage
##
## # Response Level = 1
##
## relig_ident_f | Predicted | 95% CI
## ----------------------------------------
## Non-Religous | 0.04 | [0.03, 0.04]
## Monotheist | 0.08 | [0.07, 0.10]
## Other | 0.10 | [0.08, 0.13]
## Polytheist | 0.06 | [0.04, 0.10]
##
## # Response Level = 2
##
## relig_ident_f | Predicted | 95% CI
## ----------------------------------------
## Non-Religous | 0.10 | [0.08, 0.11]
## Monotheist | 0.18 | [0.15, 0.21]
## Other | 0.21 | [0.17, 0.25]
## Polytheist | 0.14 | [0.09, 0.23]
##
## # Response Level = 3
##
## relig_ident_f | Predicted | 95% CI
## ----------------------------------------
## Non-Religous | 0.87 | [0.85, 0.88]
## Monotheist | 0.74 | [0.69, 0.78]
## Other | 0.69 | [0.63, 0.74]
## Polytheist | 0.79 | [0.69, 0.87]
##
## Adjusted for:
## * libcon_f = Moderate
## * sex_orient = 1.12
## * educ_f = 1
## * age_f = 1
logit3$x <-as.factor(logit3$x)
# Define custom levels and labels
custom_levels <- c(1, 2, 3)
custom_labels <- c("Anti-gay relationship", "No gay marriage", "Support gay marriage")
# Reassign variable with custom labels
logit3$response.level <- factor(logit3$response.level, levels = custom_levels, labels = custom_labels)
custom_order <- c('Monotheist', 'Polytheist', 'Non-Religous', 'Other')
# Ordered model
plot1 <- ggplot(logit3, aes(x = x, y = predicted, color = response.level, group = response.level)) +
geom_point() + theme_minimal(base_size = 10) +
labs(x = "Support for Gay Marriage", y = "Predicted Probability",
title = "Predicted Probability with Confidence Intervals") +
labs(color = "Gay Marriage Opinion") +
geom_errorbar(aes(ymin=conf.low, ymax=conf.high),
linewidth=.3, # Thinner lines
width=.2) + scale_x_discrete(limits = custom_order)
# Multinomial model
anes_gm$gay_marriage_f<-as.character(anes_gm$gay_marriage)
model <- multinom(gay_marriage_f ~ libcon_f + relig_ident_f + sex_orient + educ_f + age_f, data=anes_gm)
## # weights: 57 (36 variable)
## initial value 8347.256169
## iter 10 value 6023.083802
## iter 20 value 5499.163569
## iter 30 value 5358.219144
## iter 40 value 5310.159889
## final value 5308.582322
## converged
stargazer(model, type="text")
##
## ====================================================
## Dependent variable:
## ----------------------------
## 2 3
## (1) (2)
## ----------------------------------------------------
## libcon_fExtreme C -0.871*** -2.745***
## (0.156) (0.164)
##
## libcon_fExtreme L -0.654 0.687**
## (0.410) (0.303)
##
## libcon_fSlightly C -0.204 -0.708***
## (0.137) (0.118)
##
## libcon_fSlightly L -0.056 0.313**
## (0.173) (0.146)
##
## libcon_fSomewhat C -0.277** -1.535***
## (0.128) (0.115)
##
## libcon_fSomewhat L -0.226 1.372***
## (0.279) (0.227)
##
## relig_ident_fMonotheist 0.112 -0.765***
## (0.138) (0.117)
##
## relig_ident_fOther -0.436*** -1.261***
## (0.152) (0.127)
##
## relig_ident_fPolytheist 0.759 0.042
## (0.580) (0.541)
##
## sex_orient -0.060 0.084
## (0.129) (0.104)
##
## educ_f.L 0.597*** 0.732***
## (0.101) (0.093)
##
## educ_f.Q -0.053 -0.165**
## (0.092) (0.084)
##
## educ_f.C -0.052 -0.029
## (0.084) (0.077)
##
## age_f.L 0.075 -0.571***
## (0.118) (0.101)
##
## age_f.Q 0.016 0.012
## (0.115) (0.099)
##
## age_f.C 0.225** 0.174*
## (0.106) (0.094)
##
## age_f4 -0.266** -0.215**
## (0.108) (0.095)
##
## Constant 0.768*** 2.920***
## (0.211) (0.175)
##
## ----------------------------------------------------
## Akaike Inf. Crit. 10,689.170 10,689.170
## ====================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
model_m<-ggpredict(model, terms="relig_ident_f")
plot2 <- ggplot(model_m, aes(x = factor(x, level =c('Monotheist', 'Polytheist', 'Non-Religous', 'Other')), y = predicted, color = response.level, group = response.level)) +
geom_point() + theme_minimal(base_size = 15) +
labs(x = "Multi model", y = "Predicted Probability",
title = "Predicted Probability with Confidence Intervals") +
labs(color = "Gay Marriage Opinion")
combo <- plot1 + plot2
combo
Is it violated overall? For individual variables? Should you move to a multinomial modeling approach?
When running it without factors we noticed the overall was violated and most of the individual variables were violated except for sexual orientation. However, when flipping the other variables to factor and running it again we notice that most of them do not violate the Brant test. The probit and logit models performed the exact same way so choosing a model did not matter much here it mattered more on whether to use factors or change model.
When looking at the side by side we notice that they are very similar so I would use the ordered model in comparison to the multinomial model. If the results were different significantly I would use the multinomial model.