1 SetUp

https://www.ucl.ac.uk/~uctqiax/PUBLG100/2015/week9/seminar9.html

1.1 Packages

# install.packages("https://cran.r-project.org/src/contrib/Archive/Zelig/Zelig_4.2-1.tar.gz", 
#                  repos=NULL, 
#                  type="source")
# install.packages("sandwich")
# packageVersion("Zelig")
# # install.packages("Zelig")
# install.packages("foriegn")
# install.packages("lmtest")
# install.packages("library(texreg)")
# install.packages("huxtable", dependencies = TRUE)

library(Zelig)    # load BEFORE dplyr!!!
## Loading required package: boot
## Warning: package 'boot' was built under R version 4.2.3
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 4.2.3
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.2.3
## ZELIG (Versions 4.2-1, built: 2013-09-12)
## 
## +----------------------------------------------------------------+
## |  Please refer to http://gking.harvard.edu/zelig for full       |
## |  documentation or help.zelig() for help with commands and      |
## |  models support by Zelig.                                      |
## |                                                                |
## |  Zelig project citations:                                      |
## |    Kosuke Imai, Gary King, and Olivia Lau.  (2009).            |
## |    ``Zelig: Everyone's Statistical Software,''                 |
## |    http://gking.harvard.edu/zelig                              |
## |   and                                                          |
## |    Kosuke Imai, Gary King, and Olivia Lau. (2008).             |
## |    ``Toward A Common Framework for Statistical Analysis        |
## |    and Development,'' Journal of Computational and             |
## |    Graphical Statistics, Vol. 17, No. 4 (December)             |
## |    pp. 892-913.                                                |
## |                                                                |
## |   To cite individual Zelig models, please use the citation     |
## |   format printed with each model run and in the documentation. |
## +----------------------------------------------------------------+
## 
## Attaching package: 'Zelig'
## The following object is masked from 'package:utils':
## 
##     cite
library(foreign)  # to load .dta files
## Warning: package 'foreign' was built under R version 4.2.3
library(lmtest)   # for likelihood ratio test
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(texreg)   # regression tables
## Warning: package 'texreg' was built under R version 4.2.3
## Version:  1.39.3
## Date:     2023-11-09
## Author:   Philip Leifeld (University of Essex)
## 
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").
  library(dplyr)  # data manipulation/ summary statistics
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Zelig':
## 
##     combine, summarize
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(huxtable) # https://evalsp20.classes.andrewheiss.com/reference/regtables/
## Warning: package 'huxtable' was built under R version 4.2.3
## 
## Attaching package: 'huxtable'
## The following object is masked from 'package:dplyr':
## 
##     add_rownames
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:huxtable':
## 
##     theme_grey
## The following object is masked from 'package:Zelig':
## 
##     alpha

1.2 Data

1.3 Dependent Variable

# clear environment
rm(list = ls())

# load British post election study
bes <- read.dta("http://uclspp.github.io/PUBLG100/data/bes.dta")
head(bes)
cs_idTurnoutVote2001IncomeAgeGenderPartyIDInfluenceAttentionTelephoneLeftrightSelfCivicDutyIndexpolinfoindexedu15edu16edu17edu18edu19plusin_schoolin_uniCivicDutyScores
1014760118172071000000-0.633
21153210381615501000001.48 
31    
4011350011052610100000-2.15 
51175610191916700100001.03 
61147601481816401000000.366
# frequency table of voter turnout
table(bes$Turnout)
## 
##    0    1 
## 1079 3712

1.3.1 Preliminary EDA

# rename Gender to male b/c 1 = male & remove missings
bes <- bes %>%
  rename(male = Gender) %>%
  na.omit()

# mean and standard deviation for Turnout by gender
bes %>%
  group_by(male) %>%
  summarise(avg_turnout = mean(Turnout), sd_turnout = sd(Turnout))
maleavg_turnoutsd_turnout
00.7470.435
10.7330.442
# mean for multiple columns using "summarise_each"
bes %>% 
  group_by(male) %>%
  summarise_each(funs(mean), Turnout, Vote2001, Age, LeftrightSelf, 
                 CivicDutyIndex, polinfoindex)
## Warning: `summarise_each()` was deprecated in dplyr 0.7.0.
## ℹ Please use `across()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
## 
## # Simple named list: list(mean = mean, median = median)
## 
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
## 
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
maleTurnoutVote2001AgeLeftrightSelfCivicDutyIndexpolinfoindex
00.7470.84 51  5.4 17.45.1
10.7330.83750.85.4117.85.8

1.4 Logit Model

# logit model
model1 <- glm(formula = Turnout ~ Income + polinfoindex + male + edu15 + edu17 + edu18 + 
                edu19plus + in_school + in_uni, 
              family = binomial(link = "logit"),
              data = bes)

# regression output
summary(model1)
## 
## Call:
## glm(formula = Turnout ~ Income + polinfoindex + male + edu15 + 
##     edu17 + edu18 + edu19plus + in_school + in_uni, family = binomial(link = "logit"), 
##     data = bes)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2687  -0.9372   0.5915   0.7879   1.7798  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.13925    0.14935  -7.628 2.38e-14 ***
## Income        0.03437    0.01862   1.846  0.06495 .  
## polinfoindex  0.37947    0.02230  17.019  < 2e-16 ***
## male         -0.35236    0.07741  -4.552 5.32e-06 ***
## edu15         0.37946    0.09619   3.945 7.99e-05 ***
## edu17         0.46013    0.14902   3.088  0.00202 ** 
## edu18         0.10982    0.14319   0.767  0.44312    
## edu19plus     0.24044    0.12266   1.960  0.04998 *  
## in_school     0.14754    0.39279   0.376  0.70719    
## in_uni       -0.71977    0.25498  -2.823  0.00476 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4763.0  on 4160  degrees of freedom
## Residual deviance: 4381.2  on 4151  degrees of freedom
## AIC: 4401.2
## 
## Number of Fisher Scoring iterations: 4

1.5 Coefficient Interpretation

exp(.03437)
## [1] 1.034967
  • The logistic regression coefficient \(\beta\) associated with a predictor X is the expected change in log odds of having the outcome per unit change in X.

  • So increasing the predictor by 1 unit (or going from 1 level to the next) multiplies the odds of having the outcome by \(e^\beta\) .

1.6 Additive (Change in Log-Odds) Interpretation:

  • Additive Interpretation:

    • For an independent variable X, if β1​ is the coefficient, the change in log-odds is β1​.

    • A positive β1​ implies an increase in the log-odds of the event happening for a one-unit increase in X.

    • A negative β1​ implies a decrease in the log-odds of the event happening for a one-unit increase in X.

  • Example:

    • Suppose β1​=0.1. This means that for a one-unit increase in X, the log-odds of the event happening increase by 0.1.

1.7 Multiplicative (Odds Ratio) Interpretation Interpretation:

In logistic regression, the coefficients (β) represent the change in the log-odds of the dependent variable for a one-unit change in the corresponding independent variable.

  • For an independent variable X, if β1​ is the coefficient, the odds ratio is calculated as e^β1​.

  • An odds ratio greater than 1 implies an increase in the odds of the event happening for a one-unit increase in X.

  • An odds ratio less than 1 implies a decrease in the odds of the event happening for a one-unit increase in X.

  • If the odds ratio is 1, there is no effect on the odds for a one-unit change in X.

  1. Example:

    • Suppose β1​=0.1. The odds ratio would be e^0.1≈1.105. This means that for a one-unit increase in X, the odds of the event happening increase by approximately 10.5%.

The choice between the additive and multiplicative interpretations often depends on the context of the analysis and the ease of interpretation for the audience.

  • Odds ratios are commonly reported in logistic regression analyses as they provide a more intuitive interpretation, especially in the context of binary outcomes.

Remember, these interpretations hold when considering a one-unit change in the independent variable. If the independent variable is categorical, the interpretation may differ based on the coding scheme used (e.g., dummy coding).

2 Confusion Matrix

# Generate predicted probabilities
predicted_probabilities <- predict(object = model1, 
                                   type = "response"
                                   )

# Convert predicted probabilities to binary outcomes (0 or 1) using a threshold (e.g., 0.5)
threshold <- 0.5
binary_predictions <- ifelse(test = predicted_probabilities >= threshold,
                             yes = 1,
                             no = 0
                             )

# Attach binary predictions to the original dataset
bes$binary_predictions <- binary_predictions

summary(bes$binary_predictions)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  1.0000  1.0000  0.9356  1.0000  1.0000
table(bes$Turnout, bes$binary_predictions)
##    
##        0    1
##   0  160  919
##   1  108 2974

You can not calculate the sensitivity, specificity, et all

# Create a confusion matrix
conf_matrix <- table(bes$Turnout, bes$binary_predictions)

# Create a confusion matrix with row and column labels
conf_matrix <- table(bes$Turnout, bes$binary_predictions, dnn = c("Actual", "Predicted"))

# Print the labeled confusion matrix
conf_matrix
##       Predicted
## Actual    0    1
##      0  160  919
##      1  108 2974
# Extract values from the confusion matrix
true_positives <- conf_matrix[2, 2]
false_positives <- conf_matrix[1, 2]
true_negatives <- conf_matrix[1, 1]
false_negatives <- conf_matrix[2, 1]

# Calculate sensitivity and specificity
sensitivity <- true_positives / (true_positives + false_negatives)
specificity <- true_negatives / (true_negatives + false_positives)

# Print the results
cat("Sensitivity (True Positive Rate):", sensitivity, "\n")
## Sensitivity (True Positive Rate): 0.9649578
cat("Specificity (True Negative Rate):", specificity, "\n")
## Specificity (True Negative Rate): 0.1482854

3 Sigmoid curve

# Function to calculate the logistic (S-shaped) curve
logistic_curve <- function(x) {
  return(1 / (1 + exp(-x)))
}

# Generate a sequence of values for the x-axis
x_values <- seq(min(predict(model1, type = "link")), 
                max(predict(model1, type = "link")), 
                length = 100)

# Calculate the predicted probabilities using the logistic curve function
predicted_probabilities_curve <- logistic_curve(x_values)


# Plot the logistic curve and predicted probabilities
ggplot() +
  geom_line(data = data.frame(x = x_values, 
                              y = predicted_probabilities_curve),
            aes(x = x,
                y = y), color = "blue", size = 1) +
  geom_point(data = bes, mapping = aes(x = predict(model1, type = "link"), 
                               y = predicted_probabilities, 
                               color = factor(binary_predictions)
                             ),
             alpha = 0.7, size = 2) +
  scale_color_manual(values = c("0" = "blue", "1" = "red")) +
  labs(title = "S-Curve and Predicted Probabilities",
       x = "Predicted Log-Odds",
       y = "Predicted Probabilities") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

4 Binary Predictions and Predicted Probabilities

# Plot predicted probabilities
ggplot(bes, aes(x = predicted_probabilities, fill = factor(binary_predictions))) +
  geom_histogram(binwidth = 0.05, position = "identity", alpha = 0.7) +
  labs(title = "Predicted Probabilities and Binary Predictions",
       x = "Predicted Probabilities",
       y = "Frequency") +
  scale_fill_manual(values = c("0" = "blue", "1" = "red")) +
  theme_minimal()

5 Residual Analysis

Do not do residual analysis on the regression when y is discrete binary variable.

plot(model1)