1 SetUp

https://www.ucl.ac.uk/~uctqiax/PUBLG100/2015/week9/seminar9.html

1.1 Packages

# install.packages("https://cran.r-project.org/src/contrib/Archive/Zelig/Zelig_4.2-1.tar.gz", 
#                  repos=NULL, 
#                  type="source")
# install.packages("sandwich")
# packageVersion("Zelig")
# # install.packages("Zelig")
# install.packages("foriegn")
# install.packages("lmtest")
# install.packages("library(texreg)")
# install.packages("huxtable", dependencies = TRUE)

library(Zelig)    # load BEFORE dplyr!!!

## Loading required package: boot

## Warning: package 'boot' was built under R version 4.2.3

## Loading required package: MASS

## Warning: package 'MASS' was built under R version 4.2.3

## Loading required package: sandwich

## Warning: package 'sandwich' was built under R version 4.2.3

## ZELIG (Versions 4.2-1, built: 2013-09-12)

## 
## +----------------------------------------------------------------+
## |  Please refer to http://gking.harvard.edu/zelig for full       |
## |  documentation or help.zelig() for help with commands and      |
## |  models support by Zelig.                                      |
## |                                                                |
## |  Zelig project citations:                                      |
## |    Kosuke Imai, Gary King, and Olivia Lau.  (2009).            |
## |    ``Zelig: Everyone's Statistical Software,''                 |
## |    http://gking.harvard.edu/zelig                              |
## |   and                                                          |
## |    Kosuke Imai, Gary King, and Olivia Lau. (2008).             |
## |    ``Toward A Common Framework for Statistical Analysis        |
## |    and Development,'' Journal of Computational and             |
## |    Graphical Statistics, Vol. 17, No. 4 (December)             |
## |    pp. 892-913.                                                |
## |                                                                |
## |   To cite individual Zelig models, please use the citation     |
## |   format printed with each model run and in the documentation. |
## +----------------------------------------------------------------+

## 
## Attaching package: 'Zelig'

## The following object is masked from 'package:utils':
## 
##     cite

library(foreign)  # to load .dta files

## Warning: package 'foreign' was built under R version 4.2.3

library(lmtest)   # for likelihood ratio test

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(texreg)   # regression tables

## Warning: package 'texreg' was built under R version 4.2.3

## Version:  1.39.3
## Date:     2023-11-09
## Author:   Philip Leifeld (University of Essex)
## 
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").

  library(dplyr)  # data manipulation/ summary statistics

## Warning: package 'dplyr' was built under R version 4.2.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:Zelig':
## 
##     combine, summarize

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(huxtable) # https://evalsp20.classes.andrewheiss.com/reference/regtables/

## Warning: package 'huxtable' was built under R version 4.2.3

## 
## Attaching package: 'huxtable'

## The following object is masked from 'package:dplyr':
## 
##     add_rownames

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.2.3

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:huxtable':
## 
##     theme_grey

## The following object is masked from 'package:Zelig':
## 
##     alpha

1.2 Data

1.3 Dependent Variable

# clear environment
rm(list = ls())

# load British post election study
bes <- read.dta("http://uclspp.github.io/PUBLG100/data/bes.dta")
head(bes)

cs_id	Turnout	Vote2001	Income	Age	Gender	PartyID	Influence	Attention	Telephone	LeftrightSelf	CivicDutyIndex	polinfoindex	edu15	edu16	edu17	edu18	edu19plus	in_school	in_uni	CivicDutyScores
1	0	1	4	76	0	1	1	8	1	7	20	7	1	0	0	0	0	0	0	-0.633
2	1	1	5	32	1	0	3	8	1	6	15	5	0	1	0	0	0	0	0	1.48
3	1
4	0	1	1	35	0	0	1	1	0	5	26	1	0	1	0	0	0	0	0	-2.15
5	1	1	7	56	1	0	1	9	1	9	16	7	0	0	1	0	0	0	0	1.03
6	1	1	4	76	0	1	4	8	1	8	16	4	0	1	0	0	0	0	0	0.366

# frequency table of voter turnout
table(bes$Turnout)

## 
##    0    1 
## 1079 3712

1.3.1 Preliminary EDA

# rename Gender to male b/c 1 = male & remove missings
bes <- bes %>%
  rename(male = Gender) %>%
  na.omit()

# mean and standard deviation for Turnout by gender
bes %>%
  group_by(male) %>%
  summarise(avg_turnout = mean(Turnout), sd_turnout = sd(Turnout))

male	avg_turnout	sd_turnout
0	0.747	0.435
1	0.733	0.442

# mean for multiple columns using "summarise_each"
bes %>% 
  group_by(male) %>%
  summarise_each(funs(mean), Turnout, Vote2001, Age, LeftrightSelf, 
                 CivicDutyIndex, polinfoindex)

## Warning: `summarise_each()` was deprecated in dplyr 0.7.0.
## ℹ Please use `across()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
## 
## # Simple named list: list(mean = mean, median = median)
## 
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
## 
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

male	Turnout	Vote2001	Age	LeftrightSelf	CivicDutyIndex	polinfoindex
0	0.747	0.84	51	5.4	17.4	5.1
1	0.733	0.837	50.8	5.41	17.8	5.8

1.4 Logit Model

# logit model
model1 <- glm(formula = Turnout ~ Income + polinfoindex + male + edu15 + edu17 + edu18 + 
                edu19plus + in_school + in_uni, 
              family = binomial(link = "logit"),
              data = bes)

# regression output
summary(model1)

## 
## Call:
## glm(formula = Turnout ~ Income + polinfoindex + male + edu15 + 
##     edu17 + edu18 + edu19plus + in_school + in_uni, family = binomial(link = "logit"), 
##     data = bes)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2687  -0.9372   0.5915   0.7879   1.7798  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.13925    0.14935  -7.628 2.38e-14 ***
## Income        0.03437    0.01862   1.846  0.06495 .  
## polinfoindex  0.37947    0.02230  17.019  < 2e-16 ***
## male         -0.35236    0.07741  -4.552 5.32e-06 ***
## edu15         0.37946    0.09619   3.945 7.99e-05 ***
## edu17         0.46013    0.14902   3.088  0.00202 ** 
## edu18         0.10982    0.14319   0.767  0.44312    
## edu19plus     0.24044    0.12266   1.960  0.04998 *  
## in_school     0.14754    0.39279   0.376  0.70719    
## in_uni       -0.71977    0.25498  -2.823  0.00476 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4763.0  on 4160  degrees of freedom
## Residual deviance: 4381.2  on 4151  degrees of freedom
## AIC: 4401.2
## 
## Number of Fisher Scoring iterations: 4

1.5 Coefficient Interpretation

exp(.03437)

## [1] 1.034967

The logistic regression coefficient \(\beta\) associated with a predictor X is the expected change in log odds of having the outcome per unit change in X.
So increasing the predictor by 1 unit (or going from 1 level to the next) multiplies the odds of having the outcome by \(e^\beta\) .

1.6 Additive (Change in Log-Odds) Interpretation:

Additive Interpretation:
- For an independent variable X, if β1 is the coefficient, the change in log-odds is β1.
- A positive β1 implies an increase in the log-odds of the event happening for a one-unit increase in X.
- A negative β1 implies a decrease in the log-odds of the event happening for a one-unit increase in X.
Example:
- Suppose β1=0.1. This means that for a one-unit increase in X, the log-odds of the event happening increase by 0.1.

1.7 Multiplicative (Odds Ratio) Interpretation Interpretation:

In logistic regression, the coefficients (β) represent the change in the log-odds of the dependent variable for a one-unit change in the corresponding independent variable.

For an independent variable X, if β1 is the coefficient, the odds ratio is calculated as e^β1.
An odds ratio greater than 1 implies an increase in the odds of the event happening for a one-unit increase in X.
An odds ratio less than 1 implies a decrease in the odds of the event happening for a one-unit increase in X.
If the odds ratio is 1, there is no effect on the odds for a one-unit change in X.

Example:
- Suppose β1=0.1. The odds ratio would be e^0.1≈1.105. This means that for a one-unit increase in X, the odds of the event happening increase by approximately 10.5%.

The choice between the additive and multiplicative interpretations often depends on the context of the analysis and the ease of interpretation for the audience.

Odds ratios are commonly reported in logistic regression analyses as they provide a more intuitive interpretation, especially in the context of binary outcomes.

Remember, these interpretations hold when considering a one-unit change in the independent variable. If the independent variable is categorical, the interpretation may differ based on the coding scheme used (e.g., dummy coding).

2 Confusion Matrix

# Generate predicted probabilities
predicted_probabilities <- predict(object = model1, 
                                   type = "response"
                                   )

# Convert predicted probabilities to binary outcomes (0 or 1) using a threshold (e.g., 0.5)
threshold <- 0.5
binary_predictions <- ifelse(test = predicted_probabilities >= threshold,
                             yes = 1,
                             no = 0
                             )

# Attach binary predictions to the original dataset
bes$binary_predictions <- binary_predictions

summary(bes$binary_predictions)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  1.0000  1.0000  0.9356  1.0000  1.0000

table(bes$Turnout, bes$binary_predictions)

##    
##        0    1
##   0  160  919
##   1  108 2974

You can not calculate the sensitivity, specificity, et all

# Create a confusion matrix
conf_matrix <- table(bes$Turnout, bes$binary_predictions)

# Create a confusion matrix with row and column labels
conf_matrix <- table(bes$Turnout, bes$binary_predictions, dnn = c("Actual", "Predicted"))

# Print the labeled confusion matrix
conf_matrix

##       Predicted
## Actual    0    1
##      0  160  919
##      1  108 2974

# Extract values from the confusion matrix
true_positives <- conf_matrix[2, 2]
false_positives <- conf_matrix[1, 2]
true_negatives <- conf_matrix[1, 1]
false_negatives <- conf_matrix[2, 1]

# Calculate sensitivity and specificity
sensitivity <- true_positives / (true_positives + false_negatives)
specificity <- true_negatives / (true_negatives + false_positives)

# Print the results
cat("Sensitivity (True Positive Rate):", sensitivity, "\n")

## Sensitivity (True Positive Rate): 0.9649578

cat("Specificity (True Negative Rate):", specificity, "\n")

## Specificity (True Negative Rate): 0.1482854

3 Sigmoid curve

# Function to calculate the logistic (S-shaped) curve
logistic_curve <- function(x) {
  return(1 / (1 + exp(-x)))
}

# Generate a sequence of values for the x-axis
x_values <- seq(min(predict(model1, type = "link")), 
                max(predict(model1, type = "link")), 
                length = 100)

# Calculate the predicted probabilities using the logistic curve function
predicted_probabilities_curve <- logistic_curve(x_values)


# Plot the logistic curve and predicted probabilities
ggplot() +
  geom_line(data = data.frame(x = x_values, 
                              y = predicted_probabilities_curve),
            aes(x = x,
                y = y), color = "blue", size = 1) +
  geom_point(data = bes, mapping = aes(x = predict(model1, type = "link"), 
                               y = predicted_probabilities, 
                               color = factor(binary_predictions)
                             ),
             alpha = 0.7, size = 2) +
  scale_color_manual(values = c("0" = "blue", "1" = "red")) +
  labs(title = "S-Curve and Predicted Probabilities",
       x = "Predicted Log-Odds",
       y = "Predicted Probabilities") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

4 Binary Predictions and Predicted Probabilities

# Plot predicted probabilities
ggplot(bes, aes(x = predicted_probabilities, fill = factor(binary_predictions))) +
  geom_histogram(binwidth = 0.05, position = "identity", alpha = 0.7) +
  labs(title = "Predicted Probabilities and Binary Predictions",
       x = "Predicted Probabilities",
       y = "Frequency") +
  scale_fill_manual(values = c("0" = "blue", "1" = "red")) +
  theme_minimal()

5 Residual Analysis

Do not do residual analysis on the regression when y is discrete binary variable.

plot(model1)

Logit

Arvind Sharma

2024-04-23