https://www.ucl.ac.uk/~uctqiax/PUBLG100/2015/week9/seminar9.html
# install.packages("https://cran.r-project.org/src/contrib/Archive/Zelig/Zelig_4.2-1.tar.gz",
# repos=NULL,
# type="source")
# install.packages("sandwich")
# packageVersion("Zelig")
# # install.packages("Zelig")
# install.packages("foriegn")
# install.packages("lmtest")
# install.packages("library(texreg)")
# install.packages("huxtable", dependencies = TRUE)
library(Zelig) # load BEFORE dplyr!!!
## Loading required package: boot
## Warning: package 'boot' was built under R version 4.2.3
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 4.2.3
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.2.3
## ZELIG (Versions 4.2-1, built: 2013-09-12)
##
## +----------------------------------------------------------------+
## | Please refer to http://gking.harvard.edu/zelig for full |
## | documentation or help.zelig() for help with commands and |
## | models support by Zelig. |
## | |
## | Zelig project citations: |
## | Kosuke Imai, Gary King, and Olivia Lau. (2009). |
## | ``Zelig: Everyone's Statistical Software,'' |
## | http://gking.harvard.edu/zelig |
## | and |
## | Kosuke Imai, Gary King, and Olivia Lau. (2008). |
## | ``Toward A Common Framework for Statistical Analysis |
## | and Development,'' Journal of Computational and |
## | Graphical Statistics, Vol. 17, No. 4 (December) |
## | pp. 892-913. |
## | |
## | To cite individual Zelig models, please use the citation |
## | format printed with each model run and in the documentation. |
## +----------------------------------------------------------------+
##
## Attaching package: 'Zelig'
## The following object is masked from 'package:utils':
##
## cite
library(foreign) # to load .dta files
## Warning: package 'foreign' was built under R version 4.2.3
library(lmtest) # for likelihood ratio test
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(texreg) # regression tables
## Warning: package 'texreg' was built under R version 4.2.3
## Version: 1.39.3
## Date: 2023-11-09
## Author: Philip Leifeld (University of Essex)
##
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").
library(dplyr) # data manipulation/ summary statistics
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Zelig':
##
## combine, summarize
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(huxtable) # https://evalsp20.classes.andrewheiss.com/reference/regtables/
## Warning: package 'huxtable' was built under R version 4.2.3
##
## Attaching package: 'huxtable'
## The following object is masked from 'package:dplyr':
##
## add_rownames
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:huxtable':
##
## theme_grey
## The following object is masked from 'package:Zelig':
##
## alpha
# clear environment
rm(list = ls())
# load British post election study
bes <- read.dta("http://uclspp.github.io/PUBLG100/data/bes.dta")
head(bes)
cs_id | Turnout | Vote2001 | Income | Age | Gender | PartyID | Influence | Attention | Telephone | LeftrightSelf | CivicDutyIndex | polinfoindex | edu15 | edu16 | edu17 | edu18 | edu19plus | in_school | in_uni | CivicDutyScores |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 0 | 1 | 4 | 76 | 0 | 1 | 1 | 8 | 1 | 7 | 20 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | -0.633 |
2 | 1 | 1 | 5 | 32 | 1 | 0 | 3 | 8 | 1 | 6 | 15 | 5 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1.48Â |
3 | 1 | Â Â Â Â | ||||||||||||||||||
4 | 0 | 1 | 1 | 35 | 0 | 0 | 1 | 1 | 0 | 5 | 26 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | -2.15Â |
5 | 1 | 1 | 7 | 56 | 1 | 0 | 1 | 9 | 1 | 9 | 16 | 7 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1.03Â |
6 | 1 | 1 | 4 | 76 | 0 | 1 | 4 | 8 | 1 | 8 | 16 | 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.366 |
# frequency table of voter turnout
table(bes$Turnout)
##
## 0 1
## 1079 3712
# rename Gender to male b/c 1 = male & remove missings
bes <- bes %>%
rename(male = Gender) %>%
na.omit()
# mean and standard deviation for Turnout by gender
bes %>%
group_by(male) %>%
summarise(avg_turnout = mean(Turnout), sd_turnout = sd(Turnout))
male | avg_turnout | sd_turnout |
---|---|---|
0 | 0.747 | 0.435 |
1 | 0.733 | 0.442 |
# mean for multiple columns using "summarise_each"
bes %>%
group_by(male) %>%
summarise_each(funs(mean), Turnout, Vote2001, Age, LeftrightSelf,
CivicDutyIndex, polinfoindex)
## Warning: `summarise_each()` was deprecated in dplyr 0.7.0.
## ℹ Please use `across()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
##
## # Simple named list: list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
##
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
male | Turnout | Vote2001 | Age | LeftrightSelf | CivicDutyIndex | polinfoindex |
---|---|---|---|---|---|---|
0 | 0.747 | 0.84Â | 51Â Â | 5.4Â | 17.4 | 5.1 |
1 | 0.733 | 0.837 | 50.8 | 5.41 | 17.8 | 5.8 |
# logit model
model1 <- glm(formula = Turnout ~ Income + polinfoindex + male + edu15 + edu17 + edu18 +
edu19plus + in_school + in_uni,
family = binomial(link = "logit"),
data = bes)
# regression output
summary(model1)
##
## Call:
## glm(formula = Turnout ~ Income + polinfoindex + male + edu15 +
## edu17 + edu18 + edu19plus + in_school + in_uni, family = binomial(link = "logit"),
## data = bes)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2687 -0.9372 0.5915 0.7879 1.7798
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.13925 0.14935 -7.628 2.38e-14 ***
## Income 0.03437 0.01862 1.846 0.06495 .
## polinfoindex 0.37947 0.02230 17.019 < 2e-16 ***
## male -0.35236 0.07741 -4.552 5.32e-06 ***
## edu15 0.37946 0.09619 3.945 7.99e-05 ***
## edu17 0.46013 0.14902 3.088 0.00202 **
## edu18 0.10982 0.14319 0.767 0.44312
## edu19plus 0.24044 0.12266 1.960 0.04998 *
## in_school 0.14754 0.39279 0.376 0.70719
## in_uni -0.71977 0.25498 -2.823 0.00476 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4763.0 on 4160 degrees of freedom
## Residual deviance: 4381.2 on 4151 degrees of freedom
## AIC: 4401.2
##
## Number of Fisher Scoring iterations: 4
exp(.03437)
## [1] 1.034967
The logistic regression coefficient \(\beta\) associated with a predictor X is the expected change in log odds of having the outcome per unit change in X.
So increasing the predictor by 1 unit (or going from 1 level to the next) multiplies the odds of having the outcome by \(e^\beta\) .
Additive Interpretation:
For an independent variable X, if β1​ is the coefficient, the change in log-odds is β1​.
A positive β1​ implies an increase in the log-odds of the event happening for a one-unit increase in X.
A negative β1​ implies a decrease in the log-odds of the event happening for a one-unit increase in X.
Example:
In logistic regression, the coefficients (β) represent the change in the log-odds of the dependent variable for a one-unit change in the corresponding independent variable.
For an independent variable X, if β1​ is the coefficient, the odds ratio is calculated as e^β1​.
An odds ratio greater than 1 implies an increase in the odds of the event happening for a one-unit increase in X.
An odds ratio less than 1 implies a decrease in the odds of the event happening for a one-unit increase in X.
If the odds ratio is 1, there is no effect on the odds for a one-unit change in X.
Example:
The choice between the additive and multiplicative interpretations often depends on the context of the analysis and the ease of interpretation for the audience.
Remember, these interpretations hold when considering a one-unit change in the independent variable. If the independent variable is categorical, the interpretation may differ based on the coding scheme used (e.g., dummy coding).
# Generate predicted probabilities
predicted_probabilities <- predict(object = model1,
type = "response"
)
# Convert predicted probabilities to binary outcomes (0 or 1) using a threshold (e.g., 0.5)
threshold <- 0.5
binary_predictions <- ifelse(test = predicted_probabilities >= threshold,
yes = 1,
no = 0
)
# Attach binary predictions to the original dataset
bes$binary_predictions <- binary_predictions
summary(bes$binary_predictions)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 1.0000 1.0000 0.9356 1.0000 1.0000
table(bes$Turnout, bes$binary_predictions)
##
## 0 1
## 0 160 919
## 1 108 2974
You can not calculate the sensitivity, specificity, et all
# Create a confusion matrix
conf_matrix <- table(bes$Turnout, bes$binary_predictions)
# Create a confusion matrix with row and column labels
conf_matrix <- table(bes$Turnout, bes$binary_predictions, dnn = c("Actual", "Predicted"))
# Print the labeled confusion matrix
conf_matrix
## Predicted
## Actual 0 1
## 0 160 919
## 1 108 2974
# Extract values from the confusion matrix
true_positives <- conf_matrix[2, 2]
false_positives <- conf_matrix[1, 2]
true_negatives <- conf_matrix[1, 1]
false_negatives <- conf_matrix[2, 1]
# Calculate sensitivity and specificity
sensitivity <- true_positives / (true_positives + false_negatives)
specificity <- true_negatives / (true_negatives + false_positives)
# Print the results
cat("Sensitivity (True Positive Rate):", sensitivity, "\n")
## Sensitivity (True Positive Rate): 0.9649578
cat("Specificity (True Negative Rate):", specificity, "\n")
## Specificity (True Negative Rate): 0.1482854
# Function to calculate the logistic (S-shaped) curve
logistic_curve <- function(x) {
return(1 / (1 + exp(-x)))
}
# Generate a sequence of values for the x-axis
x_values <- seq(min(predict(model1, type = "link")),
max(predict(model1, type = "link")),
length = 100)
# Calculate the predicted probabilities using the logistic curve function
predicted_probabilities_curve <- logistic_curve(x_values)
# Plot the logistic curve and predicted probabilities
ggplot() +
geom_line(data = data.frame(x = x_values,
y = predicted_probabilities_curve),
aes(x = x,
y = y), color = "blue", size = 1) +
geom_point(data = bes, mapping = aes(x = predict(model1, type = "link"),
y = predicted_probabilities,
color = factor(binary_predictions)
),
alpha = 0.7, size = 2) +
scale_color_manual(values = c("0" = "blue", "1" = "red")) +
labs(title = "S-Curve and Predicted Probabilities",
x = "Predicted Log-Odds",
y = "Predicted Probabilities") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Plot predicted probabilities
ggplot(bes, aes(x = predicted_probabilities, fill = factor(binary_predictions))) +
geom_histogram(binwidth = 0.05, position = "identity", alpha = 0.7) +
labs(title = "Predicted Probabilities and Binary Predictions",
x = "Predicted Probabilities",
y = "Frequency") +
scale_fill_manual(values = c("0" = "blue", "1" = "red")) +
theme_minimal()
Do not do residual analysis on the regression when y is discrete binary variable.
plot(model1)