Load data
dta_vote<-read.csv("C:/Users/ASUS/Desktop/data/nes_2008.csv", header =T)
head(dta_vote)
## Age Married No Yes
## 1 18 0 164 11
## 2 21 0 126 153
## 3 25 0 83 179
## 4 30 0 85 134
## 5 36 0 51 128
## 6 40 0 38 148
str(dta_vote)
## 'data.frame': 26 obs. of 4 variables:
## $ Age : int 18 21 25 30 36 40 45 50 55 60 ...
## $ Married: int 0 0 0 0 0 0 0 0 0 0 ...
## $ No : int 164 126 83 85 51 38 47 63 53 41 ...
## $ Yes : int 11 153 179 134 128 148 138 220 228 198 ...
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dta_vote$Married<-factor(dta_vote$Married)
dta_vote$Age= as.numeric(dta_vote$Age)
dta_vote <- dta_vote %>% mutate(Total = Yes + No)
dta_vote <- dta_vote %>% mutate(Prop = Yes/Total)
head(dta_vote)
## Age Married No Yes Total Prop
## 1 18 0 164 11 175 0.06285714
## 2 21 0 126 153 279 0.54838710
## 3 25 0 83 179 262 0.68320611
## 4 30 0 85 134 219 0.61187215
## 5 36 0 51 128 179 0.71508380
## 6 40 0 38 148 186 0.79569892
aggregate(cbind(Yes, No) ~ Married, data = dta_vote, sum)
## Married Yes No
## 1 0 1861 814
## 2 1 2678 476
aggregate(cbind(Yes, No) ~ Age, data = dta_vote, sum)
## Age Yes No
## 1 18 11 172
## 2 21 173 155
## 3 25 276 146
## 4 30 315 139
## 5 36 298 103
## 6 40 393 84
## 7 45 378 86
## 8 50 529 110
## 9 55 562 105
## 10 60 510 74
## 11 65 469 49
## 12 70 293 38
## 13 75 332 29
Based on the plot, it seemed that order people tended to vote in 2008. Furthermore, Married
people had higher vote rate compared with Single people.
library(ggplot2)
ggplot(dta_vote, aes(Age, Yes/Total, color = Married)) +
geom_jitter() +
stat_smooth(method = "glm", formula = y ~ x,
method.args = list(family = binomial),
aes(weight = Total), se = FALSE) +
labs(x = "Age", y = "Vote Rate") +
theme(legend.position = c(.1, .9))
#logistic regression models
# slopes and intercepts dependent on Married?
summary(ma <- glm(cbind(Yes, No) ~ Age*Married, data = dta_vote,
family = "binomial"))
##
## Call:
## glm(formula = cbind(Yes, No) ~ Age * Married, family = "binomial",
## data = dta_vote)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -11.7977 -1.2562 0.0811 1.1050 4.7559
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.918787 0.116487 -7.887 3.08e-15 ***
## Age 0.042995 0.002813 15.285 < 2e-16 ***
## Married1 0.358598 0.205253 1.747 0.0806 .
## Age:Married1 0.005082 0.004599 1.105 0.2691
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 882.94 on 25 degrees of freedom
## Residual deviance: 229.51 on 22 degrees of freedom
## AIC: 368.35
##
## Number of Fisher Scoring iterations: 4
# different intercepts for Married?
summary(mb <- glm(cbind(Yes, No) ~ Age+ Married, data = dta_vote,
family = "binomial"))
##
## Call:
## glm(formula = cbind(Yes, No) ~ Age + Married, family = "binomial",
## data = dta_vote)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -11.566 -1.154 -0.051 1.142 4.959
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.992469 0.095874 -10.352 <2e-16 ***
## Age 0.044922 0.002229 20.154 <2e-16 ***
## Married1 0.572484 0.069292 8.262 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 882.94 on 25 degrees of freedom
## Residual deviance: 230.73 on 23 degrees of freedom
## AIC: 367.57
##
## Number of Fisher Scoring iterations: 4
# no MArried effect
summary(mc <- glm(cbind(Yes, No) ~ Age, data = dta_vote,
family = "binomial"))
##
## Call:
## glm(formula = cbind(Yes, No) ~ Age, family = "binomial", data = dta_vote)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -12.5330 -2.1475 0.3402 1.5936 4.3390
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.911675 0.094900 -9.607 <2e-16 ***
## Age 0.049421 0.002183 22.637 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 882.94 on 25 degrees of freedom
## Residual deviance: 299.48 on 24 degrees of freedom
## AIC: 434.32
##
## Number of Fisher Scoring iterations: 4
# test for effects
anova(ma, mb, mc, test = "Chi")
## Analysis of Deviance Table
##
## Model 1: cbind(Yes, No) ~ Age * Married
## Model 2: cbind(Yes, No) ~ Age + Married
## Model 3: cbind(Yes, No) ~ Age
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 22 229.51
## 2 23 230.74 -1 -1.224 0.2685
## 3 24 299.48 -1 -68.746 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This plot looks very similar to Visdualization01 ?
# fortify data with proportions and predicted probabilities
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
dta_mb <- predict(mb, newdata= dta_vote[, c(1,2,6)],
type = c("response"))
# plot
ggplot(dta_vote, aes(Age, Prop, color = Married)) +
geom_jitter() +
geom_line(aes(Age, dta_mb, color = Married)) +
geom_hline(yintercept = .5, col = "gray") +
labs(x = "Age", y = "Vote Rate") +
theme(legend.position = c(.9, .1))
# model diagnostics
# residual plot
library(stats)
plot(residuals(mb, type = "pearson") ~ fitted(mb), ylim=c(-3.3, 3.3),
pch = 20, xlab = "Fitted values", ylab = "Pearson residuals")
abline(h=0)
grid()
# The end