library(tidyverse)
library(Zelig)
library(texreg)
library(mvtnorm)
library(radiant.data)
library(sjmisc)
library(lattice)
library(dplyr)
Student DATA, EXTRA_CURRICULAR_ACTIVITIES
In this homework, I will conduct a regression analysis on student data of a secondary school. My focus will be examining the effects of age, sex (IV’s), reason they chose this secondary school to determine if these factors have an effect on extra curricular activities (ext_act)(DV).
library(radiant.data)
library(readr)
student <- read_csv("/Users/cruz/Desktop/students.csv", col_names = TRUE)
Parsed with column specification:
cols(
.default = col_character(),
age = col_integer(),
Medu = col_integer(),
Fedu = col_integer(),
traveltime = col_integer(),
studytime = col_integer(),
failures = col_integer(),
famrel = col_integer(),
freetime = col_integer(),
goout = col_integer(),
Dalc = col_integer(),
Walc = col_integer(),
health = col_integer(),
absences = col_integer(),
G1 = col_integer(),
G2 = col_integer(),
G3 = col_integer()
)
See spec(...) for full column specifications.
head(student)
student <- student%>%
mutate(ext_act = as.integer(activities))
NAs introduced by coercion
student3 <- student%>%
mutate(ext_act= ifelse(activities =="yes",1,0))
head(student3)
student3 %>%
select(higher, activities, ext_act, everything())
head(student3)
student3<-na.omit(student3)
head(student3)
The result display that age has a negative effect on extra-curricular activities with a (-.169). When looking at the effect of sex we see that males are (.450) more likely than females to be in extra-curricular activities which was also statistically significant. In addition, those who attended this particular secondary school for reputation reasons were (.684) more likely than any other reason to be involved in extra-curricular activities which was statistically significant.
nm0 <- glm(ext_act ~ age + sex + reason + goout, family = binomial, data = student3)
summary(nm0)
Call:
glm(formula = ext_act ~ age + sex + reason + goout, family = binomial,
data = student3)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.6406 -1.1205 0.7976 1.1211 1.5599
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.102014 1.374549 1.529 0.1262
age -0.167256 0.082025 -2.039 0.0414 *
sexM 0.450886 0.208667 2.161 0.0307 *
reasonhome 0.006484 0.257694 0.025 0.9799
reasonother 0.046453 0.379134 0.123 0.9025
reasonreputation 0.684332 0.266427 2.569 0.0102 *
goout 0.105262 0.093905 1.121 0.2623
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 547.46 on 394 degrees of freedom
Residual deviance: 530.12 on 388 degrees of freedom
AIC: 544.12
Number of Fisher Scoring iterations: 4
nm1 <- glm(ext_act ~ age + sex + reason + goout, family = binomial, data = student3)
summary(nm1)
Call:
glm(formula = ext_act ~ age + sex + reason + goout, family = binomial,
data = student3)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.6406 -1.1205 0.7976 1.1211 1.5599
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.102014 1.374549 1.529 0.1262
age -0.167256 0.082025 -2.039 0.0414 *
sexM 0.450886 0.208667 2.161 0.0307 *
reasonhome 0.006484 0.257694 0.025 0.9799
reasonother 0.046453 0.379134 0.123 0.9025
reasonreputation 0.684332 0.266427 2.569 0.0102 *
goout 0.105262 0.093905 1.121 0.2623
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 547.46 on 394 degrees of freedom
Residual deviance: 530.12 on 388 degrees of freedom
AIC: 544.12
Number of Fisher Scoring iterations: 4
nm2 <- glm(ext_act ~ age + sex + reason + goout + I(goout^2), family = binomial, data = student3)
summary(nm2)
Call:
glm(formula = ext_act ~ age + sex + reason + goout + I(goout^2),
family = binomial, data = student3)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.6684 -1.1258 0.7881 1.1189 1.6112
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.697027 1.508295 1.125 0.2605
age -0.169341 0.082168 -2.061 0.0393 *
sexM 0.455065 0.208890 2.178 0.0294 *
reasonhome -0.003032 0.258187 -0.012 0.9906
reasonother 0.051306 0.379868 0.135 0.8926
reasonreputation 0.676118 0.266806 2.534 0.0113 *
goout 0.421977 0.496146 0.851 0.3950
I(goout^2) -0.049781 0.076495 -0.651 0.5152
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 547.46 on 394 degrees of freedom
Residual deviance: 529.70 on 387 degrees of freedom
AIC: 545.7
Number of Fisher Scoring iterations: 4
We see that model 1 or 2 best fit this data. Model 3 has a higher deviance than the other 2.
library(texreg)
screenreg(list(nm0, nm1,nm2))
=================================================
Model 1 Model 2 Model 3
-------------------------------------------------
(Intercept) 2.10 2.10 1.70
(1.37) (1.37) (1.51)
age -0.17 * -0.17 * -0.17 *
(0.08) (0.08) (0.08)
sexM 0.45 * 0.45 * 0.46 *
(0.21) (0.21) (0.21)
reasonhome 0.01 0.01 -0.00
(0.26) (0.26) (0.26)
reasonother 0.05 0.05 0.05
(0.38) (0.38) (0.38)
reasonreputation 0.68 * 0.68 * 0.68 *
(0.27) (0.27) (0.27)
goout 0.11 0.11 0.42
(0.09) (0.09) (0.50)
I(goout^2) -0.05
(0.08)
-------------------------------------------------
AIC 544.12 544.12 545.70
BIC 571.97 571.97 577.53
Log Likelihood -265.06 -265.06 -264.85
Deviance 530.12 530.12 529.70
Num. obs. 395 395 395
=================================================
*** p < 0.001, ** p < 0.01, * p < 0.05
library(texreg)
htmlreg(list(nm0,nm1))
Model 1 | Model 2 | ||
---|---|---|---|
(Intercept) | 2.10 | 2.10 | |
(1.37) | (1.37) | ||
age | -0.17* | -0.17* | |
(0.08) | (0.08) | ||
sexM | 0.45* | 0.45* | |
(0.21) | (0.21) | ||
reasonhome | 0.01 | 0.01 | |
(0.26) | (0.26) | ||
reasonother | 0.05 | 0.05 | |
(0.38) | (0.38) | ||
reasonreputation | 0.68* | 0.68* | |
(0.27) | (0.27) | ||
goout | 0.11 | 0.11 | |
(0.09) | (0.09) | ||
AIC | 544.12 | 544.12 | |
BIC | 571.97 | 571.97 | |
Log Likelihood | -265.06 | -265.06 | |
Deviance | 530.12 | 530.12 | |
Num. obs. | 395 | 395 | |
p < 0.001, p < 0.01, p < 0.05 |
Identifying if the IV’s in this analysis are in fact factors
(l <- sapply(student3, function(x) is.factor(x)))
school sex age address famsize Pstatus Medu
FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Fedu Mjob Fjob reason guardian traveltime studytime
FALSE FALSE FALSE FALSE FALSE FALSE FALSE
failures schoolsup famsup paid activities nursery higher
FALSE FALSE FALSE FALSE FALSE FALSE FALSE
internet romantic famrel freetime goout Dalc Walc
FALSE FALSE FALSE FALSE FALSE FALSE FALSE
health absences G1 G2 G3 ext_act
FALSE FALSE FALSE FALSE FALSE FALSE
student3 <- student3%>%
mutate(age = as.numeric(age),
sex = as.factor(sex),
reason = as.factor(reason))
z4hw.students <- zelig(ext_act ~ age + sex* reason + goout, model = "logit", data = student3)
How to cite this model in Zelig:
R Core Team. 2007.
logit: Logistic Regression for Dichotomous Dependent Variables
in Christine Choirat, Christopher Gandrud, James Honaker, Kosuke Imai, Gary King, and Olivia Lau,
"Zelig: Everyone's Statistical Software," http://zeligproject.org/
z4hw.set <- setx(z4hw.students, age = min(student3$age):max(student3$age))
z4hw.sim <- sim(z4hw.students, z4hw.set)
ci.plot(z4hw.sim)
summary(z4hw.students)
Model:
Call:
z5$zelig(formula = ext_act ~ age + sex * reason + goout, data = student3)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.7303 -1.1346 0.7316 1.1570 1.5417
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.22830 1.38766 1.606 0.1083
age -0.16974 0.08234 -2.061 0.0393
sexM 0.26936 0.33701 0.799 0.4241
reasonhome -0.04336 0.36882 -0.118 0.9064
reasonother -0.16626 0.54463 -0.305 0.7602
reasonreputation 0.49036 0.34681 1.414 0.1574
goout 0.10757 0.09442 1.139 0.2546
sexM:reasonhome 0.10079 0.51437 0.196 0.8447
sexM:reasonother 0.42006 0.76488 0.549 0.5829
sexM:reasonreputation 0.47903 0.55103 0.869 0.3847
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 547.46 on 394 degrees of freedom
Residual deviance: 529.19 on 385 degrees of freedom
AIC: 549.19
Number of Fisher Scoring iterations: 4
Next step: Use 'setx' method
(l <- sapply(student3, function(x) is.factor(x)))
school sex age address famsize Pstatus Medu
FALSE TRUE FALSE FALSE FALSE FALSE FALSE
Fedu Mjob Fjob reason guardian traveltime studytime
FALSE FALSE FALSE TRUE FALSE FALSE FALSE
failures schoolsup famsup paid activities nursery higher
FALSE FALSE FALSE FALSE FALSE FALSE FALSE
internet romantic famrel freetime goout Dalc Walc
FALSE FALSE FALSE FALSE FALSE FALSE FALSE
health absences G1 G2 G3 ext_act
FALSE FALSE FALSE FALSE FALSE FALSE
goout.range = min(student3$goout):max(student3$goout)
x <- setx(z4hw.students, goout = goout.range)
s <- sim(z4hw.students, x = x)
ci.plot(s)
The simulation displays that the estimated values (EV) for males is (0.496) units as shown in the mean. In contrast, the estimated values for females is (0.432) units. The first diffrence (fd) between them is (-0.0632).
x <- setx(z4hw.students, sex = "M")
x1 <- setx(z4hw.students, sex = "F")
s <- sim(z4hw.students, x = x, x1 = x1)
summary(s)
sim x :
-----
ev
mean sd 50% 2.5% 97.5%
[1,] 0.5003575 0.05802548 0.4996959 0.3840955 0.6108303
pv
0 1
[1,] 0.489 0.511
sim x1 :
-----
ev
mean sd 50% 2.5% 97.5%
[1,] 0.4345366 0.05907625 0.4338276 0.3201294 0.5558806
pv
0 1
[1,] 0.595 0.405
fd
mean sd 50% 2.5% 97.5%
[1,] -0.06582088 0.08208778 -0.06520027 -0.2227502 0.1017569
fd <- s$get_qi(xvalue="x1", qi="fd")
summary(fd)
V1
Min. :-0.31780
1st Qu.:-0.11988
Median :-0.06520
Mean :-0.06582
3rd Qu.:-0.01362
Max. : 0.19259
graphics.off()
par("mar")
[1] 5.1 4.1 2.1 2.1
par(mar=c(1,1,1,1))
plot(s)
c1x <- setx(z4hw.students, sex = "M",ext_act )
c1x1 <- setx(z4hw.students, sex = "F", ext_act)
c1s <- sim(z4hw.students, x = c1x, x1 = c1x1)
graphics.off()
par("mar")
[1] 5.1 4.1 2.1 2.1
par(mar=c(1,1,1,1))
plot(c1s)
c2x <- setx(z4hw.students, age, sex*reason)
c2s <- sim(z4hw.students, x = c2x)
graphics.off()
par("mar")
[1] 5.1 4.1 2.1 2.1
par(mar=c(1,1,1,1))
plot(c2s)
d1 <- c1s$get_qi(xvalue="x1", qi="fd")
dfd <- as.data.frame(cbind(d1))
head(dfd)
library(tidyr)
tidd <- dfd %>%
gather(sex, simv, 1:1)
head(tidd)
library(dplyr)
tidd %>%
group_by(sex) %>%
summarise(mode = mode(simv), sd = sd(simv))
This histogram below is the distribution of the FDs between males & females which ranges from (-0.27)-(0.18), with a mean of approximately (-0.1).
library(ggplot2)
ggplot(tidd, aes(simv)) + geom_histogram() + facet_grid(~sex)