All the variables in this dataset were compiled from a sample of the National Survey of College Graduates through the IPUMS-HigherEd site.
For this assignment, I will focus on gender as my binary variable. I examine the relationship between gender, level of education at the bachelor’s level and higher and the number of children living in the household.
I predict that the higher level of education will be associated to less children particularly for women.
The data used for this analysis consists of a sample from the National Survey of College Graduates gathered through IPUMS-Higher Ed.
library(haven)
library(car)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr)
cpsipums2<-read_dta("highered_00005.dta")
names(cpsipums2) #print the column names
## [1] "personid" "year" "weight" "sample" "surid" "gender"
## [7] "raceth" "chtot" "dgrdg" "salary"
In this sample, there are 38,626 females and 45,830 males.
#gender
cpsipums2$female<-recode(cpsipums2$gender, recodes=1)
cpsipums2$male<-recode(cpsipums2$gender, recodes=2)
cpsipums2$gender<-recode(cpsipums2$gender, recodes="1='female'; 2='male'", as.factor.result=T)
table(cpsipums2$gender)
##
## female male
## 38626 45830
#race/ethnicity
#There are no entries in this data set under "other"
cpsipums2$asian<-recode(cpsipums2$raceth, recodes=1)
cpsipums2$white<-recode(cpsipums2$raceth, recodes=2)
cpsipums2$minorities<-recode(cpsipums2$raceth, recodes=3)
cpsipums2$other<-recode(cpsipums2$raceth, recodes=4)
cpsipums2$raceth<-recode(cpsipums2$raceth, recodes="1='asian'; 2='white'; 3='minorities'", as.factor.result=T)
table(cpsipums2$raceth)
##
## asian minorities white
## 13868 18742 51846
#education level
cpsipums2$dgrdg<-recode(cpsipums2$dgrdg, recodes="1='0bachelors'; 2='1masters'; 3='2doctorate'; 4='3professional'", as.factor.result=T)
table(cpsipums2$dgrdg, cpsipums2$gender)
##
## female male
## 0bachelors 18930 25269
## 1masters 16756 16585
## 2doctorate 1103 1611
## 3professional 1837 2365
#income grouping
cpsipums2$salary<-ifelse(cpsipums2$salary==9999998:9999999, NA, cpsipums2$salary)
summary (cpsipums2$salary)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 44000 71000 906935 108000 9999998 6519
#number of children living in the household
cpsipums2$chtot<-recode(cpsipums2$chtot, recodes="00='no children'; 01='one child'; 02='one to three children'; 03='two or more children'; 04='more than 3 children'; 98=NA", as.factor.result=T)
table (cpsipums2$chtot)
##
## one child two or more children
## 14472 20517
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
analysis1<-cpsipums2%>%
select(chtot,gender,dgrdg, raceth, weight, sample) %>%
filter( complete.cases(.))
#DESIGN
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~sample, weights=~weight, data =cpsipums2 )
The percentage of males with two children or more is higher than the observed percentage on the females. The opposite pattern is observed amongst the males with one child with a lower percentage than the females with one child.
library(ggplot2)
chi2<-svyby(formula = ~gender, by = ~chtot, design = des, FUN = svymean, na.rm=T)
svychisq(~gender+chtot, design = des)
##
## Pearson's X^2: Rao & Scott adjustment
##
## data: svychisq(~gender + chtot, design = des)
## F = 6.8878, ndf = 1, ddf = 84455, p-value = 0.00868
qplot(x=chi2$chtot,y=chi2$gendermale, data=chi2 ,xlab="children", ylab="males" )+
geom_errorbar(aes(x=chtot, ymin=gendermale,ymax=gendermale), width=.25)+
ggtitle(label = "% of Children for Males")
qplot(x=chi2$chtot,y=chi2$genderfemale, data=chi2 ,xlab="children", ylab="females" )+
geom_errorbar(aes(x=chtot, ymin=genderfemale,ymax=genderfemale), width=.25)+
ggtitle(label = "% of Children for Females")
A further comparison of genders confirms the pattern observed particularly between men and women with a doctorate degree and a masters degree. The gap in the difference between males and females with a bacherlos and professional degree is much smaller.
dash2<-svyby(formula = ~gender, by = ~chtot+dgrdg, design = des, FUN = svymean,
na.rm=T)
dash2
## chtot dgrdg
## one child.0bachelors one child 0bachelors
## two or more children.0bachelors two or more children 0bachelors
## one child.1masters one child 1masters
## two or more children.1masters two or more children 1masters
## one child.2doctorate one child 2doctorate
## two or more children.2doctorate two or more children 2doctorate
## one child.3professional one child 3professional
## two or more children.3professional two or more children 3professional
## genderfemale gendermale se.genderfemale
## one child.0bachelors 0.4987430 0.5012570 0.011565550
## two or more children.0bachelors 0.4787425 0.5212575 0.009393604
## one child.1masters 0.5516248 0.4483752 0.012870997
## two or more children.1masters 0.4960710 0.5039290 0.011039022
## one child.2doctorate 0.3720197 0.6279803 0.035006009
## two or more children.2doctorate 0.3181701 0.6818299 0.034155108
## one child.3professional 0.3979919 0.6020081 0.029284331
## two or more children.3professional 0.4263822 0.5736178 0.022386803
## se.gendermale
## one child.0bachelors 0.011565550
## two or more children.0bachelors 0.009393604
## one child.1masters 0.012870997
## two or more children.1masters 0.011039022
## one child.2doctorate 0.035006009
## two or more children.2doctorate 0.034155108
## one child.3professional 0.029284331
## two or more children.3professional 0.022386803
dash2$chtot_rec<-rep(c("one child","two or more children"),2)
dash2$dgrdg_rec<-factor(c(rep("0bachelors", 2), rep("1masters", 2), rep("2doctorate", 2), rep("3professional", 2)), ordered = T)
#fix the order of the education factor levels
dash2$dgrdg_rec<-factor(dash2$dgrdg_rec, levels(dash2$dgrdg_rec)[c(4,3,2,1)])
#FEMALES
A<-ggplot(dash2, aes(dgrdg_rec,dash2$genderfemale),xlab="education", ylab="% gender")
A<-A+geom_point(aes(colour=chtot_rec))
A<-A+geom_line(aes(colour=chtot_rec,group=chtot_rec))
A<-A+ylab("female")
A<-A+xlab("Education Level")
A+ggtitle("Percentage of Children in the Household for Females by Level of Education")
#MALES
B<-ggplot(dash2, aes(dgrdg_rec,dash2$gendermale),xlab="education", ylab="% gender")
B<-B+geom_point(aes(colour=chtot_rec))
B<-B+geom_line(aes(colour=chtot_rec,group=chtot_rec))
B<-B+ylab("male")
B<-B+xlab("Education Level")
B+ggtitle("Percentage of Children in the Household for Males by Level of Education")
##LOGIT/PROBIT ANALYSIS A logit/probit analysis confirms the observed patterns with males being more likely to respond having two or more children however in the masters degree variable, we see a change where females are slightly more likely to have two or more children in comparison to males.
#Logit model
fit.logit<-svyglm(gender~chtot+dgrdg, design= des, family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
#probit model
fit.probit<-svyglm(gender~chtot+dgrdg, design=des, family=binomial(link= "probit"))
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
stargazer(fit.logit, fit.probit,type = "html", style="demography",covariate.labels=c("two or more children","1masters","2doctorate","3professional"))
| gender | ||
| survey-weighted | survey-weighted | |
| logistic | probit | |
| Model 1 | Model 2 | |
| two or more children | 0.110* | 0.069* |
| (0.043) | (0.027) | |
| 1masters | -0.124** | -0.078** |
| (0.045) | (0.028) | |
| 2doctorate | 0.612*** | 0.380*** |
| (0.114) | (0.070) | |
| 3professional | 0.280*** | 0.175*** |
| (0.079) | (0.049) | |
| Constant | -0.014 | -0.009 |
| (0.039) | (0.024) | |
| N | 34,989 | 34,989 |
| Log Likelihood | -24,740.590 | -24,740.600 |
| AIC | 49,491.170 | 49,491.210 |
| p < .05; p < .01; p < .001 | ||
A comparison between the marginal effects from the logit and probit models appears to be very similar. We see that men are 15% more likely to report having two or more children at the doctorate level than women and 6% more likely if they have a professional degree. In terms of the masters degree, men are 3% less likely to report having two or more children.
#Logit marginal effects
log.marg<-coef(fit.logit)*mean(dlogis(predict(fit.logit)), na.rm=T)
#Probit marginal effects
prob.marg<-coef(fit.probit)*mean(dnorm(predict(fit.probit)), na.rm=T)
plot(log.marg[-1], ylab="Marginal Effects", axes=T,xaxt="n", main="Marginal Effects from Logit and Probit models", ylim=c(-.25, .2))
axis(side=1, at=1:13, labels=F)
text(x=1:13, y=-.3, srt = 45, pos = 1, xpd = TRUE,
labels = c("two or more children","1masters","2doctorate","3professional"))
points(prob.marg[-1], col=2)
abline(h=0, col=2)
legend("bottomright", legend=c("Logit Model", "Probit Model"), col=c("black", "red"),pch=1)
data.frame(m.logit=log.marg, m.probit=prob.marg)
## m.logit m.probit
## (Intercept) -0.003436141 -0.003419522
## chtottwo or more children 0.027326658 0.027323915
## dgrdg1masters -0.030840231 -0.030880300
## dgrdg2doctorate 0.151966554 0.150939085
## dgrdg3professional 0.069559658 0.069426084
In calculating probabilities, we find no difference betwen men and women which could be due to the small differences in the percentages and calculations that the previous analysis demonstrated.
#get a series of predicted probabilites for different "types" of people for each model
#expand.grid will generate all possible combinations of values you specify
dat<-expand.grid(gender=levels(factor(cpsipums2$gender)),dgrdg=levels(factor(cpsipums2$dgrdg)), chtot=levels(factor(cpsipums2$chtot)))
summary (dat)
## gender dgrdg chtot
## female:8 0bachelors :4 one child :8
## male :8 1masters :4 two or more children:8
## 2doctorate :4
## 3professional:4
fit<-predict(fit.logit, newdata=dat, type="response")
fit2<-predict(fit.probit, newdata=dat, type="response")
dat$fitted.prob.lrm<-round(fit, 3)
dat$fitted.prob.pro<-round(fit2, 3)
head(dat, n=20)
## gender dgrdg chtot fitted.prob.lrm
## 1 female 0bachelors one child 0.497
## 2 male 0bachelors one child 0.497
## 3 female 1masters one child 0.466
## 4 male 1masters one child 0.466
## 5 female 2doctorate one child 0.645
## 6 male 2doctorate one child 0.645
## 7 female 3professional one child 0.566
## 8 male 3professional one child 0.566
## 9 female 0bachelors two or more children 0.524
## 10 male 0bachelors two or more children 0.524
## 11 female 1masters two or more children 0.493
## 12 male 1masters two or more children 0.493
## 13 female 2doctorate two or more children 0.670
## 14 male 2doctorate two or more children 0.670
## 15 female 3professional two or more children 0.593
## 16 male 3professional two or more children 0.593
## fitted.prob.pro
## 1 0.497
## 2 0.497
## 3 0.466
## 4 0.466
## 5 0.645
## 6 0.645
## 7 0.566
## 8 0.566
## 9 0.524
## 10 0.524
## 11 0.493
## 12 0.493
## 13 0.670
## 14 0.670
## 15 0.593
## 16 0.593
When calculating the probabilities of a female having two children to that of males, we find that there is not difference.
dat[which(dat$gender=="female"&dat$chtot=="two or more children"&dat$dgrdg=="2doctorate"),]
## gender dgrdg chtot fitted.prob.lrm fitted.prob.pro
## 13 female 2doctorate two or more children 0.67 0.67
dat[which(dat$gender=="male"&dat$chtot=="two or more children"&dat$dgrdg=="2doctorate"),]
## gender dgrdg chtot fitted.prob.lrm fitted.prob.pro
## 14 male 2doctorate two or more children 0.67 0.67