All the variables in this dataset were compiled from a sample of the National Survey of College Graduates through the IPUMS-HigherEd site.

For this assignment, I will focus on gender as my binary variable. I examine the relationship between gender, level of education at the bachelor’s level and higher and the number of children living in the household.

I predict that the higher level of education will be associated to less children particularly for women.

The data used for this analysis consists of a sample from the National Survey of College Graduates gathered through IPUMS-Higher Ed.

library(haven)
library(car)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr)
cpsipums2<-read_dta("highered_00005.dta")
names(cpsipums2) #print the column names
##  [1] "personid" "year"     "weight"   "sample"   "surid"    "gender"  
##  [7] "raceth"   "chtot"    "dgrdg"    "salary"

In this sample, there are 38,626 females and 45,830 males.

Recoding of Variables

#gender
cpsipums2$female<-recode(cpsipums2$gender, recodes=1)
cpsipums2$male<-recode(cpsipums2$gender, recodes=2)
cpsipums2$gender<-recode(cpsipums2$gender, recodes="1='female'; 2='male'", as.factor.result=T)
table(cpsipums2$gender)
## 
## female   male 
##  38626  45830
#race/ethnicity 
#There are no entries in this data set under "other"
cpsipums2$asian<-recode(cpsipums2$raceth, recodes=1)
cpsipums2$white<-recode(cpsipums2$raceth, recodes=2)
cpsipums2$minorities<-recode(cpsipums2$raceth, recodes=3)
cpsipums2$other<-recode(cpsipums2$raceth, recodes=4)

cpsipums2$raceth<-recode(cpsipums2$raceth, recodes="1='asian'; 2='white'; 3='minorities'", as.factor.result=T)

table(cpsipums2$raceth)
## 
##      asian minorities      white 
##      13868      18742      51846
#education level
cpsipums2$dgrdg<-recode(cpsipums2$dgrdg, recodes="1='0bachelors'; 2='1masters'; 3='2doctorate'; 4='3professional'", as.factor.result=T)
table(cpsipums2$dgrdg, cpsipums2$gender)
##                
##                 female  male
##   0bachelors     18930 25269
##   1masters       16756 16585
##   2doctorate      1103  1611
##   3professional   1837  2365
#income grouping
cpsipums2$salary<-ifelse(cpsipums2$salary==9999998:9999999, NA, cpsipums2$salary)

summary (cpsipums2$salary)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0   44000   71000  906935  108000 9999998    6519
#number of children living in the household
cpsipums2$chtot<-recode(cpsipums2$chtot, recodes="00='no children'; 01='one child'; 02='one to three children'; 03='two or more children'; 04='more than 3 children'; 98=NA", as.factor.result=T)

table (cpsipums2$chtot)
## 
##            one child two or more children 
##                14472                20517

Descriptive Analysis

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
analysis1<-cpsipums2%>%
  select(chtot,gender,dgrdg, raceth, weight, sample) %>%
  filter( complete.cases(.))
#DESIGN
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~sample, weights=~weight, data =cpsipums2 )

Calculation of Gender and Percentage of Children Living in the Household

The percentage of males with two children or more is higher than the observed percentage on the females. The opposite pattern is observed amongst the males with one child with a lower percentage than the females with one child.

library(ggplot2)
chi2<-svyby(formula = ~gender, by = ~chtot, design = des, FUN = svymean, na.rm=T)
svychisq(~gender+chtot, design = des)
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~gender + chtot, design = des)
## F = 6.8878, ndf = 1, ddf = 84455, p-value = 0.00868
qplot(x=chi2$chtot,y=chi2$gendermale, data=chi2 ,xlab="children", ylab="males" )+
geom_errorbar(aes(x=chtot, ymin=gendermale,ymax=gendermale), width=.25)+
ggtitle(label = "% of Children for Males")

qplot(x=chi2$chtot,y=chi2$genderfemale, data=chi2 ,xlab="children", ylab="females" )+
geom_errorbar(aes(x=chtot, ymin=genderfemale,ymax=genderfemale), width=.25)+
ggtitle(label = "% of Children for Females")

A further comparison of genders confirms the pattern observed particularly between men and women with a doctorate degree and a masters degree. The gap in the difference between males and females with a bacherlos and professional degree is much smaller.

dash2<-svyby(formula = ~gender, by = ~chtot+dgrdg, design = des, FUN = svymean, 
na.rm=T)
dash2
##                                                   chtot         dgrdg
## one child.0bachelors                          one child    0bachelors
## two or more children.0bachelors    two or more children    0bachelors
## one child.1masters                            one child      1masters
## two or more children.1masters      two or more children      1masters
## one child.2doctorate                          one child    2doctorate
## two or more children.2doctorate    two or more children    2doctorate
## one child.3professional                       one child 3professional
## two or more children.3professional two or more children 3professional
##                                    genderfemale gendermale se.genderfemale
## one child.0bachelors                  0.4987430  0.5012570     0.011565550
## two or more children.0bachelors       0.4787425  0.5212575     0.009393604
## one child.1masters                    0.5516248  0.4483752     0.012870997
## two or more children.1masters         0.4960710  0.5039290     0.011039022
## one child.2doctorate                  0.3720197  0.6279803     0.035006009
## two or more children.2doctorate       0.3181701  0.6818299     0.034155108
## one child.3professional               0.3979919  0.6020081     0.029284331
## two or more children.3professional    0.4263822  0.5736178     0.022386803
##                                    se.gendermale
## one child.0bachelors                 0.011565550
## two or more children.0bachelors      0.009393604
## one child.1masters                   0.012870997
## two or more children.1masters        0.011039022
## one child.2doctorate                 0.035006009
## two or more children.2doctorate      0.034155108
## one child.3professional              0.029284331
## two or more children.3professional   0.022386803
dash2$chtot_rec<-rep(c("one child","two or more children"),2)
dash2$dgrdg_rec<-factor(c(rep("0bachelors", 2), rep("1masters", 2), rep("2doctorate", 2), rep("3professional", 2)), ordered = T)
                        
#fix the order of the education factor levels
dash2$dgrdg_rec<-factor(dash2$dgrdg_rec, levels(dash2$dgrdg_rec)[c(4,3,2,1)])
#FEMALES
A<-ggplot(dash2, aes(dgrdg_rec,dash2$genderfemale),xlab="education", ylab="% gender")
A<-A+geom_point(aes(colour=chtot_rec))
A<-A+geom_line(aes(colour=chtot_rec,group=chtot_rec))

A<-A+ylab("female")
A<-A+xlab("Education Level")
A+ggtitle("Percentage of Children in the Household for Females by Level of Education")

#MALES
B<-ggplot(dash2, aes(dgrdg_rec,dash2$gendermale),xlab="education", ylab="% gender")
B<-B+geom_point(aes(colour=chtot_rec))
B<-B+geom_line(aes(colour=chtot_rec,group=chtot_rec))

B<-B+ylab("male")
B<-B+xlab("Education Level")
B+ggtitle("Percentage of Children in the Household for Males by Level of Education")

##LOGIT/PROBIT ANALYSIS A logit/probit analysis confirms the observed patterns with males being more likely to respond having two or more children however in the masters degree variable, we see a change where females are slightly more likely to have two or more children in comparison to males.

#Logit model
fit.logit<-svyglm(gender~chtot+dgrdg, design= des, family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
#probit model
fit.probit<-svyglm(gender~chtot+dgrdg, design=des, family=binomial(link= "probit"))
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
stargazer(fit.logit, fit.probit,type = "html", style="demography",covariate.labels=c("two or more children","1masters","2doctorate","3professional"))
gender
survey-weighted survey-weighted
logistic probit
Model 1 Model 2
two or more children 0.110* 0.069*
(0.043) (0.027)
1masters -0.124** -0.078**
(0.045) (0.028)
2doctorate 0.612*** 0.380***
(0.114) (0.070)
3professional 0.280*** 0.175***
(0.079) (0.049)
Constant -0.014 -0.009
(0.039) (0.024)
N 34,989 34,989
Log Likelihood -24,740.590 -24,740.600
AIC 49,491.170 49,491.210
p < .05; p < .01; p < .001

MARGINAL EFFECTS

A comparison between the marginal effects from the logit and probit models appears to be very similar. We see that men are 15% more likely to report having two or more children at the doctorate level than women and 6% more likely if they have a professional degree. In terms of the masters degree, men are 3% less likely to report having two or more children.

#Logit marginal effects
log.marg<-coef(fit.logit)*mean(dlogis(predict(fit.logit)), na.rm=T)

#Probit marginal effects
prob.marg<-coef(fit.probit)*mean(dnorm(predict(fit.probit)), na.rm=T)

plot(log.marg[-1], ylab="Marginal Effects", axes=T,xaxt="n", main="Marginal Effects from Logit and Probit models", ylim=c(-.25, .2))
axis(side=1, at=1:13, labels=F)
text(x=1:13, y=-.3,  srt = 45, pos = 1, xpd = TRUE,
     labels = c("two or more children","1masters","2doctorate","3professional"))
points(prob.marg[-1], col=2)
abline(h=0, col=2)
legend("bottomright", legend=c("Logit Model", "Probit Model"), col=c("black", "red"),pch=1)

data.frame(m.logit=log.marg, m.probit=prob.marg)
##                                m.logit     m.probit
## (Intercept)               -0.003436141 -0.003419522
## chtottwo or more children  0.027326658  0.027323915
## dgrdg1masters             -0.030840231 -0.030880300
## dgrdg2doctorate            0.151966554  0.150939085
## dgrdg3professional         0.069559658  0.069426084

Fitted Values

In calculating probabilities, we find no difference betwen men and women which could be due to the small differences in the percentages and calculations that the previous analysis demonstrated.

#get a series of predicted probabilites for different "types" of people for each model
#expand.grid will generate all possible combinations of values you specify
dat<-expand.grid(gender=levels(factor(cpsipums2$gender)),dgrdg=levels(factor(cpsipums2$dgrdg)), chtot=levels(factor(cpsipums2$chtot)))
summary (dat)
##     gender            dgrdg                    chtot  
##  female:8   0bachelors   :4   one child           :8  
##  male  :8   1masters     :4   two or more children:8  
##             2doctorate   :4                           
##             3professional:4
fit<-predict(fit.logit, newdata=dat, type="response")
fit2<-predict(fit.probit, newdata=dat, type="response")
dat$fitted.prob.lrm<-round(fit, 3)
dat$fitted.prob.pro<-round(fit2, 3)

head(dat, n=20)
##    gender         dgrdg                chtot fitted.prob.lrm
## 1  female    0bachelors            one child           0.497
## 2    male    0bachelors            one child           0.497
## 3  female      1masters            one child           0.466
## 4    male      1masters            one child           0.466
## 5  female    2doctorate            one child           0.645
## 6    male    2doctorate            one child           0.645
## 7  female 3professional            one child           0.566
## 8    male 3professional            one child           0.566
## 9  female    0bachelors two or more children           0.524
## 10   male    0bachelors two or more children           0.524
## 11 female      1masters two or more children           0.493
## 12   male      1masters two or more children           0.493
## 13 female    2doctorate two or more children           0.670
## 14   male    2doctorate two or more children           0.670
## 15 female 3professional two or more children           0.593
## 16   male 3professional two or more children           0.593
##    fitted.prob.pro
## 1            0.497
## 2            0.497
## 3            0.466
## 4            0.466
## 5            0.645
## 6            0.645
## 7            0.566
## 8            0.566
## 9            0.524
## 10           0.524
## 11           0.493
## 12           0.493
## 13           0.670
## 14           0.670
## 15           0.593
## 16           0.593

When calculating the probabilities of a female having two children to that of males, we find that there is not difference.

dat[which(dat$gender=="female"&dat$chtot=="two or more children"&dat$dgrdg=="2doctorate"),]
##    gender      dgrdg                chtot fitted.prob.lrm fitted.prob.pro
## 13 female 2doctorate two or more children            0.67            0.67
dat[which(dat$gender=="male"&dat$chtot=="two or more children"&dat$dgrdg=="2doctorate"),]
##    gender      dgrdg                chtot fitted.prob.lrm fitted.prob.pro
## 14   male 2doctorate two or more children            0.67            0.67