All the variables in this dataset were compiled from a sample of the National Survey of College Graduates through the IPUMS-HigherEd site.

For this assignment, I will focus on salary as my outcome variable. I examine the relationship between gender, level of education at the bachelor’s level and higher to determine the impact on salary. Other variables included are the number of children living in the household and race that are also included in the sample but excluded for this particular analysis and assignment.

I predict that salary will be higher for males than for females at all levels of education.

The data used for this analysis consists of a sample from the National Survey of College Graduates gathered through IPUMS-Higher Ed.

library(haven)
library(car)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr)
cpsipums<-read_dta("highered_00005.dta")
names(cpsipums) #print the column names
##  [1] "personid" "year"     "weight"   "sample"   "surid"    "gender"  
##  [7] "raceth"   "chtot"    "dgrdg"    "salary"

In this sample, there are 38,626 females and 45,830 males.

Recoding of Variables

#gender
cpsipums$female<-recode(cpsipums$gender, recodes=1)
cpsipums$male<-recode(cpsipums$gender, recodes=2)
cpsipums$gender<-recode(cpsipums$gender, recodes="1='female'; 2='male'", as.factor.result=T)
table(cpsipums$gender)
## 
## female   male 
##  38626  45830
#race/ethnicity 
#There are no entries in this data set under "other"
cpsipums$asian<-recode(cpsipums$raceth, recodes=1)
cpsipums$white<-recode(cpsipums$raceth, recodes=2)
cpsipums$minorities<-recode(cpsipums$raceth, recodes=3)
cpsipums$other<-recode(cpsipums$raceth, recodes=4)

cpsipums$raceth<-recode(cpsipums$raceth, recodes="1='asian'; 2='white'; 3='minorities'", as.factor.result=T)

table(cpsipums$raceth)
## 
##      asian minorities      white 
##      13868      18742      51846
#education level
cpsipums$dgrdg<-recode(cpsipums$dgrdg, recodes="1='0bachelors'; 2='1masters'; 3='2doctorate'; 4='3professional'", as.factor.result=T)
table(cpsipums$dgrdg, cpsipums$gender)
##                
##                 female  male
##   0bachelors     18930 25269
##   1masters       16756 16585
##   2doctorate      1103  1611
##   3professional   1837  2365
#income grouping
cpsipums$salary<-ifelse(cpsipums$salary==9999998:9999999, NA, cpsipums$salary)

summary (cpsipums$salary)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0   44000   71000  906935  108000 9999998    6519
#number of children living in the household
cpsipums$chtot<-recode(cpsipums$chtot, recodes="00='no children'; 01='one child'; 02='one to three children'; 03='two or more children'; 04='more than 3 children'; 98=NA", as.factor.result=T)

table (cpsipums$chtot)
## 
##            one child two or more children 
##                14472                20517

Descriptive Statistics

The percentage of the sample with a bachelors, doctorate and professional degrees is higher for males than it is for females. However, the percentage of females is higher as the masters level. The chi square test for independence is significant which tells us that these variables are independent of each other.

#column percentages of level of education by gender
prop.table(table(cpsipums$dgrdg, cpsipums$gender), margin=2)
##                
##                     female       male
##   0bachelors    0.49008440 0.55136374
##   1masters      0.43380107 0.36188086
##   2doctorate    0.02855589 0.03515165
##   3professional 0.04755864 0.05160375
#basic chi square test of independence of level of education and salary
chisq.test(table(cpsipums$dgrdg, cpsipums$gender))
## 
##  Pearson's Chi-squared test
## 
## data:  table(cpsipums$dgrdg, cpsipums$gender)
## X-squared = 460.3, df = 3, p-value < 2.2e-16

Survey Design & Comparison with the Original Data

options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~sample, weights=~weight, data = cpsipums[is.na(cpsipums$weight)==F,] )

Column percentages of level of education by gender adjusted for weight

A comparison of the percentages of males and females by level of education follows the same pattern when comparing with weights than without weights. However, it is important to notice that the difference between males and females is much smaller when considering survey design at 1% versus without the weights which showed a difference of 6%.

ex1<-wtd.table(cpsipums$dgrdg, cpsipums$gender, weights = cpsipums$weight)
prop.table(wtd.table(cpsipums$dgrdg, cpsipums$gender, weights = cpsipums$weight), margin=2)
##                   female       male
## 0bachelors    0.58641280 0.59767939
## 1masters      0.32837989 0.28164509
## 2doctorate    0.01536567 0.02152942
## 3professional 0.06984164 0.09914609

Standard Errors of Percentages of Gender and level of education

n<-table(is.na(cpsipums$salary)==F)
n
## 
## FALSE  TRUE 
##  6519 77937
p<-prop.table(wtd.table(cpsipums$dgrdg, cpsipums$gender, weights = cpsipums$weight), margin=2)
se<-sqrt((p*(1-p))/n[2])

data.frame(proportion=p, se=se)
##   proportion.Var1 proportion.Var2 proportion.Freq       se.Var1 se.Var2
## 1      0bachelors          female      0.58641280    0bachelors  female
## 2        1masters          female      0.32837989      1masters  female
## 3      2doctorate          female      0.01536567    2doctorate  female
## 4   3professional          female      0.06984164 3professional  female
## 5      0bachelors            male      0.59767939    0bachelors    male
## 6        1masters            male      0.28164509      1masters    male
## 7      2doctorate            male      0.02152942    2doctorate    male
## 8   3professional            male      0.09914609 3professional    male
##        se.Freq
## 1 0.0017640603
## 2 0.0016822025
## 3 0.0004405968
## 4 0.0009129854
## 5 0.0017565011
## 6 0.0016111975
## 7 0.0005198981
## 8 0.0010705160

Proper survey design analysis

When including appropriate considerations of weight and standard errors a few changes in the proportions of female vs. males are observed. The gap is slighty larger at 5% than when excluding standard errors. The gap is also larger at the doctorate and professional level however, very small changes are observed at the masters level where females exceed males.

ex2<-svytable(~dgrdg+gender, design = des)
prop.table(svytable(~dgrdg+gender, design = des), margin = 2)
##                gender
## dgrdg               female       male
##   0bachelors    0.58641280 0.59767939
##   1masters      0.32837989 0.28164509
##   2doctorate    0.01536567 0.02152942
##   3professional 0.06984164 0.09914609
sv.table<-svyby(formula = ~gender, by = ~dgrdg, design = des, FUN = svymean, na.rm=T)
sv.table
##                       dgrdg genderfemale gendermale se.genderfemale
## 0bachelors       0bachelors    0.4768768  0.5231232     0.004787068
## 1masters           1masters    0.5199882  0.4800118     0.005636264
## 2doctorate       2doctorate    0.3987172  0.6012828     0.017227002
## 3professional 3professional    0.3955857  0.6044143     0.012031105
##               se.gendermale
## 0bachelors      0.004787068
## 1masters        0.005636264
## 2doctorate      0.017227002
## 3professional   0.012031105

Regression Analysis

A regression analysis shows that there is a negative relationship betweeen males and females particularly at the doctoral level. However, the P values shows a stronger effect of the education on the salaries of males than for females as well as stronger effect for a masters degree.

After incorporating the case weights we lose the effect on the masters degree and a stronger effect on the professional degree emerges. The differens in gender continue to favor men.

Finally, the survey design is included showing similar patterns to the previous case of reg2.

reg1<-lm(salary~dgrdg+gender, data=cpsipums)
summary(reg1)
## 
## Call:
## lm(formula = salary ~ dgrdg + gender, data = cpsipums)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1166105  -911390  -821121  -608405  9394417 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1166105      17908  65.116  < 2e-16 ***
## dgrdg1masters       -210715      20836 -10.113  < 2e-16 ***
## dgrdg2doctorate     -278540      56095  -4.965 6.87e-07 ***
## dgrdg3professional  -225845      45911  -4.919 8.71e-07 ***
## gendermale          -281985      19873 -14.189  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2752000 on 77932 degrees of freedom
##   (6519 observations deleted due to missingness)
## Multiple R-squared:  0.003902,   Adjusted R-squared:  0.003851 
## F-statistic: 76.32 on 4 and 77932 DF,  p-value: < 2.2e-16

Regression Analysis With Case Weights

reg2<-lm(salary~dgrdg+gender, data=cpsipums, weights = cpsipums$weight)
summary(reg2)
## 
## Call:
## lm(formula = salary ~ dgrdg + gender, data = cpsipums, weights = cpsipums$weight)
## 
## Weighted Residuals:
##        Min         1Q     Median         3Q        Max 
## -155445764  -17717071  -10225678   -6082536  768865324 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1280503      18186  70.412  < 2e-16 ***
## dgrdg1masters          9917      24167   0.410    0.682    
## dgrdg2doctorate      -41121      79872  -0.515    0.607    
## dgrdg3professional  -307549      39048  -7.876 3.42e-15 ***
## gendermale          -304932      21687 -14.060  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54510000 on 77932 degrees of freedom
##   (6519 observations deleted due to missingness)
## Multiple R-squared:  0.003558,   Adjusted R-squared:  0.003507 
## F-statistic: 69.57 on 4 and 77932 DF,  p-value: < 2.2e-16

Regression Analysis With Survey Design

reg3<-svyglm(salary~dgrdg+gender,des, family=gaussian)
summary(reg3)
## 
## Call:
## svyglm(formula = salary ~ dgrdg + gender, des, family = gaussian)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~sample, weights = ~weight, data = cpsipums[is.na(cpsipums$weight) == 
##     F, ])
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1280503      39215  32.653  < 2e-16 ***
## dgrdg1masters          9917      48986   0.202    0.840    
## dgrdg2doctorate      -41121     122430  -0.336    0.737    
## dgrdg3professional  -307549      73498  -4.184 2.86e-05 ***
## gendermale          -304932      44083  -6.917 4.64e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 8.926064e+12)
## 
## Number of Fisher Scoring iterations: 2

Conclusion

We can conclude after completing the analysis that the only statistical significance is found between gender and the professional level of education which often includes law, medicine, dentistry and other health professions degrees.