#Recode vars
library(readr)
ds1 <- read_csv("C:/Users/drayr/OneDrive/Desktop/DEM Spring 2021/Stats II 7183/HW/HW5/cps_00010.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   WTFINL = col_double(),
##   AGE = col_double(),
##   SEX = col_double(),
##   RACE = col_double(),
##   MARST = col_double(),
##   POPSTAT = col_double(),
##   VETSTAT = col_double(),
##   CITIZEN = col_double(),
##   HISPAN = col_double(),
##   LABFORCE = col_double(),
##   IND = col_double(),
##   CLASSWKR = col_double(),
##   EDCYC = col_double(),
##   COVIDUNAW = col_double()
## )
ds1<-ds1[sample(nrow(ds1), 50000), ]

#The names in the data are very ugly, so I make them less ugly
nams<-names(ds1)
head(nams, n=10)
##  [1] "WTFINL"   "AGE"      "SEX"      "RACE"     "MARST"    "POPSTAT" 
##  [7] "VETSTAT"  "CITIZEN"  "HISPAN"   "LABFORCE"
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(ds1)<-newnames
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
ds2<-ds1%>%
  mutate(
        yrsofcoll=Recode(edcyc,
                               recodes="1:2=3;3=2;4:5=1; else=NA", as.factor = T),
        yrsofcoll=relevel(yrsofcoll, ref = "1"),
        yocnum=car::Recode(edcyc,
                                recodes="1:2=3;3=2;4:5=1; else=NA", as.factor = F),
        sex = Recode(sex,
                       recodes="2 ='F'; 1 ='M'; else = NA",
                       as.factor=T),
         race=Recode(race, recodes="100='white'; 200='black'; 300='AmIn';650:652='AS_Pac'; 700:830='other_mix'; else=NA", as.factor = T),
         race=relevel(race, ref = "white"),
         evermarried = Recode(marst,
                       recodes="6 ='0'; 9 = NA; else ='1' " ,
                       as.factor=T),
         vetstat = Recode(vetstat,
                       recodes="2 ='1'; 1 ='0'; else = NA",
                       as.factor=T),
         popstat = as.factor(popstat),
         citizen = Recode(citizen,
                       recodes="1:4 ='1'; 5 ='0'; else = NA",
                       as.factor=T),
         hispan = Recode(hispan,
                       recodes="000='0';100:612='1';901:902 =NA",
                       as.factor=T),
         labforce = Recode(labforce,
                       recodes="2 ='1'; 1 ='0'; else = NA",
                       as.factor=T),
         covidunaw = Recode(covidunaw,
                       recodes="02 ='1'; 01 ='0'; else = NA",
                       as.factor=T)
         )%>%
  filter( complete.cases(.))


sub<-ds2%>%
  select(yrsofcoll,yocnum, covidunaw, sex,race,evermarried, popstat, vetstat,citizen,hispan,labforce, wtfinl) %>%
    filter( complete.cases(.))
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, weights=~wtfinl, data =sub )

Ordinal Regression example

To fit an ordinal logit to survey data in R, we use the svyolr function in the survey library.

1. Define an ordinal or multinomial outcome variable of your choosing and define how you will recode the original variable.

Ordinal categorical variable transformed from integer ‘years of college completed’

2. State a research question about what factors you believe will affect your outcome variable.

Which characteristics are useful in explaining the duration of college someone is likely to complete?

3. Fit both the ordinal or the multinomial logistic regression models to your outcome.

1. Are the assumptions of the proportional odds model met?

No, since some odds ratios for changes in y are not similar.

2. How did you evaluate the proportional odds assumption?

Comparing the odds ratios for each change in y by fitting a logit to each outcome.

3. Which model (Proportional odds or Multinomial) fits the data better? how did you decide upon this?

Multinomial. It has a lower AIC.

#Here I fit three nested models for the health outcome
fit.solr1<-svyolr(yrsofcoll~race+sex+evermarried+vetstat+hispan+labforce+citizen,des)
summary(fit.solr1)
## Call:
## svyolr(yrsofcoll ~ race + sex + evermarried + vetstat + hispan + 
##     labforce + citizen, des)
## 
## Coefficients:
##                     Value Std. Error     t value
## raceAmIn       0.01155076 0.17940658  0.06438317
## raceAS_Pac    -0.01483509 0.09720210 -0.15262105
## raceblack      0.10755898 0.05732981  1.87614405
## raceother_mix  0.08971748 0.13216890  0.67880930
## sexM           0.07334382 0.03969322  1.84776683
## evermarried1  -0.04411500 0.04173039 -1.05714346
## vetstat1       0.03270661 0.07721914  0.42355576
## hispan1        0.14466149 0.05666576  2.55289079
## labforce1     -0.12066370 0.04618654 -2.61252917
## citizen1       0.28962629 0.10369180  2.79314570
## 
## Intercepts:
##     Value   Std. Error t value
## 1|2 -1.1637  0.1168    -9.9669
## 2|3  1.0311  0.1161     8.8835
#Calculate the AIC ourself
fit.solr1$deviance+2*length(fit.solr1$coefficients)
## [1] 27469.38
ex1<-svyglm(I(yocnum>1)~race+sex+evermarried+vetstat+hispan+labforce+citizen,des, family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
ex2<-svyglm(I(yocnum>2)~race+sex+evermarried+vetstat+hispan+labforce+citizen,des, family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(ex1)
## 
## Call:
## svyglm(formula = I(yocnum > 1) ~ race + sex + evermarried + vetstat + 
##     hispan + labforce + citizen, design = des, family = "binomial")
## 
## Survey design:
## svydesign(ids = ~1, weights = ~wtfinl, data = sub)
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.843173   0.145053   5.813 6.28e-09 ***
## raceAmIn      -0.067017   0.234068  -0.286  0.77464    
## raceAS_Pac    -0.106626   0.115542  -0.923  0.35611    
## raceblack      0.275115   0.084330   3.262  0.00111 ** 
## raceother_mix  0.089490   0.170000   0.526  0.59861    
## sexM           0.026639   0.052756   0.505  0.61361    
## evermarried1   0.338407   0.053055   6.378 1.85e-10 ***
## vetstat1       0.003781   0.109040   0.035  0.97234    
## hispan1        0.112888   0.074662   1.512  0.13056    
## labforce1      0.071925   0.057807   1.244  0.21344    
## citizen1       0.258169   0.131410   1.965  0.04948 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.000857)
## 
## Number of Fisher Scoring iterations: 4
summary(ex2)
## 
## Call:
## svyglm(formula = I(yocnum > 2) ~ race + sex + evermarried + vetstat + 
##     hispan + labforce + citizen, design = des, family = "binomial")
## 
## Survey design:
## svydesign(ids = ~1, weights = ~wtfinl, data = sub)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.84222    0.13619  -6.184 6.43e-10 ***
## raceAmIn       0.05451    0.19101   0.285  0.77534    
## raceAS_Pac     0.05341    0.10552   0.506  0.61274    
## raceblack      0.01099    0.06707   0.164  0.86991    
## raceother_mix  0.08678    0.14431   0.601  0.54763    
## sexM           0.10238    0.04520   2.265  0.02351 *  
## evermarried1  -0.28648    0.04553  -6.292 3.23e-10 ***
## vetstat1       0.05617    0.09193   0.611  0.54119    
## hispan1        0.16216    0.06223   2.606  0.00917 ** 
## labforce1     -0.23411    0.04938  -4.741 2.15e-06 ***
## citizen1       0.31235    0.12532   2.492  0.01270 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.999894)
## 
## Number of Fisher Scoring iterations: 4

This is NOT a statistical test, just a a rough guide. Here, I plot the coefficients of each model. If the proportional odds is correct, and the assumption is ok, then all these should be “approximately” the same values.

coefs<-data.frame(parms =c( coefficients(ex1)[-1],coefficients(ex2)[-1]),
                  beta = rep(names(coefficients(ex1))[-1], 2 ), 
                  mod = c(rep("I(yocnum>1)",10 ),
                          rep("I(yocnum>2)",10 )))

coefs%>%
  ggplot()+
  geom_point(aes(x=beta, y=parms, color=mod))+
  theme(axis.text.x = element_text(angle = 45))+
  labs(title = "Graphical Summary of Proportional Odds Assumption",
       x= "Beta",
       y= "Estimate")

Based on this, the effects appear to be pretty consistent across transitions.

#Just print the odds ratios, 
round(exp(rbind(coef(ex1)[-1],
                coef(ex2)[-1])),3)
##      raceAmIn raceAS_Pac raceblack raceother_mix  sexM evermarried1 vetstat1
## [1,]    0.935      0.899     1.317         1.094 1.027        1.403    1.004
## [2,]    1.056      1.055     1.011         1.091 1.108        0.751    1.058
##      hispan1 labforce1 citizen1
## [1,]   1.120     1.075    1.295
## [2,]   1.176     0.791    1.367

Which again shows us the effects are pretty consistent across transitions, just like the plot does. If anything the plot is easier to digest.

Non proportional assumptions

We can also fit a cumulative logit model where the coefficients are not assumed to be the same across transitions. This is an alternative model where we don’t assume proportionality of the odds of the various transitions. We can’t use survey design to do this however.

#Proportional odds
fit.vgam<-vglm(as.ordered(yrsofcoll)~race+sex+evermarried+vetstat+hispan+labforce+citizen,
               ds2, weights =wtfinl/mean(wtfinl, na.rm=T),
               family=cumulative(parallel = T, reverse = T))  #<-parallel = T == proportional odds
summary(fit.vgam)
## 
## Call:
## vglm(formula = as.ordered(yrsofcoll) ~ race + sex + evermarried + 
##     vetstat + hispan + labforce + citizen, family = cumulative(parallel = T, 
##     reverse = T), data = ds2, weights = wtfinl/mean(wtfinl, na.rm = T))
## 
## Coefficients: 
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1  1.16372    0.09449  12.316  < 2e-16 ***
## (Intercept):2 -1.03106    0.09434 -10.929  < 2e-16 ***
## raceAmIn       0.01155    0.13957   0.083 0.934043    
## raceAS_Pac    -0.01484    0.07780  -0.191 0.848766    
## raceblack      0.10756    0.04681   2.298 0.021564 *  
## raceother_mix  0.08971    0.09833   0.912 0.361556    
## sexM           0.07334    0.03373   2.175 0.029655 *  
## evermarried1  -0.04412    0.03423  -1.289 0.197528    
## vetstat1       0.03271    0.07198   0.454 0.649556    
## hispan1        0.14466    0.04446   3.254 0.001140 ** 
## labforce1     -0.12066    0.03780  -3.193 0.001410 ** 
## citizen1       0.28963    0.08389   3.452 0.000556 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y>=2]), logitlink(P[Y>=3])
## 
## Residual deviance: 27449.38 on 26588 degrees of freedom
## 
## Log-likelihood: -13724.69 on 26588 degrees of freedom
## 
## Number of Fisher scoring iterations: 4 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Exponentiated coefficients:
##      raceAmIn    raceAS_Pac     raceblack raceother_mix          sexM 
##     1.0116179     0.9852739     1.1135564     1.0938605     1.0761001 
##  evermarried1      vetstat1       hispan1     labforce1      citizen1 
##     0.9568430     1.0332462     1.1556488     0.8863312     1.3359291
#Non-proportional odds
fit.vgam2<-vglm(as.ordered(yrsofcoll)~race+sex+evermarried+vetstat+hispan+labforce+citizen,ds2,
                weights =wtfinl/mean(wtfinl, na.rm=T),
                family=cumulative(parallel = F, reverse = T))  #<-parallel = F == Nonproportional odds
summary(fit.vgam2)
## 
## Call:
## vglm(formula = as.ordered(yrsofcoll) ~ race + sex + evermarried + 
##     vetstat + hispan + labforce + citizen, family = cumulative(parallel = F, 
##     reverse = T), data = ds2, weights = wtfinl/mean(wtfinl, na.rm = T))
## 
## Coefficients: 
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1    0.834627   0.118502   7.043 1.88e-12 ***
## (Intercept):2   -0.832900   0.110561  -7.533 4.94e-14 ***
## raceAmIn:1      -0.078101   0.181061  -0.431  0.66621    
## raceAmIn:2       0.077653   0.157447   0.493  0.62187    
## raceAS_Pac:1    -0.122780   0.097959  -1.253  0.21007    
## raceAS_Pac:2     0.065583   0.088694   0.739  0.45965    
## raceblack:1      0.268246   0.065002   4.127 3.68e-05 ***
## raceblack:2      0.006481   0.053562   0.121  0.90369    
## raceother_mix:1  0.091517   0.131099   0.698  0.48513    
## raceother_mix:2  0.090416   0.109989   0.822  0.41105    
## sexM:1           0.032151   0.044846   0.717  0.47343    
## sexM:2           0.100247   0.038632   2.595  0.00946 ** 
## evermarried1:1   0.337805   0.045202   7.473 7.83e-14 ***
## evermarried1:2  -0.287797   0.039020  -7.376 1.63e-13 ***
## vetstat1:1       0.010253   0.099236   0.103  0.91771    
## vetstat1:2       0.046153   0.082521   0.559  0.57596    
## hispan1:1        0.124183   0.059681   2.081  0.03745 *  
## hispan1:2        0.156433   0.050122   3.121  0.00180 ** 
## labforce1:1      0.075991   0.049510   1.535  0.12481    
## labforce1:2     -0.237154   0.042492  -5.581 2.39e-08 ***
## citizen1:1       0.261295   0.105685   2.472  0.01342 *  
## citizen1:2       0.308319   0.100285   3.074  0.00211 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y>=2]), logitlink(P[Y>=3])
## 
## Residual deviance: 27213.44 on 26578 degrees of freedom
## 
## Log-likelihood: -13606.72 on 26578 degrees of freedom
## 
## Number of Fisher scoring iterations: 4 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Exponentiated coefficients:
##      raceAmIn:1      raceAmIn:2    raceAS_Pac:1    raceAS_Pac:2     raceblack:1 
##       0.9248708       1.0807481       0.8844580       1.0677810       1.3076686 
##     raceblack:2 raceother_mix:1 raceother_mix:2          sexM:1          sexM:2 
##       1.0065021       1.0958352       1.0946300       1.0326733       1.1054439 
##  evermarried1:1  evermarried1:2      vetstat1:1      vetstat1:2       hispan1:1 
##       1.4018675       0.7499141       1.0103059       1.0472350       1.1322233 
##       hispan1:2     labforce1:1     labforce1:2      citizen1:1      citizen1:2 
##       1.1693329       1.0789531       0.7888699       1.2986108       1.3611351
AIC(fit.vgam)
## [1] 27473.38
AIC(fit.vgam2)
## [1] 27257.44

Multinomial

Now I fit the model, There is no survey-corrected multinomial model in R, so we use vglm and use family = multinomial.

mfit<-vglm(yrsofcoll~race+sex+evermarried+vetstat+hispan+labforce+citizen,
           family=multinomial(refLevel = 1),
           data=ds2,
           weights=wtfinl/mean(wtfinl, na.rm=T))

summary(mfit)
## 
## Call:
## vglm(formula = yrsofcoll ~ race + sex + evermarried + vetstat + 
##     hispan + labforce + citizen, family = multinomial(refLevel = 1), 
##     data = ds2, weights = wtfinl/mean(wtfinl, na.rm = T))
## 
## Coefficients: 
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1    0.27677    0.12704   2.179 0.029366 *  
## (Intercept):2   -0.02780    0.14009  -0.198 0.842674    
## raceAmIn:1      -0.10251    0.19571  -0.524 0.600423    
## raceAmIn:2      -0.01592    0.20691  -0.077 0.938667    
## raceAS_Pac:1    -0.15091    0.10659  -1.416 0.156811    
## raceAS_Pac:2    -0.04562    0.11344  -0.402 0.687558    
## raceblack:1      0.30629    0.06895   4.442 8.91e-06 ***
## raceblack:2      0.22998    0.07382   3.115 0.001838 ** 
## raceother_mix:1  0.06066    0.14103   0.430 0.667092    
## raceother_mix:2  0.12930    0.14734   0.878 0.380185    
## sexM:1          -0.01848    0.04791  -0.386 0.699631    
## sexM:2           0.09040    0.05134   1.761 0.078290 .  
## evermarried1:1   0.51488    0.04839  10.641  < 2e-16 ***
## evermarried1:2   0.07472    0.05160   1.448 0.147634    
## vetstat1:1      -0.01751    0.10487  -0.167 0.867417    
## vetstat1:2       0.04359    0.11262   0.387 0.698723    
## hispan1:1        0.05164    0.06389   0.808 0.418947    
## hispan1:2        0.19855    0.06730   2.950 0.003175 ** 
## labforce1:1      0.19524    0.05349   3.650 0.000262 ***
## labforce1:2     -0.09938    0.05610  -1.771 0.076500 .  
## citizen1:1       0.15510    0.11290   1.374 0.169535    
## citizen1:2       0.41941    0.12630   3.321 0.000898 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: log(mu[,2]/mu[,1]), log(mu[,3]/mu[,1])
## 
## Residual deviance: 27214.74 on 26578 degrees of freedom
## 
## Log-likelihood: -13607.37 on 26578 degrees of freedom
## 
## Number of Fisher scoring iterations: 4 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Reference group is level  1  of the response

Calculate the odds ratios for each covariate’s effect on each model

round(exp(coef(mfit)), 3)
##   (Intercept):1   (Intercept):2      raceAmIn:1      raceAmIn:2    raceAS_Pac:1 
##           1.319           0.973           0.903           0.984           0.860 
##    raceAS_Pac:2     raceblack:1     raceblack:2 raceother_mix:1 raceother_mix:2 
##           0.955           1.358           1.259           1.063           1.138 
##          sexM:1          sexM:2  evermarried1:1  evermarried1:2      vetstat1:1 
##           0.982           1.095           1.673           1.078           0.983 
##      vetstat1:2       hispan1:1       hispan1:2     labforce1:1     labforce1:2 
##           1.045           1.053           1.220           1.216           0.905 
##      citizen1:1      citizen1:2 
##           1.168           1.521

We can also obtain confidence intervals for these odds ratios

round(exp(confint(mfit)), 3)
##                 2.5 % 97.5 %
## (Intercept):1   1.028  1.692
## (Intercept):2   0.739  1.280
## raceAmIn:1      0.615  1.325
## raceAmIn:2      0.656  1.476
## raceAS_Pac:1    0.698  1.060
## raceAS_Pac:2    0.765  1.193
## raceblack:1     1.187  1.555
## raceblack:2     1.089  1.455
## raceother_mix:1 0.806  1.401
## raceother_mix:2 0.853  1.519
## sexM:1          0.894  1.078
## sexM:2          0.990  1.211
## evermarried1:1  1.522  1.840
## evermarried1:2  0.974  1.192
## vetstat1:1      0.800  1.207
## vetstat1:2      0.838  1.303
## hispan1:1       0.929  1.193
## hispan1:2       1.069  1.392
## labforce1:1     1.095  1.350
## labforce1:2     0.811  1.011
## citizen1:1      0.936  1.457
## citizen1:2      1.188  1.948
fit.solr1$deviance+2*length(fit.solr1$coefficients)#proportional odds (survey)
## [1] 27469.38
AIC(fit.vgam) #proportional odds
## [1] 27473.38
AIC(fit.vgam2) #cumulative logit, non proportional
## [1] 27257.44
AIC(mfit) #multinomial
## [1] 27258.74

Looks like the multinomial is the best fitting model, but it is very close to the non proportional odds model using the cumulative logit.

4. For the best fitting model from part c,

1. Describe the results of your model, and

2. present output from the model in terms of odds ratios and confidence intervals for all model parameters in a table.

Native Americans are less likely to have more college years completed than whites. Blacks are more likely to have more college years completed than whites. People who have ever been married are more likely to have more college years completed than people who have never married. US citizens are more likely to have more college years completed than non-US citizens.