This example will cover the use of R functions for fitting Ordinal and Multinomial logit models to complex survey data.
For this example I am using 2011 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link
#load brfss
library(car)
library(survey)
## Loading required package: grid
##
## Attaching package: 'survey'
##
## The following object is masked from 'package:graphics':
##
## dotchart
library(nnet)
load("~/Google Drive/dem7283/data/brfss_11.Rdata")
#The names in the data are very ugly, so I make them less ugly
nams<-names(brfss_11)
head(nams, n=10)
## [1] "_STATE" "precall" "fmonth" "intvid" "seqno" "_PSU"
## [7] "ctelenum" "cellfon" "pvtresid" "genhlth"
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-gsub(pattern = "_",replacement = "",x = nams)
names(brfss_11)<-tolower(newnames)
#Our outcome this time is the full spectrum of reported health
brfss_11$health<-as.factor(ifelse(brfss_11$genhlth%in%c(7,9),NA, brfss_11$genhlth))
brfss_11$health<-relevel(brfss_11$health, ref="1")
brfss_11$health2<-ifelse(brfss_11$genhlth%in%c(7,9),NA, brfss_11$genhlth)
#race/ethnicity
brfss_11$black<-recode(brfss_11$racegr2, recodes="2=1; 9=NA; else=0")
brfss_11$white<-recode(brfss_11$racegr2, recodes="1=1; 9=NA; else=0")
brfss_11$other<-recode(brfss_11$racegr2, recodes="3:4=1; 9=NA; else=0")
brfss_11$hispanic<-recode(brfss_11$racegr2, recodes="5=1; 9=NA; else=0")
brfss_11$race_group<-recode(brfss_11$racegr2, recodes="1='NH white'; 2='NH black'; 3:4='NH other';5='hispanic'; else=NA", as.factor.result = T)
brfss_11$race_group<-relevel(brfss_11$race_group, ref = 'NH white')
#insurance
brfss_11$ins<-ifelse(brfss_11$hlthpln1==1,1,0)
#income grouping
brfss_11$inc<-ifelse(brfss_11$incomg==9, NA, brfss_11$incomg)
#education level
brfss_11$educ<-recode(brfss_11$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor.result=T)
#brfss_11$educ<-relevel(brfss_11$educ, ref='0Prim')
#employment
brfss_11$employ<-recode(brfss_11$employ, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor.result=T)
brfss_11$employ<-relevel(brfss_11$employ, ref='Employed')
#marital status
brfss_11$marst<-recode(brfss_11$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss_11$marst<-relevel(brfss_11$marst, ref='married')
#Age cut into intervals
brfss_11$agec<-cut(brfss_11$age, breaks=c(0,24,39,59,79,99))
First, we will subset our data to have complete cases for our variables in our model and make our survey design object
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
brfss_11<-brfss_11[complete.cases(brfss_11$health,brfss_11$black,brfss_11$educ,brfss_11$agec) ,]
des<-svydesign(ids=~psu, strata=~ststr, weights=~cntywt, data = brfss_11[is.na(brfss_11$cntywt)==F,], nest=T )
To fit an ordinal logit to survey data in R, we use the svyolr fucntion in the survey library.
#Here I fit three nested models for the health outcome
fit.solr1<-svyolr(health~black+hispanic+other,des)
fit.solr2<-svyolr(health~black+hispanic+other+educ,des)
fit.solr3<-svyolr(health~black+hispanic+other+educ+agec,des)
#Show summary of each model and calculate the AIC for each model, even though the AIC is probably not right
summary(fit.solr1);fit.solr1$deviance+2*length(fit.solr1$coefficients)
## Call:
## svyolr(health ~ black + hispanic + other, des)
##
## Coefficients:
## Value Std. Error t value
## black 0.5702469 0.02572036 22.17103
## hispanic 0.7846439 0.02690469 29.16384
## other 0.1228688 0.03274422 3.75238
##
## Intercepts:
## Value Std. Error t value
## 1|2 -1.1699 0.0106 -110.4291
## 2|3 0.3294 0.0091 36.1502
## 3|4 1.8947 0.0131 144.3386
## 4|5 3.4507 0.0234 147.1528
## [1] 679738
summary(fit.solr2);fit.solr2$deviance+2*length(fit.solr2$coefficients)
## Call:
## svyolr(health ~ black + hispanic + other + educ, des)
##
## Coefficients:
## Value Std. Error t value
## black 0.4010183 0.02643104 15.172247
## hispanic 0.2945284 0.02811464 10.475980
## other 0.1915769 0.03358538 5.704175
## educ1somehs -0.5697689 0.06236792 -9.135609
## educ2hsgrad -1.1224331 0.05581058 -20.111476
## educ3somecol -1.4822546 0.05559810 -26.660166
## educ4colgrad -2.0072167 0.05569360 -36.040348
##
## Intercepts:
## Value Std. Error t value
## 1|2 -2.7462 0.0560 -49.0412
## 2|3 -1.1811 0.0548 -21.5468
## 3|4 0.4766 0.0542 8.7979
## 4|5 2.0842 0.0560 37.2026
## [1] 662336.4
summary(fit.solr3);fit.solr3$deviance+2*length(fit.solr3$coefficients)
## Call:
## svyolr(health ~ black + hispanic + other + educ + agec, des)
##
## Coefficients:
## Value Std. Error t value
## black 0.4888241 0.02653683 18.420591
## hispanic 0.4946935 0.02886040 17.140914
## other 0.3158002 0.03337465 9.462275
## educ1somehs -0.3880685 0.06185141 -6.274207
## educ2hsgrad -0.9585043 0.05537993 -17.307793
## educ3somecol -1.2865151 0.05537298 -23.233625
## educ4colgrad -1.8636030 0.05552767 -33.561702
## agec(24,39] 0.2968264 0.03256324 9.115383
## agec(39,59] 0.6399445 0.03061534 20.902738
## agec(59,79] 1.0194199 0.03106605 32.814598
## agec(79,99] 1.2943649 0.04064550 31.845220
##
## Intercepts:
## Value Std. Error t value
## 1|2 -2.0040 0.0623 -32.1667
## 2|3 -0.4064 0.0615 -6.6080
## 3|4 1.2919 0.0609 21.2009
## 4|5 2.9265 0.0619 47.2479
## [1] 653886.4
#"Examine" proportional odds assumption by fitting logits for each change
ex1<-svyglm(I(health2>1)~black+hispanic+other+educ+agec,des, family="binomial")
## Warning: non-integer #successes in a binomial glm!
ex2<-svyglm(I(health2>2)~black+hispanic+other+educ+agec,des, family="binomial")
## Warning: non-integer #successes in a binomial glm!
ex3<-svyglm(I(health2>3)~black+hispanic+other+educ+agec,des, family="binomial")
## Warning: non-integer #successes in a binomial glm!
ex4<-svyglm(I(health2>4)~black+hispanic+other+educ+agec,des, family="binomial")
## Warning: non-integer #successes in a binomial glm!
#Just a a rough guide, I plot the coefficients of each model. If the proportional odds
#assumption is ok, they all should be "approximately" the same values, they are NOT!!
plot(coef(ex1)[-1], ylim=c(-2, 3), type="l", ylab="Beta", main=c("Comparison of betas for", " proportional odds assumption"))
lines(coef(ex2)[-1], col=2) #not bad
lines(coef(ex3)[-1], col=3) # again, not bad
lines(coef(ex4)[-1], col=4) #not as good, the effect of age are very different for the change between
#fair or better -> poor health
#Just print the odds ratios,
round(exp(round(rbind(coef(ex1)[-1], coef(ex2)[-1], coef(ex3)[-1], coef(ex4)[-1]),2)),3)
## black hispanic other educ1somehs educ2hsgrad educ3somecol
## [1,] 1.405 1.310 1.162 0.756 0.566 0.432
## [2,] 1.733 1.878 1.537 0.712 0.403 0.284
## [3,] 1.733 1.768 1.419 0.705 0.395 0.278
## [4,] 1.284 1.083 1.323 0.763 0.375 0.259
## educ4colgrad agec(24,39] agec(39,59] agec(59,79] agec(79,99]
## [1,] 0.262 1.271 1.632 2.340 3.456
## [2,] 0.154 1.336 1.804 2.612 3.525
## [3,] 0.129 1.600 2.945 4.306 5.366
## [4,] 0.110 2.293 6.297 8.846 9.300
Unfortunately, in R there is no multinomial survey design model, so we have to trick it, by creating some replicate weights. This is what Thomas Lumley (the author of the survey package) says to do in this case (and in cases where the type of model can’t be fit otherwise) WARNING, THIS LOOKS SCREWY
#create a bootstrap replicate weight set
des2<-as.svrepdesign(des, type="bootstrap" , replicates=10)
#Now I fit the model, this is screwy looking
mfit<-withReplicates(des2, quote(coef(multinom(health~black+hispanic+other+educ+agec,weights=.weights,trace=F ))))
#get the betas
mfitcoef<-data.frame(matrix(attr(attr(mfit, "var"), "means")[-1:-4], nrow=4, ncol=11, byrow=F))
names(mfitcoef)<-names(coef(ex1)[-1])
#odds ratios
round(exp(mfitcoef), 3)
## black hispanic other educ1somehs educ2hsgrad educ3somecol educ4colgrad
## 1 1.048 0.897 0.905 1.049 1.206 1.107 0.879
## 2 1.637 1.603 1.430 0.883 0.613 0.433 0.229
## 3 2.302 2.306 1.514 0.628 0.327 0.195 0.070
## 4 1.838 1.537 1.529 0.589 0.220 0.119 0.036
## agec(24,39] agec(39,59] agec(59,79] agec(79,99]
## 1 1.100 1.250 1.507 1.893
## 2 1.281 1.612 2.435 3.733
## 3 1.643 3.118 5.817 9.968
## 4 2.529 9.283 18.331 27.412
#get the covariance matrix for the betas
vcov<-matrix(attr(mfit, "var"), nrow=48, ncol=48)
#get the z tests by using the coefficients and the standard errors (diag(vcov))
z<-as.vector(mfitcoef/sqrt(diag(vcov)[-1:-4]))
round(z, 2)
## black hispanic other educ1somehs educ2hsgrad educ3somecol educ4colgrad
## 1 1.02 -2.38 -1.47 0.25 0.90 0.48 -0.60
## 2 14.41 7.29 6.05 -0.60 -2.24 -3.90 -6.72
## 3 17.43 15.64 4.15 -2.28 -5.88 -8.20 -13.21
## 4 10.98 5.13 3.43 -3.01 -7.43 -10.12 -13.74
## agec(24,39] agec(39,59] agec(59,79] agec(79,99]
## 1 1.74 6.02 9.96 11.41
## 2 3.90 9.50 22.48 15.29
## 3 5.97 12.32 20.01 28.30
## 4 5.96 15.70 25.33 22.17
#p values
pvals<-round((1-pnorm(abs(as.matrix(z)),0,1))*2,4)
round(pvals, 3)
## black hispanic other educ1somehs educ2hsgrad educ3somecol
## [1,] 0.309 0.017 0.141 0.801 0.366 0.633
## [2,] 0.000 0.000 0.000 0.548 0.025 0.000
## [3,] 0.000 0.000 0.000 0.022 0.000 0.000
## [4,] 0.000 0.000 0.001 0.003 0.000 0.000
## educ4colgrad agec(24,39] agec(39,59] agec(59,79] agec(79,99]
## [1,] 0.55 0.082 0 0 0
## [2,] 0.00 0.000 0 0 0
## [3,] 0.00 0.000 0 0 0
## [4,] 0.00 0.000 0 0 0