#load libraries
library(car)
## Loading required package: carData
library(VGAM)
## Loading required package: stats4
## Loading required package: splines
## 
## Attaching package: 'VGAM'
## The following object is masked from 'package:car':
## 
##     logit
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:VGAM':
## 
##     calibrate
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#load data
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
View(brfss_17)
#The names in the data are very ugly, so I make them less ugly
nams<-names(brfss_17)
head(nams, n=10)
##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

##filter for TX
brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1

##REMOVE NON RESPONSES
brfss_17<-brfss_17%>%
  filter(tx==1, is.na(mmsawt)==F, is.na(hlthpln1)==F)
#Poor or fair self rated health
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")

#sex
brfss_17$male<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA", as.factor = T)

#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)

#education level
brfss_17$educ<-Recode(brfss_17$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')

#employment
brfss_17$employ<-Recode(brfss_17$employ1, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

#marital status
brfss_17$marst<-Recode(brfss_17$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss_17a the bmi variable has 2 implied decimal places, so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100
brfss_17$obese<-ifelse(brfss_17$bmi>=30, 1, 0)
#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")

#exercise
##Exercise
brfss_17$exercise<-Recode(brfss_17$exerany2, recodes ="7:9=NA; 1=1;2=0")

Define an ordinal or multinomial outcome variable of your choosing and define how you will recode the original variable.

##Education is going to be my ordinal variable. It will help me determine if it affects health and morality

State a research question about what factors you believe will affect your outcome variable.

##Does education affect health status and outcome?

Fit both the ordinal or the multinomial logistic regression models to your outcome.

#General ordinal coding, with 5 being the worst and 1 being the best health
brfss_17$generalhealth<-Recode(brfss_17$genhlth,
                               recodes="1:2=1;3=2;4:5=3; else=NA", as.factor = T)
brfss_17$generalhealth<-relevel(brfss_17$generalhealth, ref = "1")
brfss_17$healthnum<-car::Recode(brfss_17$genhlth,
                                recodes="1:2=1;3=2;4:5=3; else=NA", as.factor = F)

#First we tell R our survey design
options(survey.lonely.psu = "adjust")


library(dplyr)
sub<-brfss_17%>%
  select(badhealth,healthnum,generalhealth, mmsaname, bmi,
         agec,race_eth, marst, educ,white, black, hispanic,
         other, smoke, ins,exercise, mmsawt, ststr) %>%
  filter( complete.cases(.))

#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data =sub )

fit.solr1<-svyolr(generalhealth~race_eth+educ+agec+exercise+smoke,des)
summary(fit.solr1)
## Call:
## svyolr(generalhealth ~ race_eth + educ + agec + exercise + smoke, 
##     des)
## 
## Coefficients:
##                            Value Std. Error    t value
## race_ethnh black      0.04640273  0.1800048  0.2577861
## race_ethnh multirace -0.80229981  0.3902790 -2.0557083
## race_ethnh other      0.08964038  0.2768408  0.3237976
## race_ethnhwhite      -0.48683372  0.1309626 -3.7173500
## educ0Prim             0.50285291  0.2709692  1.8557567
## educ1somehs           0.43085117  0.2206965  1.9522337
## educ3somecol         -0.29911236  0.1340670 -2.2310657
## educ4colgrad         -0.89278800  0.1373191 -6.5015594
## agec(24,39]           0.52389096  0.1978651  2.6477176
## agec(39,59]           0.91349556  0.1931074  4.7305040
## agec(59,79]           1.09949905  0.2069718  5.3123127
## agec(79,99]           1.62701056  0.3759627  4.3275849
## exercise             -0.73901054  0.1118734 -6.6057761
## smokeCurrent          0.70258987  0.1550056  4.5326735
## smokeFormer           0.23579404  0.1275717  1.8483255
## 
## Intercepts:
##     Value   Std. Error t value
## 1|2 -0.3025  0.2046    -1.4782
## 2|3  1.4802  0.2072     7.1451
#Calculate the AIC 
fit.solr1$deviance+2*length(fit.solr1$coefficients)
## [1] 13476.4
##The results above that as people receive higher education their chances of reporting bad health. As people exercise, their chances of reporting bad health as well. And white have a lower chance of reporting bad health compared to other races.Similarly, those who smoke are more likely report worse health 

Are the assumptions of the proportional odds model met?

##No they are not
##Based on age, there is non proportionality.
##According to the table below, although the  proportional odds are met because the 2 binomial models are extremely similar to one another(except for age). However, due to age, the proportional odds model is not met due to the difference in magnitude in the age categories

How did you evaluate the proportional odds assumption?

ex1<-svyglm(I(healthnum>1)~race_eth+educ+agec+exercise+smoke,des, family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
ex2<-svyglm(I(healthnum>2)~race_eth+educ+agec+exercise+smoke,des, family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

Which model (Proportional odds or Multinomial) fits the data better? how did you decide upon this?

coefs<-data.frame(parms =c( coefficients(ex1)[-1],coefficients(ex2)[-1]),
                  beta = rep(names(coefficients(ex1))[-1], 2 ), 
                  mod = c(rep("I(healthnum>1)",15 ),
                          rep("I(healthnum>2)",15 )))

coefs%>%
  ggplot()+
  geom_point(aes(x=beta, y=parms, color=mod))+
  theme(axis.text.x = element_text(angle = 45))+
  labs(title = "Graphical Summary of Proportional Odds Assumption",
       x= "Beta",
       y= "Estimate")

#Just print the odds ratios, 
round(exp(rbind(coef(ex1)[-1],
                coef(ex2)[-1])),3)
##      race_ethnh black race_ethnh multirace race_ethnh other race_ethnhwhite
## [1,]            1.018                0.357            1.048           0.564
## [2,]            1.095                0.846            0.960           0.695
##      educ0Prim educ1somehs educ3somecol educ4colgrad agec(24,39] agec(39,59]
## [1,]     1.756       1.623        0.738        0.403       1.841       2.718
## [2,]     1.738       1.388        0.699        0.425       1.294       1.917
##      agec(59,79] agec(79,99] exercise smokeCurrent smokeFormer
## [1,]       2.908       6.191    0.469        1.956       1.156
## [2,]       2.860       4.585    0.489        2.264       1.537
##The odd ratios show that the non-Hispanic black, no education,some education, college education,age(59,79),exercise and non-smokers are close to each other therefore showing consistence across the transitions mentioned above
#Proportional odds




fit.vgam<-vglm(as.ordered(generalhealth)~race_eth+educ+agec+smoke+exercise,
               brfss_17, weights =mmsawt/mean(mmsawt, na.rm=T),
               family=cumulative(parallel = T, reverse = T))  #<-parallel = T == proportional odds
summary(fit.vgam)
## 
## Call:
## vglm(formula = as.ordered(generalhealth) ~ race_eth + educ + 
##     agec + smoke + exercise, family = cumulative(parallel = T, 
##     reverse = T), data = brfss_17, weights = mmsawt/mean(mmsawt, 
##     na.rm = T))
## 
## Coefficients: 
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1         0.28833    0.08535   3.378 0.000730 ***
## (Intercept):2        -1.57898    0.08745 -18.055  < 2e-16 ***
## race_ethnh black      0.08113    0.07708   1.053 0.292540    
## race_ethnh multirace -0.71924    0.26729  -2.691 0.007127 ** 
## race_ethnh other      0.08657    0.10007   0.865 0.386995    
## race_ethnhwhite      -0.42462    0.05977  -7.104 1.21e-12 ***
## educ0Prim             0.56939    0.09747   5.841 5.17e-09 ***
## educ1somehs           0.53251    0.09201   5.787 7.15e-09 ***
## educ3somecol         -0.36446    0.06116  -5.959 2.53e-09 ***
## educ4colgrad         -0.93077    0.06826 -13.637  < 2e-16 ***
## agec(24,39]           0.51395    0.07970   6.449 1.13e-10 ***
## agec(39,59]           0.95389    0.07835  12.174  < 2e-16 ***
## agec(59,79]           1.14243    0.08660  13.192  < 2e-16 ***
## agec(79,99]           1.67302    0.15622  10.710  < 2e-16 ***
## smokeCurrent          0.68636    0.06662  10.303  < 2e-16 ***
## smokeFormer           0.21019    0.06046   3.477 0.000508 ***
## exercise             -0.70672    0.05061 -13.963  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y>=2]), logitlink(P[Y>=3])
## 
## Residual deviance: 13967.08 on 15177 degrees of freedom
## 
## Log-likelihood: -6983.539 on 15177 degrees of freedom
## 
## Number of Fisher scoring iterations: 5 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Exponentiated coefficients:
##     race_ethnh black race_ethnh multirace     race_ethnh other 
##            1.0845123            0.4871230            1.0904268 
##      race_ethnhwhite            educ0Prim          educ1somehs 
##            0.6540170            1.7671964            1.7032058 
##         educ3somecol         educ4colgrad          agec(24,39] 
##            0.6945719            0.3942491            1.6718851 
##          agec(39,59]          agec(59,79]          agec(79,99] 
##            2.5957983            3.1343627            5.3282107 
##         smokeCurrent          smokeFormer             exercise 
##            1.9864758            1.2339126            0.4932613
#Non-proportional odds
##More flexible to interpret


fit.vgam2<-vglm(as.ordered(generalhealth)~race_eth+educ+agec+smoke+exercise,brfss_17,
                weights =mmsawt/mean(mmsawt, na.rm=T),
                family=cumulative(parallel = F, reverse = T))  #<-parallel = F == Nonproportional odds
summary(fit.vgam2)
## 
## Call:
## vglm(formula = as.ordered(generalhealth) ~ race_eth + educ + 
##     agec + smoke + exercise, family = cumulative(parallel = F, 
##     reverse = T), data = brfss_17, weights = mmsawt/mean(mmsawt, 
##     na.rm = T))
## 
## Coefficients: 
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1           0.28145    0.09239   3.046 0.002315 ** 
## (Intercept):2          -1.55511    0.12336 -12.607  < 2e-16 ***
## race_ethnh black:1      0.07971    0.08651   0.921 0.356870    
## race_ethnh black:2      0.03516    0.10453   0.336 0.736572    
## race_ethnh multirace:1 -0.85116    0.28064  -3.033 0.002422 ** 
## race_ethnh multirace:2 -0.30557    0.38028  -0.804 0.421658    
## race_ethnh other:1      0.14544    0.10859   1.339 0.180456    
## race_ethnh other:2     -0.16318    0.15374  -1.061 0.288488    
## race_ethnhwhite:1      -0.47506    0.06564  -7.237 4.59e-13 ***
## race_ethnhwhite:2      -0.31575    0.08234  -3.835 0.000126 ***
## educ0Prim:1             0.73043    0.12784   5.714 1.11e-08 ***
## educ0Prim:2             0.49169    0.11659   4.217 2.47e-05 ***
## educ1somehs:1           0.71739    0.11857   6.050 1.45e-09 ***
## educ1somehs:2           0.43409    0.11046   3.930 8.50e-05 ***
## educ3somecol:1         -0.34652    0.06755  -5.130 2.89e-07 ***
## educ3somecol:2         -0.37539    0.08232  -4.560 5.12e-06 ***
## educ4colgrad:1         -0.93324    0.07339 -12.715  < 2e-16 ***
## educ4colgrad:2         -0.85900    0.09906  -8.672  < 2e-16 ***
## agec(24,39]:1           0.59260    0.08405   7.051 1.78e-12 ***
## agec(24,39]:2           0.29883    0.12395   2.411 0.015910 *  
## agec(39,59]:1           1.04347    0.08372  12.464  < 2e-16 ***
## agec(39,59]:2           0.77285    0.11845   6.525 6.82e-11 ***
## agec(59,79]:1           1.09426    0.09280  11.792  < 2e-16 ***
## agec(59,79]:2           1.19266    0.12673   9.411  < 2e-16 ***
## agec(79,99]:1           1.60274    0.18367   8.726  < 2e-16 ***
## agec(79,99]:2           1.64954    0.19464   8.475  < 2e-16 ***
## smokeCurrent:1          0.58975    0.07751   7.609 2.76e-14 ***
## smokeCurrent:2          0.84254    0.08325  10.121  < 2e-16 ***
## smokeFormer:1           0.13409    0.06663   2.013 0.044167 *  
## smokeFormer:2           0.39396    0.08059   4.889 1.02e-06 ***
## exercise:1             -0.71461    0.05815 -12.289  < 2e-16 ***
## exercise:2             -0.69301    0.06471 -10.710  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y>=2]), logitlink(P[Y>=3])
## 
## Residual deviance: 13879.75 on 15162 degrees of freedom
## 
## Log-likelihood: -6939.875 on 15162 degrees of freedom
## 
## Number of Fisher scoring iterations: 6 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Exponentiated coefficients:
##     race_ethnh black:1     race_ethnh black:2 race_ethnh multirace:1 
##              1.0829723              1.0357897              0.4269211 
## race_ethnh multirace:2     race_ethnh other:1     race_ethnh other:2 
##              0.7367012              1.1565525              0.8494361 
##      race_ethnhwhite:1      race_ethnhwhite:2            educ0Prim:1 
##              0.6218493              0.7292453              2.0759682 
##            educ0Prim:2          educ1somehs:1          educ1somehs:2 
##              1.6350702              2.0490800              1.5435508 
##         educ3somecol:1         educ3somecol:2         educ4colgrad:1 
##              0.7071461              0.6870210              0.3932779 
##         educ4colgrad:2          agec(24,39]:1          agec(24,39]:2 
##              0.4235845              1.8086840              1.3482844 
##          agec(39,59]:1          agec(39,59]:2          agec(59,79]:1 
##              2.8390569              2.1659198              2.9869761 
##          agec(59,79]:2          agec(79,99]:1          agec(79,99]:2 
##              3.2958222              4.9666264              5.2045675 
##         smokeCurrent:1         smokeCurrent:2          smokeFormer:1 
##              1.8035458              2.3222670              1.1434958 
##          smokeFormer:2             exercise:1             exercise:2 
##              1.4828428              0.4893834              0.5000666
AIC(fit.vgam)
## [1] 14001.08
AIC(fit.vgam2)
## [1] 13943.75
##The coefficients below there is statistical significant between general health(good or bad health) and race, education,age,smoking and exercise
##Based on AIC, the non-proportional odds model fits the data better
##Multinominal
mfit<-vglm(generalhealth~race_eth+educ+agec+smoke+exercise
           ,
           family=multinomial(refLevel = 1),
           data=brfss_17,
           weights=mmsawt/mean(mmsawt, na.rm=T))

summary(mfit)
## 
## Call:
## vglm(formula = generalhealth ~ race_eth + educ + agec + smoke + 
##     exercise, family = multinomial(refLevel = 1), data = brfss_17, 
##     weights = mmsawt/mean(mmsawt, na.rm = T))
## 
## Coefficients: 
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1          -0.063827   0.100849  -0.633  0.52680    
## (Intercept):2          -0.842169   0.134109  -6.280 3.39e-10 ***
## race_ethnh black:1      0.030258   0.093781   0.323  0.74696    
## race_ethnh black:2      0.119845   0.116921   1.025  0.30536    
## race_ethnh multirace:1 -1.057454   0.335747  -3.150  0.00164 ** 
## race_ethnh multirace:2 -0.633566   0.394690  -1.605  0.10844    
## race_ethnh other:1      0.169882   0.115191   1.475  0.14027    
## race_ethnh other:2     -0.146054   0.168504  -0.867  0.38607    
## race_ethnhwhite:1      -0.487471   0.071540  -6.814 9.49e-12 ***
## race_ethnhwhite:2      -0.552445   0.091379  -6.046 1.49e-09 ***
## educ0Prim:1             0.596413   0.138273   4.313 1.61e-05 ***
## educ0Prim:2             0.958678   0.151047   6.347 2.20e-10 ***
## educ1somehs:1           0.602748   0.126483   4.765 1.88e-06 ***
## educ1somehs:2           0.786627   0.140746   5.589 2.28e-08 ***
## educ3somecol:1         -0.290581   0.074023  -3.926 8.65e-05 ***
## educ3somecol:2         -0.548368   0.091681  -5.981 2.21e-09 ***
## educ4colgrad:1         -0.824698   0.080723 -10.216  < 2e-16 ***
## educ4colgrad:2         -1.218710   0.106659 -11.426  < 2e-16 ***
## agec(24,39]:1           0.569281   0.091233   6.240 4.38e-10 ***
## agec(24,39]:2           0.584592   0.131506   4.445 8.77e-06 ***
## agec(39,59]:1           0.928382   0.090968  10.206  < 2e-16 ***
## agec(39,59]:2           1.222683   0.127295   9.605  < 2e-16 ***
## agec(59,79]:1           0.834770   0.102505   8.144 3.83e-16 ***
## agec(59,79]:2           1.570851   0.137033  11.463  < 2e-16 ***
## agec(79,99]:1           1.449447   0.209507   6.918 4.57e-12 ***
## agec(79,99]:2           2.423554   0.233397  10.384  < 2e-16 ***
## smokeCurrent:1          0.417321   0.085803   4.864 1.15e-06 ***
## smokeCurrent:2          1.051402   0.097803  10.750  < 2e-16 ***
## smokeFormer:1           0.009595   0.074320   0.129  0.89727    
## smokeFormer:2           0.396881   0.089816   4.419 9.92e-06 ***
## exercise:1             -0.596648   0.063753  -9.359  < 2e-16 ***
## exercise:2             -1.000708   0.074805 -13.377  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: log(mu[,2]/mu[,1]), log(mu[,3]/mu[,1])
## 
## Residual deviance: 13873.96 on 15162 degrees of freedom
## 
## Log-likelihood: -6936.981 on 15162 degrees of freedom
## 
## Number of Fisher scoring iterations: 5 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Reference group is level  1  of the response
##more significance in intercept 2

For the best fitting model from part c, Describe the results of your model, and present output from the model in terms of odds ratios and confidence intervals for all model parameters in a table.

##Odds ratios for covariates
round(exp(coef(mfit)), 3)
##          (Intercept):1          (Intercept):2     race_ethnh black:1 
##                  0.938                  0.431                  1.031 
##     race_ethnh black:2 race_ethnh multirace:1 race_ethnh multirace:2 
##                  1.127                  0.347                  0.531 
##     race_ethnh other:1     race_ethnh other:2      race_ethnhwhite:1 
##                  1.185                  0.864                  0.614 
##      race_ethnhwhite:2            educ0Prim:1            educ0Prim:2 
##                  0.576                  1.816                  2.608 
##          educ1somehs:1          educ1somehs:2         educ3somecol:1 
##                  1.827                  2.196                  0.748 
##         educ3somecol:2         educ4colgrad:1         educ4colgrad:2 
##                  0.578                  0.438                  0.296 
##          agec(24,39]:1          agec(24,39]:2          agec(39,59]:1 
##                  1.767                  1.794                  2.530 
##          agec(39,59]:2          agec(59,79]:1          agec(59,79]:2 
##                  3.396                  2.304                  4.811 
##          agec(79,99]:1          agec(79,99]:2         smokeCurrent:1 
##                  4.261                 11.286                  1.518 
##         smokeCurrent:2          smokeFormer:1          smokeFormer:2 
##                  2.862                  1.010                  1.487 
##             exercise:1             exercise:2 
##                  0.551                  0.368
##look at the relative model fits

AIC(fit.vgam) #proportional odds
## [1] 14001.08
AIC(fit.vgam2) #cumulative logit, non proportional
## [1] 13943.75
AIC(mfit) #multinomial
## [1] 13937.96
##Based on AIC, the multinomial model fits best although the non propotional model is close when using the cumulative logit