Homework 5

library(car)

## Loading required package: carData

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(tableone)
library(ggplot2)
library(VGAM)

## Warning: package 'VGAM' was built under R version 4.0.4

## Loading required package: stats4

## Loading required package: splines

## 
## Attaching package: 'VGAM'

## The following object is masked from 'package:survey':
## 
##     calibrate

## The following object is masked from 'package:car':
## 
##     logit

load("~/ipumstor2/.RData")
summary(data)

##      STRATA       SAMPWEIGHT          EDUC            BMI        
##  Min.   :6001   Min.   :     0   Min.   :  0.0   Min.   : 0.000  
##  1st Qu.:6081   1st Qu.:     0   1st Qu.:111.0   1st Qu.: 0.000  
##  Median :6158   Median :     0   Median :201.0   Median : 0.000  
##  Mean   :6156   Mean   :  3380   Mean   :232.7   Mean   : 9.144  
##  3rd Qu.:6238   3rd Qu.:  5466   3rd Qu.:302.0   3rd Qu.:22.330  
##  Max.   :6300   Max.   :104188   Max.   :999.0   Max.   :99.990  
##    HINOTCOVE        ASTHMAEV        DIABETICEV    
##  Min.   :1.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.000   Median :0.0000   Median :0.0000  
##  Mean   :1.242   Mean   :0.4852   Mean   :0.4681  
##  3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :9.000   Max.   :9.0000   Max.   :9.0000

data$HWT <- cut(data$BMI,
                     breaks=c(-Inf, 18, 25, 30, Inf),
                     labels=c("1", "2", "3", "4"))


data$INS<-Recode(data$HINOTCOVE,
              recodes="2=1; 1=0; else=NA")

data$AST<-Recode(data$ASTHMAEV,
              recodes="2=1; 1=0; else=NA")

data$DIA<-Recode(data$DIABETICEV,
              recodes="2=1; 1=0; else=NA")

data <- na.omit(data)

options(survey.lonely.psu = "adjust")

library(dplyr)

sub<-data%>%
  select(HWT,INS,AST, BMI, HINOTCOVE,
         ASTHMAEV, DIA, DIABETICEV, STRATA, EDUC, SAMPWEIGHT, STRATA) %>%
  filter( complete.cases(.))

#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~STRATA, data =sub )

## Warning in svydesign.default(ids = ~1, strata = ~STRATA, data = sub): No weights
## or probabilities supplied, assuming equal probability

fit.solr1<-svyolr(HWT~INS+AST+DIA, des)
summary(fit.solr1)

## Call:
## svyolr(HWT ~ INS + AST + DIA, des)
## 
## Coefficients:
##          Value Std. Error    t value
## INS 0.54714415 0.02327516 23.5076411
## AST 0.01869013 0.03035813  0.6156548
## DIA 1.79678229 0.03453603 52.0263052
## 
## Intercepts:
##     Value    Std. Error t value 
## 1|2  -0.6469   0.0126   -51.2806
## 2|3   0.4197   0.0124    33.9053
## 3|4   1.5820   0.0150   105.7493

fit.solr1$deviance+2*length(fit.solr1$coefficients)

## [1] 101640.8

ex1<-svyglm(I(BMI>18)~INS+AST+DIA,des, family="binomial")

ex2<-svyglm(I(BMI>25)~INS+AST+DIA,des, family="binomial")

coefs<-data.frame(parms =c( coefficients(ex1)[-1],coefficients(ex2)[-1]),
                  beta = rep(names(coefficients(ex1))[-1], 2 ), 
                  mod = c(rep("I(BMI>18)",12 ),
                          rep("I(BMI>25)",12 )))
coefs%>%
  ggplot()+
  geom_point(aes(x=beta, y=parms, color=mod))+
  theme(axis.text.x = element_text(angle = 45))+
  labs(title = "Graphical Summary of Proportional Odds Assumption",
       x= "Beta",
       y= "Estimate")

round(exp(rbind(coef(ex1)[-1],
                coef(ex2)[-1])),3)

##        INS   AST    DIA
## [1,] 2.362 0.847 38.804
## [2,] 1.628 1.015  6.605

fit.vgam<-vglm(as.ordered(HWT)~INS+AST+DIA,
               data, weights =SAMPWEIGHT/mean(SAMPWEIGHT, na.rm=T),
               family=cumulative(parallel = T, reverse = T))  #<-parallel = T == proportional odds
summary(fit.vgam)

## 
## Call:
## vglm(formula = as.ordered(HWT) ~ INS + AST + DIA, family = cumulative(parallel = T, 
##     reverse = T), data = data, weights = SAMPWEIGHT/mean(SAMPWEIGHT, 
##     na.rm = T))
## 
## Coefficients: 
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1  0.90265    0.01295   69.69   <2e-16 ***
## (Intercept):2 -0.28771    0.01203  -23.91   <2e-16 ***
## (Intercept):3 -1.48661    0.01449 -102.62   <2e-16 ***
## INS            0.49811    0.02571   19.38   <2e-16 ***
## AST            0.06479    0.02769    2.34   0.0193 *  
## DIA            1.74301    0.03959   44.03   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y>=2]), logitlink(P[Y>=3]), 
## logitlink(P[Y>=4])
## 
## Residual deviance: 102496.4 on 113727 degrees of freedom
## 
## Log-likelihood: -51248.19 on 113727 degrees of freedom
## 
## Number of Fisher scoring iterations: 4 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Exponentiated coefficients:
##      INS      AST      DIA 
## 1.645613 1.066939 5.714530

fit.vgam2<-vglm(as.ordered(HWT)~INS+AST+DIA,data,
                weights =SAMPWEIGHT/mean(SAMPWEIGHT, na.rm=T),
                family=cumulative(parallel = F, reverse = T))  #<-parallel = F == Nonproportional odds
summary(fit.vgam2)

## 
## Call:
## vglm(formula = as.ordered(HWT) ~ INS + AST + DIA, family = cumulative(parallel = F, 
##     reverse = T), data = data, weights = SAMPWEIGHT/mean(SAMPWEIGHT, 
##     na.rm = T))
## 
## Coefficients: 
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1  0.86100    0.01352  63.668  < 2e-16 ***
## (Intercept):2 -0.26124    0.01239 -21.090  < 2e-16 ***
## (Intercept):3 -1.45943    0.01554 -93.924  < 2e-16 ***
## INS:1          0.94212    0.04008  23.505  < 2e-16 ***
## INS:2          0.40262    0.02883  13.968  < 2e-16 ***
## INS:3          0.32618    0.03312   9.850  < 2e-16 ***
## AST:1         -0.08062    0.03502  -2.302   0.0213 *  
## AST:2          0.05762    0.03129   1.841   0.0656 .  
## AST:3          0.24285    0.03586   6.772 1.27e-11 ***
## DIA:1          3.40912    0.17057  19.987  < 2e-16 ***
## DIA:2          1.81799    0.05302  34.289  < 2e-16 ***
## DIA:3          1.63327    0.04142  39.433  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y>=2]), logitlink(P[Y>=3]), 
## logitlink(P[Y>=4])
## 
## Residual deviance: 101961.4 on 113721 degrees of freedom
## 
## Log-likelihood: -50980.72 on 113721 degrees of freedom
## 
## Number of Fisher scoring iterations: 7 
## 
## Warning: Hauck-Donner effect detected in the following estimate(s):
## 'DIA:1'
## 
## 
## Exponentiated coefficients:
##      INS:1      INS:2      INS:3      AST:1      AST:2      AST:3      DIA:1 
##  2.5654088  1.4957399  1.3856671  0.9225426  1.0593093  1.2748816 30.2386568 
##      DIA:2      DIA:3 
##  6.1594446  5.1205996

AIC(fit.vgam)

## [1] 102508.4

AIC(fit.vgam2)

## [1] 101985.4

mfit<-vglm(HWT~INS+AST+DIA,
           family=multinomial(refLevel = 1),
           data=data,
           weights=SAMPWEIGHT/mean(SAMPWEIGHT, na.rm=T))

summary(mfit)

## 
## Call:
## vglm(formula = HWT ~ INS + AST + DIA, family = multinomial(refLevel = 1), 
##     data = data, weights = SAMPWEIGHT/mean(SAMPWEIGHT, na.rm = T))
## 
## Coefficients: 
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1 -0.10363    0.01646  -6.298 3.02e-10 ***
## (Intercept):2 -0.18400    0.01679 -10.958  < 2e-16 ***
## (Intercept):3 -0.45401    0.01807 -25.128  < 2e-16 ***
## INS:1          0.93112    0.04509  20.651  < 2e-16 ***
## INS:2          0.90782    0.04576  19.838  < 2e-16 ***
## INS:3          1.00555    0.04695  21.420  < 2e-16 ***
## AST:1         -0.17885    0.04337  -4.124 3.73e-05 ***
## AST:2         -0.18941    0.04413  -4.292 1.77e-05 ***
## AST:3          0.11655    0.04379   2.661  0.00778 ** 
## DIA:1          2.54486    0.17746  14.340  < 2e-16 ***
## DIA:2          3.19474    0.17451  18.307  < 2e-16 ***
## DIA:3          4.12919    0.17257  23.928  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: log(mu[,2]/mu[,1]), log(mu[,3]/mu[,1]), 
## log(mu[,4]/mu[,1])
## 
## Residual deviance: 101961.3 on 113721 degrees of freedom
## 
## Log-likelihood: -50980.65 on 113721 degrees of freedom
## 
## Number of Fisher scoring iterations: 6 
## 
## Warning: Hauck-Donner effect detected in the following estimate(s):
## 'DIA:3'
## 
## 
## Reference group is level  1  of the response

round(exp(coef(mfit)), 3)

## (Intercept):1 (Intercept):2 (Intercept):3         INS:1         INS:2 
##         0.902         0.832         0.635         2.537         2.479 
##         INS:3         AST:1         AST:2         AST:3         DIA:1 
##         2.733         0.836         0.827         1.124        12.741 
##         DIA:2         DIA:3 
##        24.404        62.127

round(exp(confint(mfit)), 3)

##                2.5 % 97.5 %
## (Intercept):1  0.873  0.931
## (Intercept):2  0.805  0.860
## (Intercept):3  0.613  0.658
## INS:1          2.323  2.772
## INS:2          2.266  2.712
## INS:3          2.493  2.997
## AST:1          0.768  0.910
## AST:2          0.759  0.902
## AST:3          1.031  1.224
## DIA:1          8.998 18.042
## DIA:2         17.335 34.356
## DIA:3         44.299 87.131

AIC(fit.vgam)

## [1] 102508.4

AIC(fit.vgam2)

## [1] 101985.4

AIC(mfit)

## [1] 101985.3

#looking at the three models, The mulitnomial model seems to be more efficient, but each model is close

Homework 5

steven chavez

3/14/2021