• Preprocessing
    • Load Libraries
    • Functions
    • Read CSV, Pre-processed in Python
    • Train / Test
    • Apply Survey Weights
  • Describe
    • Unweighted
    • Descriptive Charts, Weighted Data
  • Quasi-Binomial Analysis
    • Model 1: Demographics (D)
    • Model 2: D + Socioeconomics (S)
    • Model 3: D + S + Geography (G)
    • Model 4: D + S + G + Behavioral / Risk (B)
    • Model 5: D + S + G + B + Access (A)
    • Model 6: Significant Variables Only
    • Model 7: All Variables
    • Optimal Cut Points for Quasibinomial and Confusion Matrix on Unseen Data
  • Plots
    • Group Data for Plotting
    • Plot
    • Table of Predictors for Complete Model
  • Analysis on Full Data Set
    • Forest Plots of Training Set Estimates and Complete Set Estimates
    • Coefficient Comparison, Training versus Full

Preprocessing

Load Libraries

Functions

##Print
myprint=function(x, lab="") {print(x)%>%kbl(caption=lab)%>%kable_classic(html_font="Cambria")}


## VIF and Effect Sizes


mydef=function(x){ 
  pseudo=round(1-x$deviance/x$null.deviance,3)
  vif=round(max(vif(x)),3)
  VIF=vif
  R2=pseudo
  return(myprint(noquote(rbind(VIF, R2))))
  }

Read CSV, Pre-processed in Python

y2019=read.csv('D:/MI/MI.CSV', encoding="UTF-8")

Train / Test

set.seed(1234)
mys=sample(seq(1, nrow(y2019)),.2*nrow(y2019),replace=TRUE)
mytrain=y2019[-mys,]
mytest=y2019[mys,]
print(c(nrow(mytrain),nrow(mytest)))
## [1] 285856  69835

Apply Survey Weights

mydata=y2019
options(survey.lonely.psu = "adjust")
svy2019 = svydesign(id=~1,strata = ~Stratum,weights = ~Weights,data = mydata)

train2019=svydesign(id=~1,strata = ~Stratum,weights = ~Weights,data = mytrain)
test2019=svydesign(id=~1,strata = ~Stratum,weights = ~Weights,data = mytest)

Describe

Unweighted

options(scipen=9999)
library(psych)
myprint(as.data.frame(apply(y2019,2,function(x)round(mean(x),3))))
##                            apply(y2019, 2, function(x) round(mean(x), 3))
## MI                                                                  0.069
## Male                                                                0.441
## Veteran                                                             0.141
## Rent_Home                                                           0.184
## Poor_Health                                                         0.060
## Smoker                                                              0.428
## Chew_Snuff                                                          0.029
## Percent_Drink                                                       0.154
## High_Cholesterol                                                    0.398
## High_BP                                                             0.463
## High_BMI                                                            0.643
## Poor_Health_Percent                                                 0.159
## Poor_Mental_Health_Percent                                          0.114
## Depression                                                          0.183
## Stroke                                                              0.053
## Asthma                                                              0.134
## Skin_Cancer                                                         0.118
## Cancer                                                              0.120
## COPD                                                                0.097
## Kidney                                                              0.045
## Arthritis                                                           0.388
## No_Health_Plan                                                      0.068
## No_Doctor                                                           0.130
## Cost                                                                0.093
## No_Checkup                                                          0.169
## Metropolitan                                                        0.683
## Weights                                                           508.551
## Stratum                                                        297642.443
## PSU                                                        2019004649.714
## Age_65                                                              0.443
## Age_55_to_64                                                        0.241
## Age_45_to_54                                                        0.175
## Black                                                               0.074
## Hispanic                                                            0.073
## Other_Race                                                          0.064
## Previous_Marriage                                                   0.320
## Never_Married                                                       0.095
## Income_LT25K                                                        0.201
## Income_LT75K                                                        0.335
## Income_DKR                                                          0.175
## Pre_High_School                                                     0.073
## High_School                                                         0.261
## Post_High_School                                                    0.273
## Retired_Unable                                                      0.449
## Out_of_Work                                                         0.032
## Other_Not_Working                                                   0.058
## Fair_Health                                                         0.153
## Good_Health                                                         0.324
## Poor_Exercise                                                       0.279
## Percent_Drink_2                                                     0.101
## Diabetic                                                            0.161
## Prediabetic                                                         0.025
## Division_D1                                                         0.117
## Division_D2                                                         0.049
## Division_D3                                                         0.105
## Division_D4                                                         0.171
## Division_D5                                                         0.182
## Division_D6                                                         0.063
## Division_D7                                                         0.069
## Division_D8                                                         0.131
## Division_D9                                                         0.095
apply(y2019, 2, function(x) round(mean(x), 3))
MI 0.069
Male 0.441
Veteran 0.141
Rent_Home 0.184
Poor_Health 0.060
Smoker 0.428
Chew_Snuff 0.029
Percent_Drink 0.154
High_Cholesterol 0.398
High_BP 0.463
High_BMI 0.643
Poor_Health_Percent 0.159
Poor_Mental_Health_Percent 0.114
Depression 0.183
Stroke 0.053
Asthma 0.134
Skin_Cancer 0.118
Cancer 0.120
COPD 0.097
Kidney 0.045
Arthritis 0.388
No_Health_Plan 0.068
No_Doctor 0.130
Cost 0.093
No_Checkup 0.169
Metropolitan 0.683
Weights 508.551
Stratum 297642.443
PSU 2019004649.714
Age_65 0.443
Age_55_to_64 0.241
Age_45_to_54 0.175
Black 0.074
Hispanic 0.073
Other_Race 0.064
Previous_Marriage 0.320
Never_Married 0.095
Income_LT25K 0.201
Income_LT75K 0.335
Income_DKR 0.175
Pre_High_School 0.073
High_School 0.261
Post_High_School 0.273
Retired_Unable 0.449
Out_of_Work 0.032
Other_Not_Working 0.058
Fair_Health 0.153
Good_Health 0.324
Poor_Exercise 0.279
Percent_Drink_2 0.101
Diabetic 0.161
Prediabetic 0.025
Division_D1 0.117
Division_D2 0.049
Division_D3 0.105
Division_D4 0.171
Division_D5 0.182
Division_D6 0.063
Division_D7 0.069
Division_D8 0.131
Division_D9 0.095

Descriptive Charts, Weighted Data

myd=matrix(rep(0,61*4), 61,4)

for (i in 1:length(svy2019$variables)){
myd[i,1]=colnames(svy2019$variables[i])
myd[i,2]=round(svymean(~svy2019$variables[,i], svy2019),3)
myd[i,3]=round(svyvar(~svy2019$variables[,i], svy2019)^.5,3)
myd[i,4]=round(svytotal(~svy2019$variables[,i], svy2019)/1000000,3)
}
colnames(myd)=c('Variable', 'Mean', 'SD','Total in Millions')
myd=noquote(myd)
myd%>%kbl()%>%kable_classic(html_font='Cambria')
Variable Mean SD Total in Millions
MI 0.057 0.232 10.173
Male 0.477 0.499 84.748
Veteran 0.12 0.325 21.314
Rent_Home 0.189 0.392 33.6
Poor_Health 0.06 0.237 10.588
Smoker 0.42 0.493 74.505
Chew_Snuff 0.03 0.17 5.265
Percent_Drink 0.147 0.266 26.161
High_Cholesterol 0.37 0.483 65.677
High_BP 0.418 0.493 74.138
High_BMI 0.646 0.478 114.753
Poor_Health_Percent 0.152 0.304 27.059
Poor_Mental_Health_Percent 0.121 0.266 21.417
Depression 0.177 0.382 31.455
Stroke 0.046 0.209 8.141
Asthma 0.133 0.34 23.635
Skin_Cancer 0.089 0.284 15.747
Cancer 0.095 0.293 16.834
COPD 0.084 0.277 14.896
Kidney 0.04 0.196 7.138
Arthritis 0.329 0.47 58.432
No_Health_Plan 0.102 0.303 18.099
No_Doctor 0.165 0.371 29.254
Cost 0.116 0.32 20.577
No_Checkup 0.195 0.396 34.538
Metropolitan 0.84 0.366 149.215
Weights 2236.452 3387.957 397134.892
Stratum 280247.348 168454.592 49764539.418
PSU 2019005190.677 3748.729 358522084248.168
Age_65 0.305 0.46 54.097
Age_55_to_64 0.235 0.424 41.692
Age_45_to_54 0.229 0.42 40.64
Black 0.114 0.318 20.312
Hispanic 0.149 0.356 26.413
Other_Race 0.075 0.263 13.318
Previous_Marriage 0.26 0.439 46.174
Never_Married 0.107 0.309 19.047
Income_LT25K 0.203 0.402 36.034
Income_LT75K 0.314 0.464 55.691
Income_DKR 0.165 0.371 29.342
Pre_High_School 0.136 0.343 24.22
High_School 0.26 0.438 46.113
Post_High_School 0.295 0.456 52.352
Retired_Unable 0.35 0.477 62.149
Out_of_Work 0.042 0.2 7.382
Other_Not_Working 0.071 0.257 12.652
Fair_Health 0.159 0.366 28.238
Good_Health 0.327 0.469 58.047
Poor_Exercise 0.274 0.446 48.722
Percent_Drink_2 0.093 0.241 16.428
Diabetic 0.15 0.358 26.721
Prediabetic 0.026 0.159 4.623
Division_D1 0.049 0.215 8.619
Division_D2 0.102 0.303 18.186
Division_D3 0.146 0.353 25.992
Division_D4 0.065 0.247 11.622
Division_D5 0.211 0.408 37.379
Division_D6 0.06 0.237 10.568
Division_D7 0.118 0.323 20.95
Division_D8 0.075 0.263 13.244
Division_D9 0.163 0.37 28.99

Quasi-Binomial Analysis

Model 1: Demographics (D)

myformula1=as.formula(paste("MI","~Age_45_to_54+Age_55_to_64+Age_65+Male+
            Black+Hispanic+Other_Race+Previous_Marriage+Never_Married+
            Veteran"))

m1 = survey::svyglm(myformula1,design=train2019,
                    family=quasibinomial,maxit = 100)

pm1=plot_model(m1, main="1. Demographics", show.values=TRUE, show.p=TRUE, value.offset=.4)
mydef(m1)
##      [,1]
## VIF 7.158
## R2  0.083
VIF 7.158
R2 0.083
pm1

Model 2: D + Socioeconomics (S)

myformula2=as.formula(paste("MI","~Age_45_to_54+Age_55_to_64+Age_65+Male+
            Previous_Marriage+Never_Married+Veteran+
            Income_LT25K+Income_LT75K+Income_DKR+Rent_Home+
            Retired_Unable+Out_of_Work+Other_Not_Working+
            Pre_High_School+High_School+Post_High_School"))

m2 = survey::svyglm(myformula2,design=train2019,
                    family=quasibinomial,maxit = 100)

mydef(m2)
##      [,1]
## VIF 8.332
## R2  0.117
VIF 8.332
R2 0.117
pm2=plot_model(m2, main="2. Socioeconomics", show.values=TRUE, show.p=TRUE, value.offset=.4)
pm2

Model 3: D + S + Geography (G)

myformula3=as.formula(paste("MI","~Age_45_to_54+Age_55_to_64+Age_65+Male+
            Previous_Marriage+Never_Married+Veteran+
            Income_LT25K+Income_LT75K+Income_DKR+Rent_Home+
            Retired_Unable+Out_of_Work+Other_Not_Working+
            Pre_High_School+High_School+Post_High_School+
            Division_D1+Division_D2+Division_D3+Division_D4+Division_D5+
            Division_D6+Division_D7+Division_D8+Division_D9+Metropolitan"))

m3 = survey::svyglm(myformula3,design=train2019,
                    family=quasibinomial,maxit = 100)
mydef(m3)
##      [,1]
## VIF 8.417
## R2  0.118
VIF 8.417
R2 0.118
pm3=plot_model(m3, main="3. Geography", show.values=TRUE, show.p=TRUE, value.offset=.4)
pm3

Model 4: D + S + G + Behavioral / Risk (B)

myformula4=as.formula(paste("MI","~Age_45_to_54+Age_55_to_64+Age_65+Male+
            Previous_Marriage+Never_Married+Veteran+
            Income_LT25K+Income_LT75K+Income_DKR+Rent_Home+
            Retired_Unable+
            Pre_High_School+High_School+Post_High_School+
            Division_D3+Division_D4+
            Division_D6+Division_D7+Metropolitan+
            Poor_Health+Fair_Health+Good_Health+
            Smoker+Chew_Snuff+Poor_Exercise+Percent_Drink+Percent_Drink_2+
            High_Cholesterol+High_BP+High_BMI+
            Diabetic+Prediabetic+Poor_Health_Percent+
            Poor_Mental_Health_Percent+Depression+
            Stroke+Asthma+Skin_Cancer+Cancer+COPD+Kidney+Arthritis"))

m4 = survey::svyglm(myformula4,design=train2019,
                    family=quasibinomial,maxit = 100)
mydef(m4)
##       [,1]
## VIF 18.044
## R2   0.209
VIF 18.044
R2 0.209
pm4=plot_model(m4, main="4. Behavior", show.values=TRUE, show.p=TRUE, value.offset=.4)
pm4

Model 5: D + S + G + B + Access (A)

myformula5=as.formula(paste("MI","~Age_45_to_54+Age_55_to_64+Age_65+Male+
            Previous_Marriage+Never_Married+Veteran+
            Income_LT25K+Income_LT75K+Income_DKR+Rent_Home+
            Retired_Unable+
            Pre_High_School+High_School+Post_High_School+
            Division_D4+
            Poor_Health+Fair_Health+Good_Health+
            Smoker+Percent_Drink+Percent_Drink_2+
            High_Cholesterol+High_BP+
            Diabetic+Poor_Health_Percent+
            Stroke+COPD+Kidney+Arthritis+
            No_Health_Plan+No_Doctor+Cost+No_Checkup"))

m5 = survey::svyglm(myformula5,design=train2019,
                    family=quasibinomial,maxit = 100)
mydef(m5)
##       [,1]
## VIF 17.605
## R2   0.210
VIF 17.605
R2 0.210
pm5=plot_model(m5, main="5. Access",show.values=TRUE, show.p=TRUE, value.offset=.4)
pm5

Model 6: Significant Variables Only

myformula6=as.formula(paste("MI","~Age_45_to_54+Age_55_to_64+Age_65+Male+
            Previous_Marriage+Never_Married+Veteran+
            Income_LT25K+Income_LT75K+Income_DKR+Rent_Home+
            Retired_Unable+
            Pre_High_School+High_School+Post_High_School+
            Division_D4+
            Poor_Health+Fair_Health+Good_Health+
            Smoker+Percent_Drink+Percent_Drink_2+
            High_Cholesterol+High_BP+
            Diabetic+Poor_Health_Percent+
            Stroke+COPD+Kidney+Arthritis+
            No_Doctor+Cost+No_Checkup"))

m6 = survey::svyglm(myformula6,design=train2019,
                    family=quasibinomial,maxit = 500)
mydef(m6)
##       [,1]
## VIF 17.602
## R2   0.210
VIF 17.602
R2 0.210
pm6=plot_model(m6, main="6. Total, Only Significant",show.values=TRUE, show.p=TRUE,)
pm6

Model 7: All Variables

myformula7=as.formula(paste("MI","~Age_45_to_54+Age_55_to_64+Age_65+Male+
            Black+Hispanic+Other_Race+Previous_Marriage+Never_Married+
            Veteran+
            Income_LT25K+Income_LT75K+Income_DKR+Rent_Home+
            Retired_Unable+Out_of_Work+Other_Not_Working+
            Pre_High_School+High_School+Post_High_School+
            Division_D1+Division_D2+Division_D3+Division_D4+Division_D5+
            Division_D6+Division_D7+Division_D8+Division_D9+Metropolitan+
            Poor_Health+Fair_Health+Good_Health+
            Smoker+Chew_Snuff+Poor_Exercise+Percent_Drink+Percent_Drink_2+
            High_Cholesterol+High_BP+High_BMI+
            Diabetic+Prediabetic+Poor_Health_Percent+
            Poor_Mental_Health_Percent+Depression+
            Stroke+Asthma+Skin_Cancer+Cancer+COPD+Kidney+Arthritis+
            No_Health_Plan+No_Doctor+Cost+No_Checkup"))

m7 = survey::svyglm(myformula7,design=train2019,
                    family=quasibinomial,maxit = 500)
mydef(m7)
##       [,1]
## VIF 18.156
## R2   0.212
VIF 18.156
R2 0.212
pm7=plot_model(m7, main="7. Total, All Variables", show.values=TRUE, show.p=TRUE,)
pm7

Optimal Cut Points for Quasibinomial and Confusion Matrix on Unseen Data

cutpoints=predict(m6, mytest, type='response')
for (i in 1:length(cutpoints)) {
    if (cutpoints[i]>.17){cutpoints[i]=1}else{cutpoints[i]=0}} #.17 m6


print(noquote(c('F1 Score, No MI: ',F1_Score(cutpoints, mytest$MI, positive='0'))))
## [1] F1 Score, No MI:  0.939864247385317
print(noquote(c('Precision, No MI: ',Precision(cutpoints, mytest$MI, positive='0'))))
## [1] Precision, No MI:  0.925941886234917
print(noquote(c('Recall, No MI: ',Recall(cutpoints, mytest$MI, positive='0'))))
## [1] Recall, No MI:    0.954211669918003
print(noquote(c('F1 Score, MI: ',F1_Score(cutpoints, mytest$MI, positive='1'))))
## [1] F1 Score, MI:     0.338744309885768
print(noquote(c('Precision, MI: ',Precision(cutpoints, mytest$MI, positive='1'))))
## [1] Precision, MI:    0.405844824037868
print(noquote(c('Recall, MI: ',Recall(cutpoints, mytest$MI, positive='1'))))
## [1] Recall, MI:       0.290683962264151
print(noquote(c('Accuracy: ',Accuracy(cutpoints, mytest$MI))))
## [1] Accuracy:         0.889754421135534
print(noquote(c('Pseudo-R2: ', 1-m6$devianc/m6$null.deviance)))
## [1] Pseudo-R2:        0.210038168165197

Plots

Group Data for Plotting

t1=pm1$data[,c(1,2,5,6)]
t1$Group=rep("1. Demography", nrow(t1))
t2=pm2$data[,c(1,2,5,6)]
t2$Group=rep("2. SES", nrow(t2))
t3=pm3$data[,c(1,2,5,6)]
t3$Group=rep("3. Geog.", nrow(t3))
t4=pm4$data[,c(1,2,5,6)]
t4$Group=rep("4. Behavior", nrow(t4))
t5=pm5$data[,c(1,2,5,6)]
t5$Group=rep("5. Access", nrow(t5))
t6=pm6$data[,c(1,2,5,6)]
t6$Group=rep("6. Significant Only", nrow(t6))
t7=pm7$data[,c(1,2,5,6)]
t7$Group=rep("7. All Variables", nrow(t7))

ttot=rbind(t1,t2,t3,t4,t5,t6,t7)
ttot$Group=as.factor(ttot$Group)

Plot

ggplot(data=ttot,
    aes(x = term,y = estimate, ymin = .5, ymax = 2.0 ))+
    geom_point(aes(col=Group))+
    geom_hline(aes(fill=Group),yintercept=1, linetype=2)+
    xlab('')+ ylab("Odds Ratio (95% Confidence Interval)")+
    geom_errorbar(aes(ymin=conf.low,
                      ymax=conf.high,col=Group),width=0.5,cex=1)+ 
  facet_grid(~Group)+
        theme(plot.title=element_text(size=16,face="bold"),
        axis.text.y=element_text(size=8),
        axis.text.x=element_text(size=8,face="bold", angle=90),
        axis.title=element_text(size=8,face="bold"),
        strip.text.y = element_text(hjust=0,vjust = 1,angle=180,face="bold"))+
        guides(colour=FALSE)+
        coord_flip()
## Warning: geom_hline(): Ignoring `mapping` because `yintercept` was provided.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

Table of Predictors for Complete Model

a1=model_parameters(m6, df_method='wald')
a1$CI_high=round(exp(a1$CI_high),3)
a1$CI_low=round(exp(a1$CI_low),3)
a1$Coefficient=round(exp(a1$Coefficient),3)
a1$p=round(a1$p,3)
a1$SE=NULL
a1$t=NULL
a1$df_error=NULL
a1$CI=NULL
myprint(a1)
## Parameter           | Log-Odds |       95% CI |      p
## ------------------------------------------------------
## (Intercept)         | 3.00e-03 | [0.00, 0.00] | < .001
## Age_45_to_54        |     1.74 | [1.45, 2.10] | < .001
## Age_55_to_64        |     2.25 | [1.89, 2.68] | < .001
## Age_65              |     3.30 | [2.76, 3.95] | < .001
## Male                |     2.24 | [2.08, 2.41] | < .001
## Previous_Marriage   |     1.13 | [1.05, 1.22] | 0.001 
## Never_Married       |     0.79 | [0.70, 0.90] | < .001
## Veteran             |     1.12 | [1.03, 1.23] | 0.008 
## Income_LT25K        |     1.30 | [1.15, 1.46] | < .001
## Income_LT75K        |     1.20 | [1.08, 1.33] | < .001
## Income_DKR          |     1.14 | [1.01, 1.27] | 0.028 
## Rent_Home           |     1.10 | [1.01, 1.19] | 0.037 
## Retired_Unable      |     1.29 | [1.19, 1.40] | < .001
## Pre_High_School     |     1.19 | [1.06, 1.34] | 0.004 
## High_School         |     1.10 | [1.01, 1.20] | 0.030 
## Post_High_School    |     1.11 | [1.02, 1.21] | 0.020 
## Division_D4         |     1.11 | [1.04, 1.20] | 0.004 
## Poor_Health         |     2.77 | [2.40, 3.18] | < .001
## Fair_Health         |     2.15 | [1.93, 2.40] | < .001
## Good_Health         |     1.53 | [1.40, 1.67] | < .001
## Smoker              |     1.50 | [1.41, 1.61] | < .001
## Percent_Drink       |     0.37 | [0.22, 0.61] | < .001
## Percent_Drink_2     |     2.19 | [1.28, 3.75] | 0.004 
## High_Cholesterol    |     1.65 | [1.54, 1.76] | < .001
## High_BP             |     1.73 | [1.60, 1.87] | < .001
## Diabetic            |     1.31 | [1.22, 1.42] | < .001
## Poor_Health_Percent |     1.12 | [1.01, 1.24] | 0.032 
## Stroke              |     2.92 | [2.66, 3.20] | < .001
## COPD                |     1.59 | [1.47, 1.73] | < .001
## Kidney              |     1.38 | [1.24, 1.53] | < .001
## Arthritis           |     1.16 | [1.09, 1.25] | < .001
## No_Doctor           |     0.84 | [0.74, 0.95] | 0.005 
## Cost                |     1.23 | [1.11, 1.36] | < .001
## No_Checkup          |     0.78 | [0.69, 0.87] | < .001
Parameter Coefficient CI_low CI_high p
(Intercept) 0.003 0.002 0.003 0.000
Age_45_to_54 1.744 1.451 2.095 0.000
Age_55_to_64 2.249 1.886 2.682 0.000
Age_65 3.302 2.760 3.950 0.000
Male 2.241 2.083 2.410 0.000
Previous_Marriage 1.134 1.052 1.223 0.001
Never_Married 0.790 0.697 0.896 0.000
Veteran 1.125 1.032 1.226 0.008
Income_LT25K 1.296 1.149 1.461 0.000
Income_LT75K 1.199 1.084 1.327 0.000
Income_DKR 1.137 1.014 1.274 0.028
Rent_Home 1.096 1.006 1.194 0.037
Retired_Unable 1.288 1.186 1.398 0.000
Pre_High_School 1.188 1.056 1.336 0.004
High_School 1.100 1.009 1.198 0.030
Post_High_School 1.108 1.016 1.208 0.020
Division_D4 1.114 1.036 1.197 0.004
Poor_Health 2.765 2.405 3.179 0.000
Fair_Health 2.153 1.934 2.397 0.000
Good_Health 1.526 1.395 1.670 0.000
Smoker 1.504 1.408 1.607 0.000
Percent_Drink 0.368 0.221 0.613 0.000
Percent_Drink_2 2.188 1.277 3.746 0.004
High_Cholesterol 1.648 1.539 1.764 0.000
High_BP 1.730 1.603 1.867 0.000
Diabetic 1.315 1.220 1.416 0.000
Poor_Health_Percent 1.117 1.009 1.235 0.032
Stroke 2.915 2.657 3.197 0.000
COPD 1.591 1.466 1.726 0.000
Kidney 1.377 1.238 1.532 0.000
Arthritis 1.164 1.088 1.245 0.000
No_Doctor 0.837 0.738 0.949 0.005
Cost 1.225 1.107 1.356 0.000
No_Checkup 0.777 0.690 0.874 0.000

Analysis on Full Data Set

m8=survey::svyglm(myformula6,design=svy2019,
                    family=quasibinomial,maxit = 500) #significant var. only

a2=model_parameters(m8, df_method='wald')

a2$CI_high=round(exp(a2$CI_high),3)
a2$CI_low=round(exp(a2$CI_low),3)
a2$Coefficient=round(exp(a2$Coefficient),3)
a2$p=round(a2$p,3)
a2$SE=NULL
a2$t=NULL
a2$df_error=NULL
a2$CI=NULL

myprint(a2)
## Parameter           | Log-Odds |       95% CI |      p
## ------------------------------------------------------
## (Intercept)         | 3.00e-03 | [0.00, 0.00] | < .001
## Age_45_to_54        |     1.60 | [1.35, 1.89] | < .001
## Age_55_to_64        |     2.10 | [1.79, 2.47] | < .001
## Age_65              |     3.07 | [2.61, 3.61] | < .001
## Male                |     2.17 | [2.03, 2.32] | < .001
## Previous_Marriage   |     1.12 | [1.04, 1.20] | 0.002 
## Never_Married       |     0.80 | [0.71, 0.90] | < .001
## Veteran             |     1.16 | [1.07, 1.25] | < .001
## Income_LT25K        |     1.30 | [1.16, 1.45] | < .001
## Income_LT75K        |     1.22 | [1.11, 1.33] | < .001
## Income_DKR          |     1.16 | [1.04, 1.28] | 0.008 
## Rent_Home           |     1.08 | [1.00, 1.17] | 0.063 
## Retired_Unable      |     1.30 | [1.20, 1.40] | < .001
## Pre_High_School     |     1.23 | [1.11, 1.37] | < .001
## High_School         |     1.12 | [1.03, 1.21] | 0.006 
## Post_High_School    |     1.09 | [1.01, 1.18] | 0.031 
## Division_D4         |     1.13 | [1.05, 1.20] | < .001
## Poor_Health         |     2.88 | [2.54, 3.27] | < .001
## Fair_Health         |     2.19 | [1.99, 2.41] | < .001
## Good_Health         |     1.55 | [1.43, 1.68] | < .001
## Smoker              |     1.48 | [1.40, 1.58] | < .001
## Percent_Drink       |     0.38 | [0.24, 0.61] | < .001
## Percent_Drink_2     |     2.14 | [1.30, 3.52] | 0.003 
## High_Cholesterol    |     1.65 | [1.55, 1.76] | < .001
## High_BP             |     1.68 | [1.57, 1.80] | < .001
## Diabetic            |     1.33 | [1.24, 1.43] | < .001
## Poor_Health_Percent |     1.12 | [1.02, 1.24] | 0.017 
## Stroke              |     2.95 | [2.71, 3.21] | < .001
## COPD                |     1.56 | [1.45, 1.68] | < .001
## Kidney              |     1.36 | [1.24, 1.50] | < .001
## Arthritis           |     1.17 | [1.10, 1.24] | < .001
## No_Doctor           |     0.85 | [0.76, 0.96] | 0.008 
## Cost                |     1.23 | [1.12, 1.35] | < .001
## No_Checkup          |     0.81 | [0.72, 0.90] | < .001
Parameter Coefficient CI_low CI_high p
(Intercept) 0.003 0.002 0.003 0.000
Age_45_to_54 1.597 1.349 1.891 0.000
Age_55_to_64 2.102 1.791 2.467 0.000
Age_65 3.066 2.606 3.607 0.000
Male 2.171 2.032 2.320 0.000
Previous_Marriage 1.117 1.043 1.197 0.002
Never_Married 0.796 0.707 0.897 0.000
Veteran 1.159 1.073 1.253 0.000
Income_LT25K 1.297 1.163 1.446 0.000
Income_LT75K 1.218 1.112 1.335 0.000
Income_DKR 1.155 1.038 1.285 0.008
Rent_Home 1.078 0.996 1.167 0.063
Retired_Unable 1.297 1.202 1.400 0.000
Pre_High_School 1.232 1.106 1.372 0.000
High_School 1.118 1.032 1.210 0.006
Post_High_School 1.091 1.008 1.181 0.031
Division_D4 1.127 1.055 1.204 0.000
Poor_Health 2.882 2.542 3.267 0.000
Fair_Health 2.189 1.986 2.412 0.000
Good_Health 1.548 1.426 1.681 0.000
Smoker 1.483 1.396 1.576 0.000
Percent_Drink 0.382 0.238 0.612 0.000
Percent_Drink_2 2.138 1.299 3.517 0.003
High_Cholesterol 1.653 1.553 1.759 0.000
High_BP 1.683 1.569 1.805 0.000
Diabetic 1.331 1.244 1.425 0.000
Poor_Health_Percent 1.124 1.021 1.237 0.017
Stroke 2.950 2.712 3.209 0.000
COPD 1.558 1.446 1.677 0.000
Kidney 1.362 1.235 1.502 0.000
Arthritis 1.168 1.098 1.242 0.000
No_Doctor 0.853 0.759 0.959 0.008
Cost 1.234 1.125 1.353 0.000
No_Checkup 0.807 0.724 0.900 0.000

Forest Plots of Training Set Estimates and Complete Set Estimates

pm8=plot_model(m6, main="1. Training Set, Significant Variables")
pm9=plot_model(m8, main="2. Full Set, Significant Variables")

t8=pm8$data[,c(1,2,5,6)]
t8$Group=rep("1. Training Set, Significant Variables", nrow(t8))
t9=pm9$data[,c(1,2,5,6)]
t9$Group=rep("2. Full Set, Significant Variables", nrow(t9))


ttot2=rbind(t8,t9)
ttot2$Group=as.factor(ttot2$Group)

ggplot(data=ttot2,
    aes(x = term,y = estimate, ymin = .5, ymax = 2.0 ))+
    geom_point(aes(col=Group))+
    geom_hline(aes(fill=Group),yintercept =1, linetype=2)+
    xlab('')+ ylab("Odds Ratio (95% Confidence Interval)")+
    geom_errorbar(aes(ymin=conf.low,
                      ymax=conf.high,col=Group),width=0.5,cex=1)+ 
  facet_grid(~Group)+
        theme(plot.title=element_text(size=16,face="bold"),
        axis.text.y=element_text(size=10),
        axis.text.x=element_text(face="bold"),
        axis.title=element_text(size=12,face="bold"),
        strip.text.y = element_text(hjust=0,vjust = 1,angle=180,face="bold"))+
        guides(colour=FALSE)+
        coord_flip()
## Warning: geom_hline(): Ignoring `mapping` because `yintercept` was provided.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

Coefficient Comparison, Training versus Full

a1$Interval=noquote(paste0(format(a1$Coefficient,nsmall=3),' (',format(a1$CI_low,nsmall=3),',', format(a1$CI_high,nsmall=3), ')'))
a2$Interval=noquote(paste0(format(a2$Coefficient,nsmall=3),' (',format(a2$CI_low,nsmall=3),',', format(a2$CI_high,nsmall=3), ')'))
newdata=cbind(a1$Parameter,a1$Interval,a2$Interval)
colnames(newdata)=c("Variable","Training Data Odds Ratio (95% CI)", "Full Data Odds Ratio (95% CI)")
myprint(newdata)
##       Variable              Training Data Odds Ratio (95% CI)
##  [1,] "(Intercept)"         "0.003 (0.002,0.003)"            
##  [2,] "Age_45_to_54"        "1.744 (1.451,2.095)"            
##  [3,] "Age_55_to_64"        "2.249 (1.886,2.682)"            
##  [4,] "Age_65"              "3.302 (2.760,3.950)"            
##  [5,] "Male"                "2.241 (2.083,2.410)"            
##  [6,] "Previous_Marriage"   "1.134 (1.052,1.223)"            
##  [7,] "Never_Married"       "0.790 (0.697,0.896)"            
##  [8,] "Veteran"             "1.125 (1.032,1.226)"            
##  [9,] "Income_LT25K"        "1.296 (1.149,1.461)"            
## [10,] "Income_LT75K"        "1.199 (1.084,1.327)"            
## [11,] "Income_DKR"          "1.137 (1.014,1.274)"            
## [12,] "Rent_Home"           "1.096 (1.006,1.194)"            
## [13,] "Retired_Unable"      "1.288 (1.186,1.398)"            
## [14,] "Pre_High_School"     "1.188 (1.056,1.336)"            
## [15,] "High_School"         "1.100 (1.009,1.198)"            
## [16,] "Post_High_School"    "1.108 (1.016,1.208)"            
## [17,] "Division_D4"         "1.114 (1.036,1.197)"            
## [18,] "Poor_Health"         "2.765 (2.405,3.179)"            
## [19,] "Fair_Health"         "2.153 (1.934,2.397)"            
## [20,] "Good_Health"         "1.526 (1.395,1.670)"            
## [21,] "Smoker"              "1.504 (1.408,1.607)"            
## [22,] "Percent_Drink"       "0.368 (0.221,0.613)"            
## [23,] "Percent_Drink_2"     "2.188 (1.277,3.746)"            
## [24,] "High_Cholesterol"    "1.648 (1.539,1.764)"            
## [25,] "High_BP"             "1.730 (1.603,1.867)"            
## [26,] "Diabetic"            "1.315 (1.220,1.416)"            
## [27,] "Poor_Health_Percent" "1.117 (1.009,1.235)"            
## [28,] "Stroke"              "2.915 (2.657,3.197)"            
## [29,] "COPD"                "1.591 (1.466,1.726)"            
## [30,] "Kidney"              "1.377 (1.238,1.532)"            
## [31,] "Arthritis"           "1.164 (1.088,1.245)"            
## [32,] "No_Doctor"           "0.837 (0.738,0.949)"            
## [33,] "Cost"                "1.225 (1.107,1.356)"            
## [34,] "No_Checkup"          "0.777 (0.690,0.874)"            
##       Full Data Odds Ratio (95% CI)
##  [1,] "0.003 (0.002,0.003)"        
##  [2,] "1.597 (1.349,1.891)"        
##  [3,] "2.102 (1.791,2.467)"        
##  [4,] "3.066 (2.606,3.607)"        
##  [5,] "2.171 (2.032,2.320)"        
##  [6,] "1.117 (1.043,1.197)"        
##  [7,] "0.796 (0.707,0.897)"        
##  [8,] "1.159 (1.073,1.253)"        
##  [9,] "1.297 (1.163,1.446)"        
## [10,] "1.218 (1.112,1.335)"        
## [11,] "1.155 (1.038,1.285)"        
## [12,] "1.078 (0.996,1.167)"        
## [13,] "1.297 (1.202,1.400)"        
## [14,] "1.232 (1.106,1.372)"        
## [15,] "1.118 (1.032,1.210)"        
## [16,] "1.091 (1.008,1.181)"        
## [17,] "1.127 (1.055,1.204)"        
## [18,] "2.882 (2.542,3.267)"        
## [19,] "2.189 (1.986,2.412)"        
## [20,] "1.548 (1.426,1.681)"        
## [21,] "1.483 (1.396,1.576)"        
## [22,] "0.382 (0.238,0.612)"        
## [23,] "2.138 (1.299,3.517)"        
## [24,] "1.653 (1.553,1.759)"        
## [25,] "1.683 (1.569,1.805)"        
## [26,] "1.331 (1.244,1.425)"        
## [27,] "1.124 (1.021,1.237)"        
## [28,] "2.950 (2.712,3.209)"        
## [29,] "1.558 (1.446,1.677)"        
## [30,] "1.362 (1.235,1.502)"        
## [31,] "1.168 (1.098,1.242)"        
## [32,] "0.853 (0.759,0.959)"        
## [33,] "1.234 (1.125,1.353)"        
## [34,] "0.807 (0.724,0.900)"
Variable Training Data Odds Ratio (95% CI) Full Data Odds Ratio (95% CI)
(Intercept) 0.003 (0.002,0.003) 0.003 (0.002,0.003)
Age_45_to_54 1.744 (1.451,2.095) 1.597 (1.349,1.891)
Age_55_to_64 2.249 (1.886,2.682) 2.102 (1.791,2.467)
Age_65 3.302 (2.760,3.950) 3.066 (2.606,3.607)
Male 2.241 (2.083,2.410) 2.171 (2.032,2.320)
Previous_Marriage 1.134 (1.052,1.223) 1.117 (1.043,1.197)
Never_Married 0.790 (0.697,0.896) 0.796 (0.707,0.897)
Veteran 1.125 (1.032,1.226) 1.159 (1.073,1.253)
Income_LT25K 1.296 (1.149,1.461) 1.297 (1.163,1.446)
Income_LT75K 1.199 (1.084,1.327) 1.218 (1.112,1.335)
Income_DKR 1.137 (1.014,1.274) 1.155 (1.038,1.285)
Rent_Home 1.096 (1.006,1.194) 1.078 (0.996,1.167)
Retired_Unable 1.288 (1.186,1.398) 1.297 (1.202,1.400)
Pre_High_School 1.188 (1.056,1.336) 1.232 (1.106,1.372)
High_School 1.100 (1.009,1.198) 1.118 (1.032,1.210)
Post_High_School 1.108 (1.016,1.208) 1.091 (1.008,1.181)
Division_D4 1.114 (1.036,1.197) 1.127 (1.055,1.204)
Poor_Health 2.765 (2.405,3.179) 2.882 (2.542,3.267)
Fair_Health 2.153 (1.934,2.397) 2.189 (1.986,2.412)
Good_Health 1.526 (1.395,1.670) 1.548 (1.426,1.681)
Smoker 1.504 (1.408,1.607) 1.483 (1.396,1.576)
Percent_Drink 0.368 (0.221,0.613) 0.382 (0.238,0.612)
Percent_Drink_2 2.188 (1.277,3.746) 2.138 (1.299,3.517)
High_Cholesterol 1.648 (1.539,1.764) 1.653 (1.553,1.759)
High_BP 1.730 (1.603,1.867) 1.683 (1.569,1.805)
Diabetic 1.315 (1.220,1.416) 1.331 (1.244,1.425)
Poor_Health_Percent 1.117 (1.009,1.235) 1.124 (1.021,1.237)
Stroke 2.915 (2.657,3.197) 2.950 (2.712,3.209)
COPD 1.591 (1.466,1.726) 1.558 (1.446,1.677)
Kidney 1.377 (1.238,1.532) 1.362 (1.235,1.502)
Arthritis 1.164 (1.088,1.245) 1.168 (1.098,1.242)
No_Doctor 0.837 (0.738,0.949) 0.853 (0.759,0.959)
Cost 1.225 (1.107,1.356) 1.234 (1.125,1.353)
No_Checkup 0.777 (0.690,0.874) 0.807 (0.724,0.900)