Preprocessing

Load Libraries

Functions

##Print
myprint=function(x, lab="") {print(x)%>%kbl(caption=lab)%>%kable_classic(html_font="Cambria")}


## VIF and Effect Sizes


mydef=function(x){ 
  pseudo=round(1-x$deviance/x$null.deviance,3)
  vif=round(max(vif(x)),3)
  VIF=vif
  R2=pseudo
  return(myprint(noquote(rbind(VIF, R2))))
  }

Read CSV, Pre-processed in Python

y2019=read.csv('D:/MI/MI.CSV', encoding="UTF-8")

Train / Test

set.seed(1234)
mys=sample(seq(1, nrow(y2019)),.2*nrow(y2019),replace=TRUE)
mytrain=y2019[-mys,]
mytest=y2019[mys,]
print(c(nrow(mytrain),nrow(mytest)))

## [1] 285856  69835

Apply Survey Weights

mydata=y2019
options(survey.lonely.psu = "adjust")
svy2019 = svydesign(id=~1,strata = ~Stratum,weights = ~Weights,data = mydata)

train2019=svydesign(id=~1,strata = ~Stratum,weights = ~Weights,data = mytrain)
test2019=svydesign(id=~1,strata = ~Stratum,weights = ~Weights,data = mytest)

Describe

Unweighted

options(scipen=9999)
library(psych)
myprint(as.data.frame(apply(y2019,2,function(x)round(mean(x),3))))

##                            apply(y2019, 2, function(x) round(mean(x), 3))
## MI                                                                  0.069
## Male                                                                0.441
## Veteran                                                             0.141
## Rent_Home                                                           0.184
## Poor_Health                                                         0.060
## Smoker                                                              0.428
## Chew_Snuff                                                          0.029
## Percent_Drink                                                       0.154
## High_Cholesterol                                                    0.398
## High_BP                                                             0.463
## High_BMI                                                            0.643
## Poor_Health_Percent                                                 0.159
## Poor_Mental_Health_Percent                                          0.114
## Depression                                                          0.183
## Stroke                                                              0.053
## Asthma                                                              0.134
## Skin_Cancer                                                         0.118
## Cancer                                                              0.120
## COPD                                                                0.097
## Kidney                                                              0.045
## Arthritis                                                           0.388
## No_Health_Plan                                                      0.068
## No_Doctor                                                           0.130
## Cost                                                                0.093
## No_Checkup                                                          0.169
## Metropolitan                                                        0.683
## Weights                                                           508.551
## Stratum                                                        297642.443
## PSU                                                        2019004649.714
## Age_65                                                              0.443
## Age_55_to_64                                                        0.241
## Age_45_to_54                                                        0.175
## Black                                                               0.074
## Hispanic                                                            0.073
## Other_Race                                                          0.064
## Previous_Marriage                                                   0.320
## Never_Married                                                       0.095
## Income_LT25K                                                        0.201
## Income_LT75K                                                        0.335
## Income_DKR                                                          0.175
## Pre_High_School                                                     0.073
## High_School                                                         0.261
## Post_High_School                                                    0.273
## Retired_Unable                                                      0.449
## Out_of_Work                                                         0.032
## Other_Not_Working                                                   0.058
## Fair_Health                                                         0.153
## Good_Health                                                         0.324
## Poor_Exercise                                                       0.279
## Percent_Drink_2                                                     0.101
## Diabetic                                                            0.161
## Prediabetic                                                         0.025
## Division_D1                                                         0.117
## Division_D2                                                         0.049
## Division_D3                                                         0.105
## Division_D4                                                         0.171
## Division_D5                                                         0.182
## Division_D6                                                         0.063
## Division_D7                                                         0.069
## Division_D8                                                         0.131
## Division_D9                                                         0.095


	apply(y2019, 2, function(x) round(mean(x), 3))
MI	0.069
Male	0.441
Veteran	0.141
Rent_Home	0.184
Poor_Health	0.060
Smoker	0.428
Chew_Snuff	0.029
Percent_Drink	0.154
High_Cholesterol	0.398
High_BP	0.463
High_BMI	0.643
Poor_Health_Percent	0.159
Poor_Mental_Health_Percent	0.114
Depression	0.183
Stroke	0.053
Asthma	0.134
Skin_Cancer	0.118
Cancer	0.120
COPD	0.097
Kidney	0.045
Arthritis	0.388
No_Health_Plan	0.068
No_Doctor	0.130
Cost	0.093
No_Checkup	0.169
Metropolitan	0.683
Weights	508.551
Stratum	297642.443
PSU	2019004649.714
Age_65	0.443
Age_55_to_64	0.241
Age_45_to_54	0.175
Black	0.074
Hispanic	0.073
Other_Race	0.064
Previous_Marriage	0.320
Never_Married	0.095
Income_LT25K	0.201
Income_LT75K	0.335
Income_DKR	0.175
Pre_High_School	0.073
High_School	0.261
Post_High_School	0.273
Retired_Unable	0.449
Out_of_Work	0.032
Other_Not_Working	0.058
Fair_Health	0.153
Good_Health	0.324
Poor_Exercise	0.279
Percent_Drink_2	0.101
Diabetic	0.161
Prediabetic	0.025
Division_D1	0.117
Division_D2	0.049
Division_D3	0.105
Division_D4	0.171
Division_D5	0.182
Division_D6	0.063
Division_D7	0.069
Division_D8	0.131
Division_D9	0.095

Descriptive Charts, Weighted Data

myd=matrix(rep(0,61*4), 61,4)

for (i in 1:length(svy2019$variables)){
myd[i,1]=colnames(svy2019$variables[i])
myd[i,2]=round(svymean(~svy2019$variables[,i], svy2019),3)
myd[i,3]=round(svyvar(~svy2019$variables[,i], svy2019)^.5,3)
myd[i,4]=round(svytotal(~svy2019$variables[,i], svy2019)/1000000,3)
}
colnames(myd)=c('Variable', 'Mean', 'SD','Total in Millions')
myd=noquote(myd)
myd%>%kbl()%>%kable_classic(html_font='Cambria')

Variable	Mean	SD	Total in Millions
MI	0.057	0.232	10.173
Male	0.477	0.499	84.748
Veteran	0.12	0.325	21.314
Rent_Home	0.189	0.392	33.6
Poor_Health	0.06	0.237	10.588
Smoker	0.42	0.493	74.505
Chew_Snuff	0.03	0.17	5.265
Percent_Drink	0.147	0.266	26.161
High_Cholesterol	0.37	0.483	65.677
High_BP	0.418	0.493	74.138
High_BMI	0.646	0.478	114.753
Poor_Health_Percent	0.152	0.304	27.059
Poor_Mental_Health_Percent	0.121	0.266	21.417
Depression	0.177	0.382	31.455
Stroke	0.046	0.209	8.141
Asthma	0.133	0.34	23.635
Skin_Cancer	0.089	0.284	15.747
Cancer	0.095	0.293	16.834
COPD	0.084	0.277	14.896
Kidney	0.04	0.196	7.138
Arthritis	0.329	0.47	58.432
No_Health_Plan	0.102	0.303	18.099
No_Doctor	0.165	0.371	29.254
Cost	0.116	0.32	20.577
No_Checkup	0.195	0.396	34.538
Metropolitan	0.84	0.366	149.215
Weights	2236.452	3387.957	397134.892
Stratum	280247.348	168454.592	49764539.418
PSU	2019005190.677	3748.729	358522084248.168
Age_65	0.305	0.46	54.097
Age_55_to_64	0.235	0.424	41.692
Age_45_to_54	0.229	0.42	40.64
Black	0.114	0.318	20.312
Hispanic	0.149	0.356	26.413
Other_Race	0.075	0.263	13.318
Previous_Marriage	0.26	0.439	46.174
Never_Married	0.107	0.309	19.047
Income_LT25K	0.203	0.402	36.034
Income_LT75K	0.314	0.464	55.691
Income_DKR	0.165	0.371	29.342
Pre_High_School	0.136	0.343	24.22
High_School	0.26	0.438	46.113
Post_High_School	0.295	0.456	52.352
Retired_Unable	0.35	0.477	62.149
Out_of_Work	0.042	0.2	7.382
Other_Not_Working	0.071	0.257	12.652
Fair_Health	0.159	0.366	28.238
Good_Health	0.327	0.469	58.047
Poor_Exercise	0.274	0.446	48.722
Percent_Drink_2	0.093	0.241	16.428
Diabetic	0.15	0.358	26.721
Prediabetic	0.026	0.159	4.623
Division_D1	0.049	0.215	8.619
Division_D2	0.102	0.303	18.186
Division_D3	0.146	0.353	25.992
Division_D4	0.065	0.247	11.622
Division_D5	0.211	0.408	37.379
Division_D6	0.06	0.237	10.568
Division_D7	0.118	0.323	20.95
Division_D8	0.075	0.263	13.244
Division_D9	0.163	0.37	28.99

Quasi-Binomial Analysis

Model 1: Demographics (D)

myformula1=as.formula(paste("MI","~Age_45_to_54+Age_55_to_64+Age_65+Male+
            Black+Hispanic+Other_Race+Previous_Marriage+Never_Married+
            Veteran"))

m1 = survey::svyglm(myformula1,design=train2019,
                    family=quasibinomial,maxit = 100)

pm1=plot_model(m1, main="1. Demographics", show.values=TRUE, show.p=TRUE, value.offset=.4)
mydef(m1)

##      [,1]
## VIF 7.158
## R2  0.083


VIF	7.158
R2	0.083

pm1

Model 2: D + Socioeconomics (S)

myformula2=as.formula(paste("MI","~Age_45_to_54+Age_55_to_64+Age_65+Male+
            Previous_Marriage+Never_Married+Veteran+
            Income_LT25K+Income_LT75K+Income_DKR+Rent_Home+
            Retired_Unable+Out_of_Work+Other_Not_Working+
            Pre_High_School+High_School+Post_High_School"))

m2 = survey::svyglm(myformula2,design=train2019,
                    family=quasibinomial,maxit = 100)

mydef(m2)

##      [,1]
## VIF 8.332
## R2  0.117


VIF	8.332
R2	0.117

pm2=plot_model(m2, main="2. Socioeconomics", show.values=TRUE, show.p=TRUE, value.offset=.4)
pm2

Model 3: D + S + Geography (G)

myformula3=as.formula(paste("MI","~Age_45_to_54+Age_55_to_64+Age_65+Male+
            Previous_Marriage+Never_Married+Veteran+
            Income_LT25K+Income_LT75K+Income_DKR+Rent_Home+
            Retired_Unable+Out_of_Work+Other_Not_Working+
            Pre_High_School+High_School+Post_High_School+
            Division_D1+Division_D2+Division_D3+Division_D4+Division_D5+
            Division_D6+Division_D7+Division_D8+Division_D9+Metropolitan"))

m3 = survey::svyglm(myformula3,design=train2019,
                    family=quasibinomial,maxit = 100)
mydef(m3)

##      [,1]
## VIF 8.417
## R2  0.118


VIF	8.417
R2	0.118

pm3=plot_model(m3, main="3. Geography", show.values=TRUE, show.p=TRUE, value.offset=.4)
pm3

Model 4: D + S + G + Behavioral / Risk (B)

myformula4=as.formula(paste("MI","~Age_45_to_54+Age_55_to_64+Age_65+Male+
            Previous_Marriage+Never_Married+Veteran+
            Income_LT25K+Income_LT75K+Income_DKR+Rent_Home+
            Retired_Unable+
            Pre_High_School+High_School+Post_High_School+
            Division_D3+Division_D4+
            Division_D6+Division_D7+Metropolitan+
            Poor_Health+Fair_Health+Good_Health+
            Smoker+Chew_Snuff+Poor_Exercise+Percent_Drink+Percent_Drink_2+
            High_Cholesterol+High_BP+High_BMI+
            Diabetic+Prediabetic+Poor_Health_Percent+
            Poor_Mental_Health_Percent+Depression+
            Stroke+Asthma+Skin_Cancer+Cancer+COPD+Kidney+Arthritis"))

m4 = survey::svyglm(myformula4,design=train2019,
                    family=quasibinomial,maxit = 100)
mydef(m4)

##       [,1]
## VIF 18.044
## R2   0.209


VIF	18.044
R2	0.209

pm4=plot_model(m4, main="4. Behavior", show.values=TRUE, show.p=TRUE, value.offset=.4)
pm4

Model 5: D + S + G + B + Access (A)

myformula5=as.formula(paste("MI","~Age_45_to_54+Age_55_to_64+Age_65+Male+
            Previous_Marriage+Never_Married+Veteran+
            Income_LT25K+Income_LT75K+Income_DKR+Rent_Home+
            Retired_Unable+
            Pre_High_School+High_School+Post_High_School+
            Division_D4+
            Poor_Health+Fair_Health+Good_Health+
            Smoker+Percent_Drink+Percent_Drink_2+
            High_Cholesterol+High_BP+
            Diabetic+Poor_Health_Percent+
            Stroke+COPD+Kidney+Arthritis+
            No_Health_Plan+No_Doctor+Cost+No_Checkup"))

m5 = survey::svyglm(myformula5,design=train2019,
                    family=quasibinomial,maxit = 100)
mydef(m5)

##       [,1]
## VIF 17.605
## R2   0.210


VIF	17.605
R2	0.210

pm5=plot_model(m5, main="5. Access",show.values=TRUE, show.p=TRUE, value.offset=.4)
pm5

Model 6: Significant Variables Only

myformula6=as.formula(paste("MI","~Age_45_to_54+Age_55_to_64+Age_65+Male+
            Previous_Marriage+Never_Married+Veteran+
            Income_LT25K+Income_LT75K+Income_DKR+Rent_Home+
            Retired_Unable+
            Pre_High_School+High_School+Post_High_School+
            Division_D4+
            Poor_Health+Fair_Health+Good_Health+
            Smoker+Percent_Drink+Percent_Drink_2+
            High_Cholesterol+High_BP+
            Diabetic+Poor_Health_Percent+
            Stroke+COPD+Kidney+Arthritis+
            No_Doctor+Cost+No_Checkup"))

m6 = survey::svyglm(myformula6,design=train2019,
                    family=quasibinomial,maxit = 500)
mydef(m6)

##       [,1]
## VIF 17.602
## R2   0.210


VIF	17.602
R2	0.210

pm6=plot_model(m6, main="6. Total, Only Significant",show.values=TRUE, show.p=TRUE,)
pm6

Model 7: All Variables

myformula7=as.formula(paste("MI","~Age_45_to_54+Age_55_to_64+Age_65+Male+
            Black+Hispanic+Other_Race+Previous_Marriage+Never_Married+
            Veteran+
            Income_LT25K+Income_LT75K+Income_DKR+Rent_Home+
            Retired_Unable+Out_of_Work+Other_Not_Working+
            Pre_High_School+High_School+Post_High_School+
            Division_D1+Division_D2+Division_D3+Division_D4+Division_D5+
            Division_D6+Division_D7+Division_D8+Division_D9+Metropolitan+
            Poor_Health+Fair_Health+Good_Health+
            Smoker+Chew_Snuff+Poor_Exercise+Percent_Drink+Percent_Drink_2+
            High_Cholesterol+High_BP+High_BMI+
            Diabetic+Prediabetic+Poor_Health_Percent+
            Poor_Mental_Health_Percent+Depression+
            Stroke+Asthma+Skin_Cancer+Cancer+COPD+Kidney+Arthritis+
            No_Health_Plan+No_Doctor+Cost+No_Checkup"))

m7 = survey::svyglm(myformula7,design=train2019,
                    family=quasibinomial,maxit = 500)
mydef(m7)

##       [,1]
## VIF 18.156
## R2   0.212


VIF	18.156
R2	0.212

pm7=plot_model(m7, main="7. Total, All Variables", show.values=TRUE, show.p=TRUE,)
pm7

Optimal Cut Points for Quasibinomial and Confusion Matrix on Unseen Data

cutpoints=predict(m6, mytest, type='response')
for (i in 1:length(cutpoints)) {
    if (cutpoints[i]>.17){cutpoints[i]=1}else{cutpoints[i]=0}} #.17 m6


print(noquote(c('F1 Score, No MI: ',F1_Score(cutpoints, mytest$MI, positive='0'))))

## [1] F1 Score, No MI:  0.939864247385317

print(noquote(c('Precision, No MI: ',Precision(cutpoints, mytest$MI, positive='0'))))

## [1] Precision, No MI:  0.925941886234917

print(noquote(c('Recall, No MI: ',Recall(cutpoints, mytest$MI, positive='0'))))

## [1] Recall, No MI:    0.954211669918003

print(noquote(c('F1 Score, MI: ',F1_Score(cutpoints, mytest$MI, positive='1'))))

## [1] F1 Score, MI:     0.338744309885768

print(noquote(c('Precision, MI: ',Precision(cutpoints, mytest$MI, positive='1'))))

## [1] Precision, MI:    0.405844824037868

print(noquote(c('Recall, MI: ',Recall(cutpoints, mytest$MI, positive='1'))))

## [1] Recall, MI:       0.290683962264151

print(noquote(c('Accuracy: ',Accuracy(cutpoints, mytest$MI))))

## [1] Accuracy:         0.889754421135534

print(noquote(c('Pseudo-R2: ', 1-m6$devianc/m6$null.deviance)))

## [1] Pseudo-R2:        0.210038168165197

Plots

Group Data for Plotting

t1=pm1$data[,c(1,2,5,6)]
t1$Group=rep("1. Demography", nrow(t1))
t2=pm2$data[,c(1,2,5,6)]
t2$Group=rep("2. SES", nrow(t2))
t3=pm3$data[,c(1,2,5,6)]
t3$Group=rep("3. Geog.", nrow(t3))
t4=pm4$data[,c(1,2,5,6)]
t4$Group=rep("4. Behavior", nrow(t4))
t5=pm5$data[,c(1,2,5,6)]
t5$Group=rep("5. Access", nrow(t5))
t6=pm6$data[,c(1,2,5,6)]
t6$Group=rep("6. Significant Only", nrow(t6))
t7=pm7$data[,c(1,2,5,6)]
t7$Group=rep("7. All Variables", nrow(t7))

ttot=rbind(t1,t2,t3,t4,t5,t6,t7)
ttot$Group=as.factor(ttot$Group)

Plot

ggplot(data=ttot,
    aes(x = term,y = estimate, ymin = .5, ymax = 2.0 ))+
    geom_point(aes(col=Group))+
    geom_hline(aes(fill=Group),yintercept=1, linetype=2)+
    xlab('')+ ylab("Odds Ratio (95% Confidence Interval)")+
    geom_errorbar(aes(ymin=conf.low,
                      ymax=conf.high,col=Group),width=0.5,cex=1)+ 
  facet_grid(~Group)+
        theme(plot.title=element_text(size=16,face="bold"),
        axis.text.y=element_text(size=8),
        axis.text.x=element_text(size=8,face="bold", angle=90),
        axis.title=element_text(size=8,face="bold"),
        strip.text.y = element_text(hjust=0,vjust = 1,angle=180,face="bold"))+
        guides(colour=FALSE)+
        coord_flip()

## Warning: geom_hline(): Ignoring `mapping` because `yintercept` was provided.

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

Table of Predictors for Complete Model

a1=model_parameters(m6, df_method='wald')
a1$CI_high=round(exp(a1$CI_high),3)
a1$CI_low=round(exp(a1$CI_low),3)
a1$Coefficient=round(exp(a1$Coefficient),3)
a1$p=round(a1$p,3)
a1$SE=NULL
a1$t=NULL
a1$df_error=NULL
a1$CI=NULL
myprint(a1)

## Parameter           | Log-Odds |       95% CI |      p
## ------------------------------------------------------
## (Intercept)         | 3.00e-03 | [0.00, 0.00] | < .001
## Age_45_to_54        |     1.74 | [1.45, 2.10] | < .001
## Age_55_to_64        |     2.25 | [1.89, 2.68] | < .001
## Age_65              |     3.30 | [2.76, 3.95] | < .001
## Male                |     2.24 | [2.08, 2.41] | < .001
## Previous_Marriage   |     1.13 | [1.05, 1.22] | 0.001 
## Never_Married       |     0.79 | [0.70, 0.90] | < .001
## Veteran             |     1.12 | [1.03, 1.23] | 0.008 
## Income_LT25K        |     1.30 | [1.15, 1.46] | < .001
## Income_LT75K        |     1.20 | [1.08, 1.33] | < .001
## Income_DKR          |     1.14 | [1.01, 1.27] | 0.028 
## Rent_Home           |     1.10 | [1.01, 1.19] | 0.037 
## Retired_Unable      |     1.29 | [1.19, 1.40] | < .001
## Pre_High_School     |     1.19 | [1.06, 1.34] | 0.004 
## High_School         |     1.10 | [1.01, 1.20] | 0.030 
## Post_High_School    |     1.11 | [1.02, 1.21] | 0.020 
## Division_D4         |     1.11 | [1.04, 1.20] | 0.004 
## Poor_Health         |     2.77 | [2.40, 3.18] | < .001
## Fair_Health         |     2.15 | [1.93, 2.40] | < .001
## Good_Health         |     1.53 | [1.40, 1.67] | < .001
## Smoker              |     1.50 | [1.41, 1.61] | < .001
## Percent_Drink       |     0.37 | [0.22, 0.61] | < .001
## Percent_Drink_2     |     2.19 | [1.28, 3.75] | 0.004 
## High_Cholesterol    |     1.65 | [1.54, 1.76] | < .001
## High_BP             |     1.73 | [1.60, 1.87] | < .001
## Diabetic            |     1.31 | [1.22, 1.42] | < .001
## Poor_Health_Percent |     1.12 | [1.01, 1.24] | 0.032 
## Stroke              |     2.92 | [2.66, 3.20] | < .001
## COPD                |     1.59 | [1.47, 1.73] | < .001
## Kidney              |     1.38 | [1.24, 1.53] | < .001
## Arthritis           |     1.16 | [1.09, 1.25] | < .001
## No_Doctor           |     0.84 | [0.74, 0.95] | 0.005 
## Cost                |     1.23 | [1.11, 1.36] | < .001
## No_Checkup          |     0.78 | [0.69, 0.87] | < .001


Parameter	Coefficient	CI_low	CI_high	p
(Intercept)	0.003	0.002	0.003	0.000
Age_45_to_54	1.744	1.451	2.095	0.000
Age_55_to_64	2.249	1.886	2.682	0.000
Age_65	3.302	2.760	3.950	0.000
Male	2.241	2.083	2.410	0.000
Previous_Marriage	1.134	1.052	1.223	0.001
Never_Married	0.790	0.697	0.896	0.000
Veteran	1.125	1.032	1.226	0.008
Income_LT25K	1.296	1.149	1.461	0.000
Income_LT75K	1.199	1.084	1.327	0.000
Income_DKR	1.137	1.014	1.274	0.028
Rent_Home	1.096	1.006	1.194	0.037
Retired_Unable	1.288	1.186	1.398	0.000
Pre_High_School	1.188	1.056	1.336	0.004
High_School	1.100	1.009	1.198	0.030
Post_High_School	1.108	1.016	1.208	0.020
Division_D4	1.114	1.036	1.197	0.004
Poor_Health	2.765	2.405	3.179	0.000
Fair_Health	2.153	1.934	2.397	0.000
Good_Health	1.526	1.395	1.670	0.000
Smoker	1.504	1.408	1.607	0.000
Percent_Drink	0.368	0.221	0.613	0.000
Percent_Drink_2	2.188	1.277	3.746	0.004
High_Cholesterol	1.648	1.539	1.764	0.000
High_BP	1.730	1.603	1.867	0.000
Diabetic	1.315	1.220	1.416	0.000
Poor_Health_Percent	1.117	1.009	1.235	0.032
Stroke	2.915	2.657	3.197	0.000
COPD	1.591	1.466	1.726	0.000
Kidney	1.377	1.238	1.532	0.000
Arthritis	1.164	1.088	1.245	0.000
No_Doctor	0.837	0.738	0.949	0.005
Cost	1.225	1.107	1.356	0.000
No_Checkup	0.777	0.690	0.874	0.000

Analysis on Full Data Set

m8=survey::svyglm(myformula6,design=svy2019,
                    family=quasibinomial,maxit = 500) #significant var. only

a2=model_parameters(m8, df_method='wald')

a2$CI_high=round(exp(a2$CI_high),3)
a2$CI_low=round(exp(a2$CI_low),3)
a2$Coefficient=round(exp(a2$Coefficient),3)
a2$p=round(a2$p,3)
a2$SE=NULL
a2$t=NULL
a2$df_error=NULL
a2$CI=NULL

myprint(a2)

## Parameter           | Log-Odds |       95% CI |      p
## ------------------------------------------------------
## (Intercept)         | 3.00e-03 | [0.00, 0.00] | < .001
## Age_45_to_54        |     1.60 | [1.35, 1.89] | < .001
## Age_55_to_64        |     2.10 | [1.79, 2.47] | < .001
## Age_65              |     3.07 | [2.61, 3.61] | < .001
## Male                |     2.17 | [2.03, 2.32] | < .001
## Previous_Marriage   |     1.12 | [1.04, 1.20] | 0.002 
## Never_Married       |     0.80 | [0.71, 0.90] | < .001
## Veteran             |     1.16 | [1.07, 1.25] | < .001
## Income_LT25K        |     1.30 | [1.16, 1.45] | < .001
## Income_LT75K        |     1.22 | [1.11, 1.33] | < .001
## Income_DKR          |     1.16 | [1.04, 1.28] | 0.008 
## Rent_Home           |     1.08 | [1.00, 1.17] | 0.063 
## Retired_Unable      |     1.30 | [1.20, 1.40] | < .001
## Pre_High_School     |     1.23 | [1.11, 1.37] | < .001
## High_School         |     1.12 | [1.03, 1.21] | 0.006 
## Post_High_School    |     1.09 | [1.01, 1.18] | 0.031 
## Division_D4         |     1.13 | [1.05, 1.20] | < .001
## Poor_Health         |     2.88 | [2.54, 3.27] | < .001
## Fair_Health         |     2.19 | [1.99, 2.41] | < .001
## Good_Health         |     1.55 | [1.43, 1.68] | < .001
## Smoker              |     1.48 | [1.40, 1.58] | < .001
## Percent_Drink       |     0.38 | [0.24, 0.61] | < .001
## Percent_Drink_2     |     2.14 | [1.30, 3.52] | 0.003 
## High_Cholesterol    |     1.65 | [1.55, 1.76] | < .001
## High_BP             |     1.68 | [1.57, 1.80] | < .001
## Diabetic            |     1.33 | [1.24, 1.43] | < .001
## Poor_Health_Percent |     1.12 | [1.02, 1.24] | 0.017 
## Stroke              |     2.95 | [2.71, 3.21] | < .001
## COPD                |     1.56 | [1.45, 1.68] | < .001
## Kidney              |     1.36 | [1.24, 1.50] | < .001
## Arthritis           |     1.17 | [1.10, 1.24] | < .001
## No_Doctor           |     0.85 | [0.76, 0.96] | 0.008 
## Cost                |     1.23 | [1.12, 1.35] | < .001
## No_Checkup          |     0.81 | [0.72, 0.90] | < .001


Parameter	Coefficient	CI_low	CI_high	p
(Intercept)	0.003	0.002	0.003	0.000
Age_45_to_54	1.597	1.349	1.891	0.000
Age_55_to_64	2.102	1.791	2.467	0.000
Age_65	3.066	2.606	3.607	0.000
Male	2.171	2.032	2.320	0.000
Previous_Marriage	1.117	1.043	1.197	0.002
Never_Married	0.796	0.707	0.897	0.000
Veteran	1.159	1.073	1.253	0.000
Income_LT25K	1.297	1.163	1.446	0.000
Income_LT75K	1.218	1.112	1.335	0.000
Income_DKR	1.155	1.038	1.285	0.008
Rent_Home	1.078	0.996	1.167	0.063
Retired_Unable	1.297	1.202	1.400	0.000
Pre_High_School	1.232	1.106	1.372	0.000
High_School	1.118	1.032	1.210	0.006
Post_High_School	1.091	1.008	1.181	0.031
Division_D4	1.127	1.055	1.204	0.000
Poor_Health	2.882	2.542	3.267	0.000
Fair_Health	2.189	1.986	2.412	0.000
Good_Health	1.548	1.426	1.681	0.000
Smoker	1.483	1.396	1.576	0.000
Percent_Drink	0.382	0.238	0.612	0.000
Percent_Drink_2	2.138	1.299	3.517	0.003
High_Cholesterol	1.653	1.553	1.759	0.000
High_BP	1.683	1.569	1.805	0.000
Diabetic	1.331	1.244	1.425	0.000
Poor_Health_Percent	1.124	1.021	1.237	0.017
Stroke	2.950	2.712	3.209	0.000
COPD	1.558	1.446	1.677	0.000
Kidney	1.362	1.235	1.502	0.000
Arthritis	1.168	1.098	1.242	0.000
No_Doctor	0.853	0.759	0.959	0.008
Cost	1.234	1.125	1.353	0.000
No_Checkup	0.807	0.724	0.900	0.000

Forest Plots of Training Set Estimates and Complete Set Estimates

pm8=plot_model(m6, main="1. Training Set, Significant Variables")
pm9=plot_model(m8, main="2. Full Set, Significant Variables")

t8=pm8$data[,c(1,2,5,6)]
t8$Group=rep("1. Training Set, Significant Variables", nrow(t8))
t9=pm9$data[,c(1,2,5,6)]
t9$Group=rep("2. Full Set, Significant Variables", nrow(t9))


ttot2=rbind(t8,t9)
ttot2$Group=as.factor(ttot2$Group)

ggplot(data=ttot2,
    aes(x = term,y = estimate, ymin = .5, ymax = 2.0 ))+
    geom_point(aes(col=Group))+
    geom_hline(aes(fill=Group),yintercept =1, linetype=2)+
    xlab('')+ ylab("Odds Ratio (95% Confidence Interval)")+
    geom_errorbar(aes(ymin=conf.low,
                      ymax=conf.high,col=Group),width=0.5,cex=1)+ 
  facet_grid(~Group)+
        theme(plot.title=element_text(size=16,face="bold"),
        axis.text.y=element_text(size=10),
        axis.text.x=element_text(face="bold"),
        axis.title=element_text(size=12,face="bold"),
        strip.text.y = element_text(hjust=0,vjust = 1,angle=180,face="bold"))+
        guides(colour=FALSE)+
        coord_flip()

## Warning: geom_hline(): Ignoring `mapping` because `yintercept` was provided.

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

Coefficient Comparison, Training versus Full

a1$Interval=noquote(paste0(format(a1$Coefficient,nsmall=3),' (',format(a1$CI_low,nsmall=3),',', format(a1$CI_high,nsmall=3), ')'))
a2$Interval=noquote(paste0(format(a2$Coefficient,nsmall=3),' (',format(a2$CI_low,nsmall=3),',', format(a2$CI_high,nsmall=3), ')'))
newdata=cbind(a1$Parameter,a1$Interval,a2$Interval)
colnames(newdata)=c("Variable","Training Data Odds Ratio (95% CI)", "Full Data Odds Ratio (95% CI)")
myprint(newdata)

##       Variable              Training Data Odds Ratio (95% CI)
##  [1,] "(Intercept)"         "0.003 (0.002,0.003)"            
##  [2,] "Age_45_to_54"        "1.744 (1.451,2.095)"            
##  [3,] "Age_55_to_64"        "2.249 (1.886,2.682)"            
##  [4,] "Age_65"              "3.302 (2.760,3.950)"            
##  [5,] "Male"                "2.241 (2.083,2.410)"            
##  [6,] "Previous_Marriage"   "1.134 (1.052,1.223)"            
##  [7,] "Never_Married"       "0.790 (0.697,0.896)"            
##  [8,] "Veteran"             "1.125 (1.032,1.226)"            
##  [9,] "Income_LT25K"        "1.296 (1.149,1.461)"            
## [10,] "Income_LT75K"        "1.199 (1.084,1.327)"            
## [11,] "Income_DKR"          "1.137 (1.014,1.274)"            
## [12,] "Rent_Home"           "1.096 (1.006,1.194)"            
## [13,] "Retired_Unable"      "1.288 (1.186,1.398)"            
## [14,] "Pre_High_School"     "1.188 (1.056,1.336)"            
## [15,] "High_School"         "1.100 (1.009,1.198)"            
## [16,] "Post_High_School"    "1.108 (1.016,1.208)"            
## [17,] "Division_D4"         "1.114 (1.036,1.197)"            
## [18,] "Poor_Health"         "2.765 (2.405,3.179)"            
## [19,] "Fair_Health"         "2.153 (1.934,2.397)"            
## [20,] "Good_Health"         "1.526 (1.395,1.670)"            
## [21,] "Smoker"              "1.504 (1.408,1.607)"            
## [22,] "Percent_Drink"       "0.368 (0.221,0.613)"            
## [23,] "Percent_Drink_2"     "2.188 (1.277,3.746)"            
## [24,] "High_Cholesterol"    "1.648 (1.539,1.764)"            
## [25,] "High_BP"             "1.730 (1.603,1.867)"            
## [26,] "Diabetic"            "1.315 (1.220,1.416)"            
## [27,] "Poor_Health_Percent" "1.117 (1.009,1.235)"            
## [28,] "Stroke"              "2.915 (2.657,3.197)"            
## [29,] "COPD"                "1.591 (1.466,1.726)"            
## [30,] "Kidney"              "1.377 (1.238,1.532)"            
## [31,] "Arthritis"           "1.164 (1.088,1.245)"            
## [32,] "No_Doctor"           "0.837 (0.738,0.949)"            
## [33,] "Cost"                "1.225 (1.107,1.356)"            
## [34,] "No_Checkup"          "0.777 (0.690,0.874)"            
##       Full Data Odds Ratio (95% CI)
##  [1,] "0.003 (0.002,0.003)"        
##  [2,] "1.597 (1.349,1.891)"        
##  [3,] "2.102 (1.791,2.467)"        
##  [4,] "3.066 (2.606,3.607)"        
##  [5,] "2.171 (2.032,2.320)"        
##  [6,] "1.117 (1.043,1.197)"        
##  [7,] "0.796 (0.707,0.897)"        
##  [8,] "1.159 (1.073,1.253)"        
##  [9,] "1.297 (1.163,1.446)"        
## [10,] "1.218 (1.112,1.335)"        
## [11,] "1.155 (1.038,1.285)"        
## [12,] "1.078 (0.996,1.167)"        
## [13,] "1.297 (1.202,1.400)"        
## [14,] "1.232 (1.106,1.372)"        
## [15,] "1.118 (1.032,1.210)"        
## [16,] "1.091 (1.008,1.181)"        
## [17,] "1.127 (1.055,1.204)"        
## [18,] "2.882 (2.542,3.267)"        
## [19,] "2.189 (1.986,2.412)"        
## [20,] "1.548 (1.426,1.681)"        
## [21,] "1.483 (1.396,1.576)"        
## [22,] "0.382 (0.238,0.612)"        
## [23,] "2.138 (1.299,3.517)"        
## [24,] "1.653 (1.553,1.759)"        
## [25,] "1.683 (1.569,1.805)"        
## [26,] "1.331 (1.244,1.425)"        
## [27,] "1.124 (1.021,1.237)"        
## [28,] "2.950 (2.712,3.209)"        
## [29,] "1.558 (1.446,1.677)"        
## [30,] "1.362 (1.235,1.502)"        
## [31,] "1.168 (1.098,1.242)"        
## [32,] "0.853 (0.759,0.959)"        
## [33,] "1.234 (1.125,1.353)"        
## [34,] "0.807 (0.724,0.900)"


Variable	Training Data Odds Ratio (95% CI)	Full Data Odds Ratio (95% CI)
(Intercept)	0.003 (0.002,0.003)	0.003 (0.002,0.003)
Age_45_to_54	1.744 (1.451,2.095)	1.597 (1.349,1.891)
Age_55_to_64	2.249 (1.886,2.682)	2.102 (1.791,2.467)
Age_65	3.302 (2.760,3.950)	3.066 (2.606,3.607)
Male	2.241 (2.083,2.410)	2.171 (2.032,2.320)
Previous_Marriage	1.134 (1.052,1.223)	1.117 (1.043,1.197)
Never_Married	0.790 (0.697,0.896)	0.796 (0.707,0.897)
Veteran	1.125 (1.032,1.226)	1.159 (1.073,1.253)
Income_LT25K	1.296 (1.149,1.461)	1.297 (1.163,1.446)
Income_LT75K	1.199 (1.084,1.327)	1.218 (1.112,1.335)
Income_DKR	1.137 (1.014,1.274)	1.155 (1.038,1.285)
Rent_Home	1.096 (1.006,1.194)	1.078 (0.996,1.167)
Retired_Unable	1.288 (1.186,1.398)	1.297 (1.202,1.400)
Pre_High_School	1.188 (1.056,1.336)	1.232 (1.106,1.372)
High_School	1.100 (1.009,1.198)	1.118 (1.032,1.210)
Post_High_School	1.108 (1.016,1.208)	1.091 (1.008,1.181)
Division_D4	1.114 (1.036,1.197)	1.127 (1.055,1.204)
Poor_Health	2.765 (2.405,3.179)	2.882 (2.542,3.267)
Fair_Health	2.153 (1.934,2.397)	2.189 (1.986,2.412)
Good_Health	1.526 (1.395,1.670)	1.548 (1.426,1.681)
Smoker	1.504 (1.408,1.607)	1.483 (1.396,1.576)
Percent_Drink	0.368 (0.221,0.613)	0.382 (0.238,0.612)
Percent_Drink_2	2.188 (1.277,3.746)	2.138 (1.299,3.517)
High_Cholesterol	1.648 (1.539,1.764)	1.653 (1.553,1.759)
High_BP	1.730 (1.603,1.867)	1.683 (1.569,1.805)
Diabetic	1.315 (1.220,1.416)	1.331 (1.244,1.425)
Poor_Health_Percent	1.117 (1.009,1.235)	1.124 (1.021,1.237)
Stroke	2.915 (2.657,3.197)	2.950 (2.712,3.209)
COPD	1.591 (1.466,1.726)	1.558 (1.446,1.677)
Kidney	1.377 (1.238,1.532)	1.362 (1.235,1.502)
Arthritis	1.164 (1.088,1.245)	1.168 (1.098,1.242)
No_Doctor	0.837 (0.738,0.949)	0.853 (0.759,0.959)
Cost	1.225 (1.107,1.356)	1.234 (1.125,1.353)
No_Checkup	0.777 (0.690,0.874)	0.807 (0.724,0.900)

MI

Team LAD

2021

Preprocessing

Load Libraries

Functions

Read CSV, Pre-processed in Python

Train / Test

Apply Survey Weights

Describe

Unweighted

Descriptive Charts, Weighted Data

Quasi-Binomial Analysis

Model 1: Demographics (D)

Model 2: D + Socioeconomics (S)

Model 3: D + S + Geography (G)

Model 4: D + S + G + Behavioral / Risk (B)

Model 5: D + S + G + B + Access (A)

Model 6: Significant Variables Only

Model 7: All Variables

Optimal Cut Points for Quasibinomial and Confusion Matrix on Unseen Data

Plots

Group Data for Plotting

Plot

Table of Predictors for Complete Model

Analysis on Full Data Set

Forest Plots of Training Set Estimates and Complete Set Estimates

Coefficient Comparison, Training versus Full