library(here); library(haven); library(janitor)
library(naniar); library(broom); library(rms)
library(leaps); library(skimr); library(simputation)
library(readxl); library(knitr); library(kableExtra)
library(caret); library(modelr); library(tidyverse)

skim_with(numeric = list(hist = NULL),
          integer = list(hist = NULL))

1 Data Management

1.1 Load Raw Data

omas17_raw <- read_sas(here("data", "omas_2017_puf.sas7bdat"))
dim(omas17_raw)
[1] 39711   370

1.2 Inclusions and Exclusions

We’re going to restrict the raw data set to people who meet the following criteria:

  • Responded YES (1) to “Are you covered by health insurance or some other type of health care plan?” (A1 is 1)
  • Responded YES (1) to at least one of the following four questions:
    • Are you covered by a health insurance plan through a current or former employer or union? (B4A is 1)
    • Are you covered by Medicare, the Federal government-funded health insurance plan for people 65 years and older or with certain disabilities? (B4B is 1)
    • Are you covered by Medicaid, the State of Ohio government health care program? (B4C is 1)
    • Are you covered by health insurance purchased directly, that is, a private plan not related to a current or past employment? (B4E is 1)
  • Response regarding height in inches was greater than 47 and less than 84 (D30BINC)
  • Response regarding weight in pounds (D30A_UNIT is 1) is between 60 and 450 (D30A_VALUE)
omas17_cleaning <- omas17_raw %>% clean_names() %>%
    # inclusion and exclusion criteria
    filter(a1 == 1, 
           b4a == 1 | b4b == 1 | b4c == 1 | b4e == 1,
           d30binc > 47 & d30binc < 84,
           d30a_unit == 1,
           d30a_value > 59 & d30a_value < 450)

1.3 Initial Name Changes, Calculating BMI and Selection of Variables

Next, we’ll make some name changes, and calculate BMI from Weight and Height (but changing to NA any values that fall outside of the 16-80 range), then select the variables we want to study.

omas17_cleaning <- omas17_cleaning %>%
    # name changes and minor transformations
    mutate(region = s9_region, 
           county_type = s9_type_imp,
           sroh = d30_imp, 
           age = age_a_imp,
           sex = s15,
           race_eth = race5_a_imp,
           usual_care = usual_a,
           ins_employer = b4a,
           ins_medicare = b4b,
           ins_medicaid = b4c,
           ins_private = b4e,
           mental31 = d30i,
           education = educ_imp,
           poverty = fpl_cat_18,
           ht_in = d30binc,
           wt_lb = d30a_value,
           smoke_100 = d45,
           alcohol30 = d46,
           er_visits = e62,
           job_today = g71)

1.4 Changing “Don’t Know” and “Refused” to NA with naniar

Now, we tell R that the 97, 98 and 99 values are all in fact missing values, except for those we seein the wt_lb variable. We’ll use the replace_with_na_at function from the naniar package.

omas17_cleaning <- omas17_cleaning %>%
    replace_with_na_at(.vars = c("alcohol30", "avoid_care",
                                 "ins_employer", "ins_medicare",
                                 "ins_medicaid", "ins_private",
                                 "er_visits", "food_12mo",
                                 "job_today", "marital",
                                 "mental31", "sex", 
                                 "smoke_100", "usual_care"),
                       condition = ~.x %in% c(97, 98, 99))

1.5 Converting Categorical Variables

Now, we convert the categorical variables to meaningful factors or to 1/0 numeric variables.

omas17_cleaning <- omas17_cleaning %>%
    mutate(region = fct_recode(factor(region),
                               "North_Central" = "1", 
                               "North_East" = "2", 
                               "North_East_Central" = "3",
                               "North_West" = "4",
                               "South_Central" = "5",
                               "South_East" = "6",
                               "South_West" = "7"),
           county_type = fct_recode(factor(county_type),
                             "Rural_Appalachian" = "1", 
                             "Metro" = "2",
                             "Rural_Non-Appalachian" = "3", 
                             "Suburban" = "4"),
           age = fct_recode(factor(age), 
                            "19-24" = "1",
                            "25-34" = "2",
                            "35-44" = "3",
                            "45-54" = "4",
                            "55-64" = "5",
                            "65+" = "6"),
           sex = fct_recode(factor(sex), 
                            "M" = "1", 
                            "F" = "2"),
           race_eth = fct_recode(factor(race_eth),
                                 "White" = "1",
                                 "Black/African-American" = "2",
                                 "Hispanic" = "3",
                                 "Asian" = "4",
                                 "Other" = "5"),
           education = fct_recode(factor(education),
                             "1_Up To HS but no diploma" = "1",
                             "2_HS Graduate or equivalent" = "2",
                             "3_Some College" = "3",
                             "4_Associate Degree" = "4",
                             "5_Four Year College Graduate" = "5",
                             "6_Advanced Degree" = "6"),
           marital = fct_recode(factor(marital),
                                "Married" = "1",
                                "Divorced/Separated" = "2",
                                "Widowed" = "3",
                                "Never Married" = "4",
                                "Unmarried Couple" = "5"),
           poverty = fct_recode(factor(poverty),
                                "0.75 FPL or less" = "1",
                                "0.75 - 1.00 FPL" = "2",
                                "1.00 - 1.38 FPL" = "3",
                                "1.38 - 2.06 FPL" = "4",
                                "2.06 - 2.50 FPL" = "5",
                                "2.50 - 4.00 FPL" = "6",
                                "4.00 FPL or more" = "7"),
           food_12mo = fct_recode(factor(food_12mo), 
                                  "1_Easier Now" = "1",
                                  "3_Harder Now" = "2",
                                  "2_Stayed Same" = "3",
                                  NULL = "4"),
           sroh = fct_recode(factor(sroh), 
                             "1_Excellent" = "1",
                             "2_VeryGood" = "2",
                             "3_Good" = "3",
                             "4_Fair" = "4",
                             "5_Poor" = "5"),
           smoke_stat = fct_recode(factor(smoke_stat),
                                   "Never" = "1",
                                   "Former" = "2",
                                   "Current" = "3",
                                   NULL = "4"),
           # convert 1 = Yes, 2 = No to 1 = Yes, 0 = No
           job_today = 2 - job_today,
           smoke_100 = 2 - smoke_100,
           usual_care = 2 - usual_care,
           avoid_care = 2 - avoid_care,
           ins_employer = 2 - ins_employer,
           ins_medicare = 2 - ins_medicare,
           ins_medicaid = 2 - ins_medicaid,
           ins_private = 2 - ins_private)

1.6 Calculating BMI and Primary Insurance Type Selection of Variables

Next, we’ll calculate BMI from Weight and Height (but changing to NA any values that fall outside of the 16-80 range), then we’ll assign everyone to a primary insurance type as follows:

  • if they have Medicaid insurance, that will be their primary insurance type (ins_type)
  • if not, and they have Medicare, that will be primary
  • if neither, and they have Employer, that will be primary
  • if none of those, and they have private, that will be primary
omas17_cleaning <- omas17_cleaning %>%
    mutate(bmi = 703 * wt_lb/(ht_in^2)) %>%
    # restrict BMI to 16.0 through 80.0
    mutate(bmi = ifelse(bmi < 16 | bmi > 80, NA, bmi)) %>%
    mutate(ins_type = case_when(
        ins_medicaid == 1 ~ "Medicaid",
        ins_medicare == 1 ~ "Medicare",
        ins_employer == 1 ~ "Employer",
        ins_private == 1 ~ "Private",
        TRUE ~ NA_character_),
        ins_type = fct_relevel(factor(ins_type), 
                               "Medicaid", "Medicare", 
                               "Employer", "Private"))

1.7 Selection of Variables

Finally, we select the variables we want to study.

omas17_cleaned <- omas17_cleaning %>%
    select(caseid, region, county_type, avoid_care, 
           usual_care, ins_employer, ins_medicare,
           ins_medicaid, ins_private, ins_type,
           age, sex, race_eth, education, marital, 
           poverty, job_today, food_12mo, 
           sroh, mental31, ht_in, wt_lb, bmi, 
           smoke_stat, smoke_100, alcohol30, er_visits)

1.8 Plot of missingness from naniar

dim(omas17_cleaned)
[1] 32144    27
gg_miss_var(omas17_cleaned)

miss_var_summary(omas17_cleaned)
# A tibble: 27 x 3
   variable     n_miss pct_miss
   <chr>         <int>    <dbl>
 1 food_12mo      1137    3.54 
 2 marital        1085    3.38 
 3 avoid_care      766    2.38 
 4 usual_care      640    1.99 
 5 job_today       536    1.67 
 6 ins_medicaid    474    1.47 
 7 mental31        418    1.30 
 8 ins_private     417    1.30 
 9 er_visits       346    1.08 
10 region          289    0.899
# ... with 17 more rows

1.9 Codebook

omas_2017_codes <- read_xlsx(
    here("data", "omas_2017_codebook.xlsx"))

omas_2017_miss_summary <- miss_var_summary(omas17_cleaned)

omas_table <- left_join(omas_2017_codes, omas_2017_miss_summary)
Joining, by = "variable"
omas_table %>%
    kable() %>%
    kable_styling(bootstrap_options = c("striped", "hover"))
variable type description source n_miss pct_miss
caseid character Identification Code - arbitrary CASEID 0 0.0000000
region 7-level factor North_Central, North_East, North_East_Central, North_West, South_Central, South_East, South_West S9_REGION 289 0.8990791
county_type 4-level factor Metro, Suburban, Rural Appalachian, Rural Non-Appalachian Region 0 0.0000000
avoid_care 1 (Yes), 0 (No) Did you delayed or avoid getting the health care you needed? AVOID_CARE 766 2.3830264
usual_care 1 (Yes), 0 (No) Do you have a usual source of care? USUAL_A 640 1.9910403
ins_employer 1 (Yes), 0 (No) Insurance through an employer or union? B4A 137 0.4262071
ins_medicare 1 (Yes), 0 (No) Insurance through Medicare? B4B 210 0.6533101
ins_medicaid 1 (Yes), 0 (No) Insurance through Medicaid? B4C 474 1.4746142
ins_private 1 (Yes), 0 (No) Insurance through a private plan? B4E 417 1.2972872
ins_type 4-level factor Medicaid, Medicare, Employer, Private Calculated 0 0.0000000
age 6-level factor Age category (in years): 19-24, 25-34, 35-44, 45-54, 55-64, 65+ AGE_A_IMP 0 0.0000000
sex 2-level factor F = female, M = male S15 41 0.1275510
race_eth 5-level factor White, Black/African-American, Hispanic, Asian, Other RACE5_A_IMP 0 0.0000000
education 6-level factor Up to HS but no diploma, HS Graduate or equivalent, Some College, Associate Degree, Four Year College Graduate, Advanced Degree EDUC_IMP 0 0.0000000
marital 5-level factor Married, Divorced/Separated, Widowed, Never Married, Unmarried Couple MARITAL 1085 3.3754355
poverty 7-level factor Relation to Federal Poverty Level of Household Income: 0.75 or less, 0.75-1, 1-1.38, 1.38-2.06, 2.06 - 2.5, 2.5 - 4, 4+ FPL_CAT_18 0 0.0000000
job_today 1 (Yes), 0 (No) Did you have a full-time or part-time job last week? G71 536 1.6674963
food_12mo 3-level factor Is it easier now, harder now or the same as one year ago to get food you need? FOOD_12MO 1137 3.5372076
sroh 5-level factor Self-Reported Overall Health: Excellent, Very Good, Good, Fair, Poor D30_IMP 0 0.0000000
mental31 quantitative # of days in the past month where your mental health prevented you from doing normal activities D30I 418 1.3003982
ht_in quantitative height in inches D30BINC 0 0.0000000
wt_lb quantitative weight in pounds D30A_VALUE 0 0.0000000
bmi quantitative body-mass index (Calculated from Height and Weight, restricted to 16-80) Calculated 93 0.2893230
smoke_stat 3-level factor Never Smoked, Former Smoker, Current Smoker SMOKE_STAT 150 0.4666501
smoke_100 1 (Yes), 0 (No) Have you smoked 100 cigarettes in your life? D45 130 0.4044301
alcohol30 quantitative # of days in the past 30 where you consumed alcohol D46 228 0.7093081
er_visits quantitative # of times in the past 12 months where you were a patient in a hospital emergency room E62 346 1.0764062

1.10 Final Checks from skim

skim(omas17_cleaned)
Skim summary statistics
 n obs: 32144 
 n variables: 27 

-- Variable type:character --------------------------------------------------------------------
 variable missing complete     n min max empty n_unique
   caseid       0    32144 32144   9   9     0    32144

-- Variable type:factor -----------------------------------------------------------------------
    variable missing complete     n n_unique
         age       0    32144 32144        6
 county_type       0    32144 32144        4
   education       0    32144 32144        6
   food_12mo    1137    31007 32144        3
    ins_type       0    32144 32144        4
     marital    1085    31059 32144        5
     poverty       0    32144 32144        7
    race_eth       0    32144 32144        5
      region     289    31855 32144        7
         sex      41    32103 32144        2
  smoke_stat     150    31994 32144        3
        sroh       0    32144 32144        5
                                              top_counts ordered
 65+: 10342, 55-: 7025, 45-: 5358, 35-: 3891               FALSE
 Met: 15666, Rur: 6220, Rur: 5223, Sub: 5035               FALSE
  2_H: 9810, 5_F: 6394, 3_S: 5041, 6_A: 4552               FALSE
  2_S: 22532, 3_H: 5681, 1_E: 2794, NA: 1137               FALSE
 Emp: 13560, Med: 9781, Med: 7167, Pri: 1636               FALSE
 Mar: 15274, Nev: 5364, Div: 5115, Wid: 3320               FALSE
                                           4.0: 10441, 2   FALSE
  Whi: 24889, Bla: 4188, Oth: 1912, His: 778               FALSE
  Nor: 8455, Sou: 7358, Sou: 6570, Sou: 3165               FALSE
                  F: 17607, M: 14496, NA: 41               FALSE
   Nev: 16891, For: 8538, Cur: 6565, NA: 150               FALSE
 2_V: 10059, 3_G: 9313, 1_E: 5659, 4_F: 5031               FALSE

-- Variable type:numeric ----------------------------------------------------------------------
     variable missing complete     n   mean    sd    p0    p25    p50
    alcohol30     228    31916 32144   4.1   7.32  0      0      1   
   avoid_care     766    31378 32144   0.22  0.41  0      0      0   
          bmi      93    32051 32144  29.11  6.84 16.01  24.39  27.89
    er_visits     346    31798 32144   0.62  1.74  0      0      0   
        ht_in       0    32144 32144  67.02  4.32 48     64     67   
 ins_employer     137    32007 32144   0.55  0.5   0      0      1   
 ins_medicaid     474    31670 32144   0.23  0.42  0      0      0   
 ins_medicare     210    31934 32144   0.41  0.49  0      0      0   
  ins_private     417    31727 32144   0.21  0.4   0      0      0   
    job_today     536    31608 32144   0.53  0.5   0      0      1   
     mental31     418    31726 32144   1.85  6.21  0      0      0   
    smoke_100     130    32014 32144   0.47  0.5   0      0      0   
   usual_care     640    31504 32144   0.94  0.23  0      1      1   
        wt_lb       0    32144 32144 186.15 48.3  60    150    180   
    p75   p100
   4     30   
   0      1   
  32.35  78.64
   1     21   
  70     83   
   1      1   
   0      1   
   1      1   
   0      1   
   1      1   
   0     31   
   1      1   
   1      1   
 212    446   

1.11 Saving the Results

saveRDS(omas17_cleaned, here("data", "omas2017_clean.Rds"))
write_csv(omas17_cleaned, here("data", "omas2017_clean.csv"))

2 Model Building for a Quantitative Outcome

2.1 Working with a subsample: the Northeast Ohio Region

Let’s take a sample of the data, to describe the Northeast Ohio region.

omas17ne <- omas17_cleaned %>% 
    filter(region == "North_East") 

nrow(omas17ne)
[1] 8455

2.2 Our outcome: Height to Waist Ratio (HWR)

The height/weight ratio (hwr) is usually expressed in cm / kg, rather than feet and inches. But we can convert.

\[ ht_{cm} = 2.54 * ht_{in}, wt_{kg} = 0.453592 * wt_{lb} \]

and so

\[ HWR = \frac{ht_{cm}}{wt_{kg}} \approx 5.6 \frac{ht_{in}}{wt_{lb}} \]

omas17ne <- omas17ne %>%
    mutate(HWR = 5.6 * ht_in / wt_lb)

2.2.1 Summarizing the HWR Outcome

Here’s a quick numerical summary. We have no missing values, and for now, we’ll pretend that all of these values we do see are plausible.

mosaic::favstats(~ HWR, data = omas17ne)
       min   Q1   median       Q3      max     mean        sd    n missing
 0.8195122 1.82 2.121212 2.453244 5.255385 2.152794 0.4810418 8455       0

2.2.2 Plotting the HWR Outcome

ggplot(omas17ne, aes(x = HWR)) +
    geom_histogram(bins = 30, col = "white", fill = "navy") +
    theme_bw() + 
    labs(x = "Height/Weight Ratio (cm/kg)",
         title = "Height/Weight Ratio for the NE Ohio subjects",
         subtitle = paste0("Ohio Medicaid Assessment Survey 2017, n = ", length(omas17ne$HWR), " adults"))

2.3 Predictors under consideration

  • alcohol30 (quantitative - a count)
  • mental31 (quantitative - a count)
  • sex (2 levels)
  • job_today (2 levels)
  • usual_care (2 levels)
  • smoke_100 (2 levels)
  • smoke_stat (3 levels)
  • ins_type (4 levels)
  • sroh (5 levels)
  • age (6 levels)
  • education (6 levels)
  • poverty (7 levels)
str(omas17ne)
Classes 'tbl_df', 'tbl' and 'data.frame':   8455 obs. of  28 variables:
 $ caseid      : chr  "010002958" "010005008" "010006545" "010009985" ...
  ..- attr(*, "label")= chr "CASEID--Case ID"
 $ region      : Factor w/ 7 levels "North_Central",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ county_type : Factor w/ 4 levels "Rural_Appalachian",..: 3 1 3 4 2 3 4 3 4 3 ...
 $ avoid_care  : num  1 0 0 0 0 0 0 0 0 0 ...
 $ usual_care  : num  1 0 1 1 1 1 1 1 1 0 ...
 $ ins_employer: num  1 0 0 0 0 0 0 1 0 0 ...
 $ ins_medicare: num  0 0 0 0 0 1 1 0 0 1 ...
 $ ins_medicaid: num  0 1 1 1 1 0 0 0 0 1 ...
 $ ins_private : num  1 0 0 0 0 0 1 0 1 0 ...
 $ ins_type    : Factor w/ 4 levels "Medicaid","Medicare",..: 3 1 1 1 1 2 2 3 4 1 ...
 $ age         : Factor w/ 6 levels "19-24","25-34",..: 1 5 1 5 1 6 6 4 2 4 ...
 $ sex         : Factor w/ 2 levels "M","F": 1 2 2 2 1 2 2 1 1 1 ...
 $ race_eth    : Factor w/ 5 levels "White","Black/African-American",..: 1 1 1 1 2 1 1 1 1 2 ...
 $ education   : Factor w/ 6 levels "1_Up To HS but no diploma",..: 5 2 2 1 2 2 6 5 5 2 ...
 $ marital     : Factor w/ 5 levels "Married","Divorced/Separated",..: 5 5 4 2 2 1 1 1 5 4 ...
 $ poverty     : Factor w/ 7 levels "0.75 FPL or less",..: 7 1 1 2 1 4 4 7 7 5 ...
 $ job_today   : num  1 0 1 0 1 0 0 1 1 0 ...
 $ food_12mo   : Factor w/ 3 levels "1_Easier Now",..: 1 3 3 3 3 3 3 2 3 2 ...
 $ sroh        : Factor w/ 5 levels "1_Excellent",..: 1 3 4 5 4 1 1 2 1 4 ...
 $ mental31    : num  0 0 0 0 4 0 0 0 0 NA ...
 $ ht_in       : num  75 61 52 61 65 66 60 72 71 72 ...
  ..- attr(*, "label")= chr "D30BINC--CALCULATE NUMBER OF INCHES"
  ..- attr(*, "format.sas")= chr "NMISS_F"
 $ wt_lb       : num  200 149 170 165 125 183 129 215 175 164 ...
  ..- attr(*, "label")= chr "D30A_VALUE--Weight without shoes"
  ..- attr(*, "format.sas")= chr "D30A_VALUE_F"
 $ bmi         : num  25 28.2 44.2 31.2 20.8 ...
 $ smoke_stat  : Factor w/ 3 levels "Never","Former",..: 1 2 3 3 3 2 1 1 1 3 ...
 $ smoke_100   : num  0 1 1 1 1 1 0 0 0 1 ...
 $ alcohol30   : num  19 2 0 0 15 0 25 7 10 0 ...
 $ er_visits   : num  0 0 1 NA 4 0 0 0 0 0 ...
 $ HWR         : num  2.1 2.29 1.71 2.07 2.91 ...
  ..- attr(*, "label")= chr "D30BINC--CALCULATE NUMBER OF INCHES"
  ..- attr(*, "format.sas")= chr "NMISS_F"

2.4 Running regsubsets with a multi-categorical predictor

2.4.1 Approach A

In Approach A, we treat ins_type as a factor, and use the formula interface, rather than cbind.

hwr_out_A <- regsubsets(
    HWR ~ alcohol30 + mental31 + sex + ins_type,
    data = omas17ne, nvmax = NULL, nbest = 1)

hwr_summ_A <- summary(hwr_out_A)

hwr_summ_A$which # another option is outmat here
  (Intercept) alcohol30 mental31 sexF ins_typeMedicare ins_typeEmployer
1        TRUE     FALSE    FALSE TRUE            FALSE            FALSE
2        TRUE      TRUE    FALSE TRUE            FALSE            FALSE
3        TRUE      TRUE     TRUE TRUE            FALSE            FALSE
4        TRUE      TRUE     TRUE TRUE            FALSE            FALSE
5        TRUE      TRUE     TRUE TRUE             TRUE            FALSE
6        TRUE      TRUE     TRUE TRUE             TRUE             TRUE
  ins_typePrivate
1           FALSE
2           FALSE
3           FALSE
4            TRUE
5            TRUE
6            TRUE
hwr_summ_A$adjr2
[1] 0.04804233 0.05644289 0.05787055 0.05876998 0.05948750 0.05976641

The result is problematic.

  • How do we interpret models that have some but not all of the ins_type components?
  • This is hiding the fact that missing data exist here, and that the machine is doing something about that to get these models. In fact, it’s dropping all of those observations.

2.4.2 Approach B

If we use the cbind approach, one “advantage” is that we get an error if we have missing data.

hwr_preds_B <- 
    with(omas17ne, 
         cbind(alcohol30, mental31, sex, ins_type))

hwr_out_B <- regsubsets(x = hwr_preds_B,
                        y = omas17ne$HWR,
                        nvmax = NULL, nbest = 1)

If you try to run this you will get an error:

Error in leaps.setup(x, y, wt = weights, nbest = nbest, nvmax = nvmax, : NA/NaN/Inf in foreign function call (arg 4)

Aha! So we would need to either impute some of those values, or else do a complete case analysis.

2.4.3 Approach C

Let’s try the cbind approach after we restrict ourselves to complete cases.

omas17ne_CC <- omas17ne %>%
    filter(complete.cases(caseid, HWR, 
                          alcohol30, mental31, 
                          sex, ins_type))

nrow(omas17ne_CC); nrow(omas17ne)
[1] 8273
[1] 8455

So we lose 182 rows due to missing data.

hwr_preds_CC <- 
    with(omas17ne_CC, cbind(alcohol30, mental31, sex, ins_type))

nrow(omas17ne_CC)
[1] 8273
hwr_out_CC <- regsubsets(x = hwr_preds_CC,
                           y = omas17ne_CC$HWR,
                           nvmax = NULL, nbest = 1)

hwr_summ_CC <- summary(hwr_out_CC)

hwr_summ_CC$outmat # another option is which here
         alcohol30 mental31 sex ins_type
1  ( 1 ) " "       " "      "*" " "     
2  ( 1 ) "*"       " "      "*" " "     
3  ( 1 ) "*"       "*"      "*" " "     
4  ( 1 ) "*"       "*"      "*" "*"     
hwr_summ_CC$adjr2
[1] 0.04804233 0.05644289 0.05787055 0.05854058

But, as it turns out, this is just the same as treating ins_type as if it were numeric! Which is kind of problematic, too!

head(hwr_preds_CC)
     alcohol30 mental31 sex ins_type
[1,]        19        0   1        3
[2,]         2        0   2        1
[3,]         0        0   2        1
[4,]         0        0   2        1
[5,]        15        4   1        1
[6,]         0        0   2        2

2.4.4 Approach D

omas17ne %>%
    mutate(ins_num = as.numeric(ins_type)) %>%
    tabyl(ins_type, ins_num)
 ins_type    1    2    3   4
 Medicaid 2029    0    0   0
 Medicare    0 2519    0   0
 Employer    0    0 3453   0
  Private    0    0    0 454

The formula approach hides two things:

  • the missing data are removed before the regressions are run
  • the multi-categorical factors are treated as if they were numeric
hwr_out_D <- regsubsets(
    HWR ~ alcohol30 + mental31 + sex + as.numeric(ins_type),
    data = omas17ne, nvmax = NULL, nbest = 1)

hwr_summ_D <- summary(hwr_out_D)

hwr_summ_D$outmat 
         alcohol30 mental31 sexF as.numeric(ins_type)
1  ( 1 ) " "       " "      "*"  " "                 
2  ( 1 ) "*"       " "      "*"  " "                 
3  ( 1 ) "*"       "*"      "*"  " "                 
4  ( 1 ) "*"       "*"      "*"  "*"                 
hwr_summ_D$adjr2
[1] 0.04804233 0.05644289 0.05787055 0.05854058

2.4.5 Approach E

First impute, then use formula approach (still have the problem with multiple-category variables)

How much missingness is there in these variables?

miss_var_summary(omas17ne %>% 
                     select(alcohol30, mental31, sex,
                            ins_type, HWR))
# A tibble: 5 x 3
  variable  n_miss pct_miss
  <chr>      <int>    <dbl>
1 mental31     103    1.22 
2 alcohol30     75    0.887
3 sex            9    0.106
4 ins_type       0    0    
5 HWR            0    0    
set.seed(20190305)
omas17ne_imp1 <- omas17ne %>%
    impute_cart(sex ~ ins_type + HWR) %>%
    impute_rlm(alcohol30 ~ sex + ins_type + HWR) %>%
    impute_rlm(mental31 ~ 
                   alcohol30 + sex + ins_type + HWR)
miss_var_summary(omas17ne_imp1 %>% 
                     select(alcohol30, mental31, sex,
                            ins_type, HWR))
# A tibble: 5 x 3
  variable  n_miss pct_miss
  <chr>      <int>    <dbl>
1 alcohol30      0        0
2 mental31       0        0
3 sex            0        0
4 ins_type       0        0
5 HWR            0        0
hwr_out_E <- regsubsets(
    HWR ~ alcohol30 + mental31 + sex + as.numeric(ins_type),
    data = omas17ne_imp1, nvmax = NULL, nbest = 1)

hwr_summ_E <- summary(hwr_out_E)

hwr_summ_E$outmat 
         alcohol30 mental31 sexF as.numeric(ins_type)
1  ( 1 ) " "       " "      "*"  " "                 
2  ( 1 ) "*"       " "      "*"  " "                 
3  ( 1 ) "*"       "*"      "*"  " "                 
4  ( 1 ) "*"       "*"      "*"  "*"                 
hwr_summ_E$adjr2
[1] 0.04737830 0.05492679 0.05667810 0.05722438

2.5 A More Complete Look at Potential Models

Our potential predictors are:

  • alcohol30 (quantitative - a count)
  • mental31 (quantitative - a count)
  • sex (2 levels)
  • job_today (2 levels)
  • usual_care (2 levels)
  • smoke_100 (2 levels)
  • smoke_stat (3 levels)
  • ins_type (4 levels)
  • sroh (5 levels)
  • age (6 levels)
  • education (6 levels)
  • poverty (7 levels)
omas17ne %>% 
    select(alcohol30, mental31, sex, job_today,
           usual_care, smoke_100, smoke_stat, ins_type, 
           sroh, age, education, poverty, HWR) %>%
    miss_var_summary()
# A tibble: 13 x 3
   variable   n_miss pct_miss
   <chr>       <int>    <dbl>
 1 usual_care    187    2.21 
 2 job_today     157    1.86 
 3 mental31      103    1.22 
 4 alcohol30      75    0.887
 5 smoke_stat     47    0.556
 6 smoke_100      43    0.509
 7 sex             9    0.106
 8 ins_type        0    0    
 9 sroh            0    0    
10 age             0    0    
11 education       0    0    
12 poverty         0    0    
13 HWR             0    0    
set.seed(20190305)
omas17ne_imp2 <- omas17ne %>%
    impute_cart(sex + smoke_stat ~ 
                    ins_type + sroh + age + education +
                    poverty) %>%
    impute_pmm(smoke_100 ~ 
                    ins_type + sroh + age + education +
                    poverty + sex) %>%
    impute_pmm(alcohol30 + mental31 ~ 
                   ins_type + sroh + age + education +
                   poverty + sex + smoke_stat) %>%
    impute_pmm(job_today + usual_care ~ 
                   alcohol30 + sex + smoke_stat + age +
                    ins_type + mental31)
omas17ne_imp2 %>% 
    select(alcohol30, mental31, sex, job_today,
           usual_care, smoke_100, smoke_stat, ins_type, 
           sroh, age, education, poverty, HWR) %>%
    miss_var_summary()
# A tibble: 13 x 3
   variable   n_miss pct_miss
   <chr>       <int>    <dbl>
 1 alcohol30       0        0
 2 mental31        0        0
 3 sex             0        0
 4 job_today       0        0
 5 usual_care      0        0
 6 smoke_100       0        0
 7 smoke_stat      0        0
 8 ins_type        0        0
 9 sroh            0        0
10 age             0        0
11 education       0        0
12 poverty         0        0
13 HWR             0        0
omas17ne_imp2 %>% 
    select(alcohol30, mental31, sex, job_today,
           usual_care, smoke_100, smoke_stat, ins_type, 
           sroh, age, education, poverty, HWR) %>%
    skim()
Skim summary statistics
 n obs: 8455 
 n variables: 13 

-- Variable type:factor -----------------------------------------------------------------------
   variable missing complete    n n_unique
        age       0     8455 8455        6
  education       0     8455 8455        6
   ins_type       0     8455 8455        4
    poverty       0     8455 8455        7
        sex       0     8455 8455        2
 smoke_stat       0     8455 8455        3
       sroh       0     8455 8455        5
                                            top_counts ordered
  65+: 2757, 55-: 1843, 45-: 1430, 35-: 956              FALSE
 2_H: 2294, 5_F: 1821, 3_S: 1448, 6_A: 1322              FALSE
  Emp: 3453, Med: 2519, Med: 2029, Pri: 454              FALSE
                                          4.0: 2831, 2   FALSE
                    F: 4622, M: 3833, NA: 0              FALSE
     Nev: 4461, For: 2322, Cur: 1672, NA: 0              FALSE
 2_V: 2684, 3_G: 2361, 1_E: 1564, 4_F: 1322              FALSE

-- Variable type:numeric ----------------------------------------------------------------------
   variable missing complete    n mean   sd   p0  p25  p50  p75  p100
  alcohol30       0     8455 8455 4.51 7.66 0    0    1    5    30   
        HWR       0     8455 8455 2.15 0.48 0.82 1.82 2.12 2.45  5.26
  job_today       0     8455 8455 0.53 0.5  0    0    1    1     1   
   mental31       0     8455 8455 1.86 6.19 0    0    0    0    31   
  smoke_100       0     8455 8455 0.47 0.5  0    0    0    1     1   
 usual_care       0     8455 8455 0.94 0.24 0    1    1    1     1   
omas17ne_imp2 <- omas17ne_imp2 %>%
    mutate(age_num = as.numeric(age),
           educ_num = as.numeric(education),
           ins_num = as.numeric(ins_type),
           pov_num = as.numeric(poverty),
           smoke_num = as.numeric(smoke_stat),
           sroh_num = as.numeric(sroh))
hwr_out_F <- regsubsets(
    HWR ~ alcohol30 + mental31 + sex + job_today + 
        usual_care + smoke_100 + smoke_num + pov_num +
        ins_num + sroh_num + age_num + educ_num,
    data = omas17ne_imp2, nvmax = 12, nbest = 1)

hwr_summ_F <- summary(hwr_out_F)

hwr_summ_F$outmat 
          alcohol30 mental31 sexF job_today usual_care smoke_100 smoke_num
1  ( 1 )  " "       " "      "*"  " "       " "        " "       " "      
2  ( 1 )  " "       " "      "*"  " "       " "        " "       " "      
3  ( 1 )  " "       " "      "*"  " "       " "        " "       "*"      
4  ( 1 )  " "       " "      "*"  " "       " "        "*"       "*"      
5  ( 1 )  " "       " "      "*"  "*"       " "        "*"       "*"      
6  ( 1 )  "*"       " "      "*"  "*"       " "        "*"       "*"      
7  ( 1 )  "*"       " "      "*"  "*"       " "        "*"       "*"      
8  ( 1 )  "*"       " "      "*"  "*"       " "        "*"       "*"      
9  ( 1 )  "*"       " "      "*"  "*"       "*"        "*"       "*"      
10  ( 1 ) "*"       " "      "*"  "*"       "*"        "*"       "*"      
11  ( 1 ) "*"       "*"      "*"  "*"       "*"        "*"       "*"      
12  ( 1 ) "*"       "*"      "*"  "*"       "*"        "*"       "*"      
          pov_num ins_num sroh_num age_num educ_num
1  ( 1 )  " "     " "     " "      " "     " "     
2  ( 1 )  " "     " "     "*"      " "     " "     
3  ( 1 )  " "     " "     "*"      " "     " "     
4  ( 1 )  " "     " "     "*"      " "     " "     
5  ( 1 )  " "     " "     "*"      " "     " "     
6  ( 1 )  " "     " "     "*"      " "     " "     
7  ( 1 )  " "     " "     "*"      "*"     " "     
8  ( 1 )  "*"     " "     "*"      "*"     " "     
9  ( 1 )  "*"     " "     "*"      "*"     " "     
10  ( 1 ) "*"     "*"     "*"      "*"     " "     
11  ( 1 ) "*"     "*"     "*"      "*"     " "     
12  ( 1 ) "*"     "*"     "*"      "*"     "*"     

2.5.1 Suggested Models from regsubsets

Data includes nrow(omas17ne_imp2) = 8455 observations, and we run models of size 2:13, when you include the intercept term here because we set nvarmax = 12

hwr_summ_F$aic.c <- 8455*log(hwr_summ_F$rss / 8455) + 2*(2:13) + 
    (2 * (2:13) * ((2:13)+1) / (8455 - (2:13) - 1))

Now, we build a tibble containing the winners:

hwr_F_win <- data_frame(
    k = 2:13,
    r2 = hwr_summ_F$rsq,
    adjr2 = hwr_summ_F$adjr2,
    cp = hwr_summ_F$cp,
    aic.c = hwr_summ_F$aic.c,
    bic = hwr_summ_F$bic)
Warning: `data_frame()` is deprecated, use `tibble()`.
This warning is displayed once per session.
hwr_F_win <- bind_cols(hwr_F_win, tbl_df(hwr_summ_F$which))
hwr_F_win
# A tibble: 12 x 19
       k     r2  adjr2     cp   aic.c   bic `(Intercept)` alcohol30
   <int>  <dbl>  <dbl>  <dbl>   <dbl> <dbl> <lgl>         <lgl>    
 1     2 0.0474 0.0473 592.   -12782. -392. TRUE          FALSE    
 2     3 0.0870 0.0868 218.   -13139. -742. TRUE          FALSE    
 3     4 0.0933 0.0930 160.   -13196. -792. TRUE          FALSE    
 4     5 0.101  0.101   88.5  -13266. -855. TRUE          FALSE    
 5     6 0.106  0.105   44.9  -13310. -892. TRUE          FALSE    
 6     7 0.110  0.109   10.6  -13344. -919. TRUE          TRUE     
 7     8 0.110  0.109    9.39 -13345. -913. TRUE          TRUE     
 8     9 0.110  0.110    7.35 -13347. -908. TRUE          TRUE     
 9    10 0.111  0.110    7.42 -13347. -901. TRUE          TRUE     
10    11 0.111  0.110    9.16 -13345. -892. TRUE          TRUE     
11    12 0.111  0.110   11.0  -13343. -883. TRUE          TRUE     
12    13 0.111  0.109   13    -13342. -874. TRUE          TRUE     
# ... with 11 more variables: mental31 <lgl>, sexF <lgl>, job_today <lgl>,
#   usual_care <lgl>, smoke_100 <lgl>, smoke_num <lgl>, pov_num <lgl>,
#   ins_num <lgl>, sroh_num <lgl>, age_num <lgl>, educ_num <lgl>

The models considered by regsubsets are:

k Model
2 sex_F
3 + sroh_num (sroh)
4 + smoke_num (smoke_stat)
5 + smoke_100
6 + job_today
7 + alcohol30
8 + age_num (age)
9 + pov_num (poverty)
10 + usual_care
11 + ins_num
12 + mental31
13 + educ_num

2.5.1.1 The Big Four Plots

hwrp1

hwrp2

hwrp3

hwrp4

Approach Model Suggested
Adjusted R2 k = 10 inputs
Mallows’ Cp k = 8 inputs
Bias-Corrected AIC k = 9 inputs
BIC k = 7 inputs

The relevant models, then, suggested by regsubsets each include:

  • sex_F, sroh_num, smoke_num, smoke_100, job_today and alcohol30
k Model
BIC model above with 7 inputs, including intercept
Cp + age_num (age)
AIC_c + pov_num (poverty)
R2 (adj) + usual_care

2.5.2 A Stepwise Model with Numeric Predictors

hwr_out_STEPNUM <- step(lm(
    HWR ~ alcohol30 + mental31 + sex + job_today + 
        usual_care + smoke_100 + smoke_num + pov_num +
        ins_num + sroh_num + age_num + educ_num,
    data = omas17ne_imp2))
Start:  AIC=-13341.55
HWR ~ alcohol30 + mental31 + sex + job_today + usual_care + smoke_100 + 
    smoke_num + pov_num + ins_num + sroh_num + age_num + educ_num

             Df Sum of Sq    RSS    AIC
- educ_num    1     0.003 1739.7 -13344
- mental31    1     0.030 1739.8 -13343
- ins_num     1     0.051 1739.8 -13343
- usual_care  1     0.393 1740.1 -13342
<none>                    1739.7 -13342
- pov_num     1     0.499 1740.2 -13341
- age_num     1     0.863 1740.6 -13339
- alcohol30   1     6.806 1746.5 -13310
- job_today   1    11.198 1750.9 -13289
- smoke_100   1    15.757 1755.5 -13267
- smoke_num   1    23.986 1763.7 -13228
- sroh_num    1    71.440 1811.2 -13003
- sex         1    95.911 1835.7 -12890

Step:  AIC=-13343.54
HWR ~ alcohol30 + mental31 + sex + job_today + usual_care + smoke_100 + 
    smoke_num + pov_num + ins_num + sroh_num + age_num

             Df Sum of Sq    RSS    AIC
- mental31    1     0.030 1739.8 -13345
- ins_num     1     0.050 1739.8 -13345
- usual_care  1     0.398 1740.1 -13344
<none>                    1739.7 -13344
- pov_num     1     0.509 1740.2 -13343
- age_num     1     0.867 1740.6 -13341
- alcohol30   1     6.819 1746.6 -13312
- job_today   1    11.286 1751.0 -13291
- smoke_100   1    15.755 1755.5 -13269
- smoke_num   1    24.161 1763.9 -13229
- sroh_num    1    72.158 1811.9 -13002
- sex         1    95.920 1835.7 -12892

Step:  AIC=-13345.39
HWR ~ alcohol30 + sex + job_today + usual_care + smoke_100 + 
    smoke_num + pov_num + ins_num + sroh_num + age_num

             Df Sum of Sq    RSS    AIC
- ins_num     1     0.052 1739.8 -13347
- usual_care  1     0.395 1740.2 -13346
<none>                    1739.8 -13345
- pov_num     1     0.527 1740.3 -13345
- age_num     1     0.840 1740.6 -13343
- alcohol30   1     6.796 1746.6 -13314
- job_today   1    11.279 1751.0 -13293
- smoke_100   1    15.725 1755.5 -13271
- smoke_num   1    24.158 1763.9 -13231
- sroh_num    1    76.644 1816.4 -12983
- sex         1    95.891 1835.7 -12894

Step:  AIC=-13347.13
HWR ~ alcohol30 + sex + job_today + usual_care + smoke_100 + 
    smoke_num + pov_num + sroh_num + age_num

             Df Sum of Sq    RSS    AIC
- usual_care  1     0.398 1740.2 -13347
<none>                    1739.8 -13347
- pov_num     1     0.867 1740.7 -13345
- age_num     1     0.900 1740.7 -13345
- alcohol30   1     6.820 1746.6 -13316
- job_today   1    11.673 1751.5 -13293
- smoke_100   1    15.692 1755.5 -13273
- smoke_num   1    24.111 1763.9 -13233
- sroh_num    1    77.107 1816.9 -12982
- sex         1    95.887 1835.7 -12896

Step:  AIC=-13347.2
HWR ~ alcohol30 + sex + job_today + smoke_100 + smoke_num + pov_num + 
    sroh_num + age_num

            Df Sum of Sq    RSS    AIC
<none>                   1740.2 -13347
- pov_num    1     0.833 1741.0 -13345
- age_num    1     1.003 1741.2 -13344
- alcohol30  1     6.818 1747.0 -13316
- job_today  1    11.648 1751.9 -13293
- smoke_100  1    15.787 1756.0 -13273
- smoke_num  1    24.311 1764.5 -13232
- sroh_num   1    78.245 1818.5 -12977
- sex        1    95.490 1835.7 -12898

So the model that the stepwise approach (with numeric values forced for our multi-categorical variables) selects includes these 8 variables:

  • the same 6 as we saw before: sex_F, sroh_num, smoke_num, smoke_100, job_today and alcohol30
  • plus pov_num and age_num
  • so that’s the same model that bias-corrected AIC identified. Is that surprising?

2.5.3 A Stepwise Model with Factors for Multi-Categorical Predictors

hwr_out_STEPFAC <- step(lm(
    HWR ~ alcohol30 + mental31 + sex + job_today + 
        usual_care + smoke_100 + smoke_stat + poverty +
        ins_type + sroh + age + education,
    data = omas17ne_imp2))
Start:  AIC=-13578.71
HWR ~ alcohol30 + mental31 + sex + job_today + usual_care + smoke_100 + 
    smoke_stat + poverty + ins_type + sroh + age + education

             Df Sum of Sq    RSS    AIC
- smoke_100   1     0.092 1684.1 -13580
- usual_care  1     0.289 1684.3 -13579
- mental31    1     0.363 1684.4 -13579
<none>                    1684.0 -13579
- ins_type    3     1.655 1685.7 -13576
- poverty     6     2.974 1687.0 -13576
- job_today   1     2.818 1686.8 -13567
- education   5     6.771 1690.8 -13555
- alcohol30   1     6.121 1690.2 -13550
- age         5    19.845 1703.9 -13490
- smoke_stat  2    28.817 1712.8 -13439
- sroh        4    85.995 1770.0 -13166
- sex         1   100.318 1784.3 -13092

Step:  AIC=-13580.25
HWR ~ alcohol30 + mental31 + sex + job_today + usual_care + smoke_stat + 
    poverty + ins_type + sroh + age + education

             Df Sum of Sq    RSS    AIC
- usual_care  1     0.285 1684.4 -13581
- mental31    1     0.362 1684.5 -13580
<none>                    1684.1 -13580
- ins_type    3     1.647 1685.8 -13578
- poverty     6     2.952 1687.1 -13577
- job_today   1     2.834 1687.0 -13568
- education   5     6.767 1690.9 -13556
- alcohol30   1     6.131 1690.3 -13552
- age         5    19.782 1703.9 -13492
- smoke_stat  2    30.825 1715.0 -13431
- sroh        4    85.919 1770.0 -13168
- sex         1   100.236 1784.4 -13093

Step:  AIC=-13580.81
HWR ~ alcohol30 + mental31 + sex + job_today + smoke_stat + poverty + 
    ins_type + sroh + age + education

             Df Sum of Sq    RSS    AIC
- mental31    1     0.353 1684.8 -13581
<none>                    1684.4 -13581
- ins_type    3     1.674 1686.1 -13578
- poverty     6     2.974 1687.4 -13578
- job_today   1     2.808 1687.2 -13569
- education   5     6.718 1691.1 -13557
- alcohol30   1     6.139 1690.5 -13552
- age         5    19.928 1704.3 -13491
- smoke_stat  2    31.000 1715.4 -13431
- sroh        4    87.040 1771.5 -13163
- sex         1    99.967 1784.4 -13095

Step:  AIC=-13581.04
HWR ~ alcohol30 + sex + job_today + smoke_stat + poverty + ins_type + 
    sroh + age + education

             Df Sum of Sq    RSS    AIC
<none>                    1684.8 -13581
- ins_type    3     1.666 1686.4 -13579
- poverty     6     2.996 1687.8 -13578
- job_today   1     2.644 1687.4 -13570
- education   5     6.738 1691.5 -13557
- alcohol30   1     6.049 1690.8 -13553
- age         5    20.055 1704.8 -13491
- smoke_stat  2    30.689 1715.5 -13432
- sroh        4    89.220 1774.0 -13153
- sex         1    99.792 1784.5 -13096

So the model that the stepwise approach (with factors) selects includes these 9 variables:

  • only 5 of the main 6 that we have seen: sex, sroh, smoke_stat, job_today and alcohol30,
  • but now leaving out smoke_100
  • but adding in poverty and age and education and ins_type
  • so this model has some things we haven’t seen before. Is that surprising? What’s different here?

2.5.4 All Five Models We’re Considering

Model Inputs (besides the Intercept)
A sex, sroh, smoke_stat, smoke_100, job_today, alcohol30
B model A plus age
C model B plus poverty
D model C plus usual_care
E sex, sroh, smoke_stat, job_today, alcohol30, poverty, age, education, ins_type

Note that we developed model E using the factor versions of the multi-categorical variables, but models A-D were developed using numeric versions of the multi-categorical information.

Nonetheless, we’ll use the factor versions and impute all missing predictors to assess these models for our cross-validation work.

2.5.5 Using cross-validation to make a decision between candidate models

  • crossv_kfold splits the data into k exclusive partitions, and uses each partition for a test-training split.
  • crossv_mc generates n random partitions, holding out p% of the data for training.

I usually use crossv_kfold.

set.seed(43201)

cv_modA <- omas17ne_imp2 %>%
    crossv_kfold(k = 10) %>%
    mutate(model = 
               map(train,
                   ~ lm(HWR ~ sex + sroh + smoke_stat +
                            smoke_100 + job_today + 
                            alcohol30, 
                        data = .)))

cv_modA_pred <- cv_modA %>%
    unnest(map2(model, test, ~ augment(.x, newdata = .y)))

cv_modA_results <- cv_modA_pred %>%
    summarize(Model = "Model A",
              RMSE = sqrt(mean((HWR - .fitted)^2)),
              MAE = mean(abs(HWR - .fitted)),
              MaxError = max(abs(HWR - .fitted)))
set.seed(43202)

cv_modB <- omas17ne_imp2 %>%
    crossv_kfold(k = 10) %>%
    mutate(model = 
               map(train,
                   ~ lm(HWR ~ sex + sroh + smoke_stat +
                            smoke_100 + job_today + 
                            alcohol30 + age, 
                        data = .)))

cv_modB_pred <- cv_modB %>%
    unnest(map2(model, test, ~ augment(.x, newdata = .y)))

cv_modB_results <- cv_modB_pred %>%
    summarize(Model = "Model B",
              RMSE = sqrt(mean((HWR - .fitted)^2)),
              MAE = mean(abs(HWR - .fitted)),
              MaxError = max(abs(HWR - .fitted)))
set.seed(43203)

cv_modC <- omas17ne_imp2 %>%
    crossv_kfold(k = 10) %>%
    mutate(model = 
               map(train,
                   ~ lm(HWR ~ sex + sroh + smoke_stat +
                            smoke_100 + job_today + 
                            alcohol30 + age + poverty, 
                        data = .)))

cv_modC_pred <- cv_modC %>%
    unnest(map2(model, test, ~ augment(.x, newdata = .y)))

cv_modC_results <- cv_modC_pred %>%
    summarize(Model = "Model C",
              RMSE = sqrt(mean((HWR - .fitted)^2)),
              MAE = mean(abs(HWR - .fitted)),
              MaxError = max(abs(HWR - .fitted)))
set.seed(43204)

cv_modD <- omas17ne_imp2 %>%
    crossv_kfold(k = 10) %>%
    mutate(model = 
               map(train,
                   ~ lm(HWR ~ sex + sroh + smoke_stat +
                            smoke_100 + job_today + 
                            alcohol30 + age + poverty +
                            usual_care, 
                        data = .)))

cv_modD_pred <- cv_modD %>%
    unnest(map2(model, test, ~ augment(.x, newdata = .y)))

cv_modD_results <- cv_modD_pred %>%
    summarize(Model = "Model D",
              RMSE = sqrt(mean((HWR - .fitted)^2)),
              MAE = mean(abs(HWR - .fitted)),
              MaxError = max(abs(HWR - .fitted)))
set.seed(43205)

cv_modE <- omas17ne_imp2 %>%
    crossv_kfold(k = 10) %>%
    mutate(model = 
               map(train,
                   ~ lm(HWR ~ sex + sroh + smoke_stat +
                            job_today + alcohol30 + 
                            poverty + age + education +
                            ins_type,
                        data = .)))

cv_modE_pred <- cv_modE %>%
    unnest(map2(model, test, ~ augment(.x, newdata = .y)))

cv_modE_results <- cv_modE_pred %>%
    summarize(Model = "Model E",
              RMSE = sqrt(mean((HWR - .fitted)^2)),
              MAE = mean(abs(HWR - .fitted)),
              MaxError = max(abs(HWR - .fitted)))
bind_rows(cv_modA_results, cv_modB_results,
          cv_modC_results, cv_modD_results,
          cv_modE_results)
# A tibble: 5 x 4
  Model    RMSE   MAE MaxError
  <chr>   <dbl> <dbl>    <dbl>
1 Model A 0.452 0.352     3.37
2 Model B 0.449 0.349     3.34
3 Model C 0.449 0.349     3.34
4 Model D 0.449 0.349     3.34
5 Model E 0.448 0.348     3.27

2.6 Running a Spearman \(\rho^2\) Plot

2.6.1 Without Imputation: What is the plot telling us? How about the testing?

spear1 <- spearman2(HWR ~ sex + sroh + smoke_stat +
                        job_today + alcohol30 + 
                        poverty + age + education +
                        ins_type, 
               data = omas17ne, p = 1)
spear1

Spearman rho^2    Response variable:HWR

            rho2      F df1  df2      P Adjusted rho2    n
sex        0.047 413.85   1 8444 0.0000         0.047 8446
sroh       0.054 121.62   4 8450 0.0000         0.054 8455
smoke_stat 0.011  48.20   2 8405 0.0000         0.011 8408
job_today  0.000   3.86   1 8296 0.0496         0.000 8298
alcohol30  0.003  28.14   1 8378 0.0000         0.003 8380
poverty    0.003   3.67   6 8448 0.0012         0.002 8455
age        0.015  25.61   5 8449 0.0000         0.014 8455
education  0.005   8.41   5 8449 0.0000         0.004 8455
ins_type   0.003   7.11   3 8451 0.0001         0.002 8455
plot(spear1)

2.6.2 What if we impute first?

spear2 <- spearman2(HWR ~ sex + sroh + smoke_stat +
                        job_today + alcohol30 + 
                        poverty + age + education +
                        ins_type, 
               data = omas17ne_imp2, p = 1)
spear2

Spearman rho^2    Response variable:HWR

            rho2      F df1  df2      P Adjusted rho2    n
sex        0.047 412.64   1 8453 0.0000         0.046 8455
sroh       0.054 121.62   4 8450 0.0000         0.054 8455
smoke_stat 0.011  48.29   2 8452 0.0000         0.011 8455
job_today  0.000   3.52   1 8453 0.0606         0.000 8455
alcohol30  0.003  28.18   1 8453 0.0000         0.003 8455
poverty    0.003   3.67   6 8448 0.0012         0.002 8455
age        0.015  25.61   5 8449 0.0000         0.014 8455
education  0.005   8.41   5 8449 0.0000         0.004 8455
ins_type   0.003   7.11   3 8451 0.0001         0.002 8455
plot(spear2)

2.6.3 What happens if we set p = 2 instead?

spear3 <- spearman2(HWR ~ sex + sroh + smoke_stat +
                        job_today + alcohol30 + 
                        poverty + age + education +
                        ins_type, 
               data = omas17ne_imp2, p = 2)
spear3

Spearman rho^2    Response variable:HWR

            rho2      F df1  df2      P Adjusted rho2    n
sex        0.047 412.64   1 8453 0.0000         0.046 8455
sroh       0.054 121.62   4 8450 0.0000         0.054 8455
smoke_stat 0.011  48.29   2 8452 0.0000         0.011 8455
job_today  0.000   3.52   1 8453 0.0606         0.000 8455
alcohol30  0.004  17.34   2 8452 0.0000         0.004 8455
poverty    0.003   3.67   6 8448 0.0012         0.002 8455
age        0.015  25.61   5 8449 0.0000         0.014 8455
education  0.005   8.41   5 8449 0.0000         0.004 8455
ins_type   0.003   7.11   3 8451 0.0001         0.002 8455
plot(spear3)

2.7 Building an OLS model with non-linear terms

d <- datadist(omas17ne)
options(datadist = "d")

modelE <- ols(HWR ~ sex + sroh + sex * sroh + 
                  smoke_stat + job_today + alcohol30 + 
                  poverty + age + education + ins_type, 
              na.action = na.delete,
              data = omas17ne, x = TRUE, y = TRUE)

modelE
Frequencies of Missing Values Due to Each Variable
       HWR        sex       sroh smoke_stat  job_today  alcohol30 
         0          9          0         47        157         75 
   poverty        age  education   ins_type 
         0          0          0          0 

Linear Regression Model
 
 ols(formula = HWR ~ sex + sroh + sex * sroh + smoke_stat + job_today + 
     alcohol30 + poverty + age + education + ins_type, data = omas17ne, 
     na.action = na.delete, x = TRUE, y = TRUE)
 
 
                 Model Likelihood     Discrimination    
                    Ratio Test           Indexes        
 Obs    8219    LR chi2    1335.15    R2       0.150    
 sigma0.4432    d.f.            32    R2 adj   0.147    
 d.f.   8186    Pr(> chi2)  0.0000    g        0.209    
 
 Residuals
 
      Min       1Q   Median       3Q      Max 
 -1.37540 -0.29761 -0.01181  0.26546  3.19772 
 
 
                                        Coef    S.E.   t     Pr(>|t|)
 Intercept                               2.3516 0.0355 66.20 <0.0001 
 sex=F                                   0.3502 0.0229 15.26 <0.0001 
 sroh=2_VeryGood                        -0.1258 0.0210 -6.00 <0.0001 
 sroh=3_Good                            -0.1971 0.0221 -8.90 <0.0001 
 sroh=4_Fair                            -0.2080 0.0269 -7.74 <0.0001 
 sroh=5_Poor                            -0.1103 0.0368 -3.00 0.0027  
 smoke_stat=Former                      -0.0370 0.0119 -3.10 0.0019  
 smoke_stat=Current                      0.1451 0.0141 10.33 <0.0001 
 job_today                              -0.0474 0.0130 -3.64 0.0003  
 alcohol30                               0.0037 0.0007  5.63 <0.0001 
 poverty=0.75 - 1.00 FPL                -0.0467 0.0233 -2.01 0.0450  
 poverty=1.00 - 1.38 FPL                 0.0195 0.0227  0.86 0.3899  
 poverty=1.38 - 2.06 FPL                 0.0025 0.0222  0.11 0.9105  
 poverty=2.06 - 2.50 FPL                 0.0212 0.0249  0.85 0.3940  
 poverty=2.50 - 4.00 FPL                -0.0164 0.0222 -0.74 0.4609  
 poverty=4.00 FPL or more                0.0199 0.0217  0.92 0.3597  
 age=25-34                              -0.1242 0.0248 -5.01 <0.0001 
 age=35-44                              -0.2202 0.0249 -8.85 <0.0001 
 age=45-54                              -0.1990 0.0235 -8.47 <0.0001 
 age=55-64                              -0.1766 0.0230 -7.68 <0.0001 
 age=65+                                -0.1198 0.0262 -4.58 <0.0001 
 education=2_HS Graduate or equivalent  -0.0741 0.0226 -3.28 0.0010  
 education=3_Some College               -0.1081 0.0239 -4.53 <0.0001 
 education=4_Associate Degree           -0.1078 0.0251 -4.29 <0.0001 
 education=5_Four Year College Graduate -0.0681 0.0244 -2.79 0.0052  
 education=6_Advanced Degree            -0.0358 0.0257 -1.39 0.1641  
 ins_type=Medicare                       0.0318 0.0188  1.69 0.0910  
 ins_type=Employer                       0.0208 0.0177  1.17 0.2407  
 ins_type=Private                        0.0574 0.0256  2.24 0.0252  
 sex=F * sroh=2_VeryGood                -0.0896 0.0287 -3.13 0.0018  
 sex=F * sroh=3_Good                    -0.1471 0.0295 -4.99 <0.0001 
 sex=F * sroh=4_Fair                    -0.2493 0.0341 -7.30 <0.0001 
 sex=F * sroh=5_Poor                    -0.2669 0.0466 -5.73 <0.0001 
 

2.7.1 Plot the effect sizes

plot(summary(modelE))

2.7.2 Nomogram

plot(nomogram(modelE))

2.7.3 ANOVA assessment for this model

anova(modelE)
                Analysis of Variance          Response: HWR 

 Factor                                    d.f. Partial SS  MS        
 sex  (Factor+Higher Order Factors)           5  112.562309 22.5124618
  All Interactions                            4   14.067508  3.5168770
 sroh  (Factor+Higher Order Factors)          8  103.169688 12.8962110
  All Interactions                            4   14.067508  3.5168770
 smoke_stat                                   2   30.047958 15.0239791
 job_today                                    1    2.602839  2.6028386
 alcohol30                                    1    6.230003  6.2300028
 poverty                                      6    2.999894  0.4999823
 age                                          5   20.572591  4.1145183
 education                                    5    7.177578  1.4355156
 ins_type                                     3    1.236235  0.4120782
 sex * sroh  (Factor+Higher Order Factors)    4   14.067508  3.5168770
 REGRESSION                                  32  283.651495  8.8641092
 ERROR                                     8186 1608.134143  0.1964493
 F      P     
 114.60 <.0001
  17.90 <.0001
  65.65 <.0001
  17.90 <.0001
  76.48 <.0001
  13.25 0.0003
  31.71 <.0001
   2.55 0.0183
  20.94 <.0001
   7.31 <.0001
   2.10 0.0983
  17.90 <.0001
  45.12 <.0001
              

2.7.4 What exactly does validate do here?

The details are found at this 2014-10-04 posting by Jonathan Bartlett at The Stats Geek.

By default, the machine is doing bootstrap validation, where it creates 40 resampled versions of the same size as the original data set, and treats those as the training samples (which are then averaged to get the training result below) and then comparing the result in the original data set (which produces the test value.)

validate(modelE)
          index.orig training   test optimism index.corrected  n
R-square      0.1499   0.1540 0.1462   0.0078          0.1421 40
MSE           0.1957   0.1948 0.1965  -0.0018          0.1974 40
g             0.2091   0.2118 0.2067   0.0050          0.2041 40
Intercept     0.0000   0.0000 0.0551  -0.0551          0.0551 40
Slope         1.0000   1.0000 0.9747   0.0253          0.9747 40

2.7.5 Does this model have any influential points?

which.influence(modelE)
$sroh
[1] 1542 5212

$alcohol30
[1] 5295

$education
[1] 5212
omas17ne %>% slice(5212)
# A tibble: 1 x 28
  caseid region county_type avoid_care usual_care ins_employer ins_medicare
  <chr>  <fct>  <fct>            <dbl>      <dbl>        <dbl>        <dbl>
1 04011~ North~ Metro                0          1            1            1
# ... with 21 more variables: ins_medicaid <dbl>, ins_private <dbl>,
#   ins_type <fct>, age <fct>, sex <fct>, race_eth <fct>, education <fct>,
#   marital <fct>, poverty <fct>, job_today <dbl>, food_12mo <fct>,
#   sroh <fct>, mental31 <dbl>, ht_in <dbl>, wt_lb <dbl>, bmi <dbl>,
#   smoke_stat <fct>, smoke_100 <dbl>, alcohol30 <dbl>, er_visits <dbl>,
#   HWR <dbl>

2.8 Fitting an Imputation Model

set.seed(4322019)

imp_model <- aregImpute( 
    ~ HWR + sex + sroh + smoke_stat + job_today + 
        alcohol30 + poverty + age + education + 
        ins_type, data = omas17ne, 
    n.impute = 10, pr = FALSE, x = TRUE)

imp_model

Multiple Imputation using Bootstrap and PMM

aregImpute(formula = ~HWR + sex + sroh + smoke_stat + job_today + 
    alcohol30 + poverty + age + education + ins_type, data = omas17ne, 
    n.impute = 10, x = TRUE, pr = FALSE)

n: 8455     p: 10   Imputations: 10     nk: 3 

Number of NAs:
       HWR        sex       sroh smoke_stat  job_today  alcohol30 
         0          9          0         47        157         75 
   poverty        age  education   ins_type 
         0          0          0          0 

           type d.f.
HWR           s    2
sex           c    1
sroh          c    4
smoke_stat    c    2
job_today     l    1
alcohol30     s    1
poverty       c    6
age           c    5
education     c    5
ins_type      c    3

Transformation of Target Variables Forced to be Linear

R-squares for Predicting Non-Missing Values for Each Variable
Using Last Imputations of Predictors
       sex smoke_stat  job_today  alcohol30 
     0.122      0.171      0.446      0.081 

2.9 Fitting Model E after multiple imputation

modelE_imp <- fit.mult.impute(
    HWR ~ sex + sroh + sex * sroh + smoke_stat + 
        job_today + alcohol30 + poverty + age + 
        education + ins_type,
    fitter = ols, xtrans = imp_model, data = omas17ne,
    x = TRUE, y = TRUE)

Variance Inflation Factors Due to Imputation:

                             Intercept 
                                  1.00 
                                 sex=F 
                                  1.00 
                       sroh=2_VeryGood 
                                  1.00 
                           sroh=3_Good 
                                  1.00 
                           sroh=4_Fair 
                                  1.00 
                           sroh=5_Poor 
                                  1.01 
                     smoke_stat=Former 
                                  1.01 
                    smoke_stat=Current 
                                  1.01 
                             job_today 
                                  1.08 
                             alcohol30 
                                  1.00 
               poverty=0.75 - 1.00 FPL 
                                  1.00 
               poverty=1.00 - 1.38 FPL 
                                  1.00 
               poverty=1.38 - 2.06 FPL 
                                  1.00 
               poverty=2.06 - 2.50 FPL 
                                  1.00 
               poverty=2.50 - 4.00 FPL 
                                  1.00 
              poverty=4.00 FPL or more 
                                  1.00 
                             age=25-34 
                                  1.00 
                             age=35-44 
                                  1.00 
                             age=45-54 
                                  1.00 
                             age=55-64 
                                  1.00 
                               age=65+ 
                                  1.00 
 education=2_HS Graduate or equivalent 
                                  1.00 
              education=3_Some College 
                                  1.00 
          education=4_Associate Degree 
                                  1.00 
education=5_Four Year College Graduate 
                                  1.00 
           education=6_Advanced Degree 
                                  1.00 
                     ins_type=Medicare 
                                  1.00 
                     ins_type=Employer 
                                  1.01 
                      ins_type=Private 
                                  1.00 
               sex=F * sroh=2_VeryGood 
                                  1.00 
                   sex=F * sroh=3_Good 
                                  1.00 
                   sex=F * sroh=4_Fair 
                                  1.00 
                   sex=F * sroh=5_Poor 
                                  1.01 

Rate of Missing Information:

                             Intercept 
                                  0.00 
                                 sex=F 
                                  0.00 
                       sroh=2_VeryGood 
                                  0.00 
                           sroh=3_Good 
                                  0.00 
                           sroh=4_Fair 
                                  0.00 
                           sroh=5_Poor 
                                  0.01 
                     smoke_stat=Former 
                                  0.01 
                    smoke_stat=Current 
                                  0.01 
                             job_today 
                                  0.08 
                             alcohol30 
                                  0.00 
               poverty=0.75 - 1.00 FPL 
                                  0.00 
               poverty=1.00 - 1.38 FPL 
                                  0.00 
               poverty=1.38 - 2.06 FPL 
                                  0.00 
               poverty=2.06 - 2.50 FPL 
                                  0.00 
               poverty=2.50 - 4.00 FPL 
                                  0.00 
              poverty=4.00 FPL or more 
                                  0.00 
                             age=25-34 
                                  0.00 
                             age=35-44 
                                  0.00 
                             age=45-54 
                                  0.00 
                             age=55-64 
                                  0.00 
                               age=65+ 
                                  0.00 
 education=2_HS Graduate or equivalent 
                                  0.00 
              education=3_Some College 
                                  0.00 
          education=4_Associate Degree 
                                  0.00 
education=5_Four Year College Graduate 
                                  0.00 
           education=6_Advanced Degree 
                                  0.00 
                     ins_type=Medicare 
                                  0.00 
                     ins_type=Employer 
                                  0.01 
                      ins_type=Private 
                                  0.00 
               sex=F * sroh=2_VeryGood 
                                  0.00 
                   sex=F * sroh=3_Good 
                                  0.00 
                   sex=F * sroh=4_Fair 
                                  0.00 
                   sex=F * sroh=5_Poor 
                                  0.01 

d.f. for t-distribution for Tests of Single Coefficients:

                             Intercept 
                          8.635654e+05 
                                 sex=F 
                          2.717645e+08 
                       sroh=2_VeryGood 
                          3.236438e+08 
                           sroh=3_Good 
                          1.926555e+07 
                           sroh=4_Fair 
                          5.359716e+06 
                           sroh=5_Poor 
                          2.597791e+05 
                     smoke_stat=Former 
                          2.751832e+05 
                    smoke_stat=Current 
                          1.654171e+05 
                             job_today 
                          1.561020e+03 
                             alcohol30 
                          1.177942e+06 
               poverty=0.75 - 1.00 FPL 
                          1.524378e+07 
               poverty=1.00 - 1.38 FPL 
                          1.966670e+07 
               poverty=1.38 - 2.06 FPL 
                          5.750596e+06 
               poverty=2.06 - 2.50 FPL 
                          7.951045e+06 
               poverty=2.50 - 4.00 FPL 
                          8.612249e+06 
              poverty=4.00 FPL or more 
                          5.027044e+06 
                             age=25-34 
                          5.199872e+07 
                             age=35-44 
                          5.959921e+07 
                             age=45-54 
                          2.239895e+07 
                             age=55-64 
                          5.002619e+06 
                               age=65+ 
                          1.704738e+06 
 education=2_HS Graduate or equivalent 
                          8.094864e+07 
              education=3_Some College 
                          3.524345e+07 
          education=4_Associate Degree 
                          3.766527e+07 
education=5_Four Year College Graduate 
                          2.698610e+07 
           education=6_Advanced Degree 
                          3.707462e+07 
                     ins_type=Medicare 
                          8.024449e+07 
                     ins_type=Employer 
                          2.343152e+05 
                      ins_type=Private 
                          1.034481e+07 
               sex=F * sroh=2_VeryGood 
                          1.231234e+09 
                   sex=F * sroh=3_Good 
                          1.090981e+07 
                   sex=F * sroh=4_Fair 
                          1.063890e+08 
                   sex=F * sroh=5_Poor 
                          5.588244e+04 

The following fit components were averaged over the 10 model fits:

  fitted.values stats linear.predictors 
modelE_imp
Linear Regression Model
 
 fit.mult.impute(formula = HWR ~ sex + sroh + sex * sroh + smoke_stat + 
     job_today + alcohol30 + poverty + age + education + ins_type, 
     fitter = ols, xtrans = imp_model, data = omas17ne, x = TRUE, 
     y = TRUE)
 
                 Model Likelihood     Discrimination    
                    Ratio Test           Indexes        
 Obs    8455    LR chi2    1348.60    R2       0.147    
 sigma0.4450    d.f.            32    R2 adj   0.144    
 d.f.   8422    Pr(> chi2)  0.0000    g        0.208    
 
 Residuals
 
      Min       1Q   Median       3Q      Max 
 -1.45799 -0.29739 -0.01195  0.26471  3.19204 
 
 
                                        Coef    S.E.   t     Pr(>|t|)
 Intercept                               2.3490 0.0351 67.01 <0.0001 
 sex=F                                   0.3547 0.0228 15.58 <0.0001 
 sroh=2_VeryGood                        -0.1183 0.0208 -5.68 <0.0001 
 sroh=3_Good                            -0.1888 0.0219 -8.62 <0.0001 
 sroh=4_Fair                            -0.1986 0.0265 -7.48 <0.0001 
 sroh=5_Poor                            -0.1211 0.0358 -3.38 0.0007  
 smoke_stat=Former                      -0.0358 0.0118 -3.02 0.0025  
 smoke_stat=Current                      0.1437 0.0139 10.34 <0.0001 
 job_today                              -0.0485 0.0134 -3.62 0.0003  
 alcohol30                               0.0036 0.0007  5.44 <0.0001 
 poverty=0.75 - 1.00 FPL                -0.0419 0.0229 -1.83 0.0669  
 poverty=1.00 - 1.38 FPL                 0.0214 0.0223  0.96 0.3368  
 poverty=1.38 - 2.06 FPL                -0.0035 0.0218 -0.16 0.8731  
 poverty=2.06 - 2.50 FPL                 0.0208 0.0245  0.85 0.3967  
 poverty=2.50 - 4.00 FPL                -0.0160 0.0218 -0.73 0.4651  
 poverty=4.00 FPL or more                0.0220 0.0213  1.03 0.3024  
 age=25-34                              -0.1252 0.0245 -5.11 <0.0001 
 age=35-44                              -0.2219 0.0245 -9.04 <0.0001 
 age=45-54                              -0.2018 0.0232 -8.70 <0.0001 
 age=55-64                              -0.1809 0.0227 -7.98 <0.0001 
 age=65+                                -0.1235 0.0258 -4.80 <0.0001 
 education=2_HS Graduate or equivalent  -0.0718 0.0224 -3.21 0.0013  
 education=3_Some College               -0.1074 0.0237 -4.54 <0.0001 
 education=4_Associate Degree           -0.1022 0.0249 -4.10 <0.0001 
 education=5_Four Year College Graduate -0.0666 0.0241 -2.77 0.0057  
 education=6_Advanced Degree            -0.0382 0.0254 -1.50 0.1329  
 ins_type=Medicare                       0.0332 0.0184  1.80 0.0717  
 ins_type=Employer                       0.0197 0.0175  1.12 0.2609  
 ins_type=Private                        0.0627 0.0253  2.48 0.0132  
 sex=F * sroh=2_VeryGood                -0.0936 0.0285 -3.29 0.0010  
 sex=F * sroh=3_Good                    -0.1536 0.0292 -5.25 <0.0001 
 sex=F * sroh=4_Fair                    -0.2605 0.0338 -7.71 <0.0001 
 sex=F * sroh=5_Poor                    -0.2597 0.0458 -5.67 <0.0001 
 
plot(summary(modelE_imp))

validate(modelE_imp)
          index.orig training   test optimism index.corrected  n
R-square      0.1476   0.1514 0.1442   0.0072          0.1404 40
MSE           0.1972   0.1958 0.1980  -0.0022          0.1995 40
g             0.2078   0.2101 0.2056   0.0045          0.2033 40
Intercept     0.0000   0.0000 0.0443  -0.0443          0.0443 40
Slope         1.0000   1.0000 0.9792   0.0208          0.9792 40
plot(nomogram(modelE_imp))

3 Model Building for a Binary (categorical) Outcome

3.1 Again, working with a subsample: the Northeast Ohio Region

Let’s take a sample of the data, to describe the Northeast Ohio region.

omas17ne2 <- omas17_cleaned %>% 
    filter(region == "North_East") %>%
    filter(!is.na(avoid_care))

nrow(omas17ne2)
[1] 8238

3.2 Our outcome: Pr(Avoid or Delay Care in the past 12m)

omas17ne2 %>% tabyl(avoid_care)
 avoid_care    n   percent
          0 6484 0.7870842
          1 1754 0.2129158

3.3 A shorter predictor list

We’ll consider 6 predictors:

  • alcohol30 (quantitative)
  • bmi (quantitative)
  • mental31 (count - quantitative)
  • smoke_100 (yes/no)
  • sroh (5 categories)
  • usual_care (yes/no)
omas17ne3 <- omas17ne2 %>%
    select(caseid, avoid_care, alcohol30, bmi,  
           mental31, smoke_100, sroh, usual_care)

omas17ne3
# A tibble: 8,238 x 8
   caseid  avoid_care alcohol30   bmi mental31 smoke_100 sroh    usual_care
   <chr>        <dbl>     <dbl> <dbl>    <dbl>     <dbl> <fct>        <dbl>
 1 010002~          1        19  25.0        0         0 1_Exce~          1
 2 010005~          0         2  28.2        0         1 3_Good           0
 3 010006~          0         0  44.2        0         1 4_Fair           1
 4 010009~          0         0  31.2        0         1 5_Poor           1
 5 010011~          0        15  20.8        4         1 4_Fair           1
 6 010011~          0         0  29.5        0         1 1_Exce~          1
 7 010013~          0        25  25.2        0         0 1_Exce~          1
 8 010013~          0         7  29.2        0         0 2_Very~          1
 9 010013~          0        10  24.4        0         0 1_Exce~          1
10 010013~          0         0  22.2       NA         1 4_Fair           0
# ... with 8,228 more rows

3.4 Simple Imputation?

miss_var_summary(omas17ne3)
# A tibble: 8 x 3
  variable   n_miss pct_miss
  <chr>       <int>    <dbl>
1 mental31       95    1.15 
2 usual_care     83    1.01 
3 alcohol30      39    0.473
4 bmi            21    0.255
5 smoke_100      20    0.243
6 caseid          0    0    
7 avoid_care      0    0    
8 sroh            0    0    
set.seed(20190307)
omas17ne3_imp1 <- omas17ne3 %>%
    impute_pmm(alcohol30 + bmi + smoke_100 ~ sroh) %>%
    impute_pmm(usual_care ~ bmi + smoke_100 + sroh) %>%
    impute_pmm(mental31 ~ bmi + alcohol30 + sroh)

miss_var_summary(omas17ne3_imp1)
# A tibble: 8 x 3
  variable   n_miss pct_miss
  <chr>       <int>    <dbl>
1 caseid          0        0
2 avoid_care      0        0
3 alcohol30       0        0
4 bmi             0        0
5 mental31        0        0
6 smoke_100       0        0
7 sroh            0        0
8 usual_care      0        0

3.5 Kitchen Sink Logistic Regression Model via glm

We’ll use the simply imputed data here.

m_1 <- glm(avoid_care ~ alcohol30 + bmi + mental31 +
                   smoke_100 + sroh + usual_care, 
           data = omas17ne3_imp1, 
           family = binomial())

m_1

Call:  glm(formula = avoid_care ~ alcohol30 + bmi + mental31 + smoke_100 + 
    sroh + usual_care, family = binomial(), data = omas17ne3_imp1)

Coefficients:
   (Intercept)       alcohol30             bmi        mental31  
     -1.753457        0.005833        0.002701        0.043714  
     smoke_100  sroh2_VeryGood      sroh3_Good      sroh4_Fair  
      0.129122        0.390045        0.632515        0.811374  
    sroh5_Poor      usual_care  
      0.965684       -0.363599  

Degrees of Freedom: 8237 Total (i.e. Null);  8228 Residual
Null Deviance:      8531 
Residual Deviance: 8194     AIC: 8214
glance(m_1)
# A tibble: 1 x 7
  null.deviance df.null logLik   AIC   BIC deviance df.residual
          <dbl>   <int>  <dbl> <dbl> <dbl>    <dbl>       <int>
1         8531.    8237 -4097. 8214. 8285.    8194.        8228

3.6 Kitchen Sink Logistic Regression Model via lrm

d <- datadist(omas17ne3)
options(datadist = "d")

m_1a <- lrm(avoid_care ~ alcohol30 + bmi + mental31 +
                   smoke_100 + sroh + usual_care, 
            data = omas17ne3_imp1, 
            x = TRUE, y = TRUE)

m_1a
Logistic Regression Model
 
 lrm(formula = avoid_care ~ alcohol30 + bmi + mental31 + smoke_100 + 
     sroh + usual_care, data = omas17ne3_imp1, x = TRUE, y = TRUE)
 
                       Model Likelihood     Discrimination    Rank Discrim.    
                          Ratio Test           Indexes           Indexes       
 Obs          8238    LR chi2     336.81    R2       0.062    C       0.636    
  0           6484    d.f.             9    g        0.484    Dxy     0.272    
  1           1754    Pr(> chi2) <0.0001    gr       1.623    gamma   0.272    
 max |deriv| 3e-12                          gp       0.083    tau-a   0.091    
                                            Brier    0.160                     
 
                 Coef    S.E.   Wald Z Pr(>|Z|)
 Intercept       -1.7535 0.1654 -10.60 <0.0001 
 alcohol30        0.0058 0.0036   1.63 0.1021  
 bmi              0.0027 0.0041   0.66 0.5107  
 mental31         0.0437 0.0040  11.03 <0.0001 
 smoke_100        0.1291 0.0567   2.28 0.0227  
 sroh=2_VeryGood  0.3900 0.0925   4.22 <0.0001 
 sroh=3_Good      0.6325 0.0938   6.74 <0.0001 
 sroh=4_Fair      0.8114 0.1046   7.76 <0.0001 
 sroh=5_Poor      0.9657 0.1312   7.36 <0.0001 
 usual_care      -0.3636 0.1100  -3.31 0.0009  
 

3.7 Stepwise Selection via step, applied to the glm fit

step(m_1)
Start:  AIC=8214.37
avoid_care ~ alcohol30 + bmi + mental31 + smoke_100 + sroh + 
    usual_care

             Df Deviance    AIC
- bmi         1   8194.8 8212.8
<none>            8194.4 8214.4
- alcohol30   1   8197.0 8215.0
- smoke_100   1   8199.6 8217.6
- usual_care  1   8204.8 8222.8
- sroh        4   8280.5 8292.5
- mental31    1   8313.6 8331.6

Step:  AIC=8212.8
avoid_care ~ alcohol30 + mental31 + smoke_100 + sroh + usual_care

             Df Deviance    AIC
<none>            8194.8 8212.8
- alcohol30   1   8197.3 8213.3
- smoke_100   1   8199.9 8215.9
- usual_care  1   8205.1 8221.1
- sroh        4   8288.7 8298.7
- mental31    1   8314.3 8330.3

Call:  glm(formula = avoid_care ~ alcohol30 + mental31 + smoke_100 + 
    sroh + usual_care, family = binomial(), data = omas17ne3_imp1)

Coefficients:
   (Intercept)       alcohol30        mental31       smoke_100  
     -1.683776        0.005651        0.043766        0.127628  
sroh2_VeryGood      sroh3_Good      sroh4_Fair      sroh5_Poor  
      0.396263        0.643251        0.825539        0.977939  
    usual_care  
     -0.361652  

Degrees of Freedom: 8237 Total (i.e. Null);  8229 Residual
Null Deviance:      8531 
Residual Deviance: 8195     AIC: 8213

Suggested model includes everything but bmi.

3.8 Stepwise Selection via validate, applied to the lrm fit

By default, this uses AIC, but we could instead use p values.

validate(m_1a, bw = TRUE, B = 10)

        Backwards Step-down - Original Model

 Deleted   Chi-Sq d.f. P      Residual d.f. P      AIC  
 bmi       0.43   1    0.5107 0.43     1    0.5107 -1.57
 alcohol30 2.52   1    0.1122 2.96     2    0.2282 -1.04

Approximate Estimates after Deleting Factors

                    Coef     S.E.  Wald Z         P
Intercept       -1.65322 0.125492 -13.174 0.000e+00
mental31         0.04385 0.003963  11.067 0.000e+00
smoke_100        0.13768 0.056265   2.447 1.441e-02
sroh=2_VeryGood  0.39620 0.091975   4.308 1.649e-05
sroh=3_Good      0.63350 0.092196   6.871 6.368e-12
sroh=4_Fair      0.80887 0.101773   7.948 1.887e-15
sroh=5_Poor      0.95830 0.129290   7.412 1.243e-13
usual_care      -0.36416 0.109960  -3.312 9.272e-04

Factors in Final Model

[1] mental31   smoke_100  sroh       usual_care
          index.orig training    test optimism index.corrected  n
Dxy           0.2670   0.2766  0.2653   0.0113          0.2557 10
R2            0.0616   0.0642  0.0602   0.0040          0.0575 10
Intercept     0.0000   0.0000 -0.0264   0.0264         -0.0264 10
Slope         1.0000   1.0000  0.9735   0.0265          0.9735 10
Emax          0.0000   0.0000  0.0105   0.0105          0.0105 10
D             0.0404   0.0421  0.0395   0.0027          0.0378 10
U            -0.0002  -0.0002  0.0000  -0.0002          0.0000 10
Q             0.0407   0.0424  0.0395   0.0029          0.0378 10
B             0.1603   0.1594  0.1605  -0.0011          0.1614 10
g             0.4793   0.4998  0.4834   0.0163          0.4630 10
gp            0.0825   0.0847  0.0824   0.0024          0.0801 10

Factors Retained in Backwards Elimination

 alcohol30 bmi mental31 smoke_100 sroh usual_care
 *             *                  *    *         
 *         *   *        *         *    *         
 *             *        *         *    *         
 *             *                  *    *         
 *         *   *        *         *    *         
               *        *         *    *         
 *             *        *         *    *         
 *             *                  *    *         
 *             *        *         *    *         
               *                  *    *         

Frequencies of Numbers of Factors Retained

3 4 5 6 
1 4 3 2 

So that’s recommending a model with everything but bmi and alcohol.

3.9 Comparison of Models via Cross-Validation

Let’s consider three models:

Model Variables
KS alcohol30 + bmi + mental31 + smoke_100 + sroh + usual_care
SW1 alcohol30 + mental31 + smoke_100 + sroh + usual_care
SW2 mental31 + smoke_100 + sroh + usual_care
SM smoke_100 + sroh
mod_fit_ks <- glm(avoid_care ~ alcohol30 + bmi + mental31 + smoke_100 + sroh + usual_care, data = omas17ne3_imp1, family = binomial())

mod_fit_sw1 <- glm(avoid_care ~ alcohol30 + mental31 + smoke_100 + sroh + usual_care, data = omas17ne3_imp1, family = binomial())

mod_fit_sw2 <- glm(avoid_care ~ mental31 + smoke_100 + sroh + usual_care, data = omas17ne3_imp1, family = binomial())

mod_fit_sm <- glm(avoid_care ~ smoke_100 + sroh, data = omas17ne3_imp1, family = binomial())

3.9.1 Using 10-fold cross-validation and a Confuson Matrix

3.9.1.1 Kitchen Sink Model

Train <- createDataPartition(omas17ne3_imp1$avoid_care, p=0.7, list=FALSE)
training <- omas17ne3_imp1[ Train, ]
testing <- omas17ne3_imp1[ -Train, ]

ctrl <- trainControl(method = "repeatedcv", 
                     number = 10, 
                     savePredictions = TRUE)

fit_ks <- train(as.factor(avoid_care) ~ 
                    alcohol30 + bmi + mental31 + 
                    smoke_100 + sroh + usual_care, 
                data = omas17ne3_imp1, method = "glm", 
                family = "binomial",
                trControl = ctrl, tuneLength = 5)

pred_ks <- predict(fit_ks, newdata = testing)
confusionMatrix(data = pred_ks, factor(testing$avoid_care))
Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0 1912  483
         1   38   38
                                          
               Accuracy : 0.7892          
                 95% CI : (0.7725, 0.8051)
    No Information Rate : 0.7892          
    P-Value [Acc > NIR] : 0.5117          
                                          
                  Kappa : 0.0778          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.98051         
            Specificity : 0.07294         
         Pos Pred Value : 0.79833         
         Neg Pred Value : 0.50000         
             Prevalence : 0.78915         
         Detection Rate : 0.77378         
   Detection Prevalence : 0.96924         
      Balanced Accuracy : 0.52672         
                                          
       'Positive' Class : 0               
                                          

3.9.1.2 Stepwise 1 Model

ctrl <- trainControl(method = "repeatedcv", 
                     number = 10, 
                     savePredictions = TRUE)

fit_sw1 <- train(as.factor(avoid_care) ~ 
                    alcohol30 + mental31 + 
                    smoke_100 + sroh + usual_care, 
                data = omas17ne3_imp1, method = "glm", 
                family = "binomial",
                trControl = ctrl, tuneLength = 5)

pred_sw1 <- predict(fit_sw1, newdata = testing)
confusionMatrix(data = pred_sw1, factor(testing$avoid_care))
Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0 1912  483
         1   38   38
                                          
               Accuracy : 0.7892          
                 95% CI : (0.7725, 0.8051)
    No Information Rate : 0.7892          
    P-Value [Acc > NIR] : 0.5117          
                                          
                  Kappa : 0.0778          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.98051         
            Specificity : 0.07294         
         Pos Pred Value : 0.79833         
         Neg Pred Value : 0.50000         
             Prevalence : 0.78915         
         Detection Rate : 0.77378         
   Detection Prevalence : 0.96924         
      Balanced Accuracy : 0.52672         
                                          
       'Positive' Class : 0               
                                          

3.9.1.3 Stepwise 2 Model

ctrl <- trainControl(method = "repeatedcv", 
                     number = 10, 
                     savePredictions = TRUE)

fit_sw2 <- train(as.factor(avoid_care) ~ 
                    mental31 + smoke_100 + 
                     sroh + usual_care, 
                data = omas17ne3_imp1, method = "glm", 
                family = "binomial",
                trControl = ctrl, tuneLength = 5)

pred_sw2 <- predict(fit_sw2, newdata = testing)
confusionMatrix(data = pred_sw2, factor(testing$avoid_care))
Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0 1913  483
         1   37   38
                                          
               Accuracy : 0.7896          
                 95% CI : (0.7729, 0.8055)
    No Information Rate : 0.7892          
    P-Value [Acc > NIR] : 0.4921          
                                          
                  Kappa : 0.0786          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.98103         
            Specificity : 0.07294         
         Pos Pred Value : 0.79841         
         Neg Pred Value : 0.50667         
             Prevalence : 0.78915         
         Detection Rate : 0.77418         
   Detection Prevalence : 0.96965         
      Balanced Accuracy : 0.52698         
                                          
       'Positive' Class : 0               
                                          

3.9.1.4 Small Model

ctrl <- trainControl(method = "repeatedcv", 
                     number = 10, 
                     savePredictions = TRUE)

fit_sm <- train(as.factor(avoid_care) ~ 
                    smoke_100 + sroh,
                data = omas17ne3_imp1, method = "glm", 
                family = "binomial",
                trControl = ctrl, tuneLength = 5)

pred_sm <- predict(fit_sm, newdata = testing)
confusionMatrix(data = pred_sm, factor(testing$avoid_care))
Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0 1950  521
         1    0    0
                                          
               Accuracy : 0.7892          
                 95% CI : (0.7725, 0.8051)
    No Information Rate : 0.7892          
    P-Value [Acc > NIR] : 0.5117          
                                          
                  Kappa : 0               
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 1.0000          
            Specificity : 0.0000          
         Pos Pred Value : 0.7892          
         Neg Pred Value :    NaN          
             Prevalence : 0.7892          
         Detection Rate : 0.7892          
   Detection Prevalence : 1.0000          
      Balanced Accuracy : 0.5000          
                                          
       'Positive' Class : 0               
                                          

3.10 Spearman \(\rho^2\) plot

plot(spearman2(avoid_care ~ alcohol30 + bmi + mental31 +
                   smoke_100 + sroh + usual_care, 
               data = omas17ne3))

3.11 Model 4

m_4 <- lrm(avoid_care ~ alcohol30 + bmi + rcs(mental31, 5) +
                   smoke_100 + sroh + sroh %ia% mental31 + usual_care, 
            data = omas17ne3_imp1, 
            x = TRUE, y = TRUE)

m_4
Logistic Regression Model
 
 lrm(formula = avoid_care ~ alcohol30 + bmi + rcs(mental31, 5) + 
     smoke_100 + sroh + sroh %ia% mental31 + usual_care, data = omas17ne3_imp1, 
     x = TRUE, y = TRUE)
 
                       Model Likelihood     Discrimination    Rank Discrim.    
                          Ratio Test           Indexes           Indexes       
 Obs          8238    LR chi2     423.97    R2       0.078    C       0.648    
  0           6484    d.f.            16    g        0.558    Dxy     0.295    
  1           1754    Pr(> chi2) <0.0001    gr       1.747    gamma   0.296    
 max |deriv| 4e-07                          gp       0.097    tau-a   0.099    
                                            Brier    0.158                     
 
                            Coef     S.E.    Wald Z Pr(>|Z|)
 Intercept                   -1.8711  0.1672 -11.19 <0.0001 
 alcohol30                    0.0064  0.0036   1.77 0.0763  
 bmi                          0.0035  0.0041   0.84 0.3985  
 mental31                     0.4218  0.0610   6.91 <0.0001 
 mental31'                  -27.7278  8.5309  -3.25 0.0012  
 mental31''                  37.2586 12.0383   3.10 0.0020  
 mental31'''                 -9.6384  3.7448  -2.57 0.0101  
 smoke_100                    0.1240  0.0571   2.17 0.0298  
 sroh=2_VeryGood              0.3853  0.0956   4.03 <0.0001 
 sroh=3_Good                  0.6101  0.0977   6.24 <0.0001 
 sroh=4_Fair                  0.7900  0.1112   7.11 <0.0001 
 sroh=5_Poor                  0.9942  0.1503   6.61 <0.0001 
 sroh=2_VeryGood * mental31  -0.0200  0.0228  -0.88 0.3789  
 sroh=3_Good * mental31      -0.0369  0.0203  -1.82 0.0686  
 sroh=4_Fair * mental31      -0.0471  0.0198  -2.37 0.0176  
 sroh=5_Poor * mental31      -0.0493  0.0203  -2.42 0.0155  
 usual_care                  -0.3346  0.1114  -3.01 0.0027  
 

3.11.1 Plot Effects for Model 4

plot(summary(m_4))

3.11.2 validate for model 4

validate(m_4)
          index.orig training    test optimism index.corrected  n
Dxy           0.2954   0.3003  0.2908   0.0096          0.2858 40
R2            0.0778   0.0821  0.0752   0.0069          0.0709 40
Intercept     0.0000   0.0000 -0.0596   0.0596         -0.0596 40
Slope         1.0000   1.0000  0.9562   0.0438          0.9562 40
Emax          0.0000   0.0000  0.0208   0.0208          0.0208 40
D             0.0513   0.0544  0.0496   0.0048          0.0466 40
U            -0.0002  -0.0002  0.0001  -0.0004          0.0001 40
Q             0.0516   0.0546  0.0494   0.0052          0.0464 40
B             0.1582   0.1581  0.1586  -0.0005          0.1587 40
g             0.5579   0.5716  0.5463   0.0253          0.5326 40
gp            0.0966   0.0990  0.0945   0.0045          0.0921 40

3.11.3 What exactly does calibrate do here?

plot(calibrate(m_4))


n=8238   Mean absolute error=0.003   Mean squared error=3e-05
0.9 Quantile of absolute error=0.006

3.12 Fitting multiple imputations

set.seed(4322019)

imp_model2 <- aregImpute( 
    ~ avoid_care + alcohol30 + bmi + mental31 + 
        smoke_100 + sroh + usual_care, 
    data = omas17ne3, 
    n.impute = 10, pr = FALSE, x = TRUE)

imp_model2

Multiple Imputation using Bootstrap and PMM

aregImpute(formula = ~avoid_care + alcohol30 + bmi + mental31 + 
    smoke_100 + sroh + usual_care, data = omas17ne3, n.impute = 10, 
    x = TRUE, pr = FALSE)

n: 8238     p: 7    Imputations: 10     nk: 3 

Number of NAs:
avoid_care  alcohol30        bmi   mental31  smoke_100       sroh 
         0         39         21         95         20          0 
usual_care 
        83 

           type d.f.
avoid_care    l    1
alcohol30     s    2
bmi           s    2
mental31      s    2
smoke_100     l    1
sroh          c    4
usual_care    l    1

Transformation of Target Variables Forced to be Linear

R-squares for Predicting Non-Missing Values for Each Variable
Using Last Imputations of Predictors
 alcohol30        bmi   mental31  smoke_100 usual_care 
     0.043      0.082      0.137      0.053      0.011 

3.13 Fitting Model 4 after multiple imputation

model4_imp <- fit.mult.impute(
    avoid_care ~ alcohol30 + bmi + rcs(mental31, 5) +
        smoke_100 + sroh + sroh %ia% mental31 + 
        usual_care, 
    fitter = lrm, xtrans = imp_model2, data = omas17ne3,
    x = TRUE, y = TRUE)

Variance Inflation Factors Due to Imputation:

                 Intercept                  alcohol30 
                      1.01                       1.00 
                       bmi                   mental31 
                      1.01                       1.01 
                 mental31'                 mental31'' 
                      1.01                       1.01 
               mental31'''                  smoke_100 
                      1.01                       1.00 
           sroh=2_VeryGood                sroh=3_Good 
                      1.00                       1.00 
               sroh=4_Fair                sroh=5_Poor 
                      1.00                       1.01 
sroh=2_VeryGood * mental31     sroh=3_Good * mental31 
                      1.01                       1.00 
    sroh=4_Fair * mental31     sroh=5_Poor * mental31 
                      1.00                       1.01 
                usual_care 
                      1.01 

Rate of Missing Information:

                 Intercept                  alcohol30 
                      0.01                       0.00 
                       bmi                   mental31 
                      0.01                       0.01 
                 mental31'                 mental31'' 
                      0.01                       0.01 
               mental31'''                  smoke_100 
                      0.01                       0.00 
           sroh=2_VeryGood                sroh=3_Good 
                      0.00                       0.00 
               sroh=4_Fair                sroh=5_Poor 
                      0.00                       0.01 
sroh=2_VeryGood * mental31     sroh=3_Good * mental31 
                      0.01                       0.00 
    sroh=4_Fair * mental31     sroh=5_Poor * mental31 
                      0.00                       0.01 
                usual_care 
                      0.01 

d.f. for t-distribution for Tests of Single Coefficients:

                 Intercept                  alcohol30 
                 267324.77                  392881.57 
                       bmi                   mental31 
                 261108.15                   82481.27 
                 mental31'                 mental31'' 
                 168976.74                  165854.59 
               mental31'''                  smoke_100 
                 141799.38                 2757665.91 
           sroh=2_VeryGood                sroh=3_Good 
              116626888.80                 8057346.40 
               sroh=4_Fair                sroh=5_Poor 
                 737600.24                  115514.63 
sroh=2_VeryGood * mental31     sroh=3_Good * mental31 
                 310629.71                  782491.92 
    sroh=4_Fair * mental31     sroh=5_Poor * mental31 
                1315928.71                   87764.24 
                usual_care 
                  51974.88 

The following fit components were averaged over the 10 model fits:

  stats linear.predictors 
model4_imp
Logistic Regression Model
 
 fit.mult.impute(formula = avoid_care ~ alcohol30 + bmi + rcs(mental31, 
     5) + smoke_100 + sroh + sroh %ia% mental31 + usual_care, 
     fitter = lrm, xtrans = imp_model2, data = omas17ne3, x = TRUE, 
     y = TRUE)
 
                       Model Likelihood     Discrimination    Rank Discrim.    
                          Ratio Test           Indexes           Indexes       
 Obs          8238    LR chi2     435.14    R2       0.080    C       0.649    
  0           6484    d.f.            16    g        0.563    Dxy     0.298    
  1           1754    Pr(> chi2) <0.0001    gr       1.756    gamma   0.298    
 max |deriv| 5e-07                          gp       0.098    tau-a   0.100    
                                            Brier    0.158                     
 
                            Coef     S.E.    Wald Z Pr(>|Z|)
 Intercept                   -1.8670  0.1673 -11.16 <0.0001 
 alcohol30                    0.0066  0.0036   1.83 0.0670  
 bmi                          0.0033  0.0041   0.79 0.4279  
 mental31                     0.4430  0.0621   7.13 <0.0001 
 mental31'                  -27.9376  8.3836  -3.33 0.0009  
 mental31''                  37.2457 11.7633   3.17 0.0015  
 mental31'''                 -9.3110  3.5687  -2.61 0.0091  
 smoke_100                    0.1283  0.0572   2.24 0.0248  
 sroh=2_VeryGood              0.3852  0.0957   4.02 <0.0001 
 sroh=3_Good                  0.6147  0.0978   6.28 <0.0001 
 sroh=4_Fair                  0.8003  0.1112   7.19 <0.0001 
 sroh=5_Poor                  1.0189  0.1494   6.82 <0.0001 
 sroh=2_VeryGood * mental31  -0.0202  0.0227  -0.89 0.3745  
 sroh=3_Good * mental31      -0.0357  0.0202  -1.77 0.0771  
 sroh=4_Fair * mental31      -0.0460  0.0198  -2.33 0.0200  
 sroh=5_Poor * mental31      -0.0483  0.0203  -2.38 0.0172  
 usual_care                  -0.3414  0.1116  -3.06 0.0022  
 
plot(summary(model4_imp))

plot(calibrate(model4_imp))


n=8238   Mean absolute error=0.005   Mean squared error=5e-05
0.9 Quantile of absolute error=0.009
validate(model4_imp)
          index.orig training    test optimism index.corrected  n
Dxy           0.2972   0.3020  0.2935   0.0086          0.2886 40
R2            0.0798   0.0835  0.0772   0.0063          0.0735 40
Intercept     0.0000   0.0000 -0.0401   0.0401         -0.0401 40
Slope         1.0000   1.0000  0.9627   0.0373          0.9627 40
Emax          0.0000   0.0000  0.0155   0.0155          0.0155 40
D             0.0527   0.0551  0.0510   0.0042          0.0485 40
U            -0.0002  -0.0002  0.0000  -0.0003          0.0000 40
Q             0.0529   0.0554  0.0509   0.0045          0.0485 40
B             0.1579   0.1568  0.1583  -0.0015          0.1594 40
g             0.5631   0.5701  0.5471   0.0230          0.5401 40
gp            0.0976   0.0987  0.0952   0.0035          0.0941 40
plot(nomogram(model4_imp))