library(here); library(haven); library(janitor)
library(naniar); library(broom); library(rms)
library(leaps); library(skimr); library(simputation)
library(readxl); library(knitr); library(kableExtra)
library(caret); library(modelr); library(tidyverse)
skim_with(numeric = list(hist = NULL),
integer = list(hist = NULL))
omas17_raw <- read_sas(here("data", "omas_2017_puf.sas7bdat"))
dim(omas17_raw)
[1] 39711 370
We’re going to restrict the raw data set to people who meet the following criteria:
A1
is 1)B4A
is 1)B4B
is 1)B4C
is 1)B4E
is 1)D30BINC
)D30A_UNIT
is 1) is between 60 and 450 (D30A_VALUE
)omas17_cleaning <- omas17_raw %>% clean_names() %>%
# inclusion and exclusion criteria
filter(a1 == 1,
b4a == 1 | b4b == 1 | b4c == 1 | b4e == 1,
d30binc > 47 & d30binc < 84,
d30a_unit == 1,
d30a_value > 59 & d30a_value < 450)
Next, we’ll make some name changes, and calculate BMI from Weight and Height (but changing to NA any values that fall outside of the 16-80 range), then select the variables we want to study.
omas17_cleaning <- omas17_cleaning %>%
# name changes and minor transformations
mutate(region = s9_region,
county_type = s9_type_imp,
sroh = d30_imp,
age = age_a_imp,
sex = s15,
race_eth = race5_a_imp,
usual_care = usual_a,
ins_employer = b4a,
ins_medicare = b4b,
ins_medicaid = b4c,
ins_private = b4e,
mental31 = d30i,
education = educ_imp,
poverty = fpl_cat_18,
ht_in = d30binc,
wt_lb = d30a_value,
smoke_100 = d45,
alcohol30 = d46,
er_visits = e62,
job_today = g71)
NA
with naniar
Now, we tell R that the 97
, 98
and 99
values are all in fact missing values, except for those we seein the wt_lb
variable. We’ll use the replace_with_na_at
function from the naniar
package.
omas17_cleaning <- omas17_cleaning %>%
replace_with_na_at(.vars = c("alcohol30", "avoid_care",
"ins_employer", "ins_medicare",
"ins_medicaid", "ins_private",
"er_visits", "food_12mo",
"job_today", "marital",
"mental31", "sex",
"smoke_100", "usual_care"),
condition = ~.x %in% c(97, 98, 99))
Now, we convert the categorical variables to meaningful factors or to 1/0 numeric variables.
omas17_cleaning <- omas17_cleaning %>%
mutate(region = fct_recode(factor(region),
"North_Central" = "1",
"North_East" = "2",
"North_East_Central" = "3",
"North_West" = "4",
"South_Central" = "5",
"South_East" = "6",
"South_West" = "7"),
county_type = fct_recode(factor(county_type),
"Rural_Appalachian" = "1",
"Metro" = "2",
"Rural_Non-Appalachian" = "3",
"Suburban" = "4"),
age = fct_recode(factor(age),
"19-24" = "1",
"25-34" = "2",
"35-44" = "3",
"45-54" = "4",
"55-64" = "5",
"65+" = "6"),
sex = fct_recode(factor(sex),
"M" = "1",
"F" = "2"),
race_eth = fct_recode(factor(race_eth),
"White" = "1",
"Black/African-American" = "2",
"Hispanic" = "3",
"Asian" = "4",
"Other" = "5"),
education = fct_recode(factor(education),
"1_Up To HS but no diploma" = "1",
"2_HS Graduate or equivalent" = "2",
"3_Some College" = "3",
"4_Associate Degree" = "4",
"5_Four Year College Graduate" = "5",
"6_Advanced Degree" = "6"),
marital = fct_recode(factor(marital),
"Married" = "1",
"Divorced/Separated" = "2",
"Widowed" = "3",
"Never Married" = "4",
"Unmarried Couple" = "5"),
poverty = fct_recode(factor(poverty),
"0.75 FPL or less" = "1",
"0.75 - 1.00 FPL" = "2",
"1.00 - 1.38 FPL" = "3",
"1.38 - 2.06 FPL" = "4",
"2.06 - 2.50 FPL" = "5",
"2.50 - 4.00 FPL" = "6",
"4.00 FPL or more" = "7"),
food_12mo = fct_recode(factor(food_12mo),
"1_Easier Now" = "1",
"3_Harder Now" = "2",
"2_Stayed Same" = "3",
NULL = "4"),
sroh = fct_recode(factor(sroh),
"1_Excellent" = "1",
"2_VeryGood" = "2",
"3_Good" = "3",
"4_Fair" = "4",
"5_Poor" = "5"),
smoke_stat = fct_recode(factor(smoke_stat),
"Never" = "1",
"Former" = "2",
"Current" = "3",
NULL = "4"),
# convert 1 = Yes, 2 = No to 1 = Yes, 0 = No
job_today = 2 - job_today,
smoke_100 = 2 - smoke_100,
usual_care = 2 - usual_care,
avoid_care = 2 - avoid_care,
ins_employer = 2 - ins_employer,
ins_medicare = 2 - ins_medicare,
ins_medicaid = 2 - ins_medicaid,
ins_private = 2 - ins_private)
Next, we’ll calculate BMI from Weight and Height (but changing to NA any values that fall outside of the 16-80 range), then we’ll assign everyone to a primary insurance type as follows:
ins_type
)omas17_cleaning <- omas17_cleaning %>%
mutate(bmi = 703 * wt_lb/(ht_in^2)) %>%
# restrict BMI to 16.0 through 80.0
mutate(bmi = ifelse(bmi < 16 | bmi > 80, NA, bmi)) %>%
mutate(ins_type = case_when(
ins_medicaid == 1 ~ "Medicaid",
ins_medicare == 1 ~ "Medicare",
ins_employer == 1 ~ "Employer",
ins_private == 1 ~ "Private",
TRUE ~ NA_character_),
ins_type = fct_relevel(factor(ins_type),
"Medicaid", "Medicare",
"Employer", "Private"))
Finally, we select the variables we want to study.
omas17_cleaned <- omas17_cleaning %>%
select(caseid, region, county_type, avoid_care,
usual_care, ins_employer, ins_medicare,
ins_medicaid, ins_private, ins_type,
age, sex, race_eth, education, marital,
poverty, job_today, food_12mo,
sroh, mental31, ht_in, wt_lb, bmi,
smoke_stat, smoke_100, alcohol30, er_visits)
naniar
dim(omas17_cleaned)
[1] 32144 27
gg_miss_var(omas17_cleaned)
miss_var_summary(omas17_cleaned)
# A tibble: 27 x 3
variable n_miss pct_miss
<chr> <int> <dbl>
1 food_12mo 1137 3.54
2 marital 1085 3.38
3 avoid_care 766 2.38
4 usual_care 640 1.99
5 job_today 536 1.67
6 ins_medicaid 474 1.47
7 mental31 418 1.30
8 ins_private 417 1.30
9 er_visits 346 1.08
10 region 289 0.899
# ... with 17 more rows
omas_2017_codes <- read_xlsx(
here("data", "omas_2017_codebook.xlsx"))
omas_2017_miss_summary <- miss_var_summary(omas17_cleaned)
omas_table <- left_join(omas_2017_codes, omas_2017_miss_summary)
Joining, by = "variable"
omas_table %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"))
variable | type | description | source | n_miss | pct_miss |
---|---|---|---|---|---|
caseid | character | Identification Code - arbitrary | CASEID | 0 | 0.0000000 |
region | 7-level factor | North_Central, North_East, North_East_Central, North_West, South_Central, South_East, South_West | S9_REGION | 289 | 0.8990791 |
county_type | 4-level factor | Metro, Suburban, Rural Appalachian, Rural Non-Appalachian | Region | 0 | 0.0000000 |
avoid_care | 1 (Yes), 0 (No) | Did you delayed or avoid getting the health care you needed? | AVOID_CARE | 766 | 2.3830264 |
usual_care | 1 (Yes), 0 (No) | Do you have a usual source of care? | USUAL_A | 640 | 1.9910403 |
ins_employer | 1 (Yes), 0 (No) | Insurance through an employer or union? | B4A | 137 | 0.4262071 |
ins_medicare | 1 (Yes), 0 (No) | Insurance through Medicare? | B4B | 210 | 0.6533101 |
ins_medicaid | 1 (Yes), 0 (No) | Insurance through Medicaid? | B4C | 474 | 1.4746142 |
ins_private | 1 (Yes), 0 (No) | Insurance through a private plan? | B4E | 417 | 1.2972872 |
ins_type | 4-level factor | Medicaid, Medicare, Employer, Private | Calculated | 0 | 0.0000000 |
age | 6-level factor | Age category (in years): 19-24, 25-34, 35-44, 45-54, 55-64, 65+ | AGE_A_IMP | 0 | 0.0000000 |
sex | 2-level factor | F = female, M = male | S15 | 41 | 0.1275510 |
race_eth | 5-level factor | White, Black/African-American, Hispanic, Asian, Other | RACE5_A_IMP | 0 | 0.0000000 |
education | 6-level factor | Up to HS but no diploma, HS Graduate or equivalent, Some College, Associate Degree, Four Year College Graduate, Advanced Degree | EDUC_IMP | 0 | 0.0000000 |
marital | 5-level factor | Married, Divorced/Separated, Widowed, Never Married, Unmarried Couple | MARITAL | 1085 | 3.3754355 |
poverty | 7-level factor | Relation to Federal Poverty Level of Household Income: 0.75 or less, 0.75-1, 1-1.38, 1.38-2.06, 2.06 - 2.5, 2.5 - 4, 4+ | FPL_CAT_18 | 0 | 0.0000000 |
job_today | 1 (Yes), 0 (No) | Did you have a full-time or part-time job last week? | G71 | 536 | 1.6674963 |
food_12mo | 3-level factor | Is it easier now, harder now or the same as one year ago to get food you need? | FOOD_12MO | 1137 | 3.5372076 |
sroh | 5-level factor | Self-Reported Overall Health: Excellent, Very Good, Good, Fair, Poor | D30_IMP | 0 | 0.0000000 |
mental31 | quantitative | # of days in the past month where your mental health prevented you from doing normal activities | D30I | 418 | 1.3003982 |
ht_in | quantitative | height in inches | D30BINC | 0 | 0.0000000 |
wt_lb | quantitative | weight in pounds | D30A_VALUE | 0 | 0.0000000 |
bmi | quantitative | body-mass index (Calculated from Height and Weight, restricted to 16-80) | Calculated | 93 | 0.2893230 |
smoke_stat | 3-level factor | Never Smoked, Former Smoker, Current Smoker | SMOKE_STAT | 150 | 0.4666501 |
smoke_100 | 1 (Yes), 0 (No) | Have you smoked 100 cigarettes in your life? | D45 | 130 | 0.4044301 |
alcohol30 | quantitative | # of days in the past 30 where you consumed alcohol | D46 | 228 | 0.7093081 |
er_visits | quantitative | # of times in the past 12 months where you were a patient in a hospital emergency room | E62 | 346 | 1.0764062 |
skim
skim(omas17_cleaned)
Skim summary statistics
n obs: 32144
n variables: 27
-- Variable type:character --------------------------------------------------------------------
variable missing complete n min max empty n_unique
caseid 0 32144 32144 9 9 0 32144
-- Variable type:factor -----------------------------------------------------------------------
variable missing complete n n_unique
age 0 32144 32144 6
county_type 0 32144 32144 4
education 0 32144 32144 6
food_12mo 1137 31007 32144 3
ins_type 0 32144 32144 4
marital 1085 31059 32144 5
poverty 0 32144 32144 7
race_eth 0 32144 32144 5
region 289 31855 32144 7
sex 41 32103 32144 2
smoke_stat 150 31994 32144 3
sroh 0 32144 32144 5
top_counts ordered
65+: 10342, 55-: 7025, 45-: 5358, 35-: 3891 FALSE
Met: 15666, Rur: 6220, Rur: 5223, Sub: 5035 FALSE
2_H: 9810, 5_F: 6394, 3_S: 5041, 6_A: 4552 FALSE
2_S: 22532, 3_H: 5681, 1_E: 2794, NA: 1137 FALSE
Emp: 13560, Med: 9781, Med: 7167, Pri: 1636 FALSE
Mar: 15274, Nev: 5364, Div: 5115, Wid: 3320 FALSE
4.0: 10441, 2 FALSE
Whi: 24889, Bla: 4188, Oth: 1912, His: 778 FALSE
Nor: 8455, Sou: 7358, Sou: 6570, Sou: 3165 FALSE
F: 17607, M: 14496, NA: 41 FALSE
Nev: 16891, For: 8538, Cur: 6565, NA: 150 FALSE
2_V: 10059, 3_G: 9313, 1_E: 5659, 4_F: 5031 FALSE
-- Variable type:numeric ----------------------------------------------------------------------
variable missing complete n mean sd p0 p25 p50
alcohol30 228 31916 32144 4.1 7.32 0 0 1
avoid_care 766 31378 32144 0.22 0.41 0 0 0
bmi 93 32051 32144 29.11 6.84 16.01 24.39 27.89
er_visits 346 31798 32144 0.62 1.74 0 0 0
ht_in 0 32144 32144 67.02 4.32 48 64 67
ins_employer 137 32007 32144 0.55 0.5 0 0 1
ins_medicaid 474 31670 32144 0.23 0.42 0 0 0
ins_medicare 210 31934 32144 0.41 0.49 0 0 0
ins_private 417 31727 32144 0.21 0.4 0 0 0
job_today 536 31608 32144 0.53 0.5 0 0 1
mental31 418 31726 32144 1.85 6.21 0 0 0
smoke_100 130 32014 32144 0.47 0.5 0 0 0
usual_care 640 31504 32144 0.94 0.23 0 1 1
wt_lb 0 32144 32144 186.15 48.3 60 150 180
p75 p100
4 30
0 1
32.35 78.64
1 21
70 83
1 1
0 1
1 1
0 1
1 1
0 31
1 1
1 1
212 446
saveRDS(omas17_cleaned, here("data", "omas2017_clean.Rds"))
write_csv(omas17_cleaned, here("data", "omas2017_clean.csv"))
Let’s take a sample of the data, to describe the Northeast Ohio region.
omas17ne <- omas17_cleaned %>%
filter(region == "North_East")
nrow(omas17ne)
[1] 8455
HWR
)The height/weight ratio (hwr
) is usually expressed in cm / kg, rather than feet and inches. But we can convert.
\[ ht_{cm} = 2.54 * ht_{in}, wt_{kg} = 0.453592 * wt_{lb} \]
and so
\[ HWR = \frac{ht_{cm}}{wt_{kg}} \approx 5.6 \frac{ht_{in}}{wt_{lb}} \]
omas17ne <- omas17ne %>%
mutate(HWR = 5.6 * ht_in / wt_lb)
HWR
OutcomeHere’s a quick numerical summary. We have no missing values, and for now, we’ll pretend that all of these values we do see are plausible.
mosaic::favstats(~ HWR, data = omas17ne)
min Q1 median Q3 max mean sd n missing
0.8195122 1.82 2.121212 2.453244 5.255385 2.152794 0.4810418 8455 0
HWR
Outcomeggplot(omas17ne, aes(x = HWR)) +
geom_histogram(bins = 30, col = "white", fill = "navy") +
theme_bw() +
labs(x = "Height/Weight Ratio (cm/kg)",
title = "Height/Weight Ratio for the NE Ohio subjects",
subtitle = paste0("Ohio Medicaid Assessment Survey 2017, n = ", length(omas17ne$HWR), " adults"))
str(omas17ne)
Classes 'tbl_df', 'tbl' and 'data.frame': 8455 obs. of 28 variables:
$ caseid : chr "010002958" "010005008" "010006545" "010009985" ...
..- attr(*, "label")= chr "CASEID--Case ID"
$ region : Factor w/ 7 levels "North_Central",..: 2 2 2 2 2 2 2 2 2 2 ...
$ county_type : Factor w/ 4 levels "Rural_Appalachian",..: 3 1 3 4 2 3 4 3 4 3 ...
$ avoid_care : num 1 0 0 0 0 0 0 0 0 0 ...
$ usual_care : num 1 0 1 1 1 1 1 1 1 0 ...
$ ins_employer: num 1 0 0 0 0 0 0 1 0 0 ...
$ ins_medicare: num 0 0 0 0 0 1 1 0 0 1 ...
$ ins_medicaid: num 0 1 1 1 1 0 0 0 0 1 ...
$ ins_private : num 1 0 0 0 0 0 1 0 1 0 ...
$ ins_type : Factor w/ 4 levels "Medicaid","Medicare",..: 3 1 1 1 1 2 2 3 4 1 ...
$ age : Factor w/ 6 levels "19-24","25-34",..: 1 5 1 5 1 6 6 4 2 4 ...
$ sex : Factor w/ 2 levels "M","F": 1 2 2 2 1 2 2 1 1 1 ...
$ race_eth : Factor w/ 5 levels "White","Black/African-American",..: 1 1 1 1 2 1 1 1 1 2 ...
$ education : Factor w/ 6 levels "1_Up To HS but no diploma",..: 5 2 2 1 2 2 6 5 5 2 ...
$ marital : Factor w/ 5 levels "Married","Divorced/Separated",..: 5 5 4 2 2 1 1 1 5 4 ...
$ poverty : Factor w/ 7 levels "0.75 FPL or less",..: 7 1 1 2 1 4 4 7 7 5 ...
$ job_today : num 1 0 1 0 1 0 0 1 1 0 ...
$ food_12mo : Factor w/ 3 levels "1_Easier Now",..: 1 3 3 3 3 3 3 2 3 2 ...
$ sroh : Factor w/ 5 levels "1_Excellent",..: 1 3 4 5 4 1 1 2 1 4 ...
$ mental31 : num 0 0 0 0 4 0 0 0 0 NA ...
$ ht_in : num 75 61 52 61 65 66 60 72 71 72 ...
..- attr(*, "label")= chr "D30BINC--CALCULATE NUMBER OF INCHES"
..- attr(*, "format.sas")= chr "NMISS_F"
$ wt_lb : num 200 149 170 165 125 183 129 215 175 164 ...
..- attr(*, "label")= chr "D30A_VALUE--Weight without shoes"
..- attr(*, "format.sas")= chr "D30A_VALUE_F"
$ bmi : num 25 28.2 44.2 31.2 20.8 ...
$ smoke_stat : Factor w/ 3 levels "Never","Former",..: 1 2 3 3 3 2 1 1 1 3 ...
$ smoke_100 : num 0 1 1 1 1 1 0 0 0 1 ...
$ alcohol30 : num 19 2 0 0 15 0 25 7 10 0 ...
$ er_visits : num 0 0 1 NA 4 0 0 0 0 0 ...
$ HWR : num 2.1 2.29 1.71 2.07 2.91 ...
..- attr(*, "label")= chr "D30BINC--CALCULATE NUMBER OF INCHES"
..- attr(*, "format.sas")= chr "NMISS_F"
regsubsets
with a multi-categorical predictorIn Approach A, we treat ins_type
as a factor, and use the formula interface, rather than cbind
.
hwr_out_A <- regsubsets(
HWR ~ alcohol30 + mental31 + sex + ins_type,
data = omas17ne, nvmax = NULL, nbest = 1)
hwr_summ_A <- summary(hwr_out_A)
hwr_summ_A$which # another option is outmat here
(Intercept) alcohol30 mental31 sexF ins_typeMedicare ins_typeEmployer
1 TRUE FALSE FALSE TRUE FALSE FALSE
2 TRUE TRUE FALSE TRUE FALSE FALSE
3 TRUE TRUE TRUE TRUE FALSE FALSE
4 TRUE TRUE TRUE TRUE FALSE FALSE
5 TRUE TRUE TRUE TRUE TRUE FALSE
6 TRUE TRUE TRUE TRUE TRUE TRUE
ins_typePrivate
1 FALSE
2 FALSE
3 FALSE
4 TRUE
5 TRUE
6 TRUE
hwr_summ_A$adjr2
[1] 0.04804233 0.05644289 0.05787055 0.05876998 0.05948750 0.05976641
The result is problematic.
ins_type
components?If we use the cbind
approach, one “advantage” is that we get an error if we have missing data.
hwr_preds_B <-
with(omas17ne,
cbind(alcohol30, mental31, sex, ins_type))
hwr_out_B <- regsubsets(x = hwr_preds_B,
y = omas17ne$HWR,
nvmax = NULL, nbest = 1)
If you try to run this you will get an error:
Error in leaps.setup(x, y, wt = weights, nbest = nbest, nvmax = nvmax, : NA/NaN/Inf in foreign function call (arg 4)
Aha! So we would need to either impute some of those values, or else do a complete case analysis.
Let’s try the cbind
approach after we restrict ourselves to complete cases.
omas17ne_CC <- omas17ne %>%
filter(complete.cases(caseid, HWR,
alcohol30, mental31,
sex, ins_type))
nrow(omas17ne_CC); nrow(omas17ne)
[1] 8273
[1] 8455
So we lose 182 rows due to missing data.
hwr_preds_CC <-
with(omas17ne_CC, cbind(alcohol30, mental31, sex, ins_type))
nrow(omas17ne_CC)
[1] 8273
hwr_out_CC <- regsubsets(x = hwr_preds_CC,
y = omas17ne_CC$HWR,
nvmax = NULL, nbest = 1)
hwr_summ_CC <- summary(hwr_out_CC)
hwr_summ_CC$outmat # another option is which here
alcohol30 mental31 sex ins_type
1 ( 1 ) " " " " "*" " "
2 ( 1 ) "*" " " "*" " "
3 ( 1 ) "*" "*" "*" " "
4 ( 1 ) "*" "*" "*" "*"
hwr_summ_CC$adjr2
[1] 0.04804233 0.05644289 0.05787055 0.05854058
But, as it turns out, this is just the same as treating ins_type
as if it were numeric! Which is kind of problematic, too!
head(hwr_preds_CC)
alcohol30 mental31 sex ins_type
[1,] 19 0 1 3
[2,] 2 0 2 1
[3,] 0 0 2 1
[4,] 0 0 2 1
[5,] 15 4 1 1
[6,] 0 0 2 2
omas17ne %>%
mutate(ins_num = as.numeric(ins_type)) %>%
tabyl(ins_type, ins_num)
ins_type 1 2 3 4
Medicaid 2029 0 0 0
Medicare 0 2519 0 0
Employer 0 0 3453 0
Private 0 0 0 454
The formula approach hides two things:
hwr_out_D <- regsubsets(
HWR ~ alcohol30 + mental31 + sex + as.numeric(ins_type),
data = omas17ne, nvmax = NULL, nbest = 1)
hwr_summ_D <- summary(hwr_out_D)
hwr_summ_D$outmat
alcohol30 mental31 sexF as.numeric(ins_type)
1 ( 1 ) " " " " "*" " "
2 ( 1 ) "*" " " "*" " "
3 ( 1 ) "*" "*" "*" " "
4 ( 1 ) "*" "*" "*" "*"
hwr_summ_D$adjr2
[1] 0.04804233 0.05644289 0.05787055 0.05854058
First impute, then use formula approach (still have the problem with multiple-category variables)
How much missingness is there in these variables?
miss_var_summary(omas17ne %>%
select(alcohol30, mental31, sex,
ins_type, HWR))
# A tibble: 5 x 3
variable n_miss pct_miss
<chr> <int> <dbl>
1 mental31 103 1.22
2 alcohol30 75 0.887
3 sex 9 0.106
4 ins_type 0 0
5 HWR 0 0
set.seed(20190305)
omas17ne_imp1 <- omas17ne %>%
impute_cart(sex ~ ins_type + HWR) %>%
impute_rlm(alcohol30 ~ sex + ins_type + HWR) %>%
impute_rlm(mental31 ~
alcohol30 + sex + ins_type + HWR)
miss_var_summary(omas17ne_imp1 %>%
select(alcohol30, mental31, sex,
ins_type, HWR))
# A tibble: 5 x 3
variable n_miss pct_miss
<chr> <int> <dbl>
1 alcohol30 0 0
2 mental31 0 0
3 sex 0 0
4 ins_type 0 0
5 HWR 0 0
hwr_out_E <- regsubsets(
HWR ~ alcohol30 + mental31 + sex + as.numeric(ins_type),
data = omas17ne_imp1, nvmax = NULL, nbest = 1)
hwr_summ_E <- summary(hwr_out_E)
hwr_summ_E$outmat
alcohol30 mental31 sexF as.numeric(ins_type)
1 ( 1 ) " " " " "*" " "
2 ( 1 ) "*" " " "*" " "
3 ( 1 ) "*" "*" "*" " "
4 ( 1 ) "*" "*" "*" "*"
hwr_summ_E$adjr2
[1] 0.04737830 0.05492679 0.05667810 0.05722438
Our potential predictors are:
alcohol30
(quantitative - a count)mental31
(quantitative - a count)sex
(2 levels)job_today
(2 levels)usual_care
(2 levels)smoke_100
(2 levels)smoke_stat
(3 levels)ins_type
(4 levels)sroh
(5 levels)age
(6 levels)education
(6 levels)poverty
(7 levels)omas17ne %>%
select(alcohol30, mental31, sex, job_today,
usual_care, smoke_100, smoke_stat, ins_type,
sroh, age, education, poverty, HWR) %>%
miss_var_summary()
# A tibble: 13 x 3
variable n_miss pct_miss
<chr> <int> <dbl>
1 usual_care 187 2.21
2 job_today 157 1.86
3 mental31 103 1.22
4 alcohol30 75 0.887
5 smoke_stat 47 0.556
6 smoke_100 43 0.509
7 sex 9 0.106
8 ins_type 0 0
9 sroh 0 0
10 age 0 0
11 education 0 0
12 poverty 0 0
13 HWR 0 0
set.seed(20190305)
omas17ne_imp2 <- omas17ne %>%
impute_cart(sex + smoke_stat ~
ins_type + sroh + age + education +
poverty) %>%
impute_pmm(smoke_100 ~
ins_type + sroh + age + education +
poverty + sex) %>%
impute_pmm(alcohol30 + mental31 ~
ins_type + sroh + age + education +
poverty + sex + smoke_stat) %>%
impute_pmm(job_today + usual_care ~
alcohol30 + sex + smoke_stat + age +
ins_type + mental31)
omas17ne_imp2 %>%
select(alcohol30, mental31, sex, job_today,
usual_care, smoke_100, smoke_stat, ins_type,
sroh, age, education, poverty, HWR) %>%
miss_var_summary()
# A tibble: 13 x 3
variable n_miss pct_miss
<chr> <int> <dbl>
1 alcohol30 0 0
2 mental31 0 0
3 sex 0 0
4 job_today 0 0
5 usual_care 0 0
6 smoke_100 0 0
7 smoke_stat 0 0
8 ins_type 0 0
9 sroh 0 0
10 age 0 0
11 education 0 0
12 poverty 0 0
13 HWR 0 0
omas17ne_imp2 %>%
select(alcohol30, mental31, sex, job_today,
usual_care, smoke_100, smoke_stat, ins_type,
sroh, age, education, poverty, HWR) %>%
skim()
Skim summary statistics
n obs: 8455
n variables: 13
-- Variable type:factor -----------------------------------------------------------------------
variable missing complete n n_unique
age 0 8455 8455 6
education 0 8455 8455 6
ins_type 0 8455 8455 4
poverty 0 8455 8455 7
sex 0 8455 8455 2
smoke_stat 0 8455 8455 3
sroh 0 8455 8455 5
top_counts ordered
65+: 2757, 55-: 1843, 45-: 1430, 35-: 956 FALSE
2_H: 2294, 5_F: 1821, 3_S: 1448, 6_A: 1322 FALSE
Emp: 3453, Med: 2519, Med: 2029, Pri: 454 FALSE
4.0: 2831, 2 FALSE
F: 4622, M: 3833, NA: 0 FALSE
Nev: 4461, For: 2322, Cur: 1672, NA: 0 FALSE
2_V: 2684, 3_G: 2361, 1_E: 1564, 4_F: 1322 FALSE
-- Variable type:numeric ----------------------------------------------------------------------
variable missing complete n mean sd p0 p25 p50 p75 p100
alcohol30 0 8455 8455 4.51 7.66 0 0 1 5 30
HWR 0 8455 8455 2.15 0.48 0.82 1.82 2.12 2.45 5.26
job_today 0 8455 8455 0.53 0.5 0 0 1 1 1
mental31 0 8455 8455 1.86 6.19 0 0 0 0 31
smoke_100 0 8455 8455 0.47 0.5 0 0 0 1 1
usual_care 0 8455 8455 0.94 0.24 0 1 1 1 1
omas17ne_imp2 <- omas17ne_imp2 %>%
mutate(age_num = as.numeric(age),
educ_num = as.numeric(education),
ins_num = as.numeric(ins_type),
pov_num = as.numeric(poverty),
smoke_num = as.numeric(smoke_stat),
sroh_num = as.numeric(sroh))
hwr_out_F <- regsubsets(
HWR ~ alcohol30 + mental31 + sex + job_today +
usual_care + smoke_100 + smoke_num + pov_num +
ins_num + sroh_num + age_num + educ_num,
data = omas17ne_imp2, nvmax = 12, nbest = 1)
hwr_summ_F <- summary(hwr_out_F)
hwr_summ_F$outmat
alcohol30 mental31 sexF job_today usual_care smoke_100 smoke_num
1 ( 1 ) " " " " "*" " " " " " " " "
2 ( 1 ) " " " " "*" " " " " " " " "
3 ( 1 ) " " " " "*" " " " " " " "*"
4 ( 1 ) " " " " "*" " " " " "*" "*"
5 ( 1 ) " " " " "*" "*" " " "*" "*"
6 ( 1 ) "*" " " "*" "*" " " "*" "*"
7 ( 1 ) "*" " " "*" "*" " " "*" "*"
8 ( 1 ) "*" " " "*" "*" " " "*" "*"
9 ( 1 ) "*" " " "*" "*" "*" "*" "*"
10 ( 1 ) "*" " " "*" "*" "*" "*" "*"
11 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
12 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
pov_num ins_num sroh_num age_num educ_num
1 ( 1 ) " " " " " " " " " "
2 ( 1 ) " " " " "*" " " " "
3 ( 1 ) " " " " "*" " " " "
4 ( 1 ) " " " " "*" " " " "
5 ( 1 ) " " " " "*" " " " "
6 ( 1 ) " " " " "*" " " " "
7 ( 1 ) " " " " "*" "*" " "
8 ( 1 ) "*" " " "*" "*" " "
9 ( 1 ) "*" " " "*" "*" " "
10 ( 1 ) "*" "*" "*" "*" " "
11 ( 1 ) "*" "*" "*" "*" " "
12 ( 1 ) "*" "*" "*" "*" "*"
regsubsets
Data includes nrow(omas17ne_imp2)
= 8455 observations, and we run models of size 2:13, when you include the intercept term here because we set nvarmax = 12
…
hwr_summ_F$aic.c <- 8455*log(hwr_summ_F$rss / 8455) + 2*(2:13) +
(2 * (2:13) * ((2:13)+1) / (8455 - (2:13) - 1))
Now, we build a tibble containing the winners:
hwr_F_win <- data_frame(
k = 2:13,
r2 = hwr_summ_F$rsq,
adjr2 = hwr_summ_F$adjr2,
cp = hwr_summ_F$cp,
aic.c = hwr_summ_F$aic.c,
bic = hwr_summ_F$bic)
Warning: `data_frame()` is deprecated, use `tibble()`.
This warning is displayed once per session.
hwr_F_win <- bind_cols(hwr_F_win, tbl_df(hwr_summ_F$which))
hwr_F_win
# A tibble: 12 x 19
k r2 adjr2 cp aic.c bic `(Intercept)` alcohol30
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <lgl>
1 2 0.0474 0.0473 592. -12782. -392. TRUE FALSE
2 3 0.0870 0.0868 218. -13139. -742. TRUE FALSE
3 4 0.0933 0.0930 160. -13196. -792. TRUE FALSE
4 5 0.101 0.101 88.5 -13266. -855. TRUE FALSE
5 6 0.106 0.105 44.9 -13310. -892. TRUE FALSE
6 7 0.110 0.109 10.6 -13344. -919. TRUE TRUE
7 8 0.110 0.109 9.39 -13345. -913. TRUE TRUE
8 9 0.110 0.110 7.35 -13347. -908. TRUE TRUE
9 10 0.111 0.110 7.42 -13347. -901. TRUE TRUE
10 11 0.111 0.110 9.16 -13345. -892. TRUE TRUE
11 12 0.111 0.110 11.0 -13343. -883. TRUE TRUE
12 13 0.111 0.109 13 -13342. -874. TRUE TRUE
# ... with 11 more variables: mental31 <lgl>, sexF <lgl>, job_today <lgl>,
# usual_care <lgl>, smoke_100 <lgl>, smoke_num <lgl>, pov_num <lgl>,
# ins_num <lgl>, sroh_num <lgl>, age_num <lgl>, educ_num <lgl>
The models considered by regsubsets
are:
k | Model |
---|---|
2 | sex_F |
3 | + sroh_num (sroh) |
4 | + smoke_num (smoke_stat) |
5 | + smoke_100 |
6 | + job_today |
7 | + alcohol30 |
8 | + age_num (age) |
9 | + pov_num (poverty) |
10 | + usual_care |
11 | + ins_num |
12 | + mental31 |
13 | + educ_num |
hwrp1
hwrp2
hwrp3
hwrp4
Approach | Model Suggested |
---|---|
Adjusted R2 | k = 10 inputs |
Mallows’ Cp | k = 8 inputs |
Bias-Corrected AIC | k = 9 inputs |
BIC | k = 7 inputs |
The relevant models, then, suggested by regsubsets
each include:
sex_F
, sroh_num
, smoke_num
, smoke_100
, job_today
and alcohol30
k | Model |
---|---|
BIC | model above with 7 inputs, including intercept |
Cp | + age_num (age) |
AIC_c | + pov_num (poverty) |
R2 (adj) | + usual_care |
hwr_out_STEPNUM <- step(lm(
HWR ~ alcohol30 + mental31 + sex + job_today +
usual_care + smoke_100 + smoke_num + pov_num +
ins_num + sroh_num + age_num + educ_num,
data = omas17ne_imp2))
Start: AIC=-13341.55
HWR ~ alcohol30 + mental31 + sex + job_today + usual_care + smoke_100 +
smoke_num + pov_num + ins_num + sroh_num + age_num + educ_num
Df Sum of Sq RSS AIC
- educ_num 1 0.003 1739.7 -13344
- mental31 1 0.030 1739.8 -13343
- ins_num 1 0.051 1739.8 -13343
- usual_care 1 0.393 1740.1 -13342
<none> 1739.7 -13342
- pov_num 1 0.499 1740.2 -13341
- age_num 1 0.863 1740.6 -13339
- alcohol30 1 6.806 1746.5 -13310
- job_today 1 11.198 1750.9 -13289
- smoke_100 1 15.757 1755.5 -13267
- smoke_num 1 23.986 1763.7 -13228
- sroh_num 1 71.440 1811.2 -13003
- sex 1 95.911 1835.7 -12890
Step: AIC=-13343.54
HWR ~ alcohol30 + mental31 + sex + job_today + usual_care + smoke_100 +
smoke_num + pov_num + ins_num + sroh_num + age_num
Df Sum of Sq RSS AIC
- mental31 1 0.030 1739.8 -13345
- ins_num 1 0.050 1739.8 -13345
- usual_care 1 0.398 1740.1 -13344
<none> 1739.7 -13344
- pov_num 1 0.509 1740.2 -13343
- age_num 1 0.867 1740.6 -13341
- alcohol30 1 6.819 1746.6 -13312
- job_today 1 11.286 1751.0 -13291
- smoke_100 1 15.755 1755.5 -13269
- smoke_num 1 24.161 1763.9 -13229
- sroh_num 1 72.158 1811.9 -13002
- sex 1 95.920 1835.7 -12892
Step: AIC=-13345.39
HWR ~ alcohol30 + sex + job_today + usual_care + smoke_100 +
smoke_num + pov_num + ins_num + sroh_num + age_num
Df Sum of Sq RSS AIC
- ins_num 1 0.052 1739.8 -13347
- usual_care 1 0.395 1740.2 -13346
<none> 1739.8 -13345
- pov_num 1 0.527 1740.3 -13345
- age_num 1 0.840 1740.6 -13343
- alcohol30 1 6.796 1746.6 -13314
- job_today 1 11.279 1751.0 -13293
- smoke_100 1 15.725 1755.5 -13271
- smoke_num 1 24.158 1763.9 -13231
- sroh_num 1 76.644 1816.4 -12983
- sex 1 95.891 1835.7 -12894
Step: AIC=-13347.13
HWR ~ alcohol30 + sex + job_today + usual_care + smoke_100 +
smoke_num + pov_num + sroh_num + age_num
Df Sum of Sq RSS AIC
- usual_care 1 0.398 1740.2 -13347
<none> 1739.8 -13347
- pov_num 1 0.867 1740.7 -13345
- age_num 1 0.900 1740.7 -13345
- alcohol30 1 6.820 1746.6 -13316
- job_today 1 11.673 1751.5 -13293
- smoke_100 1 15.692 1755.5 -13273
- smoke_num 1 24.111 1763.9 -13233
- sroh_num 1 77.107 1816.9 -12982
- sex 1 95.887 1835.7 -12896
Step: AIC=-13347.2
HWR ~ alcohol30 + sex + job_today + smoke_100 + smoke_num + pov_num +
sroh_num + age_num
Df Sum of Sq RSS AIC
<none> 1740.2 -13347
- pov_num 1 0.833 1741.0 -13345
- age_num 1 1.003 1741.2 -13344
- alcohol30 1 6.818 1747.0 -13316
- job_today 1 11.648 1751.9 -13293
- smoke_100 1 15.787 1756.0 -13273
- smoke_num 1 24.311 1764.5 -13232
- sroh_num 1 78.245 1818.5 -12977
- sex 1 95.490 1835.7 -12898
So the model that the stepwise approach (with numeric values forced for our multi-categorical variables) selects includes these 8 variables:
sex_F
, sroh_num
, smoke_num
, smoke_100
, job_today
and alcohol30
pov_num
and age_num
hwr_out_STEPFAC <- step(lm(
HWR ~ alcohol30 + mental31 + sex + job_today +
usual_care + smoke_100 + smoke_stat + poverty +
ins_type + sroh + age + education,
data = omas17ne_imp2))
Start: AIC=-13578.71
HWR ~ alcohol30 + mental31 + sex + job_today + usual_care + smoke_100 +
smoke_stat + poverty + ins_type + sroh + age + education
Df Sum of Sq RSS AIC
- smoke_100 1 0.092 1684.1 -13580
- usual_care 1 0.289 1684.3 -13579
- mental31 1 0.363 1684.4 -13579
<none> 1684.0 -13579
- ins_type 3 1.655 1685.7 -13576
- poverty 6 2.974 1687.0 -13576
- job_today 1 2.818 1686.8 -13567
- education 5 6.771 1690.8 -13555
- alcohol30 1 6.121 1690.2 -13550
- age 5 19.845 1703.9 -13490
- smoke_stat 2 28.817 1712.8 -13439
- sroh 4 85.995 1770.0 -13166
- sex 1 100.318 1784.3 -13092
Step: AIC=-13580.25
HWR ~ alcohol30 + mental31 + sex + job_today + usual_care + smoke_stat +
poverty + ins_type + sroh + age + education
Df Sum of Sq RSS AIC
- usual_care 1 0.285 1684.4 -13581
- mental31 1 0.362 1684.5 -13580
<none> 1684.1 -13580
- ins_type 3 1.647 1685.8 -13578
- poverty 6 2.952 1687.1 -13577
- job_today 1 2.834 1687.0 -13568
- education 5 6.767 1690.9 -13556
- alcohol30 1 6.131 1690.3 -13552
- age 5 19.782 1703.9 -13492
- smoke_stat 2 30.825 1715.0 -13431
- sroh 4 85.919 1770.0 -13168
- sex 1 100.236 1784.4 -13093
Step: AIC=-13580.81
HWR ~ alcohol30 + mental31 + sex + job_today + smoke_stat + poverty +
ins_type + sroh + age + education
Df Sum of Sq RSS AIC
- mental31 1 0.353 1684.8 -13581
<none> 1684.4 -13581
- ins_type 3 1.674 1686.1 -13578
- poverty 6 2.974 1687.4 -13578
- job_today 1 2.808 1687.2 -13569
- education 5 6.718 1691.1 -13557
- alcohol30 1 6.139 1690.5 -13552
- age 5 19.928 1704.3 -13491
- smoke_stat 2 31.000 1715.4 -13431
- sroh 4 87.040 1771.5 -13163
- sex 1 99.967 1784.4 -13095
Step: AIC=-13581.04
HWR ~ alcohol30 + sex + job_today + smoke_stat + poverty + ins_type +
sroh + age + education
Df Sum of Sq RSS AIC
<none> 1684.8 -13581
- ins_type 3 1.666 1686.4 -13579
- poverty 6 2.996 1687.8 -13578
- job_today 1 2.644 1687.4 -13570
- education 5 6.738 1691.5 -13557
- alcohol30 1 6.049 1690.8 -13553
- age 5 20.055 1704.8 -13491
- smoke_stat 2 30.689 1715.5 -13432
- sroh 4 89.220 1774.0 -13153
- sex 1 99.792 1784.5 -13096
So the model that the stepwise approach (with factors) selects includes these 9 variables:
sex
, sroh
, smoke_stat
, job_today
and alcohol30
,smoke_100
poverty
and age
and education
and ins_type
Model | Inputs (besides the Intercept) |
---|---|
A | sex , sroh , smoke_stat , smoke_100 , job_today , alcohol30 |
B | model A plus age |
C | model B plus poverty |
D | model C plus usual_care |
E | sex , sroh , smoke_stat , job_today , alcohol30 , poverty , age , education , ins_type |
Note that we developed model E using the factor versions of the multi-categorical variables, but models A-D were developed using numeric versions of the multi-categorical information.
Nonetheless, we’ll use the factor versions and impute all missing predictors to assess these models for our cross-validation work.
crossv_kfold
splits the data into k exclusive partitions, and uses each partition for a test-training split.crossv_mc
generates n random partitions, holding out p% of the data for training.I usually use crossv_kfold
.
set.seed(43201)
cv_modA <- omas17ne_imp2 %>%
crossv_kfold(k = 10) %>%
mutate(model =
map(train,
~ lm(HWR ~ sex + sroh + smoke_stat +
smoke_100 + job_today +
alcohol30,
data = .)))
cv_modA_pred <- cv_modA %>%
unnest(map2(model, test, ~ augment(.x, newdata = .y)))
cv_modA_results <- cv_modA_pred %>%
summarize(Model = "Model A",
RMSE = sqrt(mean((HWR - .fitted)^2)),
MAE = mean(abs(HWR - .fitted)),
MaxError = max(abs(HWR - .fitted)))
set.seed(43202)
cv_modB <- omas17ne_imp2 %>%
crossv_kfold(k = 10) %>%
mutate(model =
map(train,
~ lm(HWR ~ sex + sroh + smoke_stat +
smoke_100 + job_today +
alcohol30 + age,
data = .)))
cv_modB_pred <- cv_modB %>%
unnest(map2(model, test, ~ augment(.x, newdata = .y)))
cv_modB_results <- cv_modB_pred %>%
summarize(Model = "Model B",
RMSE = sqrt(mean((HWR - .fitted)^2)),
MAE = mean(abs(HWR - .fitted)),
MaxError = max(abs(HWR - .fitted)))
set.seed(43203)
cv_modC <- omas17ne_imp2 %>%
crossv_kfold(k = 10) %>%
mutate(model =
map(train,
~ lm(HWR ~ sex + sroh + smoke_stat +
smoke_100 + job_today +
alcohol30 + age + poverty,
data = .)))
cv_modC_pred <- cv_modC %>%
unnest(map2(model, test, ~ augment(.x, newdata = .y)))
cv_modC_results <- cv_modC_pred %>%
summarize(Model = "Model C",
RMSE = sqrt(mean((HWR - .fitted)^2)),
MAE = mean(abs(HWR - .fitted)),
MaxError = max(abs(HWR - .fitted)))
set.seed(43204)
cv_modD <- omas17ne_imp2 %>%
crossv_kfold(k = 10) %>%
mutate(model =
map(train,
~ lm(HWR ~ sex + sroh + smoke_stat +
smoke_100 + job_today +
alcohol30 + age + poverty +
usual_care,
data = .)))
cv_modD_pred <- cv_modD %>%
unnest(map2(model, test, ~ augment(.x, newdata = .y)))
cv_modD_results <- cv_modD_pred %>%
summarize(Model = "Model D",
RMSE = sqrt(mean((HWR - .fitted)^2)),
MAE = mean(abs(HWR - .fitted)),
MaxError = max(abs(HWR - .fitted)))
set.seed(43205)
cv_modE <- omas17ne_imp2 %>%
crossv_kfold(k = 10) %>%
mutate(model =
map(train,
~ lm(HWR ~ sex + sroh + smoke_stat +
job_today + alcohol30 +
poverty + age + education +
ins_type,
data = .)))
cv_modE_pred <- cv_modE %>%
unnest(map2(model, test, ~ augment(.x, newdata = .y)))
cv_modE_results <- cv_modE_pred %>%
summarize(Model = "Model E",
RMSE = sqrt(mean((HWR - .fitted)^2)),
MAE = mean(abs(HWR - .fitted)),
MaxError = max(abs(HWR - .fitted)))
bind_rows(cv_modA_results, cv_modB_results,
cv_modC_results, cv_modD_results,
cv_modE_results)
# A tibble: 5 x 4
Model RMSE MAE MaxError
<chr> <dbl> <dbl> <dbl>
1 Model A 0.452 0.352 3.37
2 Model B 0.449 0.349 3.34
3 Model C 0.449 0.349 3.34
4 Model D 0.449 0.349 3.34
5 Model E 0.448 0.348 3.27
spear1 <- spearman2(HWR ~ sex + sroh + smoke_stat +
job_today + alcohol30 +
poverty + age + education +
ins_type,
data = omas17ne, p = 1)
spear1
Spearman rho^2 Response variable:HWR
rho2 F df1 df2 P Adjusted rho2 n
sex 0.047 413.85 1 8444 0.0000 0.047 8446
sroh 0.054 121.62 4 8450 0.0000 0.054 8455
smoke_stat 0.011 48.20 2 8405 0.0000 0.011 8408
job_today 0.000 3.86 1 8296 0.0496 0.000 8298
alcohol30 0.003 28.14 1 8378 0.0000 0.003 8380
poverty 0.003 3.67 6 8448 0.0012 0.002 8455
age 0.015 25.61 5 8449 0.0000 0.014 8455
education 0.005 8.41 5 8449 0.0000 0.004 8455
ins_type 0.003 7.11 3 8451 0.0001 0.002 8455
plot(spear1)
spear2 <- spearman2(HWR ~ sex + sroh + smoke_stat +
job_today + alcohol30 +
poverty + age + education +
ins_type,
data = omas17ne_imp2, p = 1)
spear2
Spearman rho^2 Response variable:HWR
rho2 F df1 df2 P Adjusted rho2 n
sex 0.047 412.64 1 8453 0.0000 0.046 8455
sroh 0.054 121.62 4 8450 0.0000 0.054 8455
smoke_stat 0.011 48.29 2 8452 0.0000 0.011 8455
job_today 0.000 3.52 1 8453 0.0606 0.000 8455
alcohol30 0.003 28.18 1 8453 0.0000 0.003 8455
poverty 0.003 3.67 6 8448 0.0012 0.002 8455
age 0.015 25.61 5 8449 0.0000 0.014 8455
education 0.005 8.41 5 8449 0.0000 0.004 8455
ins_type 0.003 7.11 3 8451 0.0001 0.002 8455
plot(spear2)
spear3 <- spearman2(HWR ~ sex + sroh + smoke_stat +
job_today + alcohol30 +
poverty + age + education +
ins_type,
data = omas17ne_imp2, p = 2)
spear3
Spearman rho^2 Response variable:HWR
rho2 F df1 df2 P Adjusted rho2 n
sex 0.047 412.64 1 8453 0.0000 0.046 8455
sroh 0.054 121.62 4 8450 0.0000 0.054 8455
smoke_stat 0.011 48.29 2 8452 0.0000 0.011 8455
job_today 0.000 3.52 1 8453 0.0606 0.000 8455
alcohol30 0.004 17.34 2 8452 0.0000 0.004 8455
poverty 0.003 3.67 6 8448 0.0012 0.002 8455
age 0.015 25.61 5 8449 0.0000 0.014 8455
education 0.005 8.41 5 8449 0.0000 0.004 8455
ins_type 0.003 7.11 3 8451 0.0001 0.002 8455
plot(spear3)
d <- datadist(omas17ne)
options(datadist = "d")
modelE <- ols(HWR ~ sex + sroh + sex * sroh +
smoke_stat + job_today + alcohol30 +
poverty + age + education + ins_type,
na.action = na.delete,
data = omas17ne, x = TRUE, y = TRUE)
modelE
Frequencies of Missing Values Due to Each Variable
HWR sex sroh smoke_stat job_today alcohol30
0 9 0 47 157 75
poverty age education ins_type
0 0 0 0
Linear Regression Model
ols(formula = HWR ~ sex + sroh + sex * sroh + smoke_stat + job_today +
alcohol30 + poverty + age + education + ins_type, data = omas17ne,
na.action = na.delete, x = TRUE, y = TRUE)
Model Likelihood Discrimination
Ratio Test Indexes
Obs 8219 LR chi2 1335.15 R2 0.150
sigma0.4432 d.f. 32 R2 adj 0.147
d.f. 8186 Pr(> chi2) 0.0000 g 0.209
Residuals
Min 1Q Median 3Q Max
-1.37540 -0.29761 -0.01181 0.26546 3.19772
Coef S.E. t Pr(>|t|)
Intercept 2.3516 0.0355 66.20 <0.0001
sex=F 0.3502 0.0229 15.26 <0.0001
sroh=2_VeryGood -0.1258 0.0210 -6.00 <0.0001
sroh=3_Good -0.1971 0.0221 -8.90 <0.0001
sroh=4_Fair -0.2080 0.0269 -7.74 <0.0001
sroh=5_Poor -0.1103 0.0368 -3.00 0.0027
smoke_stat=Former -0.0370 0.0119 -3.10 0.0019
smoke_stat=Current 0.1451 0.0141 10.33 <0.0001
job_today -0.0474 0.0130 -3.64 0.0003
alcohol30 0.0037 0.0007 5.63 <0.0001
poverty=0.75 - 1.00 FPL -0.0467 0.0233 -2.01 0.0450
poverty=1.00 - 1.38 FPL 0.0195 0.0227 0.86 0.3899
poverty=1.38 - 2.06 FPL 0.0025 0.0222 0.11 0.9105
poverty=2.06 - 2.50 FPL 0.0212 0.0249 0.85 0.3940
poverty=2.50 - 4.00 FPL -0.0164 0.0222 -0.74 0.4609
poverty=4.00 FPL or more 0.0199 0.0217 0.92 0.3597
age=25-34 -0.1242 0.0248 -5.01 <0.0001
age=35-44 -0.2202 0.0249 -8.85 <0.0001
age=45-54 -0.1990 0.0235 -8.47 <0.0001
age=55-64 -0.1766 0.0230 -7.68 <0.0001
age=65+ -0.1198 0.0262 -4.58 <0.0001
education=2_HS Graduate or equivalent -0.0741 0.0226 -3.28 0.0010
education=3_Some College -0.1081 0.0239 -4.53 <0.0001
education=4_Associate Degree -0.1078 0.0251 -4.29 <0.0001
education=5_Four Year College Graduate -0.0681 0.0244 -2.79 0.0052
education=6_Advanced Degree -0.0358 0.0257 -1.39 0.1641
ins_type=Medicare 0.0318 0.0188 1.69 0.0910
ins_type=Employer 0.0208 0.0177 1.17 0.2407
ins_type=Private 0.0574 0.0256 2.24 0.0252
sex=F * sroh=2_VeryGood -0.0896 0.0287 -3.13 0.0018
sex=F * sroh=3_Good -0.1471 0.0295 -4.99 <0.0001
sex=F * sroh=4_Fair -0.2493 0.0341 -7.30 <0.0001
sex=F * sroh=5_Poor -0.2669 0.0466 -5.73 <0.0001
plot(summary(modelE))
plot(nomogram(modelE))
anova(modelE)
Analysis of Variance Response: HWR
Factor d.f. Partial SS MS
sex (Factor+Higher Order Factors) 5 112.562309 22.5124618
All Interactions 4 14.067508 3.5168770
sroh (Factor+Higher Order Factors) 8 103.169688 12.8962110
All Interactions 4 14.067508 3.5168770
smoke_stat 2 30.047958 15.0239791
job_today 1 2.602839 2.6028386
alcohol30 1 6.230003 6.2300028
poverty 6 2.999894 0.4999823
age 5 20.572591 4.1145183
education 5 7.177578 1.4355156
ins_type 3 1.236235 0.4120782
sex * sroh (Factor+Higher Order Factors) 4 14.067508 3.5168770
REGRESSION 32 283.651495 8.8641092
ERROR 8186 1608.134143 0.1964493
F P
114.60 <.0001
17.90 <.0001
65.65 <.0001
17.90 <.0001
76.48 <.0001
13.25 0.0003
31.71 <.0001
2.55 0.0183
20.94 <.0001
7.31 <.0001
2.10 0.0983
17.90 <.0001
45.12 <.0001
validate
do here?The details are found at this 2014-10-04 posting by Jonathan Bartlett at The Stats Geek.
By default, the machine is doing bootstrap validation, where it creates 40 resampled versions of the same size as the original data set, and treats those as the training samples (which are then averaged to get the training
result below) and then comparing the result in the original data set (which produces the test
value.)
validate(modelE)
index.orig training test optimism index.corrected n
R-square 0.1499 0.1540 0.1462 0.0078 0.1421 40
MSE 0.1957 0.1948 0.1965 -0.0018 0.1974 40
g 0.2091 0.2118 0.2067 0.0050 0.2041 40
Intercept 0.0000 0.0000 0.0551 -0.0551 0.0551 40
Slope 1.0000 1.0000 0.9747 0.0253 0.9747 40
which.influence(modelE)
$sroh
[1] 1542 5212
$alcohol30
[1] 5295
$education
[1] 5212
omas17ne %>% slice(5212)
# A tibble: 1 x 28
caseid region county_type avoid_care usual_care ins_employer ins_medicare
<chr> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
1 04011~ North~ Metro 0 1 1 1
# ... with 21 more variables: ins_medicaid <dbl>, ins_private <dbl>,
# ins_type <fct>, age <fct>, sex <fct>, race_eth <fct>, education <fct>,
# marital <fct>, poverty <fct>, job_today <dbl>, food_12mo <fct>,
# sroh <fct>, mental31 <dbl>, ht_in <dbl>, wt_lb <dbl>, bmi <dbl>,
# smoke_stat <fct>, smoke_100 <dbl>, alcohol30 <dbl>, er_visits <dbl>,
# HWR <dbl>
set.seed(4322019)
imp_model <- aregImpute(
~ HWR + sex + sroh + smoke_stat + job_today +
alcohol30 + poverty + age + education +
ins_type, data = omas17ne,
n.impute = 10, pr = FALSE, x = TRUE)
imp_model
Multiple Imputation using Bootstrap and PMM
aregImpute(formula = ~HWR + sex + sroh + smoke_stat + job_today +
alcohol30 + poverty + age + education + ins_type, data = omas17ne,
n.impute = 10, x = TRUE, pr = FALSE)
n: 8455 p: 10 Imputations: 10 nk: 3
Number of NAs:
HWR sex sroh smoke_stat job_today alcohol30
0 9 0 47 157 75
poverty age education ins_type
0 0 0 0
type d.f.
HWR s 2
sex c 1
sroh c 4
smoke_stat c 2
job_today l 1
alcohol30 s 1
poverty c 6
age c 5
education c 5
ins_type c 3
Transformation of Target Variables Forced to be Linear
R-squares for Predicting Non-Missing Values for Each Variable
Using Last Imputations of Predictors
sex smoke_stat job_today alcohol30
0.122 0.171 0.446 0.081
modelE_imp <- fit.mult.impute(
HWR ~ sex + sroh + sex * sroh + smoke_stat +
job_today + alcohol30 + poverty + age +
education + ins_type,
fitter = ols, xtrans = imp_model, data = omas17ne,
x = TRUE, y = TRUE)
Variance Inflation Factors Due to Imputation:
Intercept
1.00
sex=F
1.00
sroh=2_VeryGood
1.00
sroh=3_Good
1.00
sroh=4_Fair
1.00
sroh=5_Poor
1.01
smoke_stat=Former
1.01
smoke_stat=Current
1.01
job_today
1.08
alcohol30
1.00
poverty=0.75 - 1.00 FPL
1.00
poverty=1.00 - 1.38 FPL
1.00
poverty=1.38 - 2.06 FPL
1.00
poverty=2.06 - 2.50 FPL
1.00
poverty=2.50 - 4.00 FPL
1.00
poverty=4.00 FPL or more
1.00
age=25-34
1.00
age=35-44
1.00
age=45-54
1.00
age=55-64
1.00
age=65+
1.00
education=2_HS Graduate or equivalent
1.00
education=3_Some College
1.00
education=4_Associate Degree
1.00
education=5_Four Year College Graduate
1.00
education=6_Advanced Degree
1.00
ins_type=Medicare
1.00
ins_type=Employer
1.01
ins_type=Private
1.00
sex=F * sroh=2_VeryGood
1.00
sex=F * sroh=3_Good
1.00
sex=F * sroh=4_Fair
1.00
sex=F * sroh=5_Poor
1.01
Rate of Missing Information:
Intercept
0.00
sex=F
0.00
sroh=2_VeryGood
0.00
sroh=3_Good
0.00
sroh=4_Fair
0.00
sroh=5_Poor
0.01
smoke_stat=Former
0.01
smoke_stat=Current
0.01
job_today
0.08
alcohol30
0.00
poverty=0.75 - 1.00 FPL
0.00
poverty=1.00 - 1.38 FPL
0.00
poverty=1.38 - 2.06 FPL
0.00
poverty=2.06 - 2.50 FPL
0.00
poverty=2.50 - 4.00 FPL
0.00
poverty=4.00 FPL or more
0.00
age=25-34
0.00
age=35-44
0.00
age=45-54
0.00
age=55-64
0.00
age=65+
0.00
education=2_HS Graduate or equivalent
0.00
education=3_Some College
0.00
education=4_Associate Degree
0.00
education=5_Four Year College Graduate
0.00
education=6_Advanced Degree
0.00
ins_type=Medicare
0.00
ins_type=Employer
0.01
ins_type=Private
0.00
sex=F * sroh=2_VeryGood
0.00
sex=F * sroh=3_Good
0.00
sex=F * sroh=4_Fair
0.00
sex=F * sroh=5_Poor
0.01
d.f. for t-distribution for Tests of Single Coefficients:
Intercept
8.635654e+05
sex=F
2.717645e+08
sroh=2_VeryGood
3.236438e+08
sroh=3_Good
1.926555e+07
sroh=4_Fair
5.359716e+06
sroh=5_Poor
2.597791e+05
smoke_stat=Former
2.751832e+05
smoke_stat=Current
1.654171e+05
job_today
1.561020e+03
alcohol30
1.177942e+06
poverty=0.75 - 1.00 FPL
1.524378e+07
poverty=1.00 - 1.38 FPL
1.966670e+07
poverty=1.38 - 2.06 FPL
5.750596e+06
poverty=2.06 - 2.50 FPL
7.951045e+06
poverty=2.50 - 4.00 FPL
8.612249e+06
poverty=4.00 FPL or more
5.027044e+06
age=25-34
5.199872e+07
age=35-44
5.959921e+07
age=45-54
2.239895e+07
age=55-64
5.002619e+06
age=65+
1.704738e+06
education=2_HS Graduate or equivalent
8.094864e+07
education=3_Some College
3.524345e+07
education=4_Associate Degree
3.766527e+07
education=5_Four Year College Graduate
2.698610e+07
education=6_Advanced Degree
3.707462e+07
ins_type=Medicare
8.024449e+07
ins_type=Employer
2.343152e+05
ins_type=Private
1.034481e+07
sex=F * sroh=2_VeryGood
1.231234e+09
sex=F * sroh=3_Good
1.090981e+07
sex=F * sroh=4_Fair
1.063890e+08
sex=F * sroh=5_Poor
5.588244e+04
The following fit components were averaged over the 10 model fits:
fitted.values stats linear.predictors
modelE_imp
Linear Regression Model
fit.mult.impute(formula = HWR ~ sex + sroh + sex * sroh + smoke_stat +
job_today + alcohol30 + poverty + age + education + ins_type,
fitter = ols, xtrans = imp_model, data = omas17ne, x = TRUE,
y = TRUE)
Model Likelihood Discrimination
Ratio Test Indexes
Obs 8455 LR chi2 1348.60 R2 0.147
sigma0.4450 d.f. 32 R2 adj 0.144
d.f. 8422 Pr(> chi2) 0.0000 g 0.208
Residuals
Min 1Q Median 3Q Max
-1.45799 -0.29739 -0.01195 0.26471 3.19204
Coef S.E. t Pr(>|t|)
Intercept 2.3490 0.0351 67.01 <0.0001
sex=F 0.3547 0.0228 15.58 <0.0001
sroh=2_VeryGood -0.1183 0.0208 -5.68 <0.0001
sroh=3_Good -0.1888 0.0219 -8.62 <0.0001
sroh=4_Fair -0.1986 0.0265 -7.48 <0.0001
sroh=5_Poor -0.1211 0.0358 -3.38 0.0007
smoke_stat=Former -0.0358 0.0118 -3.02 0.0025
smoke_stat=Current 0.1437 0.0139 10.34 <0.0001
job_today -0.0485 0.0134 -3.62 0.0003
alcohol30 0.0036 0.0007 5.44 <0.0001
poverty=0.75 - 1.00 FPL -0.0419 0.0229 -1.83 0.0669
poverty=1.00 - 1.38 FPL 0.0214 0.0223 0.96 0.3368
poverty=1.38 - 2.06 FPL -0.0035 0.0218 -0.16 0.8731
poverty=2.06 - 2.50 FPL 0.0208 0.0245 0.85 0.3967
poverty=2.50 - 4.00 FPL -0.0160 0.0218 -0.73 0.4651
poverty=4.00 FPL or more 0.0220 0.0213 1.03 0.3024
age=25-34 -0.1252 0.0245 -5.11 <0.0001
age=35-44 -0.2219 0.0245 -9.04 <0.0001
age=45-54 -0.2018 0.0232 -8.70 <0.0001
age=55-64 -0.1809 0.0227 -7.98 <0.0001
age=65+ -0.1235 0.0258 -4.80 <0.0001
education=2_HS Graduate or equivalent -0.0718 0.0224 -3.21 0.0013
education=3_Some College -0.1074 0.0237 -4.54 <0.0001
education=4_Associate Degree -0.1022 0.0249 -4.10 <0.0001
education=5_Four Year College Graduate -0.0666 0.0241 -2.77 0.0057
education=6_Advanced Degree -0.0382 0.0254 -1.50 0.1329
ins_type=Medicare 0.0332 0.0184 1.80 0.0717
ins_type=Employer 0.0197 0.0175 1.12 0.2609
ins_type=Private 0.0627 0.0253 2.48 0.0132
sex=F * sroh=2_VeryGood -0.0936 0.0285 -3.29 0.0010
sex=F * sroh=3_Good -0.1536 0.0292 -5.25 <0.0001
sex=F * sroh=4_Fair -0.2605 0.0338 -7.71 <0.0001
sex=F * sroh=5_Poor -0.2597 0.0458 -5.67 <0.0001
plot(summary(modelE_imp))
validate(modelE_imp)
index.orig training test optimism index.corrected n
R-square 0.1476 0.1514 0.1442 0.0072 0.1404 40
MSE 0.1972 0.1958 0.1980 -0.0022 0.1995 40
g 0.2078 0.2101 0.2056 0.0045 0.2033 40
Intercept 0.0000 0.0000 0.0443 -0.0443 0.0443 40
Slope 1.0000 1.0000 0.9792 0.0208 0.9792 40
plot(nomogram(modelE_imp))
Let’s take a sample of the data, to describe the Northeast Ohio region.
omas17ne2 <- omas17_cleaned %>%
filter(region == "North_East") %>%
filter(!is.na(avoid_care))
nrow(omas17ne2)
[1] 8238
omas17ne2 %>% tabyl(avoid_care)
avoid_care n percent
0 6484 0.7870842
1 1754 0.2129158
We’ll consider 6 predictors:
alcohol30
(quantitative)bmi
(quantitative)mental31
(count - quantitative)smoke_100
(yes/no)sroh
(5 categories)usual_care
(yes/no)omas17ne3 <- omas17ne2 %>%
select(caseid, avoid_care, alcohol30, bmi,
mental31, smoke_100, sroh, usual_care)
omas17ne3
# A tibble: 8,238 x 8
caseid avoid_care alcohol30 bmi mental31 smoke_100 sroh usual_care
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <dbl>
1 010002~ 1 19 25.0 0 0 1_Exce~ 1
2 010005~ 0 2 28.2 0 1 3_Good 0
3 010006~ 0 0 44.2 0 1 4_Fair 1
4 010009~ 0 0 31.2 0 1 5_Poor 1
5 010011~ 0 15 20.8 4 1 4_Fair 1
6 010011~ 0 0 29.5 0 1 1_Exce~ 1
7 010013~ 0 25 25.2 0 0 1_Exce~ 1
8 010013~ 0 7 29.2 0 0 2_Very~ 1
9 010013~ 0 10 24.4 0 0 1_Exce~ 1
10 010013~ 0 0 22.2 NA 1 4_Fair 0
# ... with 8,228 more rows
miss_var_summary(omas17ne3)
# A tibble: 8 x 3
variable n_miss pct_miss
<chr> <int> <dbl>
1 mental31 95 1.15
2 usual_care 83 1.01
3 alcohol30 39 0.473
4 bmi 21 0.255
5 smoke_100 20 0.243
6 caseid 0 0
7 avoid_care 0 0
8 sroh 0 0
set.seed(20190307)
omas17ne3_imp1 <- omas17ne3 %>%
impute_pmm(alcohol30 + bmi + smoke_100 ~ sroh) %>%
impute_pmm(usual_care ~ bmi + smoke_100 + sroh) %>%
impute_pmm(mental31 ~ bmi + alcohol30 + sroh)
miss_var_summary(omas17ne3_imp1)
# A tibble: 8 x 3
variable n_miss pct_miss
<chr> <int> <dbl>
1 caseid 0 0
2 avoid_care 0 0
3 alcohol30 0 0
4 bmi 0 0
5 mental31 0 0
6 smoke_100 0 0
7 sroh 0 0
8 usual_care 0 0
glm
We’ll use the simply imputed data here.
m_1 <- glm(avoid_care ~ alcohol30 + bmi + mental31 +
smoke_100 + sroh + usual_care,
data = omas17ne3_imp1,
family = binomial())
m_1
Call: glm(formula = avoid_care ~ alcohol30 + bmi + mental31 + smoke_100 +
sroh + usual_care, family = binomial(), data = omas17ne3_imp1)
Coefficients:
(Intercept) alcohol30 bmi mental31
-1.753457 0.005833 0.002701 0.043714
smoke_100 sroh2_VeryGood sroh3_Good sroh4_Fair
0.129122 0.390045 0.632515 0.811374
sroh5_Poor usual_care
0.965684 -0.363599
Degrees of Freedom: 8237 Total (i.e. Null); 8228 Residual
Null Deviance: 8531
Residual Deviance: 8194 AIC: 8214
glance(m_1)
# A tibble: 1 x 7
null.deviance df.null logLik AIC BIC deviance df.residual
<dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
1 8531. 8237 -4097. 8214. 8285. 8194. 8228
lrm
d <- datadist(omas17ne3)
options(datadist = "d")
m_1a <- lrm(avoid_care ~ alcohol30 + bmi + mental31 +
smoke_100 + sroh + usual_care,
data = omas17ne3_imp1,
x = TRUE, y = TRUE)
m_1a
Logistic Regression Model
lrm(formula = avoid_care ~ alcohol30 + bmi + mental31 + smoke_100 +
sroh + usual_care, data = omas17ne3_imp1, x = TRUE, y = TRUE)
Model Likelihood Discrimination Rank Discrim.
Ratio Test Indexes Indexes
Obs 8238 LR chi2 336.81 R2 0.062 C 0.636
0 6484 d.f. 9 g 0.484 Dxy 0.272
1 1754 Pr(> chi2) <0.0001 gr 1.623 gamma 0.272
max |deriv| 3e-12 gp 0.083 tau-a 0.091
Brier 0.160
Coef S.E. Wald Z Pr(>|Z|)
Intercept -1.7535 0.1654 -10.60 <0.0001
alcohol30 0.0058 0.0036 1.63 0.1021
bmi 0.0027 0.0041 0.66 0.5107
mental31 0.0437 0.0040 11.03 <0.0001
smoke_100 0.1291 0.0567 2.28 0.0227
sroh=2_VeryGood 0.3900 0.0925 4.22 <0.0001
sroh=3_Good 0.6325 0.0938 6.74 <0.0001
sroh=4_Fair 0.8114 0.1046 7.76 <0.0001
sroh=5_Poor 0.9657 0.1312 7.36 <0.0001
usual_care -0.3636 0.1100 -3.31 0.0009
step
, applied to the glm
fitstep(m_1)
Start: AIC=8214.37
avoid_care ~ alcohol30 + bmi + mental31 + smoke_100 + sroh +
usual_care
Df Deviance AIC
- bmi 1 8194.8 8212.8
<none> 8194.4 8214.4
- alcohol30 1 8197.0 8215.0
- smoke_100 1 8199.6 8217.6
- usual_care 1 8204.8 8222.8
- sroh 4 8280.5 8292.5
- mental31 1 8313.6 8331.6
Step: AIC=8212.8
avoid_care ~ alcohol30 + mental31 + smoke_100 + sroh + usual_care
Df Deviance AIC
<none> 8194.8 8212.8
- alcohol30 1 8197.3 8213.3
- smoke_100 1 8199.9 8215.9
- usual_care 1 8205.1 8221.1
- sroh 4 8288.7 8298.7
- mental31 1 8314.3 8330.3
Call: glm(formula = avoid_care ~ alcohol30 + mental31 + smoke_100 +
sroh + usual_care, family = binomial(), data = omas17ne3_imp1)
Coefficients:
(Intercept) alcohol30 mental31 smoke_100
-1.683776 0.005651 0.043766 0.127628
sroh2_VeryGood sroh3_Good sroh4_Fair sroh5_Poor
0.396263 0.643251 0.825539 0.977939
usual_care
-0.361652
Degrees of Freedom: 8237 Total (i.e. Null); 8229 Residual
Null Deviance: 8531
Residual Deviance: 8195 AIC: 8213
Suggested model includes everything but bmi
.
validate
, applied to the lrm
fitBy default, this uses AIC, but we could instead use p values.
validate(m_1a, bw = TRUE, B = 10)
Backwards Step-down - Original Model
Deleted Chi-Sq d.f. P Residual d.f. P AIC
bmi 0.43 1 0.5107 0.43 1 0.5107 -1.57
alcohol30 2.52 1 0.1122 2.96 2 0.2282 -1.04
Approximate Estimates after Deleting Factors
Coef S.E. Wald Z P
Intercept -1.65322 0.125492 -13.174 0.000e+00
mental31 0.04385 0.003963 11.067 0.000e+00
smoke_100 0.13768 0.056265 2.447 1.441e-02
sroh=2_VeryGood 0.39620 0.091975 4.308 1.649e-05
sroh=3_Good 0.63350 0.092196 6.871 6.368e-12
sroh=4_Fair 0.80887 0.101773 7.948 1.887e-15
sroh=5_Poor 0.95830 0.129290 7.412 1.243e-13
usual_care -0.36416 0.109960 -3.312 9.272e-04
Factors in Final Model
[1] mental31 smoke_100 sroh usual_care
index.orig training test optimism index.corrected n
Dxy 0.2670 0.2766 0.2653 0.0113 0.2557 10
R2 0.0616 0.0642 0.0602 0.0040 0.0575 10
Intercept 0.0000 0.0000 -0.0264 0.0264 -0.0264 10
Slope 1.0000 1.0000 0.9735 0.0265 0.9735 10
Emax 0.0000 0.0000 0.0105 0.0105 0.0105 10
D 0.0404 0.0421 0.0395 0.0027 0.0378 10
U -0.0002 -0.0002 0.0000 -0.0002 0.0000 10
Q 0.0407 0.0424 0.0395 0.0029 0.0378 10
B 0.1603 0.1594 0.1605 -0.0011 0.1614 10
g 0.4793 0.4998 0.4834 0.0163 0.4630 10
gp 0.0825 0.0847 0.0824 0.0024 0.0801 10
Factors Retained in Backwards Elimination
alcohol30 bmi mental31 smoke_100 sroh usual_care
* * * *
* * * * * *
* * * * *
* * * *
* * * * * *
* * * *
* * * * *
* * * *
* * * * *
* * *
Frequencies of Numbers of Factors Retained
3 4 5 6
1 4 3 2
So that’s recommending a model with everything but bmi
and alcohol
.
Let’s consider three models:
Model | Variables |
---|---|
KS | alcohol30 + bmi + mental31 + smoke_100 + sroh + usual_care |
SW1 | alcohol30 + mental31 + smoke_100 + sroh + usual_care |
SW2 | mental31 + smoke_100 + sroh + usual_care |
SM | smoke_100 + sroh |
mod_fit_ks <- glm(avoid_care ~ alcohol30 + bmi + mental31 + smoke_100 + sroh + usual_care, data = omas17ne3_imp1, family = binomial())
mod_fit_sw1 <- glm(avoid_care ~ alcohol30 + mental31 + smoke_100 + sroh + usual_care, data = omas17ne3_imp1, family = binomial())
mod_fit_sw2 <- glm(avoid_care ~ mental31 + smoke_100 + sroh + usual_care, data = omas17ne3_imp1, family = binomial())
mod_fit_sm <- glm(avoid_care ~ smoke_100 + sroh, data = omas17ne3_imp1, family = binomial())
Train <- createDataPartition(omas17ne3_imp1$avoid_care, p=0.7, list=FALSE)
training <- omas17ne3_imp1[ Train, ]
testing <- omas17ne3_imp1[ -Train, ]
ctrl <- trainControl(method = "repeatedcv",
number = 10,
savePredictions = TRUE)
fit_ks <- train(as.factor(avoid_care) ~
alcohol30 + bmi + mental31 +
smoke_100 + sroh + usual_care,
data = omas17ne3_imp1, method = "glm",
family = "binomial",
trControl = ctrl, tuneLength = 5)
pred_ks <- predict(fit_ks, newdata = testing)
confusionMatrix(data = pred_ks, factor(testing$avoid_care))
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1912 483
1 38 38
Accuracy : 0.7892
95% CI : (0.7725, 0.8051)
No Information Rate : 0.7892
P-Value [Acc > NIR] : 0.5117
Kappa : 0.0778
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.98051
Specificity : 0.07294
Pos Pred Value : 0.79833
Neg Pred Value : 0.50000
Prevalence : 0.78915
Detection Rate : 0.77378
Detection Prevalence : 0.96924
Balanced Accuracy : 0.52672
'Positive' Class : 0
ctrl <- trainControl(method = "repeatedcv",
number = 10,
savePredictions = TRUE)
fit_sw1 <- train(as.factor(avoid_care) ~
alcohol30 + mental31 +
smoke_100 + sroh + usual_care,
data = omas17ne3_imp1, method = "glm",
family = "binomial",
trControl = ctrl, tuneLength = 5)
pred_sw1 <- predict(fit_sw1, newdata = testing)
confusionMatrix(data = pred_sw1, factor(testing$avoid_care))
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1912 483
1 38 38
Accuracy : 0.7892
95% CI : (0.7725, 0.8051)
No Information Rate : 0.7892
P-Value [Acc > NIR] : 0.5117
Kappa : 0.0778
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.98051
Specificity : 0.07294
Pos Pred Value : 0.79833
Neg Pred Value : 0.50000
Prevalence : 0.78915
Detection Rate : 0.77378
Detection Prevalence : 0.96924
Balanced Accuracy : 0.52672
'Positive' Class : 0
ctrl <- trainControl(method = "repeatedcv",
number = 10,
savePredictions = TRUE)
fit_sw2 <- train(as.factor(avoid_care) ~
mental31 + smoke_100 +
sroh + usual_care,
data = omas17ne3_imp1, method = "glm",
family = "binomial",
trControl = ctrl, tuneLength = 5)
pred_sw2 <- predict(fit_sw2, newdata = testing)
confusionMatrix(data = pred_sw2, factor(testing$avoid_care))
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1913 483
1 37 38
Accuracy : 0.7896
95% CI : (0.7729, 0.8055)
No Information Rate : 0.7892
P-Value [Acc > NIR] : 0.4921
Kappa : 0.0786
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.98103
Specificity : 0.07294
Pos Pred Value : 0.79841
Neg Pred Value : 0.50667
Prevalence : 0.78915
Detection Rate : 0.77418
Detection Prevalence : 0.96965
Balanced Accuracy : 0.52698
'Positive' Class : 0
ctrl <- trainControl(method = "repeatedcv",
number = 10,
savePredictions = TRUE)
fit_sm <- train(as.factor(avoid_care) ~
smoke_100 + sroh,
data = omas17ne3_imp1, method = "glm",
family = "binomial",
trControl = ctrl, tuneLength = 5)
pred_sm <- predict(fit_sm, newdata = testing)
confusionMatrix(data = pred_sm, factor(testing$avoid_care))
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1950 521
1 0 0
Accuracy : 0.7892
95% CI : (0.7725, 0.8051)
No Information Rate : 0.7892
P-Value [Acc > NIR] : 0.5117
Kappa : 0
Mcnemar's Test P-Value : <2e-16
Sensitivity : 1.0000
Specificity : 0.0000
Pos Pred Value : 0.7892
Neg Pred Value : NaN
Prevalence : 0.7892
Detection Rate : 0.7892
Detection Prevalence : 1.0000
Balanced Accuracy : 0.5000
'Positive' Class : 0
plot(spearman2(avoid_care ~ alcohol30 + bmi + mental31 +
smoke_100 + sroh + usual_care,
data = omas17ne3))
m_4 <- lrm(avoid_care ~ alcohol30 + bmi + rcs(mental31, 5) +
smoke_100 + sroh + sroh %ia% mental31 + usual_care,
data = omas17ne3_imp1,
x = TRUE, y = TRUE)
m_4
Logistic Regression Model
lrm(formula = avoid_care ~ alcohol30 + bmi + rcs(mental31, 5) +
smoke_100 + sroh + sroh %ia% mental31 + usual_care, data = omas17ne3_imp1,
x = TRUE, y = TRUE)
Model Likelihood Discrimination Rank Discrim.
Ratio Test Indexes Indexes
Obs 8238 LR chi2 423.97 R2 0.078 C 0.648
0 6484 d.f. 16 g 0.558 Dxy 0.295
1 1754 Pr(> chi2) <0.0001 gr 1.747 gamma 0.296
max |deriv| 4e-07 gp 0.097 tau-a 0.099
Brier 0.158
Coef S.E. Wald Z Pr(>|Z|)
Intercept -1.8711 0.1672 -11.19 <0.0001
alcohol30 0.0064 0.0036 1.77 0.0763
bmi 0.0035 0.0041 0.84 0.3985
mental31 0.4218 0.0610 6.91 <0.0001
mental31' -27.7278 8.5309 -3.25 0.0012
mental31'' 37.2586 12.0383 3.10 0.0020
mental31''' -9.6384 3.7448 -2.57 0.0101
smoke_100 0.1240 0.0571 2.17 0.0298
sroh=2_VeryGood 0.3853 0.0956 4.03 <0.0001
sroh=3_Good 0.6101 0.0977 6.24 <0.0001
sroh=4_Fair 0.7900 0.1112 7.11 <0.0001
sroh=5_Poor 0.9942 0.1503 6.61 <0.0001
sroh=2_VeryGood * mental31 -0.0200 0.0228 -0.88 0.3789
sroh=3_Good * mental31 -0.0369 0.0203 -1.82 0.0686
sroh=4_Fair * mental31 -0.0471 0.0198 -2.37 0.0176
sroh=5_Poor * mental31 -0.0493 0.0203 -2.42 0.0155
usual_care -0.3346 0.1114 -3.01 0.0027
plot(summary(m_4))
validate
for model 4validate(m_4)
index.orig training test optimism index.corrected n
Dxy 0.2954 0.3003 0.2908 0.0096 0.2858 40
R2 0.0778 0.0821 0.0752 0.0069 0.0709 40
Intercept 0.0000 0.0000 -0.0596 0.0596 -0.0596 40
Slope 1.0000 1.0000 0.9562 0.0438 0.9562 40
Emax 0.0000 0.0000 0.0208 0.0208 0.0208 40
D 0.0513 0.0544 0.0496 0.0048 0.0466 40
U -0.0002 -0.0002 0.0001 -0.0004 0.0001 40
Q 0.0516 0.0546 0.0494 0.0052 0.0464 40
B 0.1582 0.1581 0.1586 -0.0005 0.1587 40
g 0.5579 0.5716 0.5463 0.0253 0.5326 40
gp 0.0966 0.0990 0.0945 0.0045 0.0921 40
calibrate
do here?plot(calibrate(m_4))
n=8238 Mean absolute error=0.003 Mean squared error=3e-05
0.9 Quantile of absolute error=0.006
set.seed(4322019)
imp_model2 <- aregImpute(
~ avoid_care + alcohol30 + bmi + mental31 +
smoke_100 + sroh + usual_care,
data = omas17ne3,
n.impute = 10, pr = FALSE, x = TRUE)
imp_model2
Multiple Imputation using Bootstrap and PMM
aregImpute(formula = ~avoid_care + alcohol30 + bmi + mental31 +
smoke_100 + sroh + usual_care, data = omas17ne3, n.impute = 10,
x = TRUE, pr = FALSE)
n: 8238 p: 7 Imputations: 10 nk: 3
Number of NAs:
avoid_care alcohol30 bmi mental31 smoke_100 sroh
0 39 21 95 20 0
usual_care
83
type d.f.
avoid_care l 1
alcohol30 s 2
bmi s 2
mental31 s 2
smoke_100 l 1
sroh c 4
usual_care l 1
Transformation of Target Variables Forced to be Linear
R-squares for Predicting Non-Missing Values for Each Variable
Using Last Imputations of Predictors
alcohol30 bmi mental31 smoke_100 usual_care
0.043 0.082 0.137 0.053 0.011
model4_imp <- fit.mult.impute(
avoid_care ~ alcohol30 + bmi + rcs(mental31, 5) +
smoke_100 + sroh + sroh %ia% mental31 +
usual_care,
fitter = lrm, xtrans = imp_model2, data = omas17ne3,
x = TRUE, y = TRUE)
Variance Inflation Factors Due to Imputation:
Intercept alcohol30
1.01 1.00
bmi mental31
1.01 1.01
mental31' mental31''
1.01 1.01
mental31''' smoke_100
1.01 1.00
sroh=2_VeryGood sroh=3_Good
1.00 1.00
sroh=4_Fair sroh=5_Poor
1.00 1.01
sroh=2_VeryGood * mental31 sroh=3_Good * mental31
1.01 1.00
sroh=4_Fair * mental31 sroh=5_Poor * mental31
1.00 1.01
usual_care
1.01
Rate of Missing Information:
Intercept alcohol30
0.01 0.00
bmi mental31
0.01 0.01
mental31' mental31''
0.01 0.01
mental31''' smoke_100
0.01 0.00
sroh=2_VeryGood sroh=3_Good
0.00 0.00
sroh=4_Fair sroh=5_Poor
0.00 0.01
sroh=2_VeryGood * mental31 sroh=3_Good * mental31
0.01 0.00
sroh=4_Fair * mental31 sroh=5_Poor * mental31
0.00 0.01
usual_care
0.01
d.f. for t-distribution for Tests of Single Coefficients:
Intercept alcohol30
267324.77 392881.57
bmi mental31
261108.15 82481.27
mental31' mental31''
168976.74 165854.59
mental31''' smoke_100
141799.38 2757665.91
sroh=2_VeryGood sroh=3_Good
116626888.80 8057346.40
sroh=4_Fair sroh=5_Poor
737600.24 115514.63
sroh=2_VeryGood * mental31 sroh=3_Good * mental31
310629.71 782491.92
sroh=4_Fair * mental31 sroh=5_Poor * mental31
1315928.71 87764.24
usual_care
51974.88
The following fit components were averaged over the 10 model fits:
stats linear.predictors
model4_imp
Logistic Regression Model
fit.mult.impute(formula = avoid_care ~ alcohol30 + bmi + rcs(mental31,
5) + smoke_100 + sroh + sroh %ia% mental31 + usual_care,
fitter = lrm, xtrans = imp_model2, data = omas17ne3, x = TRUE,
y = TRUE)
Model Likelihood Discrimination Rank Discrim.
Ratio Test Indexes Indexes
Obs 8238 LR chi2 435.14 R2 0.080 C 0.649
0 6484 d.f. 16 g 0.563 Dxy 0.298
1 1754 Pr(> chi2) <0.0001 gr 1.756 gamma 0.298
max |deriv| 5e-07 gp 0.098 tau-a 0.100
Brier 0.158
Coef S.E. Wald Z Pr(>|Z|)
Intercept -1.8670 0.1673 -11.16 <0.0001
alcohol30 0.0066 0.0036 1.83 0.0670
bmi 0.0033 0.0041 0.79 0.4279
mental31 0.4430 0.0621 7.13 <0.0001
mental31' -27.9376 8.3836 -3.33 0.0009
mental31'' 37.2457 11.7633 3.17 0.0015
mental31''' -9.3110 3.5687 -2.61 0.0091
smoke_100 0.1283 0.0572 2.24 0.0248
sroh=2_VeryGood 0.3852 0.0957 4.02 <0.0001
sroh=3_Good 0.6147 0.0978 6.28 <0.0001
sroh=4_Fair 0.8003 0.1112 7.19 <0.0001
sroh=5_Poor 1.0189 0.1494 6.82 <0.0001
sroh=2_VeryGood * mental31 -0.0202 0.0227 -0.89 0.3745
sroh=3_Good * mental31 -0.0357 0.0202 -1.77 0.0771
sroh=4_Fair * mental31 -0.0460 0.0198 -2.33 0.0200
sroh=5_Poor * mental31 -0.0483 0.0203 -2.38 0.0172
usual_care -0.3414 0.1116 -3.06 0.0022
plot(summary(model4_imp))
plot(calibrate(model4_imp))
n=8238 Mean absolute error=0.005 Mean squared error=5e-05
0.9 Quantile of absolute error=0.009
validate(model4_imp)
index.orig training test optimism index.corrected n
Dxy 0.2972 0.3020 0.2935 0.0086 0.2886 40
R2 0.0798 0.0835 0.0772 0.0063 0.0735 40
Intercept 0.0000 0.0000 -0.0401 0.0401 -0.0401 40
Slope 1.0000 1.0000 0.9627 0.0373 0.9627 40
Emax 0.0000 0.0000 0.0155 0.0155 0.0155 40
D 0.0527 0.0551 0.0510 0.0042 0.0485 40
U -0.0002 -0.0002 0.0000 -0.0003 0.0000 40
Q 0.0529 0.0554 0.0509 0.0045 0.0485 40
B 0.1579 0.1568 0.1583 -0.0015 0.1594 40
g 0.5631 0.5701 0.5471 0.0230 0.5401 40
gp 0.0976 0.0987 0.0952 0.0035 0.0941 40
plot(nomogram(model4_imp))