# Attach dependencies
rmarkdown::html_dependency_jquery()
rmarkdown::html_dependency_bootstrap("spacelab")
rmarkdown::html_dependency_jqueryui()
brfss_relcodes <- list()
brfss_relcodes[["BPHIGH4"]] <- list(txt = c("Label: Ever Told Blood Pressure High Section Name: Hypertension Awareness Core Section Number: 4 Question Number: 1 Column: 101 Type of Variable: Num SAS Variable Name: BPHIGH4 Question Prologue: Question: Have you EVER been told by a doctor, nurse or other health professional that you have high blood pressure? (If ´Yes´ and respondent is female, ask ´Was this only when you were pregnant?´.)"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 21, area = list(c(top = 209.83522,
left = 36.02906, bottom = 403.03716, right = 359.73829))) %>% as.data.frame() %>%
base::subset(subset = str_detect(.[["X1"]], "\\d")) %>% .[, c(1, 3)])
brfss_relcodes[["TOLDHI2"]] <- list(txt = c("Label: Ever Told Blood Cholesterol High Section Name: Cholesterol Awareness Core Section Number: 5 Question Number: 2 Column: 104 Type of Variable: Num SAS Variable Name: TOLDHI2 Question Prologue: Question: Have you EVER been told by a doctor, nurse or other health professional that your blood cholesterol is high?"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 22, area = list(c(top = 569.36995,
left = 41.14699, bottom = 698.59774, right = 353.34088))) %>% as.data.frame() %>%
base::subset(subset = str_detect(.[["X1"]], "\\d")) %>% .[, c(1, 3)])
brfss_relcodes[["CVDINFR4"]] <- list(txt = c("Label: Ever Diagnosed with Heart Attack Section Name: Chronic Health Conditions Core Section Number: 6 Question Number: 1 Column: 106 Type of Variable: Num SAS Variable Name: CVDINFR4 Question Prologue: Has a doctor, nurse, or other health professional ever told you that you had any of the following? For each, tell me 'Yes','No',or you’re 'Not sure': Question: (Ever told) you had a heart attack, also called a myocardial infarction?"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 23, area = list(c(top = 522.02907915994,
left = 41.146987910269, bottom = 634.62358642973, right = 335.42811245442))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["CVDCRHD4"]] <- list(txt = c("Label: Ever Diagnosed with Angina or Coronary Heart Disease Section Name: Chronic Health Conditions Core Section Number: 6 Question Number: 2 Column: 107 Type of Variable: Num SAS Variable Name: CVDCRHD4 Question Prologue: Question: (Ever told) you had angina or coronary heart disease?"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 24, area = list(c(top = 205.99676898223,
left = 44.985437360845, bottom = 312.19386106624, right = 326.47173040307))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["CVDSTRK3"]] <- list(txt = c("Label: Ever Diagnosed with a Stroke Section Name: Chronic Health Conditions Core Section Number: 6 Question Number: 3 Column: 108 Type of Variable: Num SAS Variable Name: CVDSTRK3 Question Prologue: Question: (Ever told) you had a stroke."),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 24, area = list(c(top = 473.40872374798,
left = 44.985437360845, bottom = 577.04684975767, right = 289.36671904751))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["DIABETE3"]] <- list(txt = c("Label: (Ever told) you have diabetes Section Name: Chronic Health Conditions Core Section Number: 6 Question Number: 12 Column: 117 Type of Variable: Num SAS Variable Name: DIABETE3 Question Prologue: Question: (Ever told) you have diabetes (If ´Yes´ and respondent is female, ask ´Was this only when you were pregnant?´. If Respondent says pre-diabetes or borderline diabetes, use response code 4.)"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 29, area = list(c(top = 220.07108239095,
left = 43.705954210653, bottom = 385.12439418417, right = 354.6203597073))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["SEX"]] <- list(txt = c("Label: Respondents Sex Section Name: Demographics Core Section Number: 8 Question Number: 1 Column: 125 Type of Variable: Num SAS Variable Name: SEX Question Prologue: Question: Indicate sex of respondent."),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 32, area = list(c(top = 207.03157894737,
left = 41.999973010708, bottom = 277.89473684211, right = 354.63158391908))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["SMOKDAY2"]] <- list(txt = c("Label: Frequency of Days Now Smoking Section Name: Tobacco Use Core Section Number: 9 Question Number: 2 Column: 199 Type of Variable: Num SAS Variable Name: SMOKDAY2 Question Prologue: Question: Do you now smoke cigarettes every day, some days, or not at all?"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 43, area = list(c(top = 472.12924071082,
left = 46.264920511037, bottom = 617.99030694669, right = 354.6203597073))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["ECIGNOW"]] <- list(txt = c("Label: Do you now use e-cigarettes, every day, some days, or not at all? Section Name: E-Cigarettes Core Section Number: 10 Question Number: 2 Column: 205 Type of Variable: Num SAS Variable Name: ECIGNOW Question Prologue: Question: Do you now use e-cigarettes or other electronic vaping products every day, some days, or not at all?"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 47, area = list(c(top = 205.99676898223,
left = 47.544403661229, bottom = 351.85783521809, right = 352.06139340691))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["EXERANY2"]] <- list(txt = c("Label: Exercise in Past 30 Days Section Name: Exercise (Physical Activity) Core Section Number: 13 Question Number: 1 Column: 233 Type of Variable: Num SAS Variable Name: EXERANY2 Question Prologue: Question: During the past month, other than your regular job, did you participate in any physical activities or exercises such as running, calisthenics, golf, gardening, or walking for exercise?"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 52, area = list(c(top = 537.38287560582,
left = 41.146987910269, bottom = 643.57996768982, right = 358.45880915787))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["PREDIAB1"]] <- list(txt = c("Label: Ever been told by a doctor or other health professional that you have pre-diabetes or borderline diabetes? Section Name: Pre-Diabetes Module Section Number: 1 Question Number: 2 Column: 291 Type of Variable: Num SAS Variable Name: PREDIAB1 Question Prologue: Question: Have you ever been told by a doctor or other health professional that you have pre-diabetes or borderline diabetes? (If 'Yes' and respondent is female, ask: 'Was this only when you were pregnant?')"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 66, area = list(c(top = 220.07108239095,
left = 46.264920511037, bottom = 385.12439418417, right = 354.6203597073))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["X.RFCHOL1"]] <- list(txt = c("Label: High Cholesterol Calculated Variable Section Name: Calculated Variables Module Section Number: 5 Question Number: 2 Column: 1952 Type of Variable: Num SAS Variable Name: _RFCHOL1 Question Prologue: Question: Adults who have had their cholesterol checked and have been told by a doctor, nurse, or other health professional that it was high"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 153, area = list(c(top = 509.23424878837,
left = 47.544403661229, bottom = 612.87237479806, right = 355.89984285749))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["X.MICHD"]] <- list(txt = c("Label: Ever had CHD or MI Section Name: Calculated Variables Module Section Number: 6 Question Number: 1 Column: 1953 Type of Variable: Num SAS Variable Name: _MICHD Question Prologue: Question: Respondents that have ever reported having coronary heart disease (CHD) or myocardial infarction (MI)"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 154, area = list(c(top = 207.27625201939,
left = 46.264920511037, bottom = 319.87075928918, right = 349.50242710653))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["X.AGE80"]] <- list(txt = c("Label: Imputed Age value collapsed above 80 Section Name: Calculated Variables Module Section Number: 8 Question Number: 13 Column: 2031-2032 Type of Variable: Num SAS Variable Name: _AGE80 Question Prologue: Question: Imputed Age value collapsed above 80"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 164, area = list(c(top = 464.45234248788,
left = 41.146987910269, bottom = 724.187399030695, right = 344.38449450576))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["X.BMI5CAT"]] <- list(txt = c("Label: Computed body mass index categories Section Name: Calculated Variables Module Section Number: 8 Question Number: 19 Column: 2049 Type of Variable: Num SAS Variable Name: _BMI5CAT Question Prologue: Question: Four-categories of Body Mass Index (BMI)"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 167, area = list(c(top = 445.26009693053,
left = 44.985437360845, bottom = 617.99030694669, right = 355.89984285749))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["X.RFBMI5"]] <- list(txt = c("Label: Overweight or obese calculated variable Section Name: Calculated Variables Module Section Number: 8 Question Number: 20 Column: 2050 Type of Variable: Num SAS Variable Name: _RFBMI5 Question Prologue: Question: Adults who have a body mass index greater than 25.00 (Overweight or Obese)"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 168, area = list(c(top = 203.43780290792,
left = 43.705954210653, bottom = 318.59127625202, right = 354.6203597073))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["X.SMOKER3"]] <- list(txt = c("Label: Computed Smoking Status Section Name: Calculated Variables Module Section Number: 9 Question Number: 1 Column: 2054 Type of Variable: Num SAS Variable Name: _SMOKER3 Question Prologue: Question: Four-level smoker status: Everyday smoker, Someday smoker, Former smoker, Non-smoker"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 170, area = list(c(top = 202.15831987076,
left = 42.426471060461, bottom = 374.88852988691, right = 353.3408765571))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["X.RFSMOK3"]] <- list(txt = c("Label: Current Smoking Calculated Variable Section Name: Calculated Variables Module Section Number: 9 Question Number: 2 Column: 2055 Type of Variable: Num SAS Variable Name: _RFSMOK3 Question Prologue: Question: Adults who are current smokers"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 170, area = list(c(top = 529.70597738288,
left = 43.705954210653, bottom = 637.18255250404, right = 350.78191025672))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["X.CURECIG"]] <- list(txt = c("Label: Current E-cigarette User Calculated Variable Section Name: Calculated Variables Module Section Number: 10 Question Number: 2 Column: 2057 Type of Variable: Num SAS Variable Name: _CURECIG Question Prologue: Question: Adults who are current e-cigarette users"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 171, area = list(c(top = 482.36510500808,
left = 43.705954210653, bottom = 554.01615508885, right = 349.50242710653))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["X.TOTINDA"]] <- list(txt = c("Label: Leisure Time Physical Activity Calculated Variable Section Name: Calculated Variables Module Section Number: 13 Question Number: 1 Column: 2115 Type of Variable: Num SAS Variable Name: _TOTINDA Question Prologue: Question: Adults who reported doing physical activity or exercise during the past 30 days other than their regular job"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 181, area = list(c(top = 205.99676898223,
left = 44.985437360845, bottom = 309.63489499192, right = 353.3408765571))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["METVL11."]] <- list(txt = c("Label: Activity MET Value for First Activity Section Name: Calculated Variables Module Section Number: 13 Question Number: 2 Column: 2116-2118 Type of Variable: Num SAS Variable Name: METVL11_ Question Prologue: Question: Activity MET Value for First Activity"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 181, area = list(c(top = 465.73182552504,
left = 42.426471060461, bottom = 534.8239095315, right = 354.6203597073))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["METVL21."]] <- list(txt = c("Label: Activity MET Value for Second Activity Section Name: Calculated Variables Module Section Number: 13 Question Number: 3 Column: 2119-2121 Type of Variable: Num SAS Variable Name: METVL21_ Question Prologue: Question: Activity MET Value for Second Activity"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 182, area = list(c(top = 204.71728594507,
left = 39.867504760077, bottom = 273.80936995153, right = 355.89984285749))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["PADUR1."]] <- list(txt = c("Label: Minutes of First Activity Section Name: Calculated Variables Module Section Number: 13 Question Number: 8 Column: 2134-2136 Type of Variable: Num SAS Variable Name: PADUR1_ Question Prologue: Question: Minutes of First Activity"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 184, area = list(c(top = 202.15831987076,
left = 41.146987910269, bottom = 262.29402261712, right = 339.26656190499))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["PADUR2."]] <- list(txt = c("Label: Minutes of Second Activity Section Name: Calculated Variables Module Section Number: 13 Question Number: 9 Column: 2137-2139 Type of Variable: Num SAS Variable Name: PADUR2_ Question Prologue: Question: Minutes of Second Activity"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 184, area = list(c(top = 409.43457189015,
left = 41.146987910269, bottom = 469.57027463651, right = 326.47173040307))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["PAFREQ1."]] <- list(txt = c("Label: Physical Activity Frequency per Week for First Activity Section Name: Calculated Variables Module Section Number: 13 Question Number: 10 Column: 2140-2144 Type of Variable: Num SAS Variable Name: PAFREQ1_ Question Prologue: Question: Physical Activity Frequency per Week for First Activity"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 184, area = list(c(top = 623.10823909532,
left = 38.588021609885, bottom = 711.392568659128, right = 352.06139340691))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["PAFREQ2."]] <- list(txt = c("Label: Physical Activity Frequency per Week for Second Activity Section Name: Calculated Variables Module Section Number: 13 Question Number: 11 Column: 2145-2149 Type of Variable: Num SAS Variable Name: PAFREQ2_ Question Prologue: Question: Physical Activity Frequency per Week for Second Activity"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 185, area = list(c(top = 204.71728594507,
left = 39.867504760077, bottom = 291.72213247173, right = 357.17932600768))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["X.MINAC11"]] <- list(txt = c("Label: Minutes of Physical Activity per week for First Activity Section Name: Calculated Variables Module Section Number: 13 Question Number: 12 Column: 2150-2154 Type of Variable: Num SAS Variable Name: _MINAC11 Question Prologue: Question: Minutes of Physical Activity per week for First Activity"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 185, area = list(c(top = 450.37802907916,
left = 41.146987910269, bottom = 516.91114701131, right = 320.07431465211))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
brfss_relcodes[["X.MINAC21"]] <- list(txt = c("Label: Minutes of Physical Activity per week for Second Activity Section Name: Calculated Variables Module Section Number: 13 Question Number: 13 Column: 2155-2159 Type of Variable: Num SAS Variable Name: _MINAC21 Question Prologue: Question: Minutes of Physical Activity per week for Second Activity"),
lbls = tabulizer::extract_tables("BRFSS\\BRFSS_Codebook.pdf", pages = 186, area = list(c(top = 207.27625201939,
left = 41.146987910269, bottom = 276.36833602585, right = 299.60258424904))) %>%
as.data.frame() %>% base::subset(subset = str_detect(.[["X1"]], "\\d")) %>%
.[, c(1, 3)])
# ACTIN11_ ACTIN21_ STRFREQ_ PAMIN11_ PAMIN21_ PA1MIN_ PAVIG11_ PAVIG21_ PA1VIGM_
# _PACAT1 _PAINDX1 _PA150R2 _PA300R2 _PA30021 _PASTRNG _PAREC1 _PASTAE1
a <- tabulizer::locate_areas("BRFSS\\BRFSS_Codebook.pdf", pages = 32)
brfss_relcodes %<>% lapply(function(l) {
lt <- l[["txt"]] %>% unlist %>% strsplit("(?<=.)(?=Question Number)|(?<=.)(?=Column)|(?<=.)(?=Section Name)|(?<=.)(?=Core Section Number)|(?<=.)(?=Type of Variable)|(?<=.)(?=SAS Variable Name)|(?<=.)(?=Question Prologue)|(?<=.)(?=Question Prologue)",
perl = T) %>% unlist
names(lt) <- sapply(lt, str_extract, "[A-Z\\sa-z0-9]+(?=\\:)") %>% as.vector
lt %<>% sapply(function(.) gsub("$[A-Z\\sa-z0-9]+.?\\:", "", .))
l[["txt"]] <- lt
return(l)
})
a %>% paste
gc()
rJava::.jcall("java/lang/System", method = "gc")
save(brfss_relcodes, file = "brfss_relcodes.Rdata")
load(file = "brfss_relcodes.Rdata")
# load(file = 'AllData.Rdata') BRFSS Data
# https://www.cdc.gov/brfss/annual_data/annual_2017.html
BRFSS <- SASxport::read.xport("C:\\Users\\Administrator\\Documents\\Northeastern\\Fall 2018\\HINF6400\\Incentivized Insurance\\BRFSS\\LLCP2017.XPT")
BRFSS %<>% lapply(unclass) %>% as.data.frame
The BRFSS is used to compute values for risk of CVD associated with various risk factors which was then used to create a decision tree useful for determining premium rates for potential Incentivized Insurance clients fitting into certain categories. Using real world data made this analysis quite complex.
It also doesn’t appear that the type of “decision tree” gone over briefly in class is documented online. All I could find after extensive searching is about using decision trees for classification purposes in R. Thus, I’ve done my best to develop a decision tree with utility in the real world rather than making up values.
The BRFSS Variables used in this analysis are described below.
# ----------------------- Fri Oct 19 17:21:19 2018 ------------------------# Risk
# Factors
RF <- brfss_relcodes %>% names %>% .[c(1, 2, 6, 8, 9, 10, 16, 17)]
RF.BRFSS <- BRFSS %>% select(starts_with("CVD"), -ends_with("ASPRN"), names(RF))
brfss_relcodes[RF]
## $BPHIGH4
## $BPHIGH4$txt
## Label
## "Label: Ever Told Blood Pressure High "
## Section Name
## "Section Name: Hypertension Awareness "
## Core Section Number
## "Core Section Number: 4 "
## Question Number
## "Question Number: 1 "
## Column
## "Column: 101 "
## Type of Variable
## "Type of Variable: Num "
## SAS Variable Name
## "SAS Variable Name: BPHIGH4 "
## Question Prologue
## "Question Prologue: Question: Have you EVER been told by a doctor, nurse or other health professional that you have high blood pressure? (If ´Yes´ and respondent is female, ask ´Was this only when you were pregnant?´.)"
##
## $BPHIGH4$lbls
## X1 X3
## 3 1 es
## 4 2 es, but female told only during pregnancyâ\200”Go to
## 6 3 oâ\200”Go to Section 05.01 CHOLCHK1
## 7 4 old borderline high or pre-hypertensiveâ\200”Go to
## 9 7 on Ì\201t know/Not Sureâ\200”Go to Section 05.01 CHOLCHK1
## 10 9 efusedâ\200”Go to Section 05.01 CHOLCHK1
##
##
## $TOLDHI2
## $TOLDHI2$txt
## Label
## "Label: Ever Told Blood Cholesterol High "
## Section Name
## "Section Name: Cholesterol Awareness Core "
## Section Number
## "Section Number: 5 "
## Question Number
## "Question Number: 2 "
## Column
## "Column: 104 "
## Type of Variable
## "Type of Variable: Num "
## SAS Variable Name
## "SAS Variable Name: TOLDHI2 "
## Question Prologue
## "Question Prologue: Question: Have you EVER been told by a doctor, nurse or other health professional that your blood cholesterol is high?"
##
## $TOLDHI2$lbls
## X1
## 2 1
## 3 2
## 4 7
## 5 9
## 7 otes: Section 05.01, CHOLCHK1, is coded 1, 9
## X3
## 2 es
## 3 oâ\200”Go to next section
## 4 onâ\200\231t know/Not Sureâ\200”Go to next section
## 5 efusedâ\200”Go to next section
## 7
##
##
## $DIABETE3
## $DIABETE3$txt
## Label
## "Label: (Ever told) you have diabetes "
## Section Name
## "Section Name: Chronic Health Conditions Core "
## Section Number
## "Section Number: 6 "
## Question Number
## "Question Number: 12 "
## Column
## "Column: 117 "
## Type of Variable
## "Type of Variable: Num "
## SAS Variable Name
## "SAS Variable Name: DIABETE3 "
## Question Prologue
## "Question Prologue: Question: (Ever told) you have diabetes (If ´Yes´ and respondent is female, ask ´Was this only when you were pregnant?´. If Respondent says pre-diabetes or borderline diabetes, use response code 4.)"
##
## $DIABETE3$lbls
## X1 X3
## 2 1 es
## 3 2 es, but female told only during pregnancyâ\200”Go to
## 5 3 oâ\200”Go to Section 07.01 LMTJOIN3
## 6 4 o, pre-diabetes or borderline diabetesâ\200”Go to
## 8 7 onâ\200\231t know/Not Sureâ\200”Go to Section 07.01 LMTJOIN3
## 9 9 efusedâ\200”Go to Section 07.01 LMTJOIN3
##
##
## $SMOKDAY2
## $SMOKDAY2$txt
## Label
## "Label: Frequency of Days Now Smoking "
## Section Name
## "Section Name: Tobacco Use Core "
## Section Number
## "Section Number: 9 "
## Question Number
## "Question Number: 2 "
## Column
## "Column: 199 "
## Type of Variable
## "Type of Variable: Num "
## SAS Variable Name
## "SAS Variable Name: SMOKDAY2 "
## Question Prologue
## "Question Prologue: Question: Do you now smoke cigarettes every day, some days, or not at all?"
##
## $SMOKDAY2$lbls
## X1
## 2 1
## 3 2
## 4 3
## 5 7
## 6 9
## 8 otes: Section 09.01, SMOKE100, is coded 2, 7,
## X3
## 2 very day
## 3 ome days
## 4 ot at allâ\200”Go to Section 09.04 LASTSMK2
## 5 on Ì\201t Know/Not Sureâ\200”Go to Section 09.05 USENOW3
## 6 efusedâ\200”Go to Section 09.05 USENOW3
## 8
##
##
## $ECIGNOW
## $ECIGNOW$txt
## Label
## "Label: Do you now use e-cigarettes, every day, some days, or not at all? "
## Section Name
## "Section Name: E-Cigarettes Core "
## Section Number
## "Section Number: 10 "
## Question Number
## "Question Number: 2 "
## Column
## "Column: 205 "
## Type of Variable
## "Type of Variable: Num "
## SAS Variable Name
## "SAS Variable Name: ECIGNOW "
## Question Prologue
## "Question Prologue: Question: Do you now use e-cigarettes or other electronic vaping products every day, some days, or not at all?"
##
## $ECIGNOW$lbls
## X1 X3
## 2 1 very day
## 3 2 ome days
## 4 3 ot at all
## 5 7 onâ\200\231t know / Not sure
## 6 9 efused
## 8 otes: Section 10.01, ECIGARET, is coded 2, 7
##
##
## $EXERANY2
## $EXERANY2$txt
## Label
## "Label: Exercise in Past 30 Days "
## Section Name
## "Section Name: Exercise (Physical Activity) Core "
## Section Number
## "Section Number: 13 "
## Question Number
## "Question Number: 1 "
## Column
## "Column: 233 "
## Type of Variable
## "Type of Variable: Num "
## SAS Variable Name
## "SAS Variable Name: EXERANY2 "
## Question Prologue
## "Question Prologue: Question: During the past month, other than your regular job, did you participate in any physical activities or exercises such as running, calisthenics, golf, gardening, or walking for exercise?"
##
## $EXERANY2$lbls
## X1 X3
## 2 1 es
## 3 2 oâ\200”Go to Section 13.08 STRENGTH
## 4 7 onâ\200\231t know/Not Sureâ\200”Go to Section 13.08 STRENGTH
## 5 9 efusedâ\200”Go to Section 13.08 STRENGTH
##
##
## $X.RFBMI5
## $X.RFBMI5$txt
## Label
## "Label: Overweight or obese calculated variable "
## Section Name
## "Section Name: Calculated Variables Module "
## Section Number
## "Section Number: 8 "
## Question Number
## "Question Number: 20 "
## Column
## "Column: 2050 "
## Type of Variable
## "Type of Variable: Num "
## SAS Variable Name
## "SAS Variable Name: _RFBMI5 "
## Question Prologue
## "Question Prologue: Question: Adults who have a body mass index greater than 25.00 (Overweight or Obese)"
##
## $X.RFBMI5$lbls
## X1 X3
## 3 1 o
## 6 2 es
## 8 9
##
##
## $X.SMOKER3
## $X.SMOKER3$txt
## Label
## "Label: Computed Smoking Status "
## Section Name
## "Section Name: Calculated Variables Module "
## Section Number
## "Section Number: 9 "
## Question Number
## "Question Number: 1 "
## Column
## "Column: 2054 "
## Type of Variable
## "Type of Variable: Num "
## SAS Variable Name
## "SAS Variable Name: _SMOKER3 "
## Question Prologue
## "Question Prologue: Question: Four-level smoker status: Everyday smoker, Someday smoker, Former smoker, Non-smoker"
##
## $X.SMOKER3$lbls
## X1
## 3 1
## 5 2
## 7 3
## 9 4
## 11 9
## 12 otes: SMOKE100 = 1 and SMOKEDAY = 9 or SMOKE10
## X3
## 3 urrent smoker - now smokes every day
## 5 urrent smoker - now smokes some days
## 7 ormer smoker
## 9 ever smoked
## 11
## 12
RFv <- c(1, 1, 2, 2, 2, 1, 2, 2)
names(RFv) <- names(RF)
RF <- RFv
# ----------------------- Fri Oct 19 17:42:28 2018 ------------------------# CVD
# Variables
CVD.vars <- brfss_relcodes %>% names %>% .[str_detect(brfss_relcodes, "CVD")]
brfss_relcodes[CVD.vars]
## $CVDINFR4
## $CVDINFR4$txt
## Label
## "Label: Ever Diagnosed with Heart Attack "
## Section Name
## "Section Name: Chronic Health Conditions Core "
## Section Number
## "Section Number: 6 "
## Question Number
## "Question Number: 1 "
## Column
## "Column: 106 "
## Type of Variable
## "Type of Variable: Num "
## SAS Variable Name
## "SAS Variable Name: CVDINFR4 "
## Question Prologue
## "Question Prologue: Has a doctor, nurse, or other health professional ever told you that you had any of the following? For each, tell me 'Yes','No',or you’re 'Not sure': Question: (Ever told) you had a heart attack, also called a myocardial infarction?"
##
## $CVDINFR4$lbls
## X1 X3
## 3 1 es
## 4 2 o
## 5 7 onâ\200\231t know/Not sure
## 6 9 efused
##
##
## $CVDCRHD4
## $CVDCRHD4$txt
## Label
## "Label: Ever Diagnosed with Angina or Coronary Heart Disease "
## Section Name
## "Section Name: Chronic Health Conditions Core "
## Section Number
## "Section Number: 6 "
## Question Number
## "Question Number: 2 "
## Column
## "Column: 107 "
## Type of Variable
## "Type of Variable: Num "
## SAS Variable Name
## "SAS Variable Name: CVDCRHD4 "
## Question Prologue
## "Question Prologue: Question: (Ever told) you had angina or coronary heart disease?"
##
## $CVDCRHD4$lbls
## X1 X3
## 2 1 es
## 3 2 o
## 4 7 onâ\200\231t know/Not sure
## 5 9 efused
##
##
## $CVDSTRK3
## $CVDSTRK3$txt
## Label
## "Label: Ever Diagnosed with a Stroke "
## Section Name
## "Section Name: Chronic Health Conditions Core "
## Section Number
## "Section Number: 6 "
## Question Number
## "Question Number: 3 "
## Column
## "Column: 108 "
## Type of Variable
## "Type of Variable: Num "
## SAS Variable Name
## "SAS Variable Name: CVDSTRK3 "
## Question Prologue
## "Question Prologue: Question: (Ever told) you had a stroke."
##
## $CVDSTRK3$lbls
## X1 X3
## 2 1 es
## 3 2 o
## 4 7 onâ\200\231t know/Not sure
## 5 9 efused
CVDv <- c(1, 1, 1)
names(CVDv) <- CVD.vars
CVD.vars <- CVDv
# Risk Factors
RF.BRFSS[, names(RF)] %<>% purrr::map2(.x = ., .y = RF, .f = function(.x, .y) {
purrr::map_lgl(.x = .x, .y = .y, .f = function(.x, .y) {
.x == .y
})
})
# CVD Factors
RF.BRFSS[, names(CVD.vars)] %<>% purrr::map2(.x = ., .y = CVD.vars, .f = function(.x,
.y) {
purrr::map_lgl(.x = .x, .y = .y, .f = function(.x, .y) {
.x == .y
})
})
# Impute NA
RF.BRFSS_imp <- RF.BRFSS %>% Amelia::amelia(m = 1)
## -- Imputation 1 --
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 21 22 23 24 25 26 27 28
RF.BRFSS_imp[["imputations"]][["imp1"]]$ALLCVD <- Reduce("|", RF.BRFSS_imp[["imputations"]][["imp1"]][,
names(CVD.vars)])
# ----------------------- Fri Oct 19 18:18:15 2018 ------------------------#
# Correlations between CVD And Risk FActors
(RF.cor <- lapply(RF.BRFSS_imp[["imputations"]][["imp1"]][, names(RF)], CVD = RF.BRFSS_imp[["imputations"]][["imp1"]][["ALLCVD"]],
function(clm, CVD) {
(Reduce("&", list(CVD, clm)) %>% sum)/length(CVD)
}))
## $BPHIGH4
## [1] 0.08336995
##
## $TOLDHI2
## [1] 0.07242556
##
## $DIABETE3
## [1] 0.0004859733
##
## $SMOKDAY2
## [1] 0.005687441
##
## $ECIGNOW
## [1] 0.01488321
##
## $EXERANY2
## [1] 0.06879519
##
## $X.RFBMI5
## [1] 0.0797307
##
## $X.SMOKER3
## [1] 0.005687441
Risk stratified by Sex
RF.BRFSS_imp[["imputations"]][["imp1"]]$SEX <- BRFSS$SEX
RF.BRFSS_imp[["imputations"]][["imp1"]]$SEX %<>% factor(labels = c(`1` = "M", `2` = "F",
`9` = "9"))
Sex.cor <- with(RF.BRFSS_imp[["imputations"]][["imp1"]], tapply(ALLCVD, SEX, function(x) {
sum(x)/length(x)
}))
# Add Variable
RF.BRFSS_imp[["imputations"]][["imp1"]]$X.AGE80 <- BRFSS$X.AGE80
# Create Bins
RF.BRFSS_imp[["imputations"]][["imp1"]]$X.AGE80_bin <- RF.BRFSS_imp[["imputations"]][["imp1"]]$X.AGE80 %>%
cut(breaks = c(brfss_relcodes$X.AGE80$lbls[, 1] %>% str_extract("\\d{2}") %>%
as.numeric, 99), labels = brfss_relcodes$X.AGE80$lbls[, 1, drop = T], right = FALSE)
# Calculate risk rates
Age.cor <- tapply(RF.BRFSS_imp[["imputations"]][["imp1"]][["ALLCVD"]], RF.BRFSS_imp[["imputations"]][["imp1"]]$X.AGE80_bin,
function(x) {
sum(x)/length(x)
})
Risk stratified by BMI
RF.BRFSS_imp[["imputations"]][["imp1"]]$X.BMI5CAT <- BRFSS$X.BMI5CAT
# Impute and round
RF.BRFSS_imp[["imputations"]][["imp1"]]$X.BMI5CAT <- RF.BRFSS_imp[["imputations"]][["imp1"]] %>%
select(-"X.AGE80_bin", -"SEX") %>% Amelia::amelia(m = 1) %>% purrr::pluck("imputations",
"imp1") %>% .[["X.BMI5CAT"]] %>% round %>% sapply(function(x) {
ifelse(x == 0, 1, x)
})
## -- Imputation 1 --
##
## 1 2 3 4 5 6 7
# Extract Factor levels
bmicats <- c(`1` = "Underweight", `2` = "Normal Weight", `3` = "Overweight", `4` = "Obese")
# Create Factor
RF.BRFSS_imp[["imputations"]][["imp1"]]$X.BMI5CAT %<>% factor(labels = bmicats)
# Calculate Risk
BMI.cor <- tapply(RF.BRFSS_imp[["imputations"]][["imp1"]][["ALLCVD"]], RF.BRFSS_imp[["imputations"]][["imp1"]]$X.BMI5CAT,
function(x) {
sum(x)/length(x)
})
Dose response curves for the kcal/kg burned per week were reverse engineered from plots in the literature. These curves were fitted with a Loess smoothing function to produce a model that takes kcal/kg burned per week as nput and outputs a ratio of risk reduction used to determine premium discounts. This fits as a regression node to reduce the premium at the last level of the tree.
DRMen <- read.csv("DoseResponseMen.csv", header = FALSE, col.names = c("KCal.Wk",
"Risk.Red"))
DRWomen <- read.csv("DoseResponseWomen.csv", header = FALSE, col.names = c("KCal.Wk",
"Risk.Red"))
DRMen$Risk.Red %<>% {
1 - .
}
DRWomen$Risk.Red %<>% {
1 - .
}
DRPlots <- list(Men = DRMen %>% ggplot(data = ., mapping = aes(x = KCal.Wk, y = Risk.Red)) +
geom_line() + geom_smooth(method = "loess") + labs(title = "Risk Reduction v KCal/Wk",
subtitle = "Men", caption = "", x = "KCal.Wk", y = "Risk Reduction") + theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)), Women = DRWomen %>% ggplot(data = .,
mapping = aes(x = KCal.Wk, y = Risk.Red)) + geom_line() + geom_smooth(method = "loess") +
labs(title = "Risk Reduction v KCal/Wk", subtitle = "Women", caption = "", x = "KCal.Wk",
y = "Risk Reduction") + theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)))
gridExtra::grid.arrange(grobs = DRPlots)
DRMen.lm <- loess(Risk.Red ~ KCal.Wk, data = DRMen)
DRWomen.lm <- loess(Risk.Red ~ KCal.Wk, data = DRWomen)
DRMen.lm %>% predict(newdata = c(200, 400, 600))
## [1] 0.03635028 0.07325860 0.10765406
Regmodels <- list(Men = DRMen.lm, Women = DRWomen.lm)
save(Regmodels, file = "Regression.Rdata")
A table was created with all combinations of each factor from which risk is calculated with associated premium increases.
CVDPop <- read.csv("CVDPop.csv")
dec_tree <- CVDPop %>% select(ST, County, Has.Cardiovascular.Disease, Cost.p.Cap,
CR.14_pp) %>% mutate(Base = Cost.p.Cap - CR.14_pp) %>% select(ST, County, Has.Cardiovascular.Disease,
Base) %>% unite("Location", ST, County, sep = ", ") %>% cbind(from = rep("Individual",
nrow(.)), .)
dec_tree <- dec_tree[dec_tree %>% duplicated %>% not, ]
dec_tree %<>% rename(CVDR = Has.Cardiovascular.Disease, to = Location)
# ----------------------- Mon Oct 22 12:53:20 2018 ------------------------# Add
# sex
dt.sex <- expand.grid(from = dec_tree$to[1:819], to = c("M", "F"))
dt.sex <- cbind(dt.sex, CVDR = dt.sex$to %>% sapply(S = Sex.cor, function(x, S) {
S[[x]]
}), Base = NA)
dt.sex$Base <- purrr::map2(.x = dt.sex$CVDR, .y = dt.sex$from, function(.x, .y, df,
vnref, vnmult) {
out <- ((1 + .x) * df[[vnmult]][str_detect(fixed(as.character(.y)), df[[vnref]]) %>%
which %>% .[1]])
return(round(out, 2))
}, df = dec_tree, vnref = "to", vnmult = "Base") %>% unlist
# ----------------------- Mon Oct 22 16:53:35 2018 ------------------------# Add
# Risk factors
Labels <- names(RF.cor)
names(Labels) <- brfss_relcodes[names(RF.cor)] %>% lapply(`[`, "txt") %>% lapply(purrr::pluck,
1, "Label")
dt.cor <- expand.grid(from = c("M", "F"), to = names(RF.cor) %>% factor(labels = Labels))
dt.cor$CVDR <- dt.cor$to %>% sapply(S = RF.cor, function(x, S) {
S[[x]]
})
# ----------------------- Mon Oct 22 20:28:16 2018 ------------------------# Add
# Age
dt.age <- expand.grid(from = names(RF.cor), to = names(Age.cor))
dt.age$CVDR <- dt.age$to %>% sapply(S = Age.cor, function(x, S) {
S[[x]]
})
# ----------------------- Mon Oct 22 20:28:30 2018 ------------------------# Add
# BMI
dt.bmi <- expand.grid(from = names(Age.cor), to = names(BMI.cor))
dt.bmi$CVDR <- dt.bmi$to %>% sapply(S = BMI.cor, function(x, S) {
S[[x]]
})
# Total Dec Tree
dec_tree.final <- left_join(dec_tree, dt.sex, by = c(to = "from"), suffix = c(".cty",
".sex")) %>% left_join(dt.cor, by = c(to.sex = "from"), suffix = c(".sex", ".RF")) %>%
rename(CVDR.RF = CVDR) %>% mutate(Base.RF = (1 + CVDR.RF) * Base.sex) %>% left_join(dt.age,
by = c(to.RF = "from"), suffix = c(".RF", ".age")) %>% rename(CVDR.age = CVDR,
to.age = to) %>% mutate(Base.age = (1 + CVDR.age) * Base.RF)
names(dec_tree.final) <- c("Begin", "Location", "CVDR.cty", "Base.cty", "Sex", "CVDR.sex",
"Base.sex", "RF", "CVDR.RF", "Base.RF", "Age", "CVDR.age", "Base.age")
dec_tree.final
Decision trees are constructed to calculate risk associated with specific variables. As is evident in the tree plots, the complexity of real world variables makes computation of risk very resource intensive.
# ----------------------- Tue Oct 23 08:52:29 2018 ------------------------#
# Decision tree #2 RF
(form <- paste("ALLCVD~", paste(names(RF.BRFSS_imp[["imputations"]][["imp1"]])[c(4:11,
13, 15, 16)], collapse = "+")))
## [1] "ALLCVD~ BPHIGH4+TOLDHI2+DIABETE3+SMOKDAY2+ECIGNOW+EXERANY2+X.RFBMI5+X.SMOKER3+SEX+X.AGE80_bin+X.BMI5CAT"
RF.rplot <- RF.BRFSS_imp[["imputations"]][["imp1"]][, c(4:13, 15, 16)] %>% rpart::rpart(form,
method = "class", data = ., control = list(minsplit = 3, cp = 0.00001, parms = list(loss = matrix(c(0,
2, 0, 0), byrow = TRUE, nrow = 2))))
RF.prp <- RF.rplot %>% rpart.plot::prp(space = 0, extra = 106, yesno = 2, varlen = -1,
faclen = 3, tweak = 3, snip = T, fallen.leaves = F, split.border.col = 1)
# ----------------------- Tue Oct 23 09:30:59 2018 ------------------------# 3
# var Dtree
names(RF.BRFSS_imp[["imputations"]][["imp1"]])[15] <- "X.AGE80.bin"
(form <- paste("ALLCVD~", paste(names(RF.BRFSS_imp[["imputations"]][["imp1"]])[c(6,
13, 15)], collapse = "+")))
## [1] "ALLCVD~ DIABETE3+SEX+X.AGE80.bin"
RF.rplot <- RF.BRFSS_imp[["imputations"]][["imp1"]][, c(4:13, 15, 16)] %>% rpart::rpart(form,
data = ., control = list(minsplit = 3, cp = 0.00000001, parms = list(loss = matrix(c(0,
2, 0, 0), byrow = TRUE, nrow = 2))))
RF.prp <- RF.rplot %>% rpart.plot::prp(space = 1, varlen = -1, faclen = 3, tweak = 2,
fallen.leaves = F, xflip = 1, extra = 100)
# ----------------------- Tue Oct 23 09:53:51 2018 ------------------------# 2
# Var Dtree
(form <- paste("ALLCVD~", paste(names(RF.BRFSS_imp[["imputations"]][["imp1"]])[c(6,
13)], collapse = "+"), sep = "") %>% as.formula)
## ALLCVD ~ DIABETE3 + SEX
## <environment: 0x0000000009dade28>
class.RF <- RF.BRFSS_imp[["imputations"]][["imp1"]]
class.RF %<>% lapply(as.factor)
RF.rplot <- class.RF %>% rpart::rpart(form, method = "class", data = ., control = list(cp = 0.00001,
minsplit = 1, minbucket = 1))
RF.prp <- RF.rplot %>% rpart.plot::prp(space = 1, varlen = -1, faclen = 3, tweak = 2,
fallen.leaves = F, xflip = 1, extra = 100)
# png('image.png', width = 800, height = 600) RF.prp <- RF.rplot %>%
# rpart.plot::prp(space = 1,varlen = -1,faclen = 3,tweak = 2,
# fallen.leaves=F,xflip = 1,extra=100) rpart.plot::rpart.rules(RF.rplot, cover =
# TRUE) %>% .[[1]] %>% as.numeric %>% sum dev.off() Calculate Probs 1 -
# RF.rplot[['frame']][['yval2']][1] # No Val S = M,9 1 -
# RF.rplot[['frame']][['yval2']][2] # No Val X >= .5 RHS
# RF.rplot[['frame']][['yval2']][3]+RF.rplot[['frame']][['yval2']][4] # Total
# freq in two leaves on RHS
# RF.rplot[['frame']][['n']][3]/(RF.rplot[['frame']][['n']][3]+RF.rplot[['frame']][['n']][4])
# RF.rplot[['frame']][['n']][4]/(RF.rplot[['frame']][['n']][3]+RF.rplot[['frame']][['n']][4])
# Full Dtree
form <- as.formula(paste("Base.age", "~", names(dec_tree.final)[c(5, 8, 11)] %>%
paste(collapse = "+")))
dec_tree.rpart <- rpart::rpart(form, data = dec_tree.final, control = list(minsplit = 3,
cp = 0.0001))
dec_tree.rpart %>% rpart.plot::prp(space = 0, extra = 100, fallen.leaves = F, varlen = -1,
faclen = 3, tweak = 1.5, cex = )
A simple version of the 2 variable tree with actual probabilities and made up utilities is below
Rolled back one node is here:
Rolled Back