library(haven)
asthma.adult <- read_sas("/home/alain/Documents/DATA621/FinalProject/acbs_2016_adult_public_llcp.sas7bdat")
View(asthma.adult)
#write.csv(asthma.adult, "asthma_adult.csv")
#cn <- colnames(asthma.adult)
#write.csv(cn, "asthma_column_name.csv")
ASTHNOW Have you ever been told by a doctor or other health professional that you have asthma?
TCH_SIGN Has a doctor or other health professional ever taught you… a. How to recognize early signs or symptoms of an asthma episode?
TCH_RESP Has a doctor or other health professional ever taught you… b. What to do during an asthma episode or attack?
TCH_MON A peak flow meter is a hand held device that measures how quickly you can blow air out of your lungs. Has a doctor or other health professional ever taught you… c. How to use a peak flow meter to adjust your daily medications?
MGT_PLAN An asthma action plan, or asthma management plan, is a form with instructions about when to change the amount or type of medicine, when to call the doctorfor advice, and when to go to the emergency room. Has a doctor or other health professional EVER given you an asthma action plan?
MOD_ENV (7.13) INTERVIEWER READ: Now, back to questions specifically about you. Has a health professional ever advised you to change things in your home, school, or work to improve your asthma
MGT_CLAS Have you ever taken a course or class on how to manage your asthma?
INHALERH (8.3) Did a doctor or other health professional show you how to use the inhaler?
INHALERW (8.4) Did a doctor or other health professional watch you use the inhaler?
Responses types (1) YES (2) NO (7) DON’T KNOW (9) REFUSED
Predictors
MISS_DAY = “NUMBER OF MISSED DAYS” MOD_ENV = “EVER ADVISED CHANGE THINGS IN YOUR HOME” AGEDX = “AGE AT ASTHMA DIAGNOSIS” AGEG_F6_M = “MODIFIED SIX AGE GROUPS USED IN ASTHMA ADULT POST-STRATIFICATION” AIRCLEANER = “AIR CLEANER USED” ASMDCOST = “COST BARRIER: PRIMARY CARE DOCTOR” ASRXCOST = “COST BARRIER: MEDICATION” ASSPCOST = “COST BARRIER: SPECIALIST” CATTMPTS_F = “DISPOSITION CODES FOR CALL ATTEMPTS 1 THROUGH 20 …” EMP_STAT = “CURRENT EMPLOYMENT STATUS” EPIS_12M = “ASTHMA EPISODE OR ATTACK” EPIS_TP = “NUMBER OF EPISODES / ATTACKS” ER_TIMES = “NUMBER OF EMERGENCY ROOM VISITS” ER_VISIT = “EMERGENCY ROOM VISIT” EVER_ASTH = “EVER HAVE ASTHMA INCONSISTENT WITH BRFSS” #FLAG1 = “FLAG FOR AGE, AGEDX, INCIDNT INCONSISTENCY” HOSPPLAN = “HOSPITAL FOLLOW-UP” HOSPTIME = “NUMBER OF HOSPITAL VISITS” HOSP_VST = “HOSPITAL VISIT” QSTLANG_F = “LANGUAGE IDENTIFIER” SCR_MED3 = “HAVE ALL THE MEDICATIONS” #SEQNO = “ANNUAL SEQUENCE NUMBER” UNEMP_R = “REASON NOT NOW EMPLOYED” URG_TIME = “NUMBER OF URGENT VISITS” WORKENV5 = “ASTHMA AGGRAVATED BY CURRENT JOB” WORKENV6 = “ASTHMA CAUSED BY CURRENT JOB” WORKENV7 = “ASTHMA AGGRAVATED BY PREVIOUS JOB” WORKENV8 = “ASTHMA CAUSED BY PREVIOUS JOB” WORKQUIT1 = “EVER CHANGE OR QUIT A JOB” WORKSEN3 = “DOCTOR DIAGNOSED WORK ASTHMA” WORKSEN4 = “SELF-IDENTIFIED WORK ASTHMA” WORKTALK = “DOCTOR DISCUSSED WORK ASTHMA” INS1 = “INSURANCE” INS2 = “INSURANCE OR COVERAGE GAP” LANDWT_F = “FINAL ADULT ASTHMA CALL-BACK WEIGHT” LASTSYMP = “LAST HAD ANY SYMPTOMS OF ASTHMA” LAST_MD = “LAST TALKED TO A DOCTOR” LAST_MED = “LAST TOOK ASTHMA MEDICATION” LLCPWT_F = “ADULT ASTHMA FINAL WEIGHT” RACEG_F = “RACE COLLAPSED TO 1 GROUP FOR ASTHMA CALL-BACK” REGION_F = “REGIONS COLLAPSED TO 1 REGION FOR ASTHMA CALL-BACK. COMPASTH =”TYPICAL ATTACK"
asthma.mgt.adult1 <- data.frame(TCH.SIGN = asthma.adult$TCH_SIGN,
TCH.RESP = asthma.adult$TCH_RESP,
TCH.MON = asthma.adult$TCH_MON,
MGT.PLAN = asthma.adult$MGT_PLAN,
MGT.CLAS = asthma.adult$MGT_CLAS,
INHALERW = asthma.adult$INHALERW,
MOD.ENV = asthma.adult$MOD_ENV,
SEX = asthma.adult$SEX,
AGEG.F7 = asthma.adult$AGEG_F7,
"RACE.GR3" = asthma.adult[, "_RACEGR3"],
"EDUCA" = asthma.adult[, "_EDUCAG"],
EDUCAL = asthma.adult$EDUCA,
INCOMEL = asthma.adult$INCOME2,
"INCOMG" = asthma.adult[, "_INCOMG"],
"BMISCAT "= asthma.adult[, "_BMI5CAT"],
"RFBMIS" = asthma.adult[, "_RFBMI5"],
SMOKE100 = asthma.adult$SMOKE100,
COPD = asthma.adult$COPD,
EMPHY = asthma.adult$EMPHY,
DEPRESS = asthma.adult$DEPRESS,
BRONCH = asthma.adult$BRONCH,
SYMP.30D = asthma.adult$SYMP_30D,
DUR.30D = asthma.adult$DUR_30D,
ASLEEP30 = asthma.adult$ASLEEP30,
SYMPFREE = asthma.adult$SYMPFREE,
INCINDT = asthma.adult$INCIDNT,
LAST.MD = asthma.adult$LAST_MD,
LAST.MED = asthma.adult$LAST_MED,
LAST.SYMP = asthma.adult$LASTSYMP,
EPIS.12M = asthma.adult$EPIS_12M,
EPIS.TP = asthma.adult$EPIS_TP,
DUR.ASTH = asthma.adult$DUR_ASTH,
COMPASTH = asthma.adult$COMPASTH,
INS1 = asthma.adult$INS1,
INS2 = asthma.adult$INS2,
ER.TIMES = asthma.adult$ER_TIMES,
ER.VISIT = asthma.adult$ER_VISIT,
URG.TIMES = asthma.adult$URG_TIME,
HOSP.VST = asthma.adult$HOSP_VST,
HOSPTIME = asthma.adult$HOSPTIME,
HOSPPLAN = asthma.adult$HOSPPLAN,
ASMDCOST = asthma.adult$ASMDCOST,
ASRXCOST = asthma.adult$ASRXCOST,
ASSPCOST = asthma.adult$ASSPCOST,
WORKTALK = asthma.adult$WORKTALK
)
asth.mgt.ad.min <- asthma.mgt.adult1 %>% filter(TCH.SIGN == 1 | TCH.SIGN == 2,
TCH.RESP == 1 | TCH.RESP == 2,
TCH.MON == 1 | TCH.MON == 2,
MGT.PLAN == 1 | MGT.PLAN == 2,
MGT.CLAS == 1 | MGT.CLAS == 2,
INHALERW == 1 | INHALERW == 2,
MOD.ENV == 1 | MOD.ENV == 2)
dim(asth.mgt.ad.min)
## [1] 11494 45
X_BMI5CAT, -INCOMEL, -X_EDUCAG, -EPIS.TP, -HOSPPLAN, -ASSPCOST, -ASMDCOST, -INS2
asth.mgt.ad.min2 <- asth.mgt.ad.min
# asth.mgt.ad.min2 <- asth.mgt.ad.min %>% filter(LAST.MD != 77 & LAST.MD != 99,
# LAST.MED != 77 & LAST.MED != 99,
# LAST.SYMP != 77 & LAST.SYMP != 99,LAST.SYMP != 88,
# INCINDT != 7 & INCINDT != 9,
# SYMP.30D != 77 & SYMP.30D != 99,
# DUR.30D != 9 & DUR.30D != 99,
# ASLEEP30 != 99, ASLEEP30 !=66, ASLEEP30 != 100, ASLEEP30 != 111,
# EPIS.12M != 7 & EPIS.12M != 9,
# COMPASTH != 7 & COMPASTH != 9,
# INS1 != 7 & INS1 != 9,
# ER.VISIT != 7 & ER.VISIT != 9,
# ER.TIMES != 777 & ER.TIMES != 999,
# URG.TIMES != 777 & URG.TIMES != 999,
# HOSP.VST != 9,
# HOSPTIME != 777,
# SYMP.30D != 19,
# ASLEEP30 != 27,
# DUR.ASTH != 209,
# DUR.ASTH != 311,
# DUR.ASTH != 424,
# SYMP.30D != 26,
# DUR.ASTH != 127,
# DUR.ASTH != 321,
# DUR.ASTH != 213,
# DUR.ASTH != 407,
# DUR.ASTH != 317,
# ER.TIMES != 32,
# URG.TIMES != 28,
# URG.TIMES != 60,
# URG.TIMES != 9,
# DUR.ASTH != 116,
# URG.TIMES != 14)
asth.mgt.ad.min %>% select(DUR.ASTH) %>% filter(DUR.ASTH==0)
## [1] DUR.ASTH
## <0 rows> (or 0-length row.names)
#dim(asth.mgt.ad.min2)
dim()
Structure
str(asth.mgt.ad.min2)
## 'data.frame': 11494 obs. of 45 variables:
## $ TCH.SIGN : num 1 2 1 2 2 1 1 2 1 2 ...
## ..- attr(*, "label")= chr "EVER TAUGHT RECOGNIZE EARLY SIGN OR SYMPTOMS"
## ..- attr(*, "format.sas")= chr "TCH_SIGN"
## $ TCH.RESP : num 1 1 1 2 1 1 1 1 1 1 ...
## ..- attr(*, "label")= chr "EVER TAUGHT WHAT TO DO DURING ASTHMA EPISODE OR ATTACK"
## ..- attr(*, "format.sas")= chr "TCH_RESP"
## $ TCH.MON : num 2 2 2 2 2 1 1 2 2 2 ...
## ..- attr(*, "label")= chr "EVER TAUGHT HOW TO USE A PEAK FLOW"
## ..- attr(*, "format.sas")= chr "TCH_MON"
## $ MGT.PLAN : num 2 2 2 2 2 2 1 2 2 2 ...
## ..- attr(*, "label")= chr "EVER GIVEN AN ASTHMA ACTION PLAN"
## ..- attr(*, "format.sas")= chr "MGT_PLAN"
## $ MGT.CLAS : num 2 2 2 2 2 2 2 2 2 2 ...
## ..- attr(*, "label")= chr "EVER TAKEN A COURSE TO MANAGE ASTHMA"
## ..- attr(*, "format.sas")= chr "MGT_CLAS"
## $ INHALERW : num 2 2 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "label")= chr "INHALER USE WATCHED"
## ..- attr(*, "format.sas")= chr "INHALERW"
## $ MOD.ENV : num 2 2 2 2 1 2 2 2 1 2 ...
## ..- attr(*, "label")= chr "EVER ADVISED CHANGE THINGS IN YOUR HOME"
## ..- attr(*, "format.sas")= chr "MOD_ENV"
## $ SEX : num 1 2 2 2 2 2 1 2 2 2 ...
## ..- attr(*, "label")= chr "RESPONDENTS SEX"
## ..- attr(*, "format.sas")= chr "SEX"
## $ AGEG.F7 : num 4 5 5 3 6 5 4 6 6 7 ...
## ..- attr(*, "label")= chr "AGE COLLAPSED TO 7 GROUPS FOR ASTHMA CALL-BACK"
## ..- attr(*, "format.sas")= chr "AGEG_F7Z"
## $ X_RACEGR3: num 3 1 1 5 1 5 1 1 1 1 ...
## ..- attr(*, "label")= chr "COMPUTED FIVE LEVEL RACE/ETHNICITY CATEGORY."
## ..- attr(*, "format.sas")= chr "_3RACEGR"
## $ X_EDUCAG : num 4 2 2 3 4 4 4 4 4 3 ...
## ..- attr(*, "label")= chr "COMPUTED LEVEL OF EDUCATION COMPLETED CATEGORIES"
## ..- attr(*, "format.sas")= chr "_EDUCAG"
## $ EDUCAL : num 6 4 4 5 6 6 6 6 6 5 ...
## ..- attr(*, "label")= chr "EDUCATION LEVEL"
## ..- attr(*, "format.sas")= chr "EDUCA"
## $ INCOMEL : num 8 2 2 8 8 7 8 7 5 99 ...
## ..- attr(*, "label")= chr "INCOME LEVEL"
## ..- attr(*, "format.sas")= chr "IN2COME"
## $ X_INCOMG : num 5 1 1 5 5 5 5 5 3 9 ...
## ..- attr(*, "label")= chr "COMPUTED INCOME CATEGORIES"
## ..- attr(*, "format.sas")= chr "_INCOMG"
## $ X_BMI5CAT: num 4 4 3 3 4 3 2 3 4 2 ...
## ..- attr(*, "label")= chr "COMPUTED BODY MASS INDEX CATEGORIES"
## ..- attr(*, "format.sas")= chr "_BMI5CAT"
## $ X_RFBMI5 : num 2 2 2 2 2 2 1 2 2 1 ...
## ..- attr(*, "label")= chr "OVERWEIGHT OR OBESE CALCULATED VARIABLE"
## ..- attr(*, "format.sas")= chr "_5RFBMI"
## $ SMOKE100 : num 2 1 1 2 1 2 1 1 2 2 ...
## ..- attr(*, "label")= chr "SMOKED AT LEAST 100 CIGARETTES"
## ..- attr(*, "format.sas")= chr "SMOK100_"
## $ COPD : num 2 1 2 2 2 2 2 2 2 1 ...
## ..- attr(*, "label")= chr "EVER TOLD HAVE CHRONIC OBSTRUCTIVE PULMONARY DISEASE"
## ..- attr(*, "format.sas")= chr "COPD"
## $ EMPHY : num 2 2 2 2 2 2 2 2 2 2 ...
## ..- attr(*, "label")= chr "EVER TOLD HAVE EMPHYSEMA"
## ..- attr(*, "format.sas")= chr "EMPHY"
## $ DEPRESS : num 2 1 2 2 2 2 2 2 1 1 ...
## ..- attr(*, "label")= chr "EVER TOLD DEPRESSED"
## ..- attr(*, "format.sas")= chr "DEPRESS"
## $ BRONCH : num 2 1 2 2 1 2 2 2 1 2 ...
## ..- attr(*, "label")= chr "EVER TOLD HAVE CHRONIC BRONCHITIS"
## ..- attr(*, "format.sas")= chr "BRONCH"
## $ SYMP.30D : num 100 30 7 66 17 100 10 66 30 66 ...
## ..- attr(*, "label")= chr "SYMPTOM DAYS"
## ..- attr(*, "format.sas")= chr "SYMP_30D"
## $ DUR.30D : num 10 2 12 6 12 10 12 6 1 6 ...
## ..- attr(*, "label")= chr "CONSTANT SYMPTOMS"
## ..- attr(*, "format.sas")= chr "DUR_30D"
## $ ASLEEP30 : num 100 15 6 66 88 100 88 66 20 66 ...
## ..- attr(*, "label")= chr "NIGHT SYMPTOMS"
## ..- attr(*, "format.sas")= chr "ASLEEP30Z"
## $ SYMPFREE : num 14 88 6 14 11 14 7 14 88 14 ...
## ..- attr(*, "label")= chr "SYMPTOM-FREE DAYS"
## ..- attr(*, "format.sas")= chr "SYMPFREE"
## $ INCINDT : num 3 2 3 3 3 2 3 3 3 3 ...
## ..- attr(*, "label")= chr "TIME SINCE DIAGNOSIS"
## ..- attr(*, "format.sas")= chr "INCIDNT"
## $ LAST.MD : num 5 4 4 7 4 4 4 5 4 5 ...
## ..- attr(*, "label")= chr "LAST TALKED TO A DOCTOR"
## ..- attr(*, "format.sas")= chr "LAST_MD"
## $ LAST.MED : num 4 1 3 7 3 1 1 6 1 5 ...
## ..- attr(*, "label")= chr "LAST TOOK ASTHMA MEDICATION"
## ..- attr(*, "format.sas")= chr "LAST_MED"
## $ LAST.SYMP: num 4 1 3 7 3 4 3 5 1 5 ...
## ..- attr(*, "label")= chr "LAST HAD ANY SYMPTOMS OF ASTHMA"
## ..- attr(*, "format.sas")= chr "LASTSYMP"
## $ EPIS.12M : num 1 1 1 6 1 2 1 6 2 6 ...
## ..- attr(*, "label")= chr "ASTHMA EPISODE OR ATTACK"
## ..- attr(*, "format.sas")= chr "EPIS_12M"
## $ EPIS.TP : num 888 2 2 666 38 ...
## ..- attr(*, "label")= chr "NUMBER OF EPISODES / ATTACKS"
## ..- attr(*, "format.sas")= chr "EPIS_TP"
## $ DUR.ASTH : num 105 204 120 666 115 ...
## ..- attr(*, "label")= chr "DURATION OF ATTACK"
## ..- attr(*, "format.sas")= chr "DUR_ASTH"
## $ COMPASTH : num 1 3 1 6 3 11 3 6 11 6 ...
## ..- attr(*, "label")= chr "TYPICAL ATTACK"
## ..- attr(*, "format.sas")= chr "COMPASTH"
## $ INS1 : num 1 1 1 2 1 1 1 1 1 1 ...
## ..- attr(*, "label")= chr "INSURANCE"
## ..- attr(*, "format.sas")= chr "INS1Z"
## $ INS2 : num 2 2 2 5 2 2 2 2 2 2 ...
## ..- attr(*, "label")= chr "INSURANCE OR COVERAGE GAP"
## ..- attr(*, "format.sas")= chr "INS2Z"
## $ ER.TIMES : num 666 677 677 555 677 677 677 555 677 666 ...
## ..- attr(*, "label")= chr "NUMBER OF EMERGENCY ROOM VISITS"
## ..- attr(*, "format.sas")= chr "ER_TIMES"
## $ ER.VISIT : num 6 2 2 5 2 2 2 5 2 6 ...
## ..- attr(*, "label")= chr "EMERGENCY ROOM VISIT"
## ..- attr(*, "format.sas")= chr "ER_VISIT"
## $ URG.TIMES: num 666 888 3 555 888 888 888 555 888 666 ...
## ..- attr(*, "label")= chr "NUMBER OF URGENT VISITS"
## ..- attr(*, "format.sas")= chr "URG_TIME"
## $ HOSP.VST : num 6 2 2 5 2 2 2 5 2 6 ...
## ..- attr(*, "label")= chr "HOSPITAL VISIT"
## ..- attr(*, "format.sas")= chr "HOSP_VST"
## $ HOSPTIME : num 666 455 455 555 455 455 455 555 455 666 ...
## ..- attr(*, "label")= chr "NUMBER OF HOSPITAL VISITS"
## ..- attr(*, "format.sas")= chr "HOSPTIME"
## $ HOSPPLAN : num 6 45 45 5 45 45 45 5 45 6 ...
## ..- attr(*, "label")= chr "HOSPITAL FOLLOW-UP"
## ..- attr(*, "format.sas")= chr "HOSPPLAN"
## $ ASMDCOST : num 2 2 2 5 2 2 2 5 2 2 ...
## ..- attr(*, "label")= chr "COST BARRIER: PRIMARY CARE DOCTOR"
## ..- attr(*, "format.sas")= chr "ASMDCOST"
## $ ASRXCOST : num 2 2 2 5 2 2 2 5 1 2 ...
## ..- attr(*, "label")= chr "COST BARRIER: MEDICATION"
## ..- attr(*, "format.sas")= chr "ASRXCOST"
## $ ASSPCOST : num 2 2 2 5 2 2 2 5 2 2 ...
## ..- attr(*, "label")= chr "COST BARRIER: SPECIALIST"
## ..- attr(*, "format.sas")= chr "ASSPCOST"
## $ WORKTALK : num 2 2 2 2 2 2 2 2 2 2 ...
## ..- attr(*, "label")= chr "DOCTOR DISCUSSED WORK ASTHMA"
## ..- attr(*, "format.sas")= chr "WORKTALK"
attach(asth.mgt.ad.min2)
Summary
summary(asth.mgt.ad.min2)
## TCH.SIGN TCH.RESP TCH.MON MGT.PLAN
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :1.000 Median :2.000 Median :2.000
## Mean :1.337 Mean :1.238 Mean :1.555 Mean :1.694
## 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :2.000 Max. :2.000 Max. :2.000 Max. :2.000
##
## MGT.CLAS INHALERW MOD.ENV SEX AGEG.F7
## Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:4.000
## Median :2.000 Median :1.00 Median :2.000 Median :2.000 Median :5.000
## Mean :1.906 Mean :1.24 Mean :1.662 Mean :1.678 Mean :4.594
## 3rd Qu.:2.000 3rd Qu.:1.00 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:6.000
## Max. :2.000 Max. :2.00 Max. :2.000 Max. :2.000 Max. :7.000
##
## X_RACEGR3 X_EDUCAG EDUCAL INCOMEL X_INCOMG
## Min. :1.000 Min. :1.000 Min. :1.00 Min. : 1.00 Min. :1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:4.00 1st Qu.: 4.00 1st Qu.:2.000
## Median :1.000 Median :3.000 Median :5.00 Median : 6.00 Median :4.000
## Mean :1.653 Mean :3.007 Mean :4.98 Mean :13.61 Mean :4.057
## 3rd Qu.:1.000 3rd Qu.:4.000 3rd Qu.:6.00 3rd Qu.: 8.00 3rd Qu.:5.000
## Max. :9.000 Max. :9.000 Max. :9.00 Max. :99.00 Max. :9.000
##
## X_BMI5CAT X_RFBMI5 SMOKE100 COPD EMPHY
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.00 1st Qu.:2.000
## Median :3.000 Median :2.000 Median :2.000 Median :2.00 Median :2.000
## Mean :3.147 Mean :2.062 Mean :1.549 Mean :1.85 Mean :1.945
## 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.00 3rd Qu.:2.000
## Max. :4.000 Max. :9.000 Max. :9.000 Max. :9.00 Max. :9.000
## NA's :520 NA's :30 NA's :30
## DEPRESS BRONCH SYMP.30D DUR.30D
## Min. :1.000 Min. :1.000 Min. : 1.00 Min. : 1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.: 10.00 1st Qu.: 6.000
## Median :2.000 Median :2.000 Median : 30.00 Median :10.000
## Mean :1.633 Mean :1.769 Mean : 44.46 Mean : 9.201
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.: 66.00 3rd Qu.:12.000
## Max. :9.000 Max. :9.000 Max. :100.00 Max. :99.000
## NA's :30 NA's :30
## ASLEEP30 SYMPFREE INCINDT LAST.MD
## Min. : 1 Min. : 1.00 Min. :1.000 Min. : 4.000
## 1st Qu.: 66 1st Qu.:12.00 1st Qu.:3.000 1st Qu.: 4.000
## Median : 66 Median :14.00 Median :3.000 Median : 4.000
## Mean : 66 Mean :28.28 Mean :2.892 Mean : 5.729
## 3rd Qu.: 88 3rd Qu.:14.00 3rd Qu.:3.000 3rd Qu.: 5.000
## Max. :111 Max. :99.00 Max. :9.000 Max. :99.000
##
## LAST.MED LAST.SYMP EPIS.12M EPIS.TP
## Min. : 1.000 Min. : 1.000 Min. :1.000 Min. : 1.0
## 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.:1.000 1st Qu.: 10.0
## Median : 3.000 Median : 3.000 Median :2.000 Median : 666.0
## Mean : 3.733 Mean : 4.742 Mean :2.703 Mean : 625.3
## 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.:6.000 3rd Qu.:1111.0
## Max. :99.000 Max. :99.000 Max. :9.000 Max. :1111.0
##
## DUR.ASTH COMPASTH INS1 INS2
## Min. : 101.0 Min. : 1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 203.0 1st Qu.: 3.000 1st Qu.:1.000 1st Qu.:2.000
## Median : 666.0 Median : 6.000 Median :1.000 Median :2.000
## Mean : 626.8 Mean : 6.126 Mean :1.065 Mean :2.127
## 3rd Qu.:1111.0 3rd Qu.:11.000 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :1111.0 Max. :11.000 Max. :9.000 Max. :9.000
##
## ER.TIMES ER.VISIT URG.TIMES HOSP.VST
## Min. : 1.0 Min. :1.000 Min. : 1.0 Min. :1.000
## 1st Qu.:555.0 1st Qu.:2.000 1st Qu.:555.0 1st Qu.:2.000
## Median :677.0 Median :2.000 Median :666.0 Median :2.000
## Mean :584.2 Mean :3.256 Mean :616.3 Mean :3.468
## 3rd Qu.:677.0 3rd Qu.:5.000 3rd Qu.:888.0 3rd Qu.:5.000
## Max. :999.0 Max. :9.000 Max. :999.0 Max. :7.000
##
## HOSPTIME HOSPPLAN ASMDCOST ASRXCOST
## Min. : 1.0 Min. : 1.00 Min. :1.000 Min. :1.000
## 1st Qu.:455.0 1st Qu.: 5.00 1st Qu.:2.000 1st Qu.:2.000
## Median :455.0 Median :45.00 Median :2.000 Median :2.000
## Mean :503.5 Mean :25.91 Mean :2.416 Mean :2.353
## 3rd Qu.:555.0 3rd Qu.:45.00 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :777.0 Max. :45.00 Max. :9.000 Max. :9.000
## NA's :4 NA's :4
## ASSPCOST WORKTALK
## Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :2.000
## Mean :2.433 Mean :1.928
## 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :9.000 Max. :9.000
## NA's :4 NA's :8
Histogram of the target variable
Histograms tell us how the data is distributed in the dataset (numeric fields).
cor_asthma.adult <- cor(asth.mgt.ad.min2[,-c(1:7)], use = "na.or.complete")
corrplot(cor_asthma.adult, order = 'hclust', type = 'lower')
# cor(asthma.mgt.adult1[,-1], use = "na.or.complete")
# write.csv(cor(asthma.mgt.adult1[,-1], use = "na.or.complete"), "predictors_cor.csv")
We are removing the predictor X_BMISCAT because it has to much missing values There are highly correlated predictors. We are going to remove some of them.
asth.mgt.ad.min21 <- select(asth.mgt.ad.min2, -X_BMI5CAT, -INCOMEL, -X_EDUCAG, -EPIS.TP, -HOSPPLAN, -ASSPCOST, -ASMDCOST, -INS2)
colnames(asth.mgt.ad.min21)
## [1] "TCH.SIGN" "TCH.RESP" "TCH.MON" "MGT.PLAN" "MGT.CLAS" "INHALERW"
## [7] "MOD.ENV" "SEX" "AGEG.F7" "X_RACEGR3" "EDUCAL" "X_INCOMG"
## [13] "X_RFBMI5" "SMOKE100" "COPD" "EMPHY" "DEPRESS" "BRONCH"
## [19] "SYMP.30D" "DUR.30D" "ASLEEP30" "SYMPFREE" "INCINDT" "LAST.MD"
## [25] "LAST.MED" "LAST.SYMP" "EPIS.12M" "DUR.ASTH" "COMPASTH" "INS1"
## [31] "ER.TIMES" "ER.VISIT" "URG.TIMES" "HOSP.VST" "HOSPTIME" "ASRXCOST"
## [37] "WORKTALK"
We first extract variables related to education, #### Selection of variables
responses <- data.frame(
TCH.SIGN = asth.mgt.ad.min21$TCH.SIGN,
TCH.RESP = asth.mgt.ad.min21$TCH.RESP,
TCH.MON = asth.mgt.ad.min21$TCH.MON,
MGT.PLAN = asth.mgt.ad.min21$MGT.PLAN,
MGT.CLAS = asth.mgt.ad.min21$MGT.CLAS,
INHALERW = asth.mgt.ad.min21$INHALERW,
MOD.ENV = asth.mgt.ad.min21$MOD.ENV
)
head(responses)
## TCH.SIGN TCH.RESP TCH.MON MGT.PLAN MGT.CLAS INHALERW MOD.ENV
## 1 1 1 2 2 2 2 2
## 2 2 1 2 2 2 2 2
## 3 1 1 2 2 2 1 2
## 4 2 2 2 2 2 1 2
## 5 2 1 2 2 2 1 1
## 6 1 1 1 2 2 1 2
responses.cat <- data.frame(apply(responses, 2, as.factor))
summary(responses.cat)
## TCH.SIGN TCH.RESP TCH.MON MGT.PLAN MGT.CLAS INHALERW MOD.ENV
## 1:7621 1:8760 1:5118 1:3522 1: 1078 1:8738 1:3889
## 2:3873 2:2734 2:6376 2:7972 2:10416 2:2756 2:7605
set.seed(2)
# Initialize total within sum of squares error: wss
wss <- 0
# Look over 1 to 15 possible clusters
for (i in 1:16) {
# Fit the model: km.out
km.out <- kmeans(responses.cat, centers = i, nstart = 20, iter.max = 50)
# Save the within cluster sum of squares
wss[i] <- km.out$tot.withinss
}
# Produce a scree plot
plot(1:16, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares"
)
# Select number of clusters
k <- 3
set.seed(2)
# Build model with k clusters: km.out
km.out <- kmeans(responses.cat, centers = k, nstart = 20, iter.max = 50)
# View the resulting model
km.out$centers
## TCH.SIGN TCH.RESP TCH.MON MGT.PLAN MGT.CLAS INHALERW MOD.ENV
## 1 1.977681 1.648612 1.802667 1.950463 1.979314 1.429777 1.822265
## 2 1.007879 1.058652 2.000000 1.759556 1.927050 1.204844 1.654800
## 3 1.057819 1.034145 1.000000 1.427271 1.828819 1.108127 1.532666
resp.asthma <- cbind(responses.cat, target = km.out$cluster)
head(resp.asthma)
## TCH.SIGN TCH.RESP TCH.MON MGT.PLAN MGT.CLAS INHALERW MOD.ENV target
## 1 1 1 2 2 2 2 2 2
## 2 2 1 2 2 2 2 2 1
## 3 1 1 2 2 2 1 2 2
## 4 2 2 2 2 2 1 2 1
## 5 2 1 2 2 2 1 1 1
## 6 1 1 1 2 2 1 2 3
write.csv(resp.asthma, "response_interpret.csv")
resp.asthma$target <- as.factor(resp.asthma$target)
summary(resp.asthma)
## TCH.SIGN TCH.RESP TCH.MON MGT.PLAN MGT.CLAS INHALERW MOD.ENV target
## 1:7621 1:8760 1:5118 1:3522 1: 1078 1:8738 1:3889 1:3674
## 2:3873 2:2734 2:6376 2:7972 2:10416 2:2756 2:7605 2:3427
## 3:4393
plot(resp.asthma$target)
egt <- group_by(resp.asthma, TCH.SIGN, target) %>% summarise(count=n()) %>%
group_by(TCH.SIGN) %>% mutate(etotal=sum(count), proportion=count/etotal)
## `summarise()` regrouping output by 'TCH.SIGN' (override with `.groups` argument)
tibble::as.tibble(egt)
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## # A tibble: 6 x 5
## TCH.SIGN target count etotal proportion
## <fct> <fct> <int> <int> <dbl>
## 1 1 1 82 7621 0.0108
## 2 1 2 3400 7621 0.446
## 3 1 3 4139 7621 0.543
## 4 2 1 3592 3873 0.927
## 5 2 2 27 3873 0.00697
## 6 2 3 254 3873 0.0656
asth.res1 <- egt %>% group_by(TCH.SIGN) %>% mutate(group.max = max(count)) %>% group_by(target) %>% filter(count==group.max)
asth.res1
## # A tibble: 2 x 6
## # Groups: target [2]
## TCH.SIGN target count etotal proportion group.max
## <fct> <fct> <int> <int> <dbl> <int>
## 1 1 3 4139 7621 0.543 4139
## 2 2 1 3592 3873 0.927 3592
ggplot(egt, aes(x=target, y=proportion, group=TCH.SIGN, linetype=TCH.SIGN))+geom_line()
In the target response, 8 is the positive answer, 3 is the negative answer, 5 is don’t know and 6 is refused for the question: TCH_SIGN Has a doctor or other health professional ever taught you… a. How to recognize early signs or symptoms of an asthma episode?
egt <- group_by(resp.asthma, TCH.RESP, target) %>% summarise(count=n()) %>%
group_by(TCH.RESP) %>% mutate(etotal=sum(count), proportion=count/etotal)
## `summarise()` regrouping output by 'TCH.RESP' (override with `.groups` argument)
tibble::as.tibble(egt)
## # A tibble: 6 x 5
## TCH.RESP target count etotal proportion
## <fct> <fct> <int> <int> <dbl>
## 1 1 1 1291 8760 0.147
## 2 1 2 3226 8760 0.368
## 3 1 3 4243 8760 0.484
## 4 2 1 2383 2734 0.872
## 5 2 2 201 2734 0.0735
## 6 2 3 150 2734 0.0549
asth.res2 <- egt %>% group_by(TCH.RESP) %>% mutate(group.max = max(count)) %>% group_by(target) %>% filter(count==group.max)
asth.res2
## # A tibble: 2 x 6
## # Groups: target [2]
## TCH.RESP target count etotal proportion group.max
## <fct> <fct> <int> <int> <dbl> <int>
## 1 1 3 4243 8760 0.484 4243
## 2 2 1 2383 2734 0.872 2383
ggplot(egt, aes(x=target, y=proportion, group=TCH.RESP, linetype=TCH.RESP))+geom_line()
In the target response, 8 is the positive answer, 3 is the negative answer, 1 is don’t know and 1 is refused for the question: TCH_RESP Has a doctor or other health professional ever taught you… b. What to do during an asthma episode or attack?
egt <- group_by(resp.asthma, TCH.MON, target) %>% summarise(count=n()) %>%
group_by(TCH.MON) %>% mutate(etotal=sum(count), proportion=count/etotal)
## `summarise()` regrouping output by 'TCH.MON' (override with `.groups` argument)
tibble::as.tibble(egt)
## # A tibble: 4 x 5
## TCH.MON target count etotal proportion
## <fct> <fct> <int> <int> <dbl>
## 1 1 1 725 5118 0.142
## 2 1 3 4393 5118 0.858
## 3 2 1 2949 6376 0.463
## 4 2 2 3427 6376 0.537
asth.res3 <- egt %>% group_by(TCH.MON) %>% mutate(group.max = max(count)) %>% group_by(target) %>% filter(count==group.max)
asth.res3
## # A tibble: 2 x 6
## # Groups: target [2]
## TCH.MON target count etotal proportion group.max
## <fct> <fct> <int> <int> <dbl> <int>
## 1 1 3 4393 5118 0.858 4393
## 2 2 2 3427 6376 0.537 3427
ggplot(egt, aes(x=target, y=proportion, group=TCH.MON, linetype=TCH.MON))+geom_line()
In the target response, 8 is the positive answer, 7 are the negative answers, 2 is don’t know and 2 is refused for the question: TCH_MON A peak flow meter is a hand held device that measures how quickly you can blow air out of your lungs. Has a doctor or other health professional ever taught you… c. How to use a peak flow meter to adjust your daily medications?
egt <- group_by(resp.asthma, MGT.PLAN, target) %>% summarise(count=n()) %>%
group_by(MGT.PLAN) %>% mutate(etotal=sum(count), proportion=count/etotal)
## `summarise()` regrouping output by 'MGT.PLAN' (override with `.groups` argument)
tibble::as.tibble(egt)
## # A tibble: 6 x 5
## MGT.PLAN target count etotal proportion
## <fct> <fct> <int> <int> <dbl>
## 1 1 1 182 3522 0.0517
## 2 1 2 824 3522 0.234
## 3 1 3 2516 3522 0.714
## 4 2 1 3492 7972 0.438
## 5 2 2 2603 7972 0.327
## 6 2 3 1877 7972 0.235
asth.res4 <- egt %>% group_by(MGT.PLAN) %>% mutate(group.max = max(count)) %>% group_by(target) %>% filter(count==group.max)
asth.res4
## # A tibble: 2 x 6
## # Groups: target [2]
## MGT.PLAN target count etotal proportion group.max
## <fct> <fct> <int> <int> <dbl> <int>
## 1 1 3 2516 3522 0.714 2516
## 2 2 1 3492 7972 0.438 3492
ggplot(egt, aes(x=target, y=proportion, group=MGT.PLAN, linetype=MGT.PLAN))+geom_line()
In the target response, 8 is the positive answer, 3 is the negative answer, 9 is don’t know and 9 is refused for the question: MGT_PLAN An asthma action plan, or asthma management plan, is a form with instructions about when to change the amount or type of medicine, when to call the doctor for advice, and when to go to the emergency room. Has a doctor or other health professional EVER given you an asthma action plan?
egt <- group_by(resp.asthma, MGT.CLAS, target) %>% summarise(count=n()) %>%
group_by(MGT.CLAS) %>% mutate(etotal=sum(count), proportion=count/etotal)
## `summarise()` regrouping output by 'MGT.CLAS' (override with `.groups` argument)
tibble::as.tibble(egt)
## # A tibble: 6 x 5
## MGT.CLAS target count etotal proportion
## <fct> <fct> <int> <int> <dbl>
## 1 1 1 76 1078 0.0705
## 2 1 2 250 1078 0.232
## 3 1 3 752 1078 0.698
## 4 2 1 3598 10416 0.345
## 5 2 2 3177 10416 0.305
## 6 2 3 3641 10416 0.350
asth.res5 <- egt %>% group_by(MGT.CLAS) %>% mutate(group.max = max(count)) %>% group_by(target) %>% filter(count==group.max)
asth.res5
## # A tibble: 2 x 6
## # Groups: target [1]
## MGT.CLAS target count etotal proportion group.max
## <fct> <fct> <int> <int> <dbl> <int>
## 1 1 3 752 1078 0.698 752
## 2 2 3 3641 10416 0.350 3641
ggplot(egt, aes(x=target, y=proportion, group=MGT.CLAS, linetype=MGT.CLAS))+geom_line()
In the target response, 8 is the positive answer, 8 or(3,7) is the negative answer, 8 is don’t know and 6 is refused for the question: MGT_CLAS Have you ever taken a course or class on how to manage your asthma?
egt <- group_by(resp.asthma, INHALERW , target) %>% summarise(count=n()) %>%
group_by(INHALERW ) %>% mutate(etotal=sum(count), proportion=count/etotal)
## `summarise()` regrouping output by 'INHALERW' (override with `.groups` argument)
tibble::as.tibble(egt)
## # A tibble: 6 x 5
## INHALERW target count etotal proportion
## <fct> <fct> <int> <int> <dbl>
## 1 1 1 2095 8738 0.240
## 2 1 2 2725 8738 0.312
## 3 1 3 3918 8738 0.448
## 4 2 1 1579 2756 0.573
## 5 2 2 702 2756 0.255
## 6 2 3 475 2756 0.172
asth.res6 <- egt %>% group_by(INHALERW) %>% mutate(group.max = max(count)) %>% group_by(target) %>% filter(count==group.max)
asth.res6
## # A tibble: 2 x 6
## # Groups: target [2]
## INHALERW target count etotal proportion group.max
## <fct> <fct> <int> <int> <dbl> <int>
## 1 1 3 3918 8738 0.448 3918
## 2 2 1 1579 2756 0.573 1579
ggplot(egt, aes(x=target, y=proportion, group=INHALERW , linetype=INHALERW))+geom_line()
In the target response, 8 is the positive answer, 3 is the negative answer, 4 is don’t know and 1 is refused for the question: INHALERW (8.4) Did a doctor or other health professional watch you use the inhaler?
egt <- group_by(resp.asthma, MOD.ENV , target) %>% summarise(count=n()) %>%
group_by(MOD.ENV) %>% mutate(etotal=sum(count), proportion=count/etotal)
## `summarise()` regrouping output by 'MOD.ENV' (override with `.groups` argument)
tibble::as.tibble(egt)
## # A tibble: 6 x 5
## MOD.ENV target count etotal proportion
## <fct> <fct> <int> <int> <dbl>
## 1 1 1 653 3889 0.168
## 2 1 2 1183 3889 0.304
## 3 1 3 2053 3889 0.528
## 4 2 1 3021 7605 0.397
## 5 2 2 2244 7605 0.295
## 6 2 3 2340 7605 0.308
asth.res7 <- egt %>% group_by(MOD.ENV) %>% mutate(group.max = max(count)) %>% group_by(target) %>% filter(count==group.max)
asth.res7
## # A tibble: 2 x 6
## # Groups: target [2]
## MOD.ENV target count etotal proportion group.max
## <fct> <fct> <int> <int> <dbl> <int>
## 1 1 3 2053 3889 0.528 2053
## 2 2 1 3021 7605 0.397 3021
ggplot(egt, aes(x=target, y=proportion, group=MOD.ENV, linetype=MOD.ENV))+geom_line()
response.var <- data_frame(RESPONSE = c("1=YES", "2=NO"),
TCH.SIGN = asth.res1$target,
TCH.RES = asth.res2$target,
TCH.MON = asth.res3$target,
MGT.PLAN = asth.res4$target,
MGT.CLAS = asth.res5$target,
INHALERW = asth.res6$target,
MOD.ENV = asth.res7$target)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
response.var
## # A tibble: 2 x 8
## RESPONSE TCH.SIGN TCH.RES TCH.MON MGT.PLAN MGT.CLAS INHALERW MOD.ENV
## <chr> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 1=YES 3 3 3 3 3 3 3
## 2 2=NO 1 1 2 1 3 1 1
resp.asthma2 <- resp.asthma
resp.asthma2$target <- if_else(resp.asthma2$target==3, 1, 0)
#?mutate()
asth.mgt.ad.min31 <- asth.mgt.ad.min21 %>%
select( -TCH.SIGN,-TCH.RESP, -TCH.MON, -MGT.PLAN, -MGT.CLAS, -INHALERW, -MOD.ENV) %>%
mutate(TARGET = resp.asthma2$target) %>% relocate(TARGET, .before = SEX)
str(asth.mgt.ad.min31)
## 'data.frame': 11494 obs. of 31 variables:
## $ TARGET : num 0 0 0 0 0 1 1 0 0 0 ...
## $ SEX : num 1 2 2 2 2 2 1 2 2 2 ...
## ..- attr(*, "label")= chr "RESPONDENTS SEX"
## ..- attr(*, "format.sas")= chr "SEX"
## $ AGEG.F7 : num 4 5 5 3 6 5 4 6 6 7 ...
## ..- attr(*, "label")= chr "AGE COLLAPSED TO 7 GROUPS FOR ASTHMA CALL-BACK"
## ..- attr(*, "format.sas")= chr "AGEG_F7Z"
## $ X_RACEGR3: num 3 1 1 5 1 5 1 1 1 1 ...
## ..- attr(*, "label")= chr "COMPUTED FIVE LEVEL RACE/ETHNICITY CATEGORY."
## ..- attr(*, "format.sas")= chr "_3RACEGR"
## $ EDUCAL : num 6 4 4 5 6 6 6 6 6 5 ...
## ..- attr(*, "label")= chr "EDUCATION LEVEL"
## ..- attr(*, "format.sas")= chr "EDUCA"
## $ X_INCOMG : num 5 1 1 5 5 5 5 5 3 9 ...
## ..- attr(*, "label")= chr "COMPUTED INCOME CATEGORIES"
## ..- attr(*, "format.sas")= chr "_INCOMG"
## $ X_RFBMI5 : num 2 2 2 2 2 2 1 2 2 1 ...
## ..- attr(*, "label")= chr "OVERWEIGHT OR OBESE CALCULATED VARIABLE"
## ..- attr(*, "format.sas")= chr "_5RFBMI"
## $ SMOKE100 : num 2 1 1 2 1 2 1 1 2 2 ...
## ..- attr(*, "label")= chr "SMOKED AT LEAST 100 CIGARETTES"
## ..- attr(*, "format.sas")= chr "SMOK100_"
## $ COPD : num 2 1 2 2 2 2 2 2 2 1 ...
## ..- attr(*, "label")= chr "EVER TOLD HAVE CHRONIC OBSTRUCTIVE PULMONARY DISEASE"
## ..- attr(*, "format.sas")= chr "COPD"
## $ EMPHY : num 2 2 2 2 2 2 2 2 2 2 ...
## ..- attr(*, "label")= chr "EVER TOLD HAVE EMPHYSEMA"
## ..- attr(*, "format.sas")= chr "EMPHY"
## $ DEPRESS : num 2 1 2 2 2 2 2 2 1 1 ...
## ..- attr(*, "label")= chr "EVER TOLD DEPRESSED"
## ..- attr(*, "format.sas")= chr "DEPRESS"
## $ BRONCH : num 2 1 2 2 1 2 2 2 1 2 ...
## ..- attr(*, "label")= chr "EVER TOLD HAVE CHRONIC BRONCHITIS"
## ..- attr(*, "format.sas")= chr "BRONCH"
## $ SYMP.30D : num 100 30 7 66 17 100 10 66 30 66 ...
## ..- attr(*, "label")= chr "SYMPTOM DAYS"
## ..- attr(*, "format.sas")= chr "SYMP_30D"
## $ DUR.30D : num 10 2 12 6 12 10 12 6 1 6 ...
## ..- attr(*, "label")= chr "CONSTANT SYMPTOMS"
## ..- attr(*, "format.sas")= chr "DUR_30D"
## $ ASLEEP30 : num 100 15 6 66 88 100 88 66 20 66 ...
## ..- attr(*, "label")= chr "NIGHT SYMPTOMS"
## ..- attr(*, "format.sas")= chr "ASLEEP30Z"
## $ SYMPFREE : num 14 88 6 14 11 14 7 14 88 14 ...
## ..- attr(*, "label")= chr "SYMPTOM-FREE DAYS"
## ..- attr(*, "format.sas")= chr "SYMPFREE"
## $ INCINDT : num 3 2 3 3 3 2 3 3 3 3 ...
## ..- attr(*, "label")= chr "TIME SINCE DIAGNOSIS"
## ..- attr(*, "format.sas")= chr "INCIDNT"
## $ LAST.MD : num 5 4 4 7 4 4 4 5 4 5 ...
## ..- attr(*, "label")= chr "LAST TALKED TO A DOCTOR"
## ..- attr(*, "format.sas")= chr "LAST_MD"
## $ LAST.MED : num 4 1 3 7 3 1 1 6 1 5 ...
## ..- attr(*, "label")= chr "LAST TOOK ASTHMA MEDICATION"
## ..- attr(*, "format.sas")= chr "LAST_MED"
## $ LAST.SYMP: num 4 1 3 7 3 4 3 5 1 5 ...
## ..- attr(*, "label")= chr "LAST HAD ANY SYMPTOMS OF ASTHMA"
## ..- attr(*, "format.sas")= chr "LASTSYMP"
## $ EPIS.12M : num 1 1 1 6 1 2 1 6 2 6 ...
## ..- attr(*, "label")= chr "ASTHMA EPISODE OR ATTACK"
## ..- attr(*, "format.sas")= chr "EPIS_12M"
## $ DUR.ASTH : num 105 204 120 666 115 ...
## ..- attr(*, "label")= chr "DURATION OF ATTACK"
## ..- attr(*, "format.sas")= chr "DUR_ASTH"
## $ COMPASTH : num 1 3 1 6 3 11 3 6 11 6 ...
## ..- attr(*, "label")= chr "TYPICAL ATTACK"
## ..- attr(*, "format.sas")= chr "COMPASTH"
## $ INS1 : num 1 1 1 2 1 1 1 1 1 1 ...
## ..- attr(*, "label")= chr "INSURANCE"
## ..- attr(*, "format.sas")= chr "INS1Z"
## $ ER.TIMES : num 666 677 677 555 677 677 677 555 677 666 ...
## ..- attr(*, "label")= chr "NUMBER OF EMERGENCY ROOM VISITS"
## ..- attr(*, "format.sas")= chr "ER_TIMES"
## $ ER.VISIT : num 6 2 2 5 2 2 2 5 2 6 ...
## ..- attr(*, "label")= chr "EMERGENCY ROOM VISIT"
## ..- attr(*, "format.sas")= chr "ER_VISIT"
## $ URG.TIMES: num 666 888 3 555 888 888 888 555 888 666 ...
## ..- attr(*, "label")= chr "NUMBER OF URGENT VISITS"
## ..- attr(*, "format.sas")= chr "URG_TIME"
## $ HOSP.VST : num 6 2 2 5 2 2 2 5 2 6 ...
## ..- attr(*, "label")= chr "HOSPITAL VISIT"
## ..- attr(*, "format.sas")= chr "HOSP_VST"
## $ HOSPTIME : num 666 455 455 555 455 455 455 555 455 666 ...
## ..- attr(*, "label")= chr "NUMBER OF HOSPITAL VISITS"
## ..- attr(*, "format.sas")= chr "HOSPTIME"
## $ ASRXCOST : num 2 2 2 5 2 2 2 5 1 2 ...
## ..- attr(*, "label")= chr "COST BARRIER: MEDICATION"
## ..- attr(*, "format.sas")= chr "ASRXCOST"
## $ WORKTALK : num 2 2 2 2 2 2 2 2 2 2 ...
## ..- attr(*, "label")= chr "DOCTOR DISCUSSED WORK ASTHMA"
## ..- attr(*, "format.sas")= chr "WORKTALK"
Here were are going to drop missing data because they are only 12 over 13,922 rows. We also transform all predictors to categorical.
asth.mgt.ad.min33 <- drop_na(asth.mgt.ad.min31)
asth.mgt.ad.min35 <- data.frame(apply(asth.mgt.ad.min33, 2, as.factor))
summary(asth.mgt.ad.min35)
## TARGET SEX AGEG.F7 X_RACEGR3 EDUCAL X_INCOMG X_RFBMI5 SMOKE100
## 0:7085 1:3686 1: 622 1:8935 1: 14 1:1583 1:2917 1:5401
## 1:4379 2:7778 2: 996 2: 636 2: 255 2:1872 2:8027 2:6017
## 3:1217 3: 420 3: 573 3:1042 9: 520 7: 43
## 4:1869 4: 384 4:2702 4:1394 9: 3
## 5:2924 5: 964 5:3514 5:4412
## 6:2555 9: 125 6:4392 9:1161
## 7:1281 9: 14
## COPD EMPHY DEPRESS BRONCH SYMP.30D DUR.30D
## 1:2311 1: 973 1:4428 1:3198 66 :3094 12 :4181
## 2:9038 2:10425 2:7000 2:8159 30 :1989 6 :3094
## 7: 106 7: 61 7: 18 7: 100 100 :1425 10 :1425
## 9: 9 9: 5 9: 18 9: 7 88 : 613 1 :1149
## 2 : 468 2 : 804
## 3 : 454 11 : 613
## (Other):3421 (Other): 198
## ASLEEP30 SYMPFREE INCINDT LAST.MD LAST.MED
## 88 :3306 14 :5683 1: 259 4 :7077 1 :4479
## 66 :3094 88 :2378 2: 864 5 :1563 7 :2027
## 100 :1425 10 : 516 3:10307 6 : 673 3 :1388
## 111 : 613 7 : 413 7: 28 7 :2014 4 :1111
## 30 : 532 12 : 381 9: 6 77: 76 2 : 950
## 2 : 342 5 : 379 88: 55 5 : 931
## (Other):2152 (Other):1714 99: 6 (Other): 578
## LAST.SYMP EPIS.12M DUR.ASTH COMPASTH INS1 ER.TIMES
## 1 :3161 1:4724 1111 :3646 11 :3646 1:10833 677 :5896
## 3 :2205 2:3565 666 :3094 6 :3094 2: 611 666 :2546
## 7 :1669 6:3094 201 : 379 3 :2915 7: 14 555 :1772
## 4 :1425 7: 77 105 : 377 1 :1060 9: 6 1 : 633
## 2 :1414 9: 4 130 : 350 2 : 703 2 : 298
## 5 : 924 202 : 345 7 : 33 3 : 133
## (Other): 666 (Other):3273 (Other): 13 (Other): 186
## ER.VISIT URG.TIMES HOSP.VST HOSPTIME ASRXCOST WORKTALK
## 1:1229 888 :4845 1: 345 455 :5986 1:1393 1:2305
## 2:5896 666 :2546 2:5986 666 :2546 2:8279 2:8841
## 5:1772 555 :1772 4: 808 555 :1772 5:1772 6: 219
## 6:2546 1 : 990 5:1772 444 : 808 7: 13 7: 76
## 7: 20 2 : 575 6:2546 1 : 185 9: 7 8: 15
## 9: 1 3 : 263 7: 7 2 : 66 9: 8
## (Other): 473 (Other): 101
library(dplyr)
egt <- group_by(asth.mgt.ad.min35, EDUCAL, TARGET) %>% summarise(count=n()) %>%
group_by(EDUCAL) %>% mutate(etotal=sum(count), proportion=count/etotal)
## `summarise()` regrouping output by 'EDUCAL' (override with `.groups` argument)
ggplot(egt, aes(x=EDUCAL, y=proportion, group=TARGET, linetype=TARGET))+geom_line()
egt <- summarize(group_by(asth.mgt.ad.min35, DUR.ASTH, TARGET), count = n())
## `summarise()` regrouping output by 'DUR.ASTH' (override with `.groups` argument)
egt <- mutate(egt, etotal =sum(count), proportion= count/etotal)
ggplot(data=egt, aes(x=DUR.ASTH, y=proportion, group=TARGET, linetype=TARGET))+geom_line()
library(caret)
set.seed(25)
inTraining <- createDataPartition(asth.mgt.ad.min35$TARGET, p = .80, list = FALSE)
training1 <- asth.mgt.ad.min35[ inTraining,]
testing1 <- asth.mgt.ad.min35[-inTraining,]
glm.all <- glm(TARGET~., data=training1, family=binomial)
glm.all
##
## Call: glm(formula = TARGET ~ ., family = binomial, data = training1)
##
## Coefficients:
## (Intercept) SEX2 AGEG.F72 AGEG.F73 AGEG.F74
## -0.181851 0.264127 0.234222 0.108609 0.063376
## AGEG.F75 AGEG.F76 AGEG.F77 X_RACEGR32 X_RACEGR33
## -0.152065 -0.285352 -0.404544 0.429904 -0.102289
## X_RACEGR34 X_RACEGR35 X_RACEGR39 EDUCAL2 EDUCAL3
## 0.141641 0.215193 -0.109681 -0.347330 -0.153723
## EDUCAL4 EDUCAL5 EDUCAL6 EDUCAL9 X_INCOMG2
## -0.067706 0.026137 -0.020716 2.757223 0.018806
## X_INCOMG3 X_INCOMG4 X_INCOMG5 X_INCOMG9 X_RFBMI52
## -0.042773 0.008287 0.093898 -0.088480 -0.009203
## X_RFBMI59 SMOKE1002 SMOKE1007 SMOKE1009 COPD2
## -0.018798 0.103907 0.278189 16.178356 -0.083654
## COPD7 COPD9 EMPHY2 EMPHY7 EMPHY9
## -0.574820 0.136230 0.066485 -0.209436 -15.561838
## DEPRESS2 DEPRESS7 DEPRESS9 BRONCH2 BRONCH7
## -0.013758 0.276575 0.630537 -0.118858 -0.424399
## BRONCH9 SYMP.30D10 SYMP.30D100 SYMP.30D11 SYMP.30D12
## 15.272502 -0.187808 -0.175885 -2.250695 -0.320924
## SYMP.30D13 SYMP.30D14 SYMP.30D15 SYMP.30D16 SYMP.30D17
## 0.352782 -0.302786 -0.179834 -0.622459 2.018186
## SYMP.30D18 SYMP.30D19 SYMP.30D2 SYMP.30D20 SYMP.30D21
## -0.396191 15.612673 -0.003330 -0.219732 -1.125363
## SYMP.30D22 SYMP.30D23 SYMP.30D24 SYMP.30D25 SYMP.30D26
## -0.876827 -1.475145 -1.990922 -0.328536 -15.260601
## SYMP.30D27 SYMP.30D28 SYMP.30D29 SYMP.30D3 SYMP.30D30
## -1.738791 -0.508401 0.562701 -0.050136 -0.174683
## SYMP.30D4 SYMP.30D5 SYMP.30D6 SYMP.30D66 SYMP.30D7
## -0.258579 -0.167390 -0.356342 -0.646848 -0.388629
## SYMP.30D77 SYMP.30D8 SYMP.30D88 SYMP.30D9 SYMP.30D99
## -0.553411 -0.454873 -0.197766 -0.574277 -0.226603
## DUR.30D10 DUR.30D11 DUR.30D12 DUR.30D2 DUR.30D6
## NA NA NA -0.174010 NA
## DUR.30D7 DUR.30D77 DUR.30D9 DUR.30D99 ASLEEP3010
## -0.908898 NA -14.561426 NA 0.247724
## ASLEEP30100 ASLEEP3011 ASLEEP30111 ASLEEP3012 ASLEEP3013
## NA 0.265941 NA 0.230518 -0.728100
## ASLEEP3014 ASLEEP3015 ASLEEP3016 ASLEEP3017 ASLEEP3018
## -0.473326 0.255851 1.274436 -15.981704 0.254817
## ASLEEP3019 ASLEEP302 ASLEEP3020 ASLEEP3021 ASLEEP3022
## -14.649763 0.128439 -0.077222 0.526733 -14.262420
## ASLEEP3025 ASLEEP3026 ASLEEP3027 ASLEEP3028 ASLEEP3029
## 0.488759 14.788058 -0.366785 -1.128305 -0.127257
## ASLEEP303 ASLEEP3030 ASLEEP304 ASLEEP305 ASLEEP306
## -0.071442 0.033518 -0.142830 -0.128622 -0.472891
## ASLEEP3066 ASLEEP307 ASLEEP3077 ASLEEP308 ASLEEP3088
## NA 0.472901 -0.071217 1.167993 -0.166977
## ASLEEP309 ASLEEP3099 SYMPFREE10 SYMPFREE11 SYMPFREE12
## 0.293889 0.674442 0.212565 0.156256 0.024225
## SYMPFREE13 SYMPFREE14 SYMPFREE2 SYMPFREE3 SYMPFREE4
## 0.135587 0.162379 0.090015 -0.156981 -0.207404
## SYMPFREE5 SYMPFREE6 SYMPFREE7 SYMPFREE77 SYMPFREE8
## 0.158441 0.085357 0.144401 0.070074 -0.128409
## SYMPFREE88 SYMPFREE9 SYMPFREE99 INCINDT2 INCINDT3
## -0.193457 -0.002361 -0.019917 0.048511 0.852294
## INCINDT7 INCINDT9 LAST.MD5 LAST.MD6 LAST.MD7
## 0.413457 0.058609 0.154191 -0.280160 -0.509185
## LAST.MD77 LAST.MD88 LAST.MD99 LAST.MED2 LAST.MED3
## -1.010175 -0.422158 -0.086899 -0.352555 -0.449883
## LAST.MED4 LAST.MED5 LAST.MED6 LAST.MED7 LAST.MED77
## -0.541316 -0.535523 -0.196715 -0.380953 -0.216937
## LAST.MED99 LAST.SYMP2 LAST.SYMP3 LAST.SYMP4 LAST.SYMP5
## -14.934184 0.026462 0.004513 NA 0.530932
## LAST.SYMP6 LAST.SYMP7 LAST.SYMP77 LAST.SYMP88 LAST.SYMP99
## 0.533765 0.484223 -0.259288 NA 0.999034
## EPIS.12M2 EPIS.12M6 EPIS.12M7 EPIS.12M9 DUR.ASTH102
## 0.146460 NA -0.136185 16.662584 -0.028399
## DUR.ASTH103 DUR.ASTH104 DUR.ASTH105 DUR.ASTH106 DUR.ASTH107
## -0.201662 0.137783 0.138460 -15.224557 0.204079
## DUR.ASTH108 DUR.ASTH110 DUR.ASTH111 DUR.ASTH1111 DUR.ASTH112
## -0.432433 0.061958 -14.759587 NA -15.581279
## DUR.ASTH113 DUR.ASTH115 DUR.ASTH117 DUR.ASTH120 DUR.ASTH125
## 0.066307 0.339253 16.595013 0.184678 -0.509594
## DUR.ASTH127 DUR.ASTH130 DUR.ASTH135 DUR.ASTH140 DUR.ASTH145
## 29.483620 0.401918 -0.239410 0.626707 0.757596
## DUR.ASTH150 DUR.ASTH152 DUR.ASTH160 DUR.ASTH180 DUR.ASTH190
## 0.028602 -15.102748 0.864854 16.962027 0.440111
## DUR.ASTH201 DUR.ASTH202 DUR.ASTH203 DUR.ASTH204 DUR.ASTH205
## 0.302879 0.594290 0.951425 -0.069675 -0.100363
## DUR.ASTH206 DUR.ASTH207 DUR.ASTH208 DUR.ASTH209 DUR.ASTH210
## 0.854545 0.986888 0.040403 13.785976 0.766371
## DUR.ASTH212 DUR.ASTH213 DUR.ASTH215 DUR.ASTH216 DUR.ASTH218
## 0.438552 -16.062366 -15.113075 0.979394 -14.946911
## DUR.ASTH220 DUR.ASTH224 DUR.ASTH230 DUR.ASTH236 DUR.ASTH248
## 16.503949 0.734224 0.951416 1.109690 -0.033846
## DUR.ASTH259 DUR.ASTH301 DUR.ASTH302 DUR.ASTH303 DUR.ASTH304
## -15.639218 0.280429 0.411167 0.435009 0.574860
## DUR.ASTH305 DUR.ASTH306 DUR.ASTH307 DUR.ASTH308 DUR.ASTH309
## 0.758204 0.416532 1.408647 -0.020734 -0.659452
## DUR.ASTH310 DUR.ASTH311 DUR.ASTH312 DUR.ASTH314 DUR.ASTH315
## 0.206549 -12.878657 -0.585569 0.209144 -0.851930
## DUR.ASTH320 DUR.ASTH321 DUR.ASTH324 DUR.ASTH330 DUR.ASTH390
## 16.715346 1.987906 -14.461668 2.170410 15.717743
## DUR.ASTH401 DUR.ASTH402 DUR.ASTH403 DUR.ASTH404 DUR.ASTH405
## 0.048625 0.171832 0.418509 0.928861 -0.313108
## DUR.ASTH406 DUR.ASTH407 DUR.ASTH408 DUR.ASTH409 DUR.ASTH410
## 1.120048 0.981731 -0.204968 -13.401208 15.703163
## DUR.ASTH412 DUR.ASTH414 DUR.ASTH420 DUR.ASTH424 DUR.ASTH430
## 0.252328 15.967369 -14.593660 14.653323 -14.382398
## DUR.ASTH452 DUR.ASTH555 DUR.ASTH666 DUR.ASTH777 DUR.ASTH999
## -14.868939 -0.754637 NA 0.429349 -0.770031
## COMPASTH11 COMPASTH2 COMPASTH3 COMPASTH4 COMPASTH6
## NA -0.275394 -0.042198 -0.533271 NA
## COMPASTH7 COMPASTH9 INS12 INS17 INS19
## -0.933036 16.419823 -0.041935 -0.558905 0.227420
## ER.TIMES10 ER.TIMES12 ER.TIMES13 ER.TIMES14 ER.TIMES2
## -15.462531 -15.667974 16.351336 -15.427728 0.251148
## ER.TIMES20 ER.TIMES23 ER.TIMES25 ER.TIMES3 ER.TIMES30
## 14.882582 -15.611541 NA -0.051359 29.667504
## ER.TIMES32 ER.TIMES4 ER.TIMES5 ER.TIMES555 ER.TIMES6
## 14.476255 0.047693 1.091430 -0.349869 -0.617654
## ER.TIMES666 ER.TIMES677 ER.TIMES7 ER.TIMES777 ER.TIMES8
## -0.667965 0.016504 -0.212898 -0.230671 1.147756
## ER.TIMES9 ER.TIMES999 ER.VISIT2 ER.VISIT5 ER.VISIT6
## 14.885856 -15.334054 NA NA NA
## ER.VISIT7 ER.VISIT9 URG.TIMES10 URG.TIMES12 URG.TIMES13
## 0.004306 NA -0.485703 0.453965 -0.500979
## URG.TIMES14 URG.TIMES15 URG.TIMES18 URG.TIMES2 URG.TIMES20
## -0.587495 15.602243 -0.049394 -0.041123 14.093685
## URG.TIMES24 URG.TIMES28 URG.TIMES3 URG.TIMES30 URG.TIMES4
## -14.510432 -15.389699 0.140851 -14.624279 0.217759
## URG.TIMES45 URG.TIMES5 URG.TIMES52 URG.TIMES555 URG.TIMES6
## 15.574777 0.151851 15.501242 NA 0.661062
## URG.TIMES60 URG.TIMES666 URG.TIMES7 URG.TIMES77 URG.TIMES777
## 16.177205 NA 0.656048 -14.441160 -0.304537
## URG.TIMES8 URG.TIMES88 URG.TIMES888 URG.TIMES9 URG.TIMES999
## 0.296016 -0.211716 -0.037603 15.794009 -1.161489
## HOSP.VST2 HOSP.VST4 HOSP.VST5 HOSP.VST6 HOSP.VST7
## -0.383499 -0.208026 NA NA 13.365065
## HOSPTIME12 HOSPTIME13 HOSPTIME17 HOSPTIME2 HOSPTIME3
## NA 15.709833 NA -0.091844 0.542429
## HOSPTIME4 HOSPTIME444 HOSPTIME455 HOSPTIME5 HOSPTIME555
## 0.589363 NA NA -0.519547 NA
## HOSPTIME6 HOSPTIME666 HOSPTIME7 HOSPTIME777 HOSPTIME8
## 0.670323 NA -0.013440 2.385353 -0.152631
## HOSPTIME9 ASRXCOST2 ASRXCOST5 ASRXCOST7 ASRXCOST9
## 15.230481 0.081513 NA -0.295730 -15.418198
## WORKTALK2 WORKTALK6 WORKTALK7 WORKTALK8 WORKTALK9
## -0.570482 -0.613781 -0.294629 0.011138 -0.792576
##
## Degrees of Freedom: 9171 Total (i.e. Null); 8859 Residual
## Null Deviance: 12200
## Residual Deviance: 11130 AIC: 11760
# glm.pred <- predict(glm.all, newdata = testing1[,-1], type = "response")
# predicted <- as.factor(ifelse(glm.pred>.5,1,0))
# glm.cm <- confusionMatrix(data = predicted, testing1$TARGET, positive = '1')
# glm.cm
glm.mod11 <- step(glm.all, trace = 0)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.mod11
##
## Call: glm(formula = TARGET ~ SEX + AGEG.F7 + X_RACEGR3 + EDUCAL + SMOKE100 +
## BRONCH + DUR.30D + INCINDT + LAST.MD + LAST.MED + EPIS.12M +
## HOSP.VST + WORKTALK, family = binomial, data = training1)
##
## Coefficients:
## (Intercept) SEX2 AGEG.F72 AGEG.F73 AGEG.F74 AGEG.F75
## -0.02448 0.24748 0.25090 0.16985 0.13176 -0.10723
## AGEG.F76 AGEG.F77 X_RACEGR32 X_RACEGR33 X_RACEGR34 X_RACEGR35
## -0.23935 -0.39914 0.44299 -0.08015 0.15982 0.20431
## X_RACEGR39 EDUCAL2 EDUCAL3 EDUCAL4 EDUCAL5 EDUCAL6
## -0.10953 -0.31863 -0.10127 -0.03155 0.06364 0.05913
## EDUCAL9 SMOKE1002 SMOKE1007 SMOKE1009 BRONCH2 BRONCH7
## 2.71201 0.11278 0.32951 13.92550 -0.13120 -0.42682
## BRONCH9 DUR.30D10 DUR.30D11 DUR.30D12 DUR.30D2 DUR.30D6
## 0.81345 0.29620 0.24865 0.12710 -0.21461 0.08613
## DUR.30D7 DUR.30D77 DUR.30D9 DUR.30D99 INCINDT2 INCINDT3
## -0.86167 -0.33618 -12.68953 0.08905 0.03673 0.85489
## INCINDT7 INCINDT9 LAST.MD5 LAST.MD6 LAST.MD7 LAST.MD77
## 0.25794 0.09236 0.11175 -0.30344 -0.54278 -1.10727
## LAST.MD88 LAST.MD99 LAST.MED2 LAST.MED3 LAST.MED4 LAST.MED5
## -0.48450 -0.09719 -0.35535 -0.39862 -0.52269 -0.52902
## LAST.MED6 LAST.MED7 LAST.MED77 LAST.MED99 EPIS.12M2 EPIS.12M6
## -0.19137 -0.39857 -0.30796 -12.88413 -0.14101 NA
## EPIS.12M7 EPIS.12M9 HOSP.VST2 HOSP.VST4 HOSP.VST5 HOSP.VST6
## -0.35314 14.29553 -0.62950 -0.46539 -0.62826 -0.86303
## HOSP.VST7 WORKTALK2 WORKTALK6 WORKTALK7 WORKTALK8 WORKTALK9
## 13.51931 -0.56114 -0.62826 -0.43398 -0.21206 -0.86451
##
## Degrees of Freedom: 9171 Total (i.e. Null); 9107 Residual
## Null Deviance: 12200
## Residual Deviance: 11450 AIC: 11580
Call: glm(formula = TARGET ~ SEX + AGEG.F7 + X_RACEGR3 + EDUCAL + BRONCH + DUR.30D + INCINDT + LAST.MD + LAST.MED + LAST.SYMP + COMPASTH + HOSPTIME + ASRXCOST + WORKTALK, family = binomial, data = training1)
# glm.mod11 <- glm(formula = TARGET ~ SEX + AGEG.F7 + X_RACEGR3 + EDUCAL + BRONCH +
# DUR.30D + INCINDT + LAST.MD + LAST.MED + LAST.SYMP + COMPASTH +HOSPPLAN+
# + ASRXCOST + WORKTALK, family = binomial, data = training1)
glm11.pred <- predict(glm.mod11, newdata = testing1[,-1], type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
predicted <- as.factor(ifelse(glm11.pred>.5,1,0))
glm11.cm <- confusionMatrix(data = predicted, testing1$TARGET, positive = '1')
glm11.cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1216 621
## 1 201 254
##
## Accuracy : 0.6414
## 95% CI : (0.6213, 0.661)
## No Information Rate : 0.6182
## P-Value [Acc > NIR] : 0.01177
##
## Kappa : 0.1634
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.2903
## Specificity : 0.8582
## Pos Pred Value : 0.5582
## Neg Pred Value : 0.6619
## Prevalence : 0.3818
## Detection Rate : 0.1108
## Detection Prevalence : 0.1985
## Balanced Accuracy : 0.5742
##
## 'Positive' Class : 1
##
glm.mod12 <- glm(formula = TARGET ~ SEX + AGEG.F7 + X_RACEGR3 + EDUCAL + X_INCOMG +
BRONCH + DUR.30D + INCINDT + LAST.MD + LAST.MED + LAST.SYMP +
COMPASTH + WORKTALK, family = binomial, data = training1)
glm.mod12
##
## Call: glm(formula = TARGET ~ SEX + AGEG.F7 + X_RACEGR3 + EDUCAL + X_INCOMG +
## BRONCH + DUR.30D + INCINDT + LAST.MD + LAST.MED + LAST.SYMP +
## COMPASTH + WORKTALK, family = binomial, data = training1)
##
## Coefficients:
## (Intercept) SEX2 AGEG.F72 AGEG.F73 AGEG.F74 AGEG.F75
## -0.296861 0.274233 0.213055 0.109213 0.069105 -0.162227
## AGEG.F76 AGEG.F77 X_RACEGR32 X_RACEGR33 X_RACEGR34 X_RACEGR35
## -0.286174 -0.433522 0.463234 -0.082731 0.169006 0.252479
## X_RACEGR39 EDUCAL2 EDUCAL3 EDUCAL4 EDUCAL5 EDUCAL6
## -0.076406 -0.424699 -0.209247 -0.129815 -0.047915 -0.061467
## EDUCAL9 X_INCOMG2 X_INCOMG3 X_INCOMG4 X_INCOMG5 X_INCOMG9
## 2.571087 -0.003592 -0.070456 -0.017138 0.077514 -0.117321
## BRONCH2 BRONCH7 BRONCH9 DUR.30D10 DUR.30D11 DUR.30D12
## -0.153546 -0.418623 0.763387 0.299654 0.170922 0.056587
## DUR.30D2 DUR.30D6 DUR.30D7 DUR.30D77 DUR.30D9 DUR.30D99
## -0.241000 -0.310584 -0.870667 -0.383731 -11.811407 0.083653
## INCINDT2 INCINDT3 INCINDT7 INCINDT9 LAST.MD5 LAST.MD6
## 0.013815 0.805441 0.349549 -0.089799 -0.153248 -0.576709
## LAST.MD7 LAST.MD77 LAST.MD88 LAST.MD99 LAST.MED2 LAST.MED3
## -0.781759 -1.136623 -0.795709 -0.031350 -0.365605 -0.458308
## LAST.MED4 LAST.MED5 LAST.MED6 LAST.MED7 LAST.MED77 LAST.MED99
## -0.565573 -0.558899 -0.202688 -0.373341 -0.298783 -11.905740
## LAST.SYMP2 LAST.SYMP3 LAST.SYMP4 LAST.SYMP5 LAST.SYMP6 LAST.SYMP7
## 0.041498 0.111992 NA 0.483963 0.499962 0.481258
## LAST.SYMP77 LAST.SYMP88 LAST.SYMP99 COMPASTH11 COMPASTH2 COMPASTH3
## -0.138243 NA 0.904544 -0.202580 -0.123520 -0.021035
## COMPASTH4 COMPASTH6 COMPASTH7 COMPASTH9 WORKTALK2 WORKTALK6
## -0.652164 NA -0.754911 12.360655 -0.564453 -0.590771
## WORKTALK7 WORKTALK8 WORKTALK9
## -0.413979 -0.084454 -0.828417
##
## Degrees of Freedom: 9171 Total (i.e. Null); 9100 Residual
## Null Deviance: 12200
## Residual Deviance: 11480 AIC: 11620
glm12.pred <- predict(glm.mod12, newdata = testing1[,-1], type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
predicted <- as.factor(ifelse(glm12.pred>.5,1,0))
glm12.cm <- confusionMatrix(data = predicted, testing1$TARGET, positive = '1')
glm12.cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1216 609
## 1 201 266
##
## Accuracy : 0.6466
## 95% CI : (0.6266, 0.6662)
## No Information Rate : 0.6182
## P-Value [Acc > NIR] : 0.002673
##
## Kappa : 0.178
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3040
## Specificity : 0.8582
## Pos Pred Value : 0.5696
## Neg Pred Value : 0.6663
## Prevalence : 0.3818
## Detection Rate : 0.1161
## Detection Prevalence : 0.2038
## Balanced Accuracy : 0.5811
##
## 'Positive' Class : 1
##
Since our dataset has multiple variable, we can use penalized logistic regression to find an optimal performing model. Ridge Regression and Lasso Regression have two different approaches. Ridge Regression incorporates all variables in the model and gives the coefficients of variables with minor contribution close to zero Lasso Regression keeps only the most significant variables and gives zero to the coefficient of the rest of variables.
set.seed(25)
inTraining <- createDataPartition(asth.mgt.ad.min33$TARGET, p = .80, list = FALSE)
training2 <- asth.mgt.ad.min33[ inTraining,]
testing2 <- asth.mgt.ad.min33[-inTraining,]
x <- model.matrix(TARGET ~., data = training2)
y = training2$TARGET
xt <- model.matrix(TARGET ~., data = testing2)
yt <- as.factor(testing2$TARGET)
We fit and obsrve the coefficients of rigde regression against the log of lambda.
fit.ridge <- glmnet(x = x,y=y, alpha=0, family="binomial")
plot(fit.ridge, xvar= "lambda", label=TRUE)
The coefficients are significative for negative log lambda and start stabilize around -4
cv.ridge <- cv.glmnet(x = x, y = y, alpha=0)
plot(cv.ridge)
The plot shows that the log of the optimal value of lambda (i.e. the one that minimises the root mean square error) is approximately -3. The exact value can be viewed by examining the variable lambda_min in the code below. In general though, the objective of regularisation is to balance accuracy and simplicity. In the present context, this means a model with the smallest number of coefficients that also gives a good accuracy. To this end, the cv.glmnet function finds the value of lambda that gives the simplest model but also lies within one standard error of the optimal value of lambda.
cv.ridge$lambda.min
## [1] 0.03528077
ridge.model1 <- glmnet(x = x,y=y, lambda = cv.ridge$lambda.min, alpha=0, family="binomial")
ridge.pred1 <- predict(ridge.model1, newx = xt)
predicted <- rep(0, length(yt))
predicted[ridge.pred1>0.5] <- "1"
ridge.cm1 <- confusionMatrix(data = as.factor(predicted), yt, positive = '1')
ridge.cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1402 873
## 1 8 9
##
## Accuracy : 0.6156
## 95% CI : (0.5953, 0.6356)
## No Information Rate : 0.6152
## P-Value [Acc > NIR] : 0.4921
##
## Kappa : 0.0055
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.010204
## Specificity : 0.994326
## Pos Pred Value : 0.529412
## Neg Pred Value : 0.616264
## Prevalence : 0.384817
## Detection Rate : 0.003927
## Detection Prevalence : 0.007417
## Balanced Accuracy : 0.502265
##
## 'Positive' Class : 1
##
We observe overfitting with this ridge model
ridge.model2 <- glmnet(x = x,y=y, lambda = cv.ridge$lambda.1se, alpha=0, family="binomial")
ridge.pred2 <- predict(ridge.model2, newx = xt)
predicted <- rep(0, length(yt))
predicted[ridge.pred2>0.5] <- "1"
ridge.cm2 <- confusionMatrix(data = as.factor(predicted), yt, positive = '1')
ridge.cm2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1410 881
## 1 0 1
##
## Accuracy : 0.6156
## 95% CI : (0.5953, 0.6356)
## No Information Rate : 0.6152
## P-Value [Acc > NIR] : 0.4921
##
## Kappa : 0.0014
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0011338
## Specificity : 1.0000000
## Pos Pred Value : 1.0000000
## Neg Pred Value : 0.6154518
## Prevalence : 0.3848168
## Detection Rate : 0.0004363
## Detection Prevalence : 0.0004363
## Balanced Accuracy : 0.5005669
##
## 'Positive' Class : 1
##
We observe overfitting with this second ridge model
coef(ridge.model1)
## 32 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -2.324525e-02
## (Intercept) .
## SEX 2.520737e-01
## AGEG.F7 -7.108246e-02
## X_RACEGR3 2.720204e-02
## EDUCAL 6.846601e-02
## X_INCOMG -1.660746e-02
## X_RFBMI5 -9.439599e-03
## SMOKE100 6.536126e-02
## COPD -4.966950e-02
## EMPHY -3.385579e-02
## DEPRESS 3.800329e-02
## BRONCH -8.075578e-02
## SYMP.30D 5.043238e-04
## DUR.30D -2.048291e-03
## ASLEEP30 -1.438621e-03
## SYMPFREE -3.209097e-03
## INCINDT 3.136646e-01
## LAST.MD -6.780652e-03
## LAST.MED -1.023217e-02
## LAST.SYMP -4.855822e-03
## EPIS.12M -8.050949e-03
## DUR.ASTH -2.502109e-05
## COMPASTH -1.534186e-02
## INS1 -2.410712e-02
## ER.TIMES -5.698173e-05
## ER.VISIT -4.874670e-02
## URG.TIMES -1.115901e-04
## HOSP.VST -8.757719e-03
## HOSPTIME -8.930293e-04
## ASRXCOST -6.361566e-02
## WORKTALK -1.666078e-01
fit.lasso <- glmnet(x =x, y = y, alpha = 1, family = "binomial")
plot(fit.lasso, xvar = "dev", label = TRUE)
fit.lasso <- glmnet(x,y)
plot(fit.lasso, xvar = "lambda", label = TRUE)
plot(fit.lasso, xvar = "dev", label = TRUE)
cv.lasso <- cv.glmnet(x,y)
plot(cv.lasso)
The plot shows that the log of the optimal value of lambda (i.e. the one that minimises the root mean square error) is approximately -10. The exact value can be viewed by examining the variable lambda_min in the code below. In general though, the objective of regularisation is to balance accuracy and simplicity. In the present context, this means a model with the smallest number of coefficients that also gives a good accuracy. To this end, the cv.glmnet function finds the value of lambda that gives the simplest model but also lies within one standard error of the optimal value of lambda.
lasso.model1 <- glmnet(x =x, y = y, lambda = cv.lasso$lambda.min, alpha = 1, family = "binomial")
lasso.pred1 <- predict(lasso.model1, newx = xt, type = "response")
predicted <- as.factor(ifelse(lasso.pred1>.5,1,0))
lasso.cm1 <- confusionMatrix(data = predicted, yt, positive = '1')
lasso.cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1257 700
## 1 153 182
##
## Accuracy : 0.6278
## 95% CI : (0.6077, 0.6477)
## No Information Rate : 0.6152
## P-Value [Acc > NIR] : 0.1104
##
## Kappa : 0.1107
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.20635
## Specificity : 0.89149
## Pos Pred Value : 0.54328
## Neg Pred Value : 0.64231
## Prevalence : 0.38482
## Detection Rate : 0.07941
## Detection Prevalence : 0.14616
## Balanced Accuracy : 0.54892
##
## 'Positive' Class : 1
##
coef(lasso.model1)
## 32 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -5.179373e-02
## (Intercept) .
## SEX 2.979078e-01
## AGEG.F7 -8.759172e-02
## X_RACEGR3 3.082027e-02
## EDUCAL 7.741617e-02
## X_INCOMG -1.969625e-02
## X_RFBMI5 -1.189649e-02
## SMOKE100 7.113936e-02
## COPD -5.932798e-02
## EMPHY -3.853782e-02
## DEPRESS 5.382313e-02
## BRONCH -9.581112e-02
## SYMP.30D 6.902831e-04
## DUR.30D -2.579247e-03
## ASLEEP30 -1.762597e-03
## SYMPFREE -3.767817e-03
## INCINDT 3.753925e-01
## LAST.MD -7.637464e-03
## LAST.MED -1.255915e-02
## LAST.SYMP -5.781372e-03
## EPIS.12M -3.302387e-02
## DUR.ASTH 1.679400e-04
## COMPASTH -3.207206e-02
## INS1 -2.106734e-02
## ER.TIMES 9.594068e-05
## ER.VISIT -1.288389e-01
## URG.TIMES -1.301261e-04
## HOSP.VST 1.001025e-01
## HOSPTIME -1.355156e-03
## ASRXCOST -5.168730e-02
## WORKTALK -2.002581e-01
lasso.model2 <- glmnet(x =x, y = y, lambda = cv.lasso$lambda.1se, alpha = 1, family = "binomial")
lasso.pred2 <- predict(lasso.model2, newx = xt, type = "response")
predicted <- as.factor(ifelse(lasso.pred2>.5,1,0))
lasso.cm2 <- confusionMatrix(data = predicted, yt, positive = '1')
lasso.cm2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1356 808
## 1 54 74
##
## Accuracy : 0.6239
## 95% CI : (0.6037, 0.6438)
## No Information Rate : 0.6152
## P-Value [Acc > NIR] : 0.2014
##
## Kappa : 0.0543
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.08390
## Specificity : 0.96170
## Pos Pred Value : 0.57812
## Neg Pred Value : 0.62662
## Prevalence : 0.38482
## Detection Rate : 0.03229
## Detection Prevalence : 0.05585
## Balanced Accuracy : 0.52280
##
## 'Positive' Class : 1
##
AICc <- function(fit){
tLL <- fit$nulldev - deviance(fit)
k <- fit$df
n <- fit$nobs
AICc <- -tLL+2*k+2*k*(k+1)/(n-k-1)
return (AICc)
}
it <- glmnet(x, y, family = “multinomial”)
tLL <- fit\(nulldev - deviance(fit) k <- fit\)df n <- fit$nobs AICc <- -tLL+2k+2k*(k+1)/(n-k-1) AICc
AICc(ridge.model1)
## [1] -415.5483
AICc(lasso.model1)
## [1] -429.3727
ctrl <- trainControl(method = "repeatedcv", repeats = 3)
plsFit1 <- train(
TARGET ~ .,
data = training1,
method = "pls",
preProc = c("center", "scale"),
tuneLength = 15,
## added:
trControl = ctrl
)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: SYMP.30D19, ASLEEP3019, ASLEEP3024,
## DUR.ASTH116, DUR.ASTH213, DUR.ASTH215, DUR.ASTH317, DUR.ASTH324, DUR.ASTH414,
## ER.TIMES14, ER.TIMES25, URG.TIMES16, URG.TIMES25, URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH180,
## DUR.ASTH209, DUR.ASTH317, ER.TIMES23, URG.TIMES16, URG.TIMES25, URG.TIMES28,
## URG.TIMES33, URG.TIMES9, HOSPTIME10, HOSPTIME9
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH111, DUR.ASTH116,
## DUR.ASTH317, DUR.ASTH424, DUR.ASTH430, ER.TIMES32, URG.TIMES16, URG.TIMES25,
## URG.TIMES33, HOSPTIME10, HOSPTIME17, HOSPTIME8
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, EPIS.12M9, DUR.ASTH116,
## DUR.ASTH317, COMPASTH9, ER.TIMES999, ER.VISIT9, URG.TIMES16, URG.TIMES25,
## URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: SMOKE1009, ASLEEP3024, DUR.ASTH116,
## DUR.ASTH259, DUR.ASTH317, URG.TIMES13, URG.TIMES16, URG.TIMES25, URG.TIMES33,
## HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH152,
## DUR.ASTH311, DUR.ASTH317, URG.TIMES16, URG.TIMES25, URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH117,
## DUR.ASTH317, DUR.ASTH409, ER.TIMES30, URG.TIMES16, URG.TIMES25, URG.TIMES33,
## URG.TIMES60, HOSPTIME10, HOSPTIME13
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: SYMP.30D26, DUR.30D9, ASLEEP3024,
## DUR.ASTH116, DUR.ASTH218, DUR.ASTH317, DUR.ASTH390, URG.TIMES16, URG.TIMES25,
## URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3017, ASLEEP3024, DUR.ASTH112,
## DUR.ASTH116, DUR.ASTH317, DUR.ASTH320, ER.TIMES12, URG.TIMES16, URG.TIMES25,
## URG.TIMES33, HOSPTIME10, HOSPTIME12
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH317,
## URG.TIMES16, URG.TIMES25, URG.TIMES33, URG.TIMES52, URG.TIMES77, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH215,
## DUR.ASTH317, DUR.ASTH409, ER.TIMES999, ER.VISIT9, URG.TIMES13, URG.TIMES16,
## URG.TIMES25, URG.TIMES33, URG.TIMES52, HOSPTIME10, HOSPTIME13
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: SYMP.30D19, ASLEEP3024, DUR.ASTH116,
## DUR.ASTH218, DUR.ASTH259, DUR.ASTH317, DUR.ASTH410, DUR.ASTH414, ER.TIMES14,
## ER.TIMES23, ER.TIMES25, ER.TIMES32, URG.TIMES16, URG.TIMES25, URG.TIMES33,
## HOSPTIME10, HOSPTIME17, HOSPTIME8
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH127,
## DUR.ASTH317, URG.TIMES16, URG.TIMES25, URG.TIMES28, URG.TIMES33, URG.TIMES77,
## URG.TIMES9, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3017, ASLEEP3024, DUR.ASTH116,
## DUR.ASTH180, DUR.ASTH317, DUR.ASTH424, URG.TIMES16, URG.TIMES18, URG.TIMES25,
## URG.TIMES33, URG.TIMES45, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: DUR.30D9, ASLEEP3024, DUR.ASTH116,
## DUR.ASTH152, DUR.ASTH209, DUR.ASTH317, DUR.ASTH390, ER.TIMES30, URG.TIMES16,
## URG.TIMES25, URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH111, DUR.ASTH116,
## DUR.ASTH213, DUR.ASTH311, DUR.ASTH317, DUR.ASTH324, URG.TIMES16, URG.TIMES25,
## URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: SYMP.30D26, ASLEEP3024, DUR.ASTH116,
## DUR.ASTH317, DUR.ASTH430, ER.TIMES12, URG.TIMES16, URG.TIMES25, URG.TIMES33,
## HOSPTIME10, HOSPTIME12
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH117,
## DUR.ASTH317, URG.TIMES16, URG.TIMES25, URG.TIMES33, URG.TIMES60, HOSPTIME10,
## HOSPTIME9
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH317,
## COMPASTH9, URG.TIMES16, URG.TIMES25, URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, EPIS.12M9, DUR.ASTH112,
## DUR.ASTH116, DUR.ASTH317, URG.TIMES16, URG.TIMES25, URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH111, DUR.ASTH116,
## DUR.ASTH117, DUR.ASTH311, DUR.ASTH317, URG.TIMES13, URG.TIMES16, URG.TIMES25,
## URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: DUR.30D9, ASLEEP3024, DUR.ASTH116,
## DUR.ASTH317, DUR.ASTH409, ER.TIMES12, ER.TIMES14, URG.TIMES16, URG.TIMES25,
## URG.TIMES33, HOSPTIME10, HOSPTIME12
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: SYMP.30D26, ASLEEP3024, DUR.ASTH116,
## DUR.ASTH317, DUR.ASTH430, URG.TIMES16, URG.TIMES25, URG.TIMES28, URG.TIMES33,
## HOSPTIME10, HOSPTIME13
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH317,
## URG.TIMES16, URG.TIMES25, URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: SYMP.30D19, ASLEEP3024, DUR.ASTH116,
## DUR.ASTH213, DUR.ASTH317, ER.TIMES25, URG.TIMES16, URG.TIMES25, URG.TIMES33,
## URG.TIMES52, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, EPIS.12M9, DUR.ASTH116,
## DUR.ASTH209, DUR.ASTH317, DUR.ASTH390, COMPASTH9, ER.TIMES999, ER.VISIT9,
## URG.TIMES16, URG.TIMES25, URG.TIMES33, HOSPTIME10, HOSPTIME9
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH215,
## DUR.ASTH317, URG.TIMES16, URG.TIMES25, URG.TIMES33, URG.TIMES60, URG.TIMES77,
## HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3017, ASLEEP3024, DUR.ASTH116,
## DUR.ASTH218, DUR.ASTH317, DUR.ASTH414, DUR.ASTH424, ER.TIMES23, ER.TIMES30,
## URG.TIMES16, URG.TIMES25, URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH152,
## DUR.ASTH180, DUR.ASTH259, DUR.ASTH317, ER.TIMES32, URG.TIMES16, URG.TIMES25,
## URG.TIMES33, HOSPTIME10, HOSPTIME17, HOSPTIME8
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, ASLEEP3027, DUR.ASTH112,
## DUR.ASTH116, DUR.ASTH317, DUR.ASTH324, URG.TIMES16, URG.TIMES25, URG.TIMES33,
## URG.TIMES9, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH317,
## URG.TIMES16, URG.TIMES25, URG.TIMES33, HOSPTIME10
pls1.pred <- predict(plsFit1, newdata = testing1[,-1], type = "prob")
pls.pred1 <- predict(plsFit1, newdata = testing1[,-1], type = "raw")
pls.cm1 <- confusionMatrix(data = pls.pred1, testing1$TARGET, positive = '1')
pls.cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1209 622
## 1 208 253
##
## Accuracy : 0.6379
## 95% CI : (0.6178, 0.6576)
## No Information Rate : 0.6182
## P-Value [Acc > NIR] : 0.02755
##
## Kappa : 0.1565
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.2891
## Specificity : 0.8532
## Pos Pred Value : 0.5488
## Neg Pred Value : 0.6603
## Prevalence : 0.3818
## Detection Rate : 0.1104
## Detection Prevalence : 0.2011
## Balanced Accuracy : 0.5712
##
## 'Positive' Class : 1
##
training3 <- training1
testing3 <- testing1
training3$TARGET <- ifelse(training3$TARGET=="1","T","F")
testing3$TARGET <- ifelse(testing3$TARGET=="1","T","F")
ctrl <- trainControl(
method = "repeatedcv",
repeats = 3,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
set.seed(123)
plsFit2 <- train(
TARGET ~ .,
data = training3,
method = "pls",
preProc = c("center", "scale"),
tuneLength = 15,
trControl = ctrl,
metric = "ROC"
)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH111, DUR.ASTH112,
## DUR.ASTH116, DUR.ASTH209, DUR.ASTH317, URG.TIMES13, URG.TIMES16, URG.TIMES25,
## URG.TIMES33, URG.TIMES60, HOSPTIME10, HOSPTIME13
## Warning in fitFunc(X, Y, ncomp, Y.add = Y.add, center = center, ...): No convergence in 100 iterations
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH117,
## DUR.ASTH218, DUR.ASTH317, DUR.ASTH320, URG.TIMES16, URG.TIMES25, URG.TIMES33,
## HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH317,
## DUR.ASTH424, ER.TIMES30, URG.TIMES16, URG.TIMES25, URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH259,
## DUR.ASTH311, DUR.ASTH317, DUR.ASTH414, ER.TIMES12, URG.TIMES16, URG.TIMES25,
## URG.TIMES33, URG.TIMES9, HOSPTIME10, HOSPTIME12, HOSPTIME8
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH213,
## DUR.ASTH317, DUR.ASTH390, ER.TIMES23, URG.TIMES16, URG.TIMES25, URG.TIMES33,
## HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH317,
## ER.TIMES32, URG.TIMES16, URG.TIMES25, URG.TIMES28, URG.TIMES33, URG.TIMES52,
## HOSPTIME10, HOSPTIME17
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH317,
## DUR.ASTH410, ER.TIMES999, ER.VISIT9, URG.TIMES16, URG.TIMES25, URG.TIMES33,
## HOSPTIME10, HOSPTIME9
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: DUR.30D9, ASLEEP3017, ASLEEP3024,
## EPIS.12M9, DUR.ASTH116, DUR.ASTH317, ER.TIMES14, URG.TIMES16, URG.TIMES25,
## URG.TIMES33, URG.TIMES77, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH180,
## DUR.ASTH215, DUR.ASTH317, COMPASTH9, URG.TIMES16, URG.TIMES25, URG.TIMES33,
## HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: SYMP.30D19, SYMP.30D26, ASLEEP3024,
## DUR.ASTH116, DUR.ASTH152, DUR.ASTH317, DUR.ASTH324, DUR.ASTH409, DUR.ASTH430,
## ER.TIMES25, URG.TIMES16, URG.TIMES25, URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH180,
## DUR.ASTH317, URG.TIMES16, URG.TIMES25, URG.TIMES33, URG.TIMES60, HOSPTIME10,
## HOSPTIME9
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, EPIS.12M9, DUR.ASTH116,
## DUR.ASTH209, DUR.ASTH317, DUR.ASTH390, URG.TIMES16, URG.TIMES25, URG.TIMES33,
## HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH317,
## DUR.ASTH320, ER.TIMES30, ER.TIMES999, ER.VISIT9, URG.TIMES16, URG.TIMES25,
## URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH317,
## URG.TIMES16, URG.TIMES25, URG.TIMES33, HOSPTIME10, HOSPTIME13
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: DUR.30D9, ASLEEP3024, DUR.ASTH116,
## DUR.ASTH311, DUR.ASTH317, DUR.ASTH409, COMPASTH9, URG.TIMES16, URG.TIMES25,
## URG.TIMES33, URG.TIMES45, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH112, DUR.ASTH116,
## DUR.ASTH317, DUR.ASTH414, DUR.ASTH430, URG.TIMES16, URG.TIMES25, URG.TIMES33,
## URG.TIMES9, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH218,
## DUR.ASTH259, DUR.ASTH317, ER.TIMES12, ER.TIMES23, ER.TIMES32, URG.TIMES16,
## URG.TIMES25, URG.TIMES28, URG.TIMES33, URG.TIMES77, HOSPTIME10, HOSPTIME12,
## HOSPTIME17, HOSPTIME8
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3017, ASLEEP3024, DUR.ASTH116,
## DUR.ASTH117, DUR.ASTH317, DUR.ASTH424, URG.TIMES13, URG.TIMES16, URG.TIMES25,
## URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH152,
## DUR.ASTH213, DUR.ASTH317, ER.TIMES14, URG.TIMES16, URG.TIMES25, URG.TIMES33,
## URG.TIMES52, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: SYMP.30D19, SYMP.30D26, ASLEEP3024,
## DUR.ASTH111, DUR.ASTH116, DUR.ASTH215, DUR.ASTH317, DUR.ASTH324, ER.TIMES25,
## URG.TIMES16, URG.TIMES25, URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH218,
## DUR.ASTH317, URG.TIMES13, URG.TIMES16, URG.TIMES25, URG.TIMES33, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3017, ASLEEP3024, DUR.ASTH111,
## DUR.ASTH116, DUR.ASTH117, DUR.ASTH317, ER.TIMES32, URG.TIMES16, URG.TIMES25,
## URG.TIMES33, HOSPTIME10, HOSPTIME17
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: SYMP.30D19, ASLEEP3024, DUR.ASTH116,
## DUR.ASTH317, ER.TIMES12, ER.TIMES25, URG.TIMES16, URG.TIMES25, URG.TIMES33,
## HOSPTIME10, HOSPTIME12, HOSPTIME13
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: DUR.30D9, ASLEEP3024, DUR.ASTH116,
## DUR.ASTH213, DUR.ASTH311, DUR.ASTH317, DUR.ASTH414, URG.TIMES16, URG.TIMES25,
## URG.TIMES33, URG.TIMES77, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH317,
## DUR.ASTH390, ER.TIMES999, ER.VISIT9, URG.TIMES16, URG.TIMES25, URG.TIMES28,
## URG.TIMES33, URG.TIMES9, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH152,
## DUR.ASTH317, COMPASTH9, URG.TIMES16, URG.TIMES25, URG.TIMES33, HOSPTIME10,
## HOSPTIME8
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: SYMP.30D26, ASLEEP3024, DUR.ASTH112,
## DUR.ASTH116, DUR.ASTH209, DUR.ASTH215, DUR.ASTH259, DUR.ASTH317, DUR.ASTH324,
## ER.TIMES23, ER.TIMES30, URG.TIMES16, URG.TIMES25, URG.TIMES30, URG.TIMES33,
## HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, EPIS.12M9, DUR.ASTH116,
## DUR.ASTH317, DUR.ASTH409, DUR.ASTH430, URG.TIMES16, URG.TIMES25, URG.TIMES33,
## HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH317,
## DUR.ASTH424, ER.TIMES14, URG.TIMES16, URG.TIMES25, URG.TIMES33, URG.TIMES52,
## HOSPTIME10, HOSPTIME9
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH180,
## DUR.ASTH317, URG.TIMES16, URG.TIMES25, URG.TIMES33, URG.TIMES60, HOSPTIME10
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: ASLEEP3024, DUR.ASTH116, DUR.ASTH317,
## URG.TIMES16, URG.TIMES25, URG.TIMES33, HOSPTIME10
plsFit2
## Partial Least Squares
##
## 9172 samples
## 30 predictor
## 2 classes: 'F', 'T'
##
## Pre-processing: centered (351), scaled (351)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 8255, 8255, 8255, 8254, 8255, 8255, ...
## Resampling results across tuning parameters:
##
## ncomp ROC Sens Spec
## 1 0.6039485 0.9715378 0.07135043
## 2 0.6291503 0.8576829 0.27654565
## 3 0.6359999 0.8655053 0.26580057
## 4 0.6368760 0.8631514 0.26760901
## 5 0.6350089 0.8545066 0.28473423
## 6 0.6358661 0.8506253 0.28045421
## 7 0.6370014 0.8500373 0.29119929
## 8 0.6378465 0.8499791 0.29167494
## 9 0.6384263 0.8490389 0.29167114
## 10 0.6384409 0.8481566 0.29347931
## 11 0.6383158 0.8501564 0.29224149
## 12 0.6380956 0.8501565 0.29395523
## 13 0.6380557 0.8496853 0.29309809
## 14 0.6379165 0.8492152 0.29319333
## 15 0.6378900 0.8485680 0.29243196
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 10.
pls2.pred <- predict(plsFit2, newdata = testing3[,-1], type = "prob")
pls.pred2 <- predict(plsFit2, newdata = testing3[,-1], type = "raw")
pls.cm2 <- confusionMatrix(data = pls.pred1, testing1$TARGET, positive = '1')
pls.cm2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1209 622
## 1 208 253
##
## Accuracy : 0.6379
## 95% CI : (0.6178, 0.6576)
## No Information Rate : 0.6182
## P-Value [Acc > NIR] : 0.02755
##
## Kappa : 0.1565
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.2891
## Specificity : 0.8532
## Pos Pred Value : 0.5488
## Neg Pred Value : 0.6603
## Prevalence : 0.3818
## Detection Rate : 0.1104
## Detection Prevalence : 0.2011
## Balanced Accuracy : 0.5712
##
## 'Positive' Class : 1
##
cm.metric <- function(cm){
test = c(cm$overall["Accuracy"],
cm$byClass["Precision"],
cm$byClass["Sensitivity"],
cm$byClass["Specificity"],
cm$byClass["F1"])
return(test)
}
metrics.mod <- data.frame(#glm.mod = cm.metric(glm.cm),
glm.mod11 = cm.metric(glm11.cm),
glm.mod12 = cm.metric(glm12.cm),
ridge.mod1 = cm.metric(ridge.cm1),
ridge.mod2 = cm.metric(ridge.cm2),
lasso.mod1 = cm.metric(lasso.cm1),
lasso.mod2 = cm.metric(lasso.cm2),
pls.mod1 = cm.metric(pls.cm1),
pls.mod2 = cm.metric(pls.cm2))
metrics.mod
## glm.mod11 glm.mod12 ridge.mod1 ridge.mod2 lasso.mod1 lasso.mod2
## Accuracy 0.6413613 0.6465969 0.61561955 0.615619546 0.6278360 0.62390925
## Precision 0.5582418 0.5695931 0.52941176 1.000000000 0.5432836 0.57812500
## Sensitivity 0.2902857 0.3040000 0.01020408 0.001133787 0.2063492 0.08390023
## Specificity 0.8581510 0.8581510 0.99432624 1.000000000 0.8914894 0.96170213
## F1 0.3819549 0.3964232 0.02002225 0.002265006 0.2990961 0.14653465
## pls.mod1 pls.mod2
## Accuracy 0.6378709 0.6378709
## Precision 0.5488069 0.5488069
## Sensitivity 0.2891429 0.2891429
## Specificity 0.8532110 0.8532110
## F1 0.3787425 0.3787425
With precision and specificity equal to 1, the ridge.mod2 model is overfitting. But glm.mod12 has the best accuracy, precision, sensivity, and specificity.
We can plot the ROC curve and extract the AUC value.
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:colorspace':
##
## coords
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(dplyr)
prediction <- data.frame(TARGET = testing1$TARGET,
glm1 = glm11.pred,
gml2 = glm12.pred,
pls1 = pls1.pred,
pls2 = pls2.pred,
rp1 = ridge.pred1,
rp2 = ridge.pred2,
lp1 = lasso.pred1,
lp2 = lasso.pred2)
prediction[1:10,]
## TARGET glm1 gml2 pls1.0 pls1.1 pls2.F pls2.T
## 9 0 0.42038227 0.40075572 0.5699343 0.4300657 0.5719089 0.4280911
## 16 0 0.18190010 0.16424556 0.6636239 0.3363761 0.6651945 0.3348055
## 19 1 0.23123657 0.26210733 0.5994985 0.4005015 0.6029853 0.3970147
## 23 0 0.36339360 0.35312033 0.5655764 0.4344236 0.5672724 0.4327276
## 24 1 0.56115514 0.44466340 0.3878735 0.6121265 0.3856000 0.6144000
## 26 1 0.08046416 0.08320078 0.6972119 0.3027881 0.6919859 0.3080141
## 41 1 0.55224416 0.56647201 0.5283444 0.4716556 0.5257482 0.4742518
## 48 0 0.27400911 0.28907858 0.5923933 0.4076067 0.5945010 0.4054990
## 57 0 0.52444012 0.52378026 0.4900860 0.5099140 0.4914093 0.5085907
## 61 0 0.56905667 0.59206901 0.5119637 0.4880363 0.5071891 0.4928109
## s0 s0.1 s0.2 s0.3
## 9 -0.54196169 -0.6010595 0.3713285 0.3595024
## 16 -0.80330024 -0.6855449 0.3003069 0.3283310
## 19 -1.14384238 -0.8200241 0.2231959 0.2689421
## 23 -0.82107169 -0.6885967 0.2947905 0.3158181
## 24 -1.03773414 -0.7151098 0.2472863 0.2746741
## 26 -1.43218074 -0.8463289 0.1699976 0.2432997
## 41 -0.03224888 -0.2289511 0.4944371 0.4372921
## 48 -0.45346431 -0.5159101 0.4165649 0.4021712
## 57 -0.71385682 -0.5736487 0.3137068 0.3408616
## 61 -0.77733510 -0.6947722 0.2939840 0.3200485
## With ggplot2 ##
library(ggplot2)
# Create multiple curves to plot
roc1 <- roc(TARGET ~., data = prediction)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
ggroc(roc1)
The glm model has the best Area Under the Curve.
best.model <- glm(formula = TARGET ~ SEX + AGEG.F7 + X_RACEGR3 + EDUCAL + X_INCOMG +
BRONCH + DUR.30D + INCINDT + LAST.MD + LAST.MED + LAST.SYMP +
COMPASTH + WORKTALK, family = binomial, data = asth.mgt.ad.min35)
summary(best.model)
##
## Call:
## glm(formula = TARGET ~ SEX + AGEG.F7 + X_RACEGR3 + EDUCAL + X_INCOMG +
## BRONCH + DUR.30D + INCINDT + LAST.MD + LAST.MED + LAST.SYMP +
## COMPASTH + WORKTALK, family = binomial, data = asth.mgt.ad.min35)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7729 -0.9740 -0.7404 1.2084 2.3441
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.466562 0.613700 -0.760 0.447109
## SEX2 0.276288 0.044987 6.141 8.18e-10 ***
## AGEG.F72 0.194106 0.111573 1.740 0.081907 .
## AGEG.F73 0.087674 0.108869 0.805 0.420638
## AGEG.F74 0.043179 0.103487 0.417 0.676502
## AGEG.F75 -0.190401 0.100283 -1.899 0.057614 .
## AGEG.F76 -0.313196 0.102316 -3.061 0.002206 **
## AGEG.F77 -0.453235 0.112523 -4.028 5.63e-05 ***
## X_RACEGR32 0.446755 0.087281 5.119 3.08e-07 ***
## X_RACEGR33 -0.018268 0.109114 -0.167 0.867035
## X_RACEGR34 0.119190 0.111057 1.073 0.283168
## X_RACEGR35 0.260119 0.075426 3.449 0.000563 ***
## X_RACEGR39 -0.026701 0.196333 -0.136 0.891824
## EDUCAL2 -0.312208 0.593888 -0.526 0.599096
## EDUCAL3 -0.092466 0.585184 -0.158 0.874448
## EDUCAL4 -0.006621 0.580499 -0.011 0.990900
## EDUCAL5 0.082535 0.580654 0.142 0.886969
## EDUCAL6 0.060055 0.581018 0.103 0.917676
## EDUCAL9 1.723178 0.846335 2.036 0.041745 *
## X_INCOMG2 0.037522 0.073904 0.508 0.611651
## X_INCOMG3 -0.042309 0.087665 -0.483 0.629367
## X_INCOMG4 0.006740 0.081718 0.082 0.934265
## X_INCOMG5 0.100688 0.069871 1.441 0.149572
## X_INCOMG9 -0.150883 0.086062 -1.753 0.079571 .
## BRONCH2 -0.185872 0.047802 -3.888 0.000101 ***
## BRONCH7 -0.328224 0.227379 -1.444 0.148876
## BRONCH9 0.407287 0.830823 0.490 0.623978
## DUR.30D10 0.277913 0.095582 2.908 0.003642 **
## DUR.30D11 -0.002470 0.124786 -0.020 0.984209
## DUR.30D12 0.010185 0.080167 0.127 0.898906
## DUR.30D2 -0.285110 0.100115 -2.848 0.004402 **
## DUR.30D6 -0.416742 0.357009 -1.167 0.243084
## DUR.30D7 -0.493707 0.423701 -1.165 0.243928
## DUR.30D77 -0.368780 0.203664 -1.811 0.070183 .
## DUR.30D9 -11.944014 229.404588 -0.052 0.958477
## DUR.30D99 0.009318 0.699072 0.013 0.989365
## INCINDT2 0.159646 0.169713 0.941 0.346867
## INCINDT3 0.918928 0.150998 6.086 1.16e-09 ***
## INCINDT7 0.410334 0.487609 0.842 0.400056
## INCINDT9 0.509994 0.911950 0.559 0.576002
## LAST.MD5 -0.152709 0.063268 -2.414 0.015792 *
## LAST.MD6 -0.499577 0.097967 -5.099 3.41e-07 ***
## LAST.MD7 -0.673545 0.084721 -7.950 1.86e-15 ***
## LAST.MD77 -0.890718 0.299501 -2.974 0.002939 **
## LAST.MD88 -0.871599 0.358578 -2.431 0.015069 *
## LAST.MD99 0.008230 0.953016 0.009 0.993109
## LAST.MED2 -0.356421 0.080579 -4.423 9.72e-06 ***
## LAST.MED3 -0.502834 0.072404 -6.945 3.79e-12 ***
## LAST.MED4 -0.577696 0.083699 -6.902 5.12e-12 ***
## LAST.MED5 -0.619110 0.094496 -6.552 5.69e-11 ***
## LAST.MED6 -0.290622 0.123177 -2.359 0.018305 *
## LAST.MED7 -0.500214 0.102672 -4.872 1.11e-06 ***
## LAST.MED77 -0.395498 0.287950 -1.373 0.169598
## LAST.MED99 -11.932664 143.285200 -0.083 0.933630
## LAST.SYMP2 0.129438 0.076970 1.682 0.092631 .
## LAST.SYMP3 0.193300 0.074028 2.611 0.009024 **
## LAST.SYMP4 NA NA NA NA
## LAST.SYMP5 0.568687 0.352571 1.613 0.106751
## LAST.SYMP6 0.585151 0.362313 1.615 0.106301
## LAST.SYMP7 0.614473 0.353784 1.737 0.082412 .
## LAST.SYMP77 -0.014998 0.203814 -0.074 0.941340
## LAST.SYMP88 NA NA NA NA
## LAST.SYMP99 0.832603 0.866077 0.961 0.336376
## COMPASTH11 -0.252750 0.074691 -3.384 0.000715 ***
## COMPASTH2 -0.193343 0.101922 -1.897 0.057831 .
## COMPASTH3 -0.069566 0.075269 -0.924 0.355369
## COMPASTH4 -0.895772 0.811506 -1.104 0.269663
## COMPASTH6 NA NA NA NA
## COMPASTH7 -0.886717 0.424085 -2.091 0.036537 *
## COMPASTH9 12.361369 324.743746 0.038 0.969636
## WORKTALK2 -0.555216 0.049857 -11.136 < 2e-16 ***
## WORKTALK6 -0.642787 0.161161 -3.988 6.65e-05 ***
## WORKTALK7 -0.435909 0.250891 -1.737 0.082309 .
## WORKTALK8 0.381298 0.555692 0.686 0.492607
## WORKTALK9 -0.922260 0.890616 -1.036 0.300421
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15248 on 11463 degrees of freedom
## Residual deviance: 14370 on 11392 degrees of freedom
## AIC: 14514
##
## Number of Fisher Scoring iterations: 11
exp(coef(best.model))
## (Intercept) SEX2 AGEG.F72 AGEG.F73 AGEG.F74 AGEG.F75
## 6.271548e-01 1.318228e+00 1.214226e+00 1.091632e+00 1.044125e+00 8.266279e-01
## AGEG.F76 AGEG.F77 X_RACEGR32 X_RACEGR33 X_RACEGR34 X_RACEGR35
## 7.311063e-01 6.355688e-01 1.563231e+00 9.818974e-01 1.126584e+00 1.297085e+00
## X_RACEGR39 EDUCAL2 EDUCAL3 EDUCAL4 EDUCAL5 EDUCAL6
## 9.736526e-01 7.318293e-01 9.116803e-01 9.934010e-01 1.086036e+00 1.061895e+00
## EDUCAL9 X_INCOMG2 X_INCOMG3 X_INCOMG4 X_INCOMG5 X_INCOMG9
## 5.602306e+00 1.038235e+00 9.585739e-01 1.006763e+00 1.105931e+00 8.599482e-01
## BRONCH2 BRONCH7 BRONCH9 DUR.30D10 DUR.30D11 DUR.30D12
## 8.303795e-01 7.202013e-01 1.502735e+00 1.320371e+00 9.975333e-01 1.010237e+00
## DUR.30D2 DUR.30D6 DUR.30D7 DUR.30D77 DUR.30D9 DUR.30D99
## 7.519313e-01 6.591913e-01 6.103596e-01 6.915774e-01 6.498014e-06 1.009362e+00
## INCINDT2 INCINDT3 INCINDT7 INCINDT9 LAST.MD5 LAST.MD6
## 1.173096e+00 2.506601e+00 1.507320e+00 1.665280e+00 8.583797e-01 6.067873e-01
## LAST.MD7 LAST.MD77 LAST.MD88 LAST.MD99 LAST.MED2 LAST.MED3
## 5.098977e-01 4.103609e-01 4.182823e-01 1.008264e+00 7.001777e-01 6.048143e-01
## LAST.MED4 LAST.MED5 LAST.MED6 LAST.MED7 LAST.MED77 LAST.MED99
## 5.611900e-01 5.384235e-01 7.477985e-01 6.064008e-01 6.733445e-01 6.572189e-06
## LAST.SYMP2 LAST.SYMP3 LAST.SYMP4 LAST.SYMP5 LAST.SYMP6 LAST.SYMP7
## 1.138189e+00 1.213246e+00 NA 1.765947e+00 1.795263e+00 1.848682e+00
## LAST.SYMP77 LAST.SYMP88 LAST.SYMP99 COMPASTH11 COMPASTH2 COMPASTH3
## 9.851141e-01 NA 2.299297e+00 7.766618e-01 8.241990e-01 9.327990e-01
## COMPASTH4 COMPASTH6 COMPASTH7 COMPASTH9 WORKTALK2 WORKTALK6
## 4.082922e-01 NA 4.120061e-01 2.336007e+05 5.739481e-01 5.258250e-01
## WORKTALK7 WORKTALK8 WORKTALK9
## 6.466767e-01 1.464184e+00 3.976193e-01