\(~\)
\(~\)
\(~\)
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')
\(~\)
\(~\)
\(~\)
\(~\)
#### Variable in Data - Definition - Data Type
##### seqn - Respondent sequence number - Identifier
##### riagendr - Gender - Categorical
##### ridageyr - Age in years at screening - Continuous / Numerical
##### ridreth1 - Race/Hispanic origin - Categorical
##### dmdeduc2 - Education level - Adults 20+ - Categorical
##### dmdmartl - Marital status - Categorical
##### indhhin2 - Annual household income - Categorical
##### bmxbmi - Body Mass Index (kg/m**2) - Continuous / Numerical
##### diq010 - Doctor diagnosed diabetes - Categorical / Target
##### lbxglu - Fasting Glucose (mg/dL) - Continuous / Numerical
str(diab_pop)
## 'data.frame': 5719 obs. of 10 variables:
## $ seqn : num 83732 83733 83734 83735 83736 ...
## $ riagendr: Factor w/ 2 levels "Male","Female": 1 1 1 2 2 2 1 2 1 1 ...
## $ ridageyr: num 62 53 78 56 42 72 22 32 56 46 ...
## $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 3 3 3 3 4 1 4 1 4 3 ...
## $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 5 3 3 5 4 2 4 4 3 5 ...
## $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 1 3 1 6 3 4 5 1 3 6 ...
## $ indhhin2: Factor w/ 14 levels "$0-$4,999","$5,000-$9,999",..: 10 4 5 10 NA 13 NA 6 3 3 ...
## $ bmxbmi : num 27.8 30.8 28.8 42.4 20.3 28.6 28 28.2 33.6 27.6 ...
## $ diq010 : Factor w/ 2 levels "Diabetes","No Diabetes": 1 2 1 2 2 2 2 2 1 2 ...
## $ lbxglu : num NA 101 84 NA 84 107 95 NA NA NA ...
\(~\)
\(~\)
\(~\)
install_if_not <- function( list.of.packages ) {
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}
\(~\)
\(~\)
\(~\)
\(~\)
library('tidyverse')
## -- Attaching packages --------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.4
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
diab_pop.no_na_vals <- diab_pop %>% na.omit()
library('caret')
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(8675309) # this will ensure our results are the same every run
# to randomize you may use: set.seed(Sys.time())
# The createDataPartition function is used to create training and test sets
trainIndex <- createDataPartition(diab_pop.no_na_vals$diq010,
p = .6,
list = FALSE,
times = 1)
diab_pop.no_na_vals.train <- diab_pop.no_na_vals[trainIndex, ]
diab_pop.no_na_vals.test <- diab_pop.no_na_vals[-trainIndex, ]
\(~\)
\(~\)
\(~\)
\(~\)
logit_model <- glm(diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu,
family="binomial",
data = diab_pop.no_na_vals.train)
logit_model
##
## Call: glm(formula = diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 +
## dmdmartl + indhhin2 + bmxbmi + lbxglu, family = "binomial",
## data = diab_pop.no_na_vals.train)
##
## Coefficients:
## (Intercept) riagendrFemale
## 9.80789 -0.11464
## ridageyr ridreth1Other Hispanic
## -0.05503 -0.29723
## ridreth1Non-Hispanic White ridreth1Non-Hispanic Black
## -0.38270 -0.92228
## ridreth1Other dmdeduc2Grades 9-11th
## -0.71991 0.44575
## dmdeduc2High school graduate/GED dmdeduc2Some college or AA degrees
## 1.26584 0.90043
## dmdeduc2College grad or above dmdmartlWidowed
## 0.94730 0.47483
## dmdmartlDivorced dmdmartlSeparated
## 0.36843 -0.04411
## dmdmartlNever married dmdmartlLiving with partner
## -0.03577 0.17759
## indhhin2$5,000-$9,999 indhhin2$10,000-$14,999
## 0.34427 1.52998
## indhhin2$15,000-$19,999 indhhin2$20,000-$24,999
## 1.78220 1.08653
## indhhin2$25,000-$34,999 indhhin2$45,000-$54,999
## 1.04877 0.75253
## indhhin2$65,000-$74,999 indhhin220,000+
## 0.90369 1.03068
## indhhin2less than $20,000 indhhin2$75,000-$99,999
## 2.72902 1.41228
## indhhin2$100,000+ bmxbmi
## 0.77823 -0.05353
## lbxglu
## -0.03826
##
## Degrees of Freedom: 1125 Total (i.e. Null); 1097 Residual
## Null Deviance: 952.3
## Residual Deviance: 551.3 AIC: 609.3
summary(logit_model)
##
## Call:
## glm(formula = diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 +
## dmdmartl + indhhin2 + bmxbmi + lbxglu, family = "binomial",
## data = diab_pop.no_na_vals.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0553 0.1203 0.2454 0.4033 4.0912
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 9.807888 1.143258 8.579 < 2e-16 ***
## riagendrFemale -0.114638 0.238933 -0.480 0.63138
## ridageyr -0.055027 0.009326 -5.900 3.63e-09 ***
## ridreth1Other Hispanic -0.297226 0.428658 -0.693 0.48807
## ridreth1Non-Hispanic White -0.382705 0.393014 -0.974 0.33017
## ridreth1Non-Hispanic Black -0.922281 0.418804 -2.202 0.02765 *
## ridreth1Other -0.719906 0.487388 -1.477 0.13966
## dmdeduc2Grades 9-11th 0.445749 0.429922 1.037 0.29982
## dmdeduc2High school graduate/GED 1.265841 0.400733 3.159 0.00158 **
## dmdeduc2Some college or AA degrees 0.900433 0.389046 2.314 0.02064 *
## dmdeduc2College grad or above 0.947302 0.426745 2.220 0.02643 *
## dmdmartlWidowed 0.474834 0.399959 1.187 0.23515
## dmdmartlDivorced 0.368433 0.378205 0.974 0.32998
## dmdmartlSeparated -0.044107 0.558009 -0.079 0.93700
## dmdmartlNever married -0.035769 0.437124 -0.082 0.93478
## dmdmartlLiving with partner 0.177591 0.547438 0.324 0.74563
## indhhin2$5,000-$9,999 0.344274 0.627713 0.548 0.58338
## indhhin2$10,000-$14,999 1.529979 0.657454 2.327 0.01996 *
## indhhin2$15,000-$19,999 1.782201 0.672002 2.652 0.00800 **
## indhhin2$20,000-$24,999 1.086530 0.639608 1.699 0.08937 .
## indhhin2$25,000-$34,999 1.048766 0.590010 1.778 0.07548 .
## indhhin2$45,000-$54,999 0.752527 0.632012 1.191 0.23378
## indhhin2$65,000-$74,999 0.903686 0.656972 1.376 0.16897
## indhhin220,000+ 1.030682 0.902598 1.142 0.25349
## indhhin2less than $20,000 2.729024 1.140925 2.392 0.01676 *
## indhhin2$75,000-$99,999 1.412279 0.657736 2.147 0.03178 *
## indhhin2$100,000+ 0.778230 0.615139 1.265 0.20582
## bmxbmi -0.053526 0.017057 -3.138 0.00170 **
## lbxglu -0.038258 0.003688 -10.373 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 952.29 on 1125 degrees of freedom
## Residual deviance: 551.29 on 1097 degrees of freedom
## AIC: 609.29
##
## Number of Fisher Scoring iterations: 6
str(logit_model,1)
## List of 30
## $ coefficients : Named num [1:29] 9.808 -0.115 -0.055 -0.297 -0.383 ...
## ..- attr(*, "names")= chr [1:29] "(Intercept)" "riagendrFemale" "ridageyr" "ridreth1Other Hispanic" ...
## $ residuals : Named num [1:1126] 1.01 1.03 1.68 -1.01 1.13 ...
## ..- attr(*, "names")= chr [1:1126] "2" "11" "13" "14" ...
## $ fitted.values : Named num [1:1126] 0.98802 0.96838 0.59693 0.00902 0.88719 ...
## ..- attr(*, "names")= chr [1:1126] "2" "11" "13" "14" ...
## $ effects : Named num [1:1126] -12.493 0.103 6.2 -0.463 2.082 ...
## ..- attr(*, "names")= chr [1:1126] "(Intercept)" "riagendrFemale" "ridageyr" "ridreth1Other Hispanic" ...
## $ R : num [1:29, 1:29] -8.93 0 0 0 0 ...
## ..- attr(*, "dimnames")=List of 2
## $ rank : int 29
## $ qr :List of 5
## ..- attr(*, "class")= chr "qr"
## $ family :List of 12
## ..- attr(*, "class")= chr "family"
## $ linear.predictors: Named num [1:1126] 4.413 3.422 0.393 -4.699 2.062 ...
## ..- attr(*, "names")= chr [1:1126] "2" "11" "13" "14" ...
## $ deviance : num 551
## $ aic : num 609
## $ null.deviance : num 952
## $ iter : int 6
## $ weights : Named num [1:1126] 0.01184 0.03062 0.2406 0.00894 0.10009 ...
## ..- attr(*, "names")= chr [1:1126] "2" "11" "13" "14" ...
## $ prior.weights : Named num [1:1126] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "names")= chr [1:1126] "2" "11" "13" "14" ...
## $ df.residual : int 1097
## $ df.null : int 1125
## $ y : Named num [1:1126] 1 1 1 0 1 1 1 1 1 1 ...
## ..- attr(*, "names")= chr [1:1126] "2" "11" "13" "14" ...
## $ converged : logi TRUE
## $ boundary : logi FALSE
## $ model :'data.frame': 1126 obs. of 9 variables:
## ..- attr(*, "terms")=Classes 'terms', 'formula' language diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu
## .. .. ..- attr(*, "variables")= language list(diq010, riagendr, ridageyr, ridreth1, dmdeduc2, dmdmartl, indhhin2, bmxbmi, lbxglu)
## .. .. ..- attr(*, "factors")= int [1:9, 1:8] 0 1 0 0 0 0 0 0 0 0 ...
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. ..- attr(*, "term.labels")= chr [1:8] "riagendr" "ridageyr" "ridreth1" "dmdeduc2" ...
## .. .. ..- attr(*, "order")= int [1:8] 1 1 1 1 1 1 1 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(diq010, riagendr, ridageyr, ridreth1, dmdeduc2, dmdmartl, indhhin2, bmxbmi, lbxglu)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:9] "factor" "factor" "numeric" "factor" ...
## .. .. .. ..- attr(*, "names")= chr [1:9] "diq010" "riagendr" "ridageyr" "ridreth1" ...
## $ call : language glm(formula = diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu, fa| __truncated__
## $ formula :Class 'formula' language diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## $ terms :Classes 'terms', 'formula' language diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu
## .. ..- attr(*, "variables")= language list(diq010, riagendr, ridageyr, ridreth1, dmdeduc2, dmdmartl, indhhin2, bmxbmi, lbxglu)
## .. ..- attr(*, "factors")= int [1:9, 1:8] 0 1 0 0 0 0 0 0 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. ..- attr(*, "term.labels")= chr [1:8] "riagendr" "ridageyr" "ridreth1" "dmdeduc2" ...
## .. ..- attr(*, "order")= int [1:8] 1 1 1 1 1 1 1 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(diq010, riagendr, ridageyr, ridreth1, dmdeduc2, dmdmartl, indhhin2, bmxbmi, lbxglu)
## .. ..- attr(*, "dataClasses")= Named chr [1:9] "factor" "factor" "numeric" "factor" ...
## .. .. ..- attr(*, "names")= chr [1:9] "diq010" "riagendr" "ridageyr" "ridreth1" ...
## $ data :'data.frame': 1126 obs. of 10 variables:
## ..- attr(*, "na.action")= 'omit' Named int [1:3843] 1 4 5 7 8 9 10 12 16 18 ...
## .. ..- attr(*, "names")= chr [1:3843] "1" "4" "5" "7" ...
## $ offset : NULL
## $ control :List of 3
## $ method : chr "glm.fit"
## $ contrasts :List of 5
## $ xlevels :List of 5
## - attr(*, "class")= chr [1:2] "glm" "lm"
logit_model$coefficients
## (Intercept) riagendrFemale
## 9.80788800 -0.11463826
## ridageyr ridreth1Other Hispanic
## -0.05502652 -0.29722567
## ridreth1Non-Hispanic White ridreth1Non-Hispanic Black
## -0.38270466 -0.92228099
## ridreth1Other dmdeduc2Grades 9-11th
## -0.71990619 0.44574943
## dmdeduc2High school graduate/GED dmdeduc2Some college or AA degrees
## 1.26584123 0.90043329
## dmdeduc2College grad or above dmdmartlWidowed
## 0.94730185 0.47483393
## dmdmartlDivorced dmdmartlSeparated
## 0.36843309 -0.04410658
## dmdmartlNever married dmdmartlLiving with partner
## -0.03576914 0.17759058
## indhhin2$5,000-$9,999 indhhin2$10,000-$14,999
## 0.34427417 1.52997906
## indhhin2$15,000-$19,999 indhhin2$20,000-$24,999
## 1.78220109 1.08652981
## indhhin2$25,000-$34,999 indhhin2$45,000-$54,999
## 1.04876567 0.75252721
## indhhin2$65,000-$74,999 indhhin220,000+
## 0.90368602 1.03068174
## indhhin2less than $20,000 indhhin2$75,000-$99,999
## 2.72902414 1.41227935
## indhhin2$100,000+ bmxbmi
## 0.77823046 -0.05352607
## lbxglu
## -0.03825800
logit_model$aic
## [1] 609.2867
coeff_logit_model <- broom::tidy(logit_model)
coeff_logit_model
## # A tibble: 29 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 9.81 1.14 8.58 9.58e-18
## 2 riagendrFemale -0.115 0.239 -0.480 6.31e- 1
## 3 ridageyr -0.0550 0.00933 -5.90 3.63e- 9
## 4 ridreth1Other Hispanic -0.297 0.429 -0.693 4.88e- 1
## 5 ridreth1Non-Hispanic White -0.383 0.393 -0.974 3.30e- 1
## 6 ridreth1Non-Hispanic Black -0.922 0.419 -2.20 2.77e- 2
## 7 ridreth1Other -0.720 0.487 -1.48 1.40e- 1
## 8 dmdeduc2Grades 9-11th 0.446 0.430 1.04 3.00e- 1
## 9 dmdeduc2High school graduate/GED 1.27 0.401 3.16 1.58e- 3
## 10 dmdeduc2Some college or AA degrees 0.900 0.389 2.31 2.06e- 2
## # ... with 19 more rows
coeff_logit_model <- coeff_logit_model %>%
mutate(odds_ratio = exp(estimate)) %>%
arrange( -abs(estimate) )
coeff_logit_model
## # A tibble: 29 x 6
## term estimate std.error statistic p.value odds_ratio
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 9.81 1.14 8.58 9.58e-18 18177.
## 2 indhhin2less than $20,000 2.73 1.14 2.39 1.68e- 2 15.3
## 3 indhhin2$15,000-$19,999 1.78 0.672 2.65 8.00e- 3 5.94
## 4 indhhin2$10,000-$14,999 1.53 0.657 2.33 2.00e- 2 4.62
## 5 indhhin2$75,000-$99,999 1.41 0.658 2.15 3.18e- 2 4.11
## 6 dmdeduc2High school graduat~ 1.27 0.401 3.16 1.58e- 3 3.55
## 7 indhhin2$20,000-$24,999 1.09 0.640 1.70 8.94e- 2 2.96
## 8 indhhin2$25,000-$34,999 1.05 0.590 1.78 7.55e- 2 2.85
## 9 indhhin220,000+ 1.03 0.903 1.14 2.53e- 1 2.80
## 10 dmdeduc2College grad or abo~ 0.947 0.427 2.22 2.64e- 2 2.58
## # ... with 19 more rows
\(~\)
\(~\)
\(~\)
\(~\)
Since training data was used to train the model, we use the hold-out data or testing data to evaluate the model performace:
probs <- predict(logit_model, diab_pop.no_na_vals.test, "response")
diab_pop.no_na_vals.test.SCORED <- cbind(diab_pop.no_na_vals.test, probs)
glimpse(diab_pop.no_na_vals.test.SCORED)
## Observations: 750
## Variables: 11
## $ seqn <dbl> 83734, 83737, 83757, 83761, 83789, 83820, 83822, 83823, 83...
## $ riagendr <fct> Male, Female, Female, Female, Male, Male, Female, Female, ...
## $ ridageyr <dbl> 78, 72, 57, 24, 66, 70, 20, 29, 69, 71, 37, 49, 41, 54, 80...
## $ ridreth1 <fct> Non-Hispanic White, MexicanAmerican, Other Hispanic, Other...
## $ dmdeduc2 <fct> High school graduate/GED, Grades 9-11th, Less than 9th gra...
## $ dmdmartl <fct> Married, Separated, Separated, Never married, Living with ...
## $ indhhin2 <fct> "$20,000-$24,999", "$75,000-$99,999", "$20,000-$24,999", "...
## $ bmxbmi <dbl> 28.8, 28.6, 35.4, 25.3, 34.0, 27.0, 22.2, 29.7, 28.2, 27.6...
## $ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes,...
## $ lbxglu <dbl> 84, 107, 398, 95, 113, 94, 80, 102, 105, 76, 79, 126, 110,...
## $ probs <dbl> 9.387889e-01, 8.722288e-01, 5.437387e-05, 9.727618e-01, 8....
\(~\)
\(~\)
\(~\)
\(~\)
yardstick
for Model Metrics\(~\)
install_if_not('yardstick')
## [1] "the package 'yardstick' is already installed"
library('yardstick')
## For binary classification, the first factor level is assumed to be the event.
## Set the global option `yardstick.event_first` to `FALSE` to change this.
##
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
##
## precision, recall
## The following object is masked from 'package:readr':
##
## spec
\(~\)
\(~\)
A Confusion Matrix is a table of counts of classses for predicted classes versus actual classes. Therefore, in order to create a confusion matrix we will need to specify a threshold value for the probability; let’s use .5 and the conf_mat
function:
diab_pop.no_na_vals.test.SCORED <- diab_pop.no_na_vals.test.SCORED %>%
mutate(Pred_DM2 = ifelse(probs <= 0.5, 1, 2)) %>%
mutate(Pred_DM2 = as.factor(Pred_DM2))
levels(diab_pop.no_na_vals.test.SCORED$Pred_DM2) <- c('Diabetes' , 'No Diabetes')
glimpse(diab_pop.no_na_vals.test.SCORED)
## Observations: 750
## Variables: 12
## $ seqn <dbl> 83734, 83737, 83757, 83761, 83789, 83820, 83822, 83823, 83...
## $ riagendr <fct> Male, Female, Female, Female, Male, Male, Female, Female, ...
## $ ridageyr <dbl> 78, 72, 57, 24, 66, 70, 20, 29, 69, 71, 37, 49, 41, 54, 80...
## $ ridreth1 <fct> Non-Hispanic White, MexicanAmerican, Other Hispanic, Other...
## $ dmdeduc2 <fct> High school graduate/GED, Grades 9-11th, Less than 9th gra...
## $ dmdmartl <fct> Married, Separated, Separated, Never married, Living with ...
## $ indhhin2 <fct> "$20,000-$24,999", "$75,000-$99,999", "$20,000-$24,999", "...
## $ bmxbmi <dbl> 28.8, 28.6, 35.4, 25.3, 34.0, 27.0, 22.2, 29.7, 28.2, 27.6...
## $ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes,...
## $ lbxglu <dbl> 84, 107, 398, 95, 113, 94, 80, 102, 105, 76, 79, 126, 110,...
## $ probs <dbl> 9.387889e-01, 8.722288e-01, 5.437387e-05, 9.727618e-01, 8....
## $ Pred_DM2 <fct> No Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabet...
conf_mat.logit <- diab_pop.no_na_vals.test.SCORED %>%
conf_mat(truth = diq010 , Pred_DM2 )
conf_mat.logit
## Truth
## Prediction Diabetes No Diabetes
## Diabetes 47 12
## No Diabetes 65 626
summary(conf_mat.logit)
## # A tibble: 13 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.897
## 2 kap binary 0.498
## 3 sens binary 0.420
## 4 spec binary 0.981
## 5 ppv binary 0.797
## 6 npv binary 0.906
## 7 mcc binary 0.531
## 8 j_index binary 0.401
## 9 bal_accuracy binary 0.700
## 10 detection_prevalence binary 0.0787
## 11 precision binary 0.797
## 12 recall binary 0.420
## 13 f_meas binary 0.550
\(~\)
\(~\)
The Receiver Operating Characteristic Curve or ROC Curve is created by plotting the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings. The function roc_auc
can be used to obtain the estimated area under the ROC curve and the function roc_curve
can be used to plot the curve:
roc_auc.logit <- diab_pop.no_na_vals.test.SCORED %>%
roc_auc(truth=diq010 , probs)
roc_auc.logit
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.854
diab_pop.no_na_vals.test.SCORED %>%
roc_curve(truth=diq010 , probs) %>%
autoplot() +
labs( title = "ROC Curve - Logistic Regression",
caption = paste0("Area Under ROC Curve : ", round(roc_auc.logit$.estimate,3) ) ) +
theme( plot.title = element_text(size = 18) ,
plot.caption = element_text(size = 12))
diab_pop.no_na_vals.test.SCORED %>%
roc_curve(truth=diq010 , probs) %>%
mutate(one_minus_spec = 1- specificity ) %>%
ggplot(aes(x=one_minus_spec, y = sensitivity)) +
geom_point() +
labs( title = "ROC Curve - Logistic Regression",
caption = paste0("Area Under ROC Curve : ", round(roc_auc.logit$.estimate,3) ) ) +
theme( plot.title = element_text(size = 18) ,
plot.caption = element_text(size = 12))
\(~\)
\(~\)
The Precision-Recall Curve is created by plotting the precision or PPV against the recall or TPR at various threshold settings. The function pr_auc
can be used to obtain the estimated area under the Precision-Recall Curve and the function pr_curve
can be used to plot the curve:
pr_auc.logit <- diab_pop.no_na_vals.test.SCORED %>%
pr_auc(truth=diq010 , probs)
pr_auc.logit
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 pr_auc binary 0.0841
diab_pop.no_na_vals.test.SCORED %>%
pr_curve(truth=diq010 , probs) %>%
autoplot() +
labs( title = "Precision-Recall Curve - Logistic Regression",
caption = paste0("Area Under Precision-Recall Curve : ", round(pr_auc.logit$.estimate,3) ) ) +
theme( plot.title = element_text(size = 18) ,
plot.caption = element_text(size = 12))
\(~\)
\(~\)
lift_curve.logit <- diab_pop.no_na_vals.test.SCORED %>%
lift_curve(truth=diq010, probs)
autoplot(lift_curve.logit) +
labs( title = "Lift Curve - Logistic Regression") +
theme(plot.title = element_text(size = 18))
\(~\)
\(~\)
gain_curve.logit <- diab_pop.no_na_vals.test.SCORED %>%
gain_curve(truth=diq010, probs)
plot_gain_curve.logit <- autoplot(gain_curve.logit) +
labs( title = "Gain Curve - Logistic Regression") +
theme(plot.title = element_text(size = 18))
plot_gain_curve.logit
\(~\)
\(~\)
\(~\)
\(~\)
Let’s remove the intercept and the variable lbxglu
:
features <- colnames(diab_pop.no_na_vals.train)[!colnames(diab_pop.no_na_vals.train) %in% c("seqn","diq010", "lbxglu")]
features_added <- paste0(features, collapse = " + ")
formula_2 <- paste0( "diq010 ~ ", features_added, " - 1" )
formula_2
## [1] "diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi - 1"
logit_no_int_model <- glm(formula_2,
family="binomial",
data = diab_pop.no_na_vals.train)
logit_no_int_model
##
## Call: glm(formula = formula_2, family = "binomial", data = diab_pop.no_na_vals.train)
##
## Coefficients:
## riagendrMale riagendrFemale
## 5.48435 5.55488
## ridageyr ridreth1Other Hispanic
## -0.05793 -0.10096
## ridreth1Non-Hispanic White ridreth1Non-Hispanic Black
## 0.23968 -0.50310
## ridreth1Other dmdeduc2Grades 9-11th
## -0.18278 0.52285
## dmdeduc2High school graduate/GED dmdeduc2Some college or AA degrees
## 0.84767 0.82258
## dmdeduc2College grad or above dmdmartlWidowed
## 0.70671 0.50172
## dmdmartlDivorced dmdmartlSeparated
## 0.07506 -0.10449
## dmdmartlNever married dmdmartlLiving with partner
## 0.35830 0.14843
## indhhin2$5,000-$9,999 indhhin2$10,000-$14,999
## 0.23534 1.07971
## indhhin2$15,000-$19,999 indhhin2$20,000-$24,999
## 0.81281 0.39890
## indhhin2$25,000-$34,999 indhhin2$45,000-$54,999
## 0.51096 0.18778
## indhhin2$65,000-$74,999 indhhin220,000+
## 0.99847 0.51638
## indhhin2less than $20,000 indhhin2$75,000-$99,999
## 1.40105 0.78987
## indhhin2$100,000+ bmxbmi
## 0.64519 -0.06104
##
## Degrees of Freedom: 1126 Total (i.e. Null); 1098 Residual
## Null Deviance: 1561
## Residual Deviance: 796.5 AIC: 852.5
\(~\)
\(~\)
Finally, we will compare the two models. First, score the test data with the model:
probs_no_int <- predict(logit_no_int_model, diab_pop.no_na_vals.test, "response")
Now we combine with our previous results and use a gather
to stack the results into the column value
:
diab_pop.no_na_vals.test.SCORED_Compare <- cbind(diab_pop.no_na_vals.test.SCORED, probs_no_int)
diab_pop.no_na_vals.test.SCORED_Compare <- diab_pop.no_na_vals.test.SCORED_Compare %>%
tidyr::gather(probs, probs_no_int, key="model_type", value = "value")
glimpse(diab_pop.no_na_vals.test.SCORED_Compare)
## Observations: 1,500
## Variables: 13
## $ seqn <dbl> 83734, 83737, 83757, 83761, 83789, 83820, 83822, 83823, ...
## $ riagendr <fct> Male, Female, Female, Female, Male, Male, Female, Female...
## $ ridageyr <dbl> 78, 72, 57, 24, 66, 70, 20, 29, 69, 71, 37, 49, 41, 54, ...
## $ ridreth1 <fct> Non-Hispanic White, MexicanAmerican, Other Hispanic, Oth...
## $ dmdeduc2 <fct> High school graduate/GED, Grades 9-11th, Less than 9th g...
## $ dmdmartl <fct> Married, Separated, Separated, Never married, Living wit...
## $ indhhin2 <fct> "$20,000-$24,999", "$75,000-$99,999", "$20,000-$24,999",...
## $ bmxbmi <dbl> 28.8, 28.6, 35.4, 25.3, 34.0, 27.0, 22.2, 29.7, 28.2, 27...
## $ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabete...
## $ lbxglu <dbl> 84, 107, 398, 95, 113, 94, 80, 102, 105, 76, 79, 126, 11...
## $ Pred_DM2 <fct> No Diabetes, No Diabetes, Diabetes, No Diabetes, No Diab...
## $ model_type <chr> "probs", "probs", "probs", "probs", "probs", "probs", "p...
## $ value <dbl> 9.387889e-01, 8.722288e-01, 5.437387e-05, 9.727618e-01, ...
Now we can use roc_auc
and roc_curve
with a group_by
on the key we added in the last step:
roc_auc.logit <- diab_pop.no_na_vals.test.SCORED_Compare %>%
group_by(model_type) %>%
roc_auc(truth=diq010 , value)
roc_auc.logit
## # A tibble: 2 x 4
## model_type .metric .estimator .estimate
## <chr> <chr> <chr> <dbl>
## 1 probs roc_auc binary 0.854
## 2 probs_no_int roc_auc binary 0.753
diab_pop.no_na_vals.test.SCORED_Compare %>%
group_by(model_type) %>%
roc_curve(truth=diq010 , value) %>%
autoplot() +
labs( title = "ROC Curves - Logistic Regression Compare ") +
theme( plot.title = element_text(size = 18) ,
plot.caption = element_text(size = 12))
We can see that the AUC of the first model is larger, additionally, the ROC curve of the second model is below the first, therefore the first model is the better choice.
\(~\)
\(~\)
\(~\)
\(~\)
Strengths | Weakness |
---|---|
very common and standard approach | strong assumptions about data |
easy to understand and interpret | model form must be specified in advance |
flexibility, can be adopted for many situations | does not handle missing data |
provides estimates of both size and strength of the relationships among features and outcome | only numeric features, categorical and strings need encodings |
. | understanding of statistics to fully understand model |
\(~\) |
\(~\)
\(~\)
\(~\)
\(~\)
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')
#### Variable in Data - Definition - Data Type
##### seqn - Respondent sequence number - Identifier
##### riagendr - Gender - Categorical
##### ridageyr - Age in years at screening - Continuous / Numerical
##### ridreth1 - Race/Hispanic origin - Categorical
##### dmdeduc2 - Education level - Adults 20+ - Categorical
##### dmdmartl - Marital status - Categorical
##### indhhin2 - Annual household income - Categorical
##### bmxbmi - Body Mass Index (kg/m**2) - Continuous / Numerical
##### diq010 - Doctor diagnosed diabetes - Categorical / Target
##### lbxglu - Fasting Glucose (mg/dL) - Continuous / Numerical
str(diab_pop)
install_if_not <- function( list.of.packages ) {
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}
library('tidyverse')
diab_pop.no_na_vals <- diab_pop %>% na.omit()
library('caret')
set.seed(8675309) # this will ensure our results are the same every run
# to randomize you may use: set.seed(Sys.time())
# The createDataPartition function is used to create training and test sets
trainIndex <- createDataPartition(diab_pop.no_na_vals$diq010,
p = .6,
list = FALSE,
times = 1)
diab_pop.no_na_vals.train <- diab_pop.no_na_vals[trainIndex, ]
diab_pop.no_na_vals.test <- diab_pop.no_na_vals[-trainIndex, ]
logit_model <- glm(diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu,
family="binomial",
data = diab_pop.no_na_vals.train)
logit_model
summary(logit_model)
str(logit_model,1)
logit_model$coefficients
logit_model$aic
coeff_logit_model <- broom::tidy(logit_model)
coeff_logit_model
coeff_logit_model <- coeff_logit_model %>%
mutate(odds_ratio = exp(estimate)) %>%
arrange( -abs(estimate) )
coeff_logit_model
probs <- predict(logit_model, diab_pop.no_na_vals.test, "response")
diab_pop.no_na_vals.test.SCORED <- cbind(diab_pop.no_na_vals.test, probs)
glimpse(diab_pop.no_na_vals.test.SCORED)
install_if_not('yardstick')
library('yardstick')
diab_pop.no_na_vals.test.SCORED <- diab_pop.no_na_vals.test.SCORED %>%
mutate(Pred_DM2 = ifelse(probs <= 0.5, 1, 2)) %>%
mutate(Pred_DM2 = as.factor(Pred_DM2))
levels(diab_pop.no_na_vals.test.SCORED$Pred_DM2) <- c('Diabetes' , 'No Diabetes')
glimpse(diab_pop.no_na_vals.test.SCORED)
conf_mat.logit <- diab_pop.no_na_vals.test.SCORED %>%
conf_mat(truth = diq010 , Pred_DM2 )
conf_mat.logit
summary(conf_mat.logit)
roc_auc.logit <- diab_pop.no_na_vals.test.SCORED %>%
roc_auc(truth=diq010 , probs)
roc_auc.logit
diab_pop.no_na_vals.test.SCORED %>%
roc_curve(truth=diq010 , probs) %>%
autoplot() +
labs( title = "ROC Curve - Logistic Regression",
caption = paste0("Area Under ROC Curve : ", round(roc_auc.logit$.estimate,3) ) ) +
theme( plot.title = element_text(size = 18) ,
plot.caption = element_text(size = 12))
diab_pop.no_na_vals.test.SCORED %>%
roc_curve(truth=diq010 , probs) %>%
mutate(one_minus_spec = 1- specificity ) %>%
ggplot(aes(x=one_minus_spec, y = sensitivity)) +
geom_point() +
labs( title = "ROC Curve - Logistic Regression",
caption = paste0("Area Under ROC Curve : ", round(roc_auc.logit$.estimate,3) ) ) +
theme( plot.title = element_text(size = 18) ,
plot.caption = element_text(size = 12))
pr_auc.logit <- diab_pop.no_na_vals.test.SCORED %>%
pr_auc(truth=diq010 , probs)
pr_auc.logit
diab_pop.no_na_vals.test.SCORED %>%
pr_curve(truth=diq010 , probs) %>%
autoplot() +
labs( title = "Precision-Recall Curve - Logistic Regression",
caption = paste0("Area Under Precision-Recall Curve : ", round(pr_auc.logit$.estimate,3) ) ) +
theme( plot.title = element_text(size = 18) ,
plot.caption = element_text(size = 12))
lift_curve.logit <- diab_pop.no_na_vals.test.SCORED %>%
lift_curve(truth=diq010, probs)
autoplot(lift_curve.logit) +
labs( title = "Lift Curve - Logistic Regression") +
theme(plot.title = element_text(size = 18))
gain_curve.logit <- diab_pop.no_na_vals.test.SCORED %>%
gain_curve(truth=diq010, probs)
plot_gain_curve.logit <- autoplot(gain_curve.logit) +
labs( title = "Gain Curve - Logistic Regression") +
theme(plot.title = element_text(size = 18))
plot_gain_curve.logit
features <- colnames(diab_pop.no_na_vals.train)[!colnames(diab_pop.no_na_vals.train) %in% c("seqn","diq010", "lbxglu")]
features_added <- paste0(features, collapse = " + ")
formula_2 <- paste0( "diq010 ~ ", features_added, " - 1" )
formula_2
logit_no_int_model <- glm(formula_2,
family="binomial",
data = diab_pop.no_na_vals.train)
logit_no_int_model
probs_no_int <- predict(logit_no_int_model, diab_pop.no_na_vals.test, "response")
diab_pop.no_na_vals.test.SCORED_Compare <- cbind(diab_pop.no_na_vals.test.SCORED, probs_no_int)
diab_pop.no_na_vals.test.SCORED_Compare <- diab_pop.no_na_vals.test.SCORED_Compare %>%
tidyr::gather(probs, probs_no_int, key="model_type", value = "value")
glimpse(diab_pop.no_na_vals.test.SCORED_Compare)
roc_auc.logit <- diab_pop.no_na_vals.test.SCORED_Compare %>%
group_by(model_type) %>%
roc_auc(truth=diq010 , value)
roc_auc.logit
diab_pop.no_na_vals.test.SCORED_Compare %>%
group_by(model_type) %>%
roc_curve(truth=diq010 , value) %>%
autoplot() +
labs( title = "ROC Curves - Logistic Regression Compare ") +
theme( plot.title = element_text(size = 18) ,
plot.caption = element_text(size = 12))