\(~\)

1 Read in the Data

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

\(~\)

1.1 Reminders

1.1.1 The Data

#### Variable in Data - Definition - Data Type
##### seqn - Respondent sequence number - Identifier
##### riagendr - Gender - Categorical
##### ridageyr - Age in years at screening - Continuous / Numerical
##### ridreth1 - Race/Hispanic origin  - Categorical
##### dmdeduc2 - Education level - Adults 20+  - Categorical
##### dmdmartl - Marital status  - Categorical
##### indhhin2 - Annual household income  - Categorical
##### bmxbmi - Body Mass Index (kg/m**2) - Continuous / Numerical
##### diq010 - Doctor diagnosed diabetes - Categorical / Target
##### lbxglu - Fasting Glucose (mg/dL) - Continuous / Numerical

str(diab_pop)

## 'data.frame':    5719 obs. of  10 variables:
##  $ seqn    : num  83732 83733 83734 83735 83736 ...
##  $ riagendr: Factor w/ 2 levels "Male","Female": 1 1 1 2 2 2 1 2 1 1 ...
##  $ ridageyr: num  62 53 78 56 42 72 22 32 56 46 ...
##  $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 3 3 3 3 4 1 4 1 4 3 ...
##  $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 5 3 3 5 4 2 4 4 3 5 ...
##  $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 1 3 1 6 3 4 5 1 3 6 ...
##  $ indhhin2: Factor w/ 14 levels "$0-$4,999","$5,000-$9,999",..: 10 4 5 10 NA 13 NA 6 3 3 ...
##  $ bmxbmi  : num  27.8 30.8 28.8 42.4 20.3 28.6 28 28.2 33.6 27.6 ...
##  $ diq010  : Factor w/ 2 levels "Diabetes","No Diabetes": 1 2 1 2 2 2 2 2 1 2 ...
##  $ lbxglu  : num  NA 101 84 NA 84 107 95 NA NA NA ...

\(~\)

1.1.2 Install if not Function

install_if_not <- function( list.of.packages ) {
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}

\(~\)

2 The `e1071` package

install_if_not('e1071')

## [1] "the package 'e1071' is already installed"

library('e1071')

3 Split Data

library('caret')

## Loading required package: lattice

## Loading required package: ggplot2

diab_pop.no_na <- na.omit(diab_pop)
trainIndex <- createDataPartition(diab_pop.no_na$diq010, 
                                  list = FALSE , 
                                  p = .8)

train <- diab_pop.no_na[trainIndex, ]
test <- diab_pop.no_na[-trainIndex, ]

4 Make Formula

features <- colnames(train)[!colnames(train) %in% c('diq010', 'seqn')]
features_plus <- paste0(features, collapse = " + ")
my_formula <- paste0("diq010 ~ ", features_plus)

my_formula

## [1] "diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu"

5 Train Model

model_nb <- naiveBayes( as.formula(my_formula) , data=train)

summary(model_nb)

##           Length Class  Mode     
## apriori   2      table  numeric  
## tables    8      -none- list     
## levels    2      -none- character
## isnumeric 8      -none- logical  
## call      4      -none- call

6 Score

class <- predict(model_nb, test)
probs <- predict(model_nb, test, 'raw')

test.scored <- cbind(test, class, probs)

7 `yardstick`

library('dplyr')

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library('yardstick')

## For binary classification, the first factor level is assumed to be the event.
## Set the global option `yardstick.event_first` to `FALSE` to change this.

## 
## Attaching package: 'yardstick'

## The following objects are masked from 'package:caret':
## 
##     precision, recall

test.scored %>%
  conf_mat(truth=diq010, class)

##              Truth
## Prediction    Diabetes No Diabetes
##   Diabetes          28          10
##   No Diabetes       28         309

summary(test.scored %>%
  conf_mat(truth=diq010, class))

## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary         0.899
##  2 kap                  binary         0.540
##  3 sens                 binary         0.5  
##  4 spec                 binary         0.969
##  5 ppv                  binary         0.737
##  6 npv                  binary         0.917
##  7 mcc                  binary         0.554
##  8 j_index              binary         0.469
##  9 bal_accuracy         binary         0.734
## 10 detection_prevalence binary         0.101
## 11 precision            binary         0.737
## 12 recall               binary         0.5  
## 13 f_meas               binary         0.596

\(~\)

8 Code Appendix

\(~\)

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')


#### Variable in Data - Definition - Data Type
##### seqn - Respondent sequence number - Identifier
##### riagendr - Gender - Categorical
##### ridageyr - Age in years at screening - Continuous / Numerical
##### ridreth1 - Race/Hispanic origin  - Categorical
##### dmdeduc2 - Education level - Adults 20+  - Categorical
##### dmdmartl - Marital status  - Categorical
##### indhhin2 - Annual household income  - Categorical
##### bmxbmi - Body Mass Index (kg/m**2) - Continuous / Numerical
##### diq010 - Doctor diagnosed diabetes - Categorical / Target
##### lbxglu - Fasting Glucose (mg/dL) - Continuous / Numerical

str(diab_pop)


install_if_not <- function( list.of.packages ) {
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}


install_if_not('e1071')

library('e1071')
library('caret')

diab_pop.no_na <- na.omit(diab_pop)
trainIndex <- createDataPartition(diab_pop.no_na$diq010, 
                                  list = FALSE , 
                                  p = .8)

train <- diab_pop.no_na[trainIndex, ]
test <- diab_pop.no_na[-trainIndex, ]
features <- colnames(train)[!colnames(train) %in% c('diq010', 'seqn')]
features_plus <- paste0(features, collapse = " + ")
my_formula <- paste0("diq010 ~ ", features_plus)

my_formula
model_nb <- naiveBayes( as.formula(my_formula) , data=train)

summary(model_nb)
class <- predict(model_nb, test)
probs <- predict(model_nb, test, 'raw')

test.scored <- cbind(test, class, probs)
library('dplyr')
library('yardstick')

test.scored %>%
  conf_mat(truth=diq010, class)

summary(test.scored %>%
  conf_mat(truth=diq010, class))

Naïve Bayes

J Kyle Armstrong, PhD

10 March 2020

1 Read in the Data

1.1 Reminders

1.1.1 The Data

1.1.2 Install if not Function

2 The `e1071` package

3 Split Data

4 Make Formula

5 Train Model

6 Score

7 `yardstick`

8 Code Appendix

Naïve Bayes

J Kyle Armstrong, PhD

10 March 2020

1 Read in the Data

1.1 Reminders

1.1.1 The Data

1.1.2 Install if not Function

2 The e1071 package

3 Split Data

4 Make Formula

5 Train Model

6 Score

7 yardstick

8 Code Appendix

2 The `e1071` package

7 `yardstick`