\(~\)
\(~\)
\(~\)
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')
\(~\)
\(~\)
\(~\)
\(~\)
#### Variable in Data - Definition - Data Type
##### seqn - Respondent sequence number - Identifier
##### riagendr - Gender - Categorical
##### ridageyr - Age in years at screening - Continuous / Numerical
##### ridreth1 - Race/Hispanic origin - Categorical
##### dmdeduc2 - Education level - Adults 20+ - Categorical
##### dmdmartl - Marital status - Categorical
##### indhhin2 - Annual household income - Categorical
##### bmxbmi - Body Mass Index (kg/m**2) - Continuous / Numerical
##### diq010 - Doctor diagnosed diabetes - Categorical / Target
##### lbxglu - Fasting Glucose (mg/dL) - Continuous / Numerical
str(diab_pop)
## 'data.frame': 5719 obs. of 10 variables:
## $ seqn : num 83732 83733 83734 83735 83736 ...
## $ riagendr: Factor w/ 2 levels "Male","Female": 1 1 1 2 2 2 1 2 1 1 ...
## $ ridageyr: num 62 53 78 56 42 72 22 32 56 46 ...
## $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 3 3 3 3 4 1 4 1 4 3 ...
## $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 5 3 3 5 4 2 4 4 3 5 ...
## $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 1 3 1 6 3 4 5 1 3 6 ...
## $ indhhin2: Factor w/ 14 levels "$0-$4,999","$5,000-$9,999",..: 10 4 5 10 NA 13 NA 6 3 3 ...
## $ bmxbmi : num 27.8 30.8 28.8 42.4 20.3 28.6 28 28.2 33.6 27.6 ...
## $ diq010 : Factor w/ 2 levels "Diabetes","No Diabetes": 1 2 1 2 2 2 2 2 1 2 ...
## $ lbxglu : num NA 101 84 NA 84 107 95 NA NA NA ...
\(~\)
\(~\)
\(~\)
install_if_not <- function( list.of.packages ) {
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}
\(~\)
\(~\)
\(~\)
\(~\)
e1071
packageinstall_if_not('e1071')
## [1] "the package 'e1071' is already installed"
library('e1071')
library('caret')
## Loading required package: lattice
## Loading required package: ggplot2
diab_pop.no_na <- na.omit(diab_pop)
trainIndex <- createDataPartition(diab_pop.no_na$diq010,
list = FALSE ,
p = .8)
train <- diab_pop.no_na[trainIndex, ]
test <- diab_pop.no_na[-trainIndex, ]
features <- colnames(train)[!colnames(train) %in% c('diq010', 'seqn')]
features_plus <- paste0(features, collapse = " + ")
my_formula <- paste0("diq010 ~ ", features_plus)
my_formula
## [1] "diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu"
model_nb <- naiveBayes( as.formula(my_formula) , data=train)
summary(model_nb)
## Length Class Mode
## apriori 2 table numeric
## tables 8 -none- list
## levels 2 -none- character
## isnumeric 8 -none- logical
## call 4 -none- call
class <- predict(model_nb, test)
probs <- predict(model_nb, test, 'raw')
test.scored <- cbind(test, class, probs)
yardstick
library('dplyr')
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('yardstick')
## For binary classification, the first factor level is assumed to be the event.
## Set the global option `yardstick.event_first` to `FALSE` to change this.
##
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
##
## precision, recall
test.scored %>%
conf_mat(truth=diq010, class)
## Truth
## Prediction Diabetes No Diabetes
## Diabetes 28 10
## No Diabetes 28 309
summary(test.scored %>%
conf_mat(truth=diq010, class))
## # A tibble: 13 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.899
## 2 kap binary 0.540
## 3 sens binary 0.5
## 4 spec binary 0.969
## 5 ppv binary 0.737
## 6 npv binary 0.917
## 7 mcc binary 0.554
## 8 j_index binary 0.469
## 9 bal_accuracy binary 0.734
## 10 detection_prevalence binary 0.101
## 11 precision binary 0.737
## 12 recall binary 0.5
## 13 f_meas binary 0.596
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')
#### Variable in Data - Definition - Data Type
##### seqn - Respondent sequence number - Identifier
##### riagendr - Gender - Categorical
##### ridageyr - Age in years at screening - Continuous / Numerical
##### ridreth1 - Race/Hispanic origin - Categorical
##### dmdeduc2 - Education level - Adults 20+ - Categorical
##### dmdmartl - Marital status - Categorical
##### indhhin2 - Annual household income - Categorical
##### bmxbmi - Body Mass Index (kg/m**2) - Continuous / Numerical
##### diq010 - Doctor diagnosed diabetes - Categorical / Target
##### lbxglu - Fasting Glucose (mg/dL) - Continuous / Numerical
str(diab_pop)
install_if_not <- function( list.of.packages ) {
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}
install_if_not('e1071')
library('e1071')
library('caret')
diab_pop.no_na <- na.omit(diab_pop)
trainIndex <- createDataPartition(diab_pop.no_na$diq010,
list = FALSE ,
p = .8)
train <- diab_pop.no_na[trainIndex, ]
test <- diab_pop.no_na[-trainIndex, ]
features <- colnames(train)[!colnames(train) %in% c('diq010', 'seqn')]
features_plus <- paste0(features, collapse = " + ")
my_formula <- paste0("diq010 ~ ", features_plus)
my_formula
model_nb <- naiveBayes( as.formula(my_formula) , data=train)
summary(model_nb)
class <- predict(model_nb, test)
probs <- predict(model_nb, test, 'raw')
test.scored <- cbind(test, class, probs)
library('dplyr')
library('yardstick')
test.scored %>%
conf_mat(truth=diq010, class)
summary(test.scored %>%
conf_mat(truth=diq010, class))