Epidemiological Analysis and Machine Learning Demo

Author

OJALA BRIAN OLOO

Data Set Used: Heart Disease Data set

About Data set:

The data set is the Stat log Heart Disease data set taken from the UCI repository. The dataset consists of 270 individuals’ data. There are 14 columns in the dataset (which have been extracted from a larger set of 75). No missing values. The classification task is to predict whether an individual is suffering from heart disease or not. (0: absence, 1: presence)

Please note : This to show some of my skills in epidemiological data analysis and machine learning skills in R

# CLEAR WORKSPACE 
rm(list = ls(all.names = TRUE))
#==================================-
# Working directory
setwd("C:/Epidemiology")
#1.1 Import data
library(readxl)
heart <- read_excel("heart.xlsx",sheet = "heart")

Data processing

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(expss)
Loading required package: maditr

To select columns from data: columns(mtcars, mpg, vs:carb)

Attaching package: 'maditr'
The following objects are masked from 'package:dplyr':

    between, coalesce, first, last

Use 'expss_output_rnotebook()' to display tables inside R Notebooks.
 To return to the console output, use 'expss_output_default()'.

Attaching package: 'expss'
The following objects are masked from 'package:dplyr':

    compute, contains, na_if, recode, vars, where
heart<-heart|>
  mutate(
    sex= factor(sex,
                levels = c(0,1),
                      labels = c("Female","Male"),
                      exclude = NA),
    cp=factor(cp,
              levels = c(0,1,2,3),
              labels = c("typical angina","atypical angina",
                         "non- anginal pain","asymptomatic"),
              exclude = NA),
    fbs=factor(fbs,
                levels = c(0,1),
                labels = c("False","True"),
                exclude = NA),
    restecg=factor(restecg,
                    levels = c(0,1,2),
                    labels = c("Normal","having ST-T wave abnormality",
                               "showing probable"),
                    exclude = NA),

    exang=factor(exang,
                  levels = c(0,1),
                  labels=c("No","Yes"),
                  exclude = NA),
    
    slope=factor(slope,
                    levels = c(0,1,2),
                    labels=c("up sloping","flat",
                             "down sloping"),
                     exclude = NA),
    ca=factor(ca,
               levels = c(0,1,2,3)),
              thal=factor(thal,
                      levels = c(0,1,2,3),
                      labels = c("NULL","normal blood flow","fixed defect",
                                 "reversible defect"),
                      exclude = NA),
              target=factor(target,
                                  levels = c(0,1),
                                  labels = c("Absent","Present")))|>
      apply_labels(
        age=    "Patients Age in years" ,
        sex=    "Gender",
        cp= "Type of chest pain",
       trestbps=    "Patient's level of BP at resting mode in mm/HG" ,
        chol=   "Serum cholesterol in mg/dl",
        fbs=    "Blood sugar levels on fasting > 120 mg/dl",
      restecg=  "Result of electrocardiogram while at rest" ,
      thalach=  "Maximum heart rate achieved" ,
     exang= "Angina induced by exercise" ,
     oldpeak=   "Exercise induced ST-depression" ,
     slope="ST segment measured in terms of slope",
     ca=    "The number of major vessels",
      thal= "A blood disorder called thalassemia",
      target=   "Heart Disease")|>
  mutate(
    Age_cat=case_when(
      age<35~1,
      age>=35 & age<40~2,
      age>=40 & age<45~3,
      age>=45 & age<50~4,
      age>=50 & age<55~5,
      age>=55 & age<60~6,
      age>=60 & age<65~7,
      age>=65~8
    ))|>
  mutate(
    Age_cat=factor(Age_cat,
                          levels = c(1,2,3,4,5,6,7,8),
                          labels = c("29-34","35-39","40-44","45-49",
                                     "50-54","55-59","60-64","65 and Above"),
                          
    ))

Selecting 8 columns for the purpose of demonstrating my skills in this analysis

df<-heart|>
  select(Age_cat,sex,ca,target,fbs,age,trestbps,chol)
# Viewing the first 10 observations
head(df,10)
# A tibble: 10 × 8
   Age_cat      sex    ca    target  fbs   age        trestbps   chol      
   <fct>        <fct>  <fct> <fct>   <fct> <labelled> <labelled> <labelled>
 1 65 and Above Male   3     Present False 70         130        322       
 2 65 and Above Female 0     Absent  False 67         115        564       
 3 55-59        Male   0     Present False 57         124        261       
 4 60-64        Male   1     Absent  False 64         128        263       
 5 65 and Above Female 1     Absent  False 74         120        269       
 6 65 and Above Male   0     Absent  False 65         120        177       
 7 55-59        Male   1     Present True  56         130        256       
 8 55-59        Male   1     Present False 59         110        239       
 9 60-64        Male   2     Present False 60         140        293       
10 60-64        Female 3     Present False 63         150        407       

Study the distribution and pattern of heart disease in female patients

#Forming another data frame with only female patients
df_1<-df%>%
  select(Age_cat,sex,ca,target,fbs)%>%
  filter(sex=="Female")

Showing age group distribution among female patients using a bar graph

library(ggplot2)

Attaching package: 'ggplot2'
The following object is masked from 'package:expss':

    vars
data <- data.frame("Age" = c("29-34", "35-39","40-44","45-49","50-54","55-59","60-64","65+"),
                   "Freq" = c(1,3,10,9,15,12,21,16),
                   "Percent" = c("1.2%", "3.5%","11.5%","10.3%","17.2%","13.8%","24.1%","18.4%"))
#=====================================
ggplot(data, aes(x = Age, y = as.numeric(Freq))) + 
  geom_bar(stat = "identity", color = "black", fill = "dodgerblue1")+
  geom_text(label = with(data, paste(Freq, paste0('(', Percent, ')'))), vjust=-1) +
  ylim(0, 25)+
  labs(title = " Female Patients participated in the study",
       y= "Percentage distribution",
       x="Age group(years)")

Distribution of heart disease in female patients among age group using tables

library(table1)

Attaching package: 'table1'
The following objects are masked from 'package:base':

    units, units<-
table1(~Age_cat|target,data = df_1)
Absent
(N=67)
Present
(N=20)
Overall
(N=87)
Age_cat
29-34 1 (1.5%) 0 (0%) 1 (1.1%)
35-39 3 (4.5%) 0 (0%) 3 (3.4%)
40-44 9 (13.4%) 1 (5.0%) 10 (11.5%)
45-49 9 (13.4%) 0 (0%) 9 (10.3%)
50-54 14 (20.9%) 1 (5.0%) 15 (17.2%)
55-59 7 (10.4%) 5 (25.0%) 12 (13.8%)
60-64 10 (14.9%) 11 (55.0%) 21 (24.1%)
65 and Above 14 (20.9%) 2 (10.0%) 16 (18.4%)

N/B: The incidence rate of heart disease was high between 60-64 years among female patients

Study the distribution and pattern of heart disease in male patients

df_2<-df%>%
  select(Age_cat,sex,ca,target,fbs)%>%
  filter(sex=="Male")
# View the first observation
head(df_2,10)
# A tibble: 10 × 5
   Age_cat      sex   ca    target  fbs  
   <fct>        <fct> <fct> <fct>   <fct>
 1 65 and Above Male  3     Present False
 2 55-59        Male  0     Present False
 3 60-64        Male  1     Absent  False
 4 65 and Above Male  0     Absent  False
 5 55-59        Male  1     Present True 
 6 55-59        Male  1     Present False
 7 60-64        Male  2     Present False
 8 55-59        Male  0     Absent  False
 9 50-54        Male  0     Absent  False
10 40-44        Male  0     Absent  False

Presenting the age distribution in male patients using bar graph

dat <- data.frame("Age" = c("29-34", "35-39","40-44","45-49","50-54","55-59","60-64","65+"),
                   "Freq" = c(2,6,27,21,38,42,25,22),
                   "Percent" = c("1.1%", "3.3%","14.8%","11.5%","20.8%","23.0%","13.7%","12.0%"))
#=====================================
ggplot(dat, aes(x = Age, y = as.numeric(Freq))) + 
  geom_bar(stat = "identity", color = "black", fill = "green")+
  geom_text(label = with(dat, paste(Freq, paste0('(', Percent, ')'))), vjust=-1) +
  ylim(0, 50)+
  labs(title = "Male Patients participated in the study",
       y= "Percentage distribution",
       x="Age group(years)")

Distribution of heart disease in male patients

table1(~Age_cat|target,data = df_2)
Absent
(N=83)
Present
(N=100)
Overall
(N=183)
Age_cat
29-34 2 (2.4%) 0 (0%) 2 (1.1%)
35-39 2 (2.4%) 4 (4.0%) 6 (3.3%)
40-44 20 (24.1%) 7 (7.0%) 27 (14.8%)
45-49 10 (12.0%) 11 (11.0%) 21 (11.5%)
50-54 22 (26.5%) 16 (16.0%) 38 (20.8%)
55-59 15 (18.1%) 27 (27.0%) 42 (23.0%)
60-64 6 (7.2%) 19 (19.0%) 25 (13.7%)
65 and Above 6 (7.2%) 16 (16.0%) 22 (12.0%)

N/B: The incidence rate of heart disease was high between 55-59 years in male patients

Heart disease Analysis

# Load packages
library(gtsummary)

Attaching package: 'gtsummary'
The following objects are masked from 'package:expss':

    contains, vars, where
library(flextable)

Attaching package: 'flextable'
The following object is masked from 'package:gtsummary':

    continuous_summary
The following object is masked from 'package:expss':

    set_caption
library(officer)

Attaching package: 'officer'
The following object is masked from 'package:readxl':

    read_xlsx
library(broom)
library(gt)

Attaching package: 'gt'
The following objects are masked from 'package:expss':

    contains, gt, tab_caption, vars, where
#library(smd)
#========Drop Age_Cat from df data frame
df<-df|>
  select(-Age_cat)
#================================
Tab_1 <-df |>
  tbl_summary(
    by = target,  # Uncomment if you want group-wise summary
    statistic = list(
      all_continuous() ~ "{mean} ± {sd}",
      all_categorical() ~ "{n} ({p}%)"
    ),
    percent = "column",
    missing = "no"
  ) |>
  add_overall() |>
  add_p(pvalue_fun = ~style_pvalue(.x, digits = 2)) |>
  modify_footnote(all_stat_cols() ~ "Mean(SD)") |>
  modify_spanning_header(c("stat_1", "stat_2") ~ "**Heart Disease **") |>
  modify_caption("Table 1:Bivariate Analysis") |>
  bold_labels() |>
  add_n() |>
  as_flex_table()
sect_properties <- prop_section(page_size = page_size(orient = "portrait"))#, width = 8.3, height = 11.7)
save_as_docx(Tab_1,path="Table1c.docx", pr_section = sect_properties)
Tab_1

Heart Disease

Characteristic

N

Overall
N = 2701

Absent
N = 1501

Present
N = 1201

p-value2

Gender

270

<0.001

Female

87 (32%)

67 (45%)

20 (17%)

Male

183 (68%)

83 (55%)

100 (83%)

The number of major vessels

270

<0.001

0

160 (59%)

120 (80%)

40 (33%)

1

58 (21%)

20 (13%)

38 (32%)

2

33 (12%)

7 (4.7%)

26 (22%)

3

19 (7.0%)

3 (2.0%)

16 (13%)

Blood sugar levels on fasting > 120 mg/dl

270

0.79

False

230 (85%)

127 (85%)

103 (86%)

True

40 (15%)

23 (15%)

17 (14%)

Patients Age in years

270

54 ± 9

53 ± 10

57 ± 8

<0.001

Patient's level of BP at resting mode in mm/HG

270

131 ± 18

129 ± 16

134 ± 19

0.032

Serum cholesterol in mg/dl

270

250 ± 52

244 ± 54

256 ± 48

0.008

1Mean(SD)

2Pearson's Chi-squared test; Wilcoxon rank sum test

df %>%
  select(sex,target,trestbps) %>%
  tbl_summary(by = target) %>%
  add_overall() %>%
  add_p(test = all_continuous() ~ "t.test",
        pvalue_fun = ~style_pvalue(., digits = 2))%>%
  as_gt() %>%
  gt::tab_header(
    title = gt::md("**Table 1. Characteristics of Heart Disease**"),
    subtitle = gt::md("_Highly Confidential_")
  ) %>%
  gt::tab_source_note("Data updated by Ojala on Feb 24, 2026")
Table 1. Characteristics of Heart Disease
Highly Confidential
Characteristic Overall
N = 2701
Absent
N = 1501
Present
N = 1201
p-value2
Gender


<0.001
    Female 87 (32%) 67 (45%) 20 (17%)
    Male 183 (68%) 83 (55%) 100 (83%)
Patient's level of BP at resting mode in mm/HG 130 (120, 140) 130 (120, 140) 130 (120, 145) 0.012
1 n (%); Median (Q1, Q3)
2 Pearson’s Chi-squared test; Welch Two Sample t-test
Data updated by Ojala on Feb 24, 2026

Measures of Associations

#Un Adjusted Odd Ratios
Model1<-df|>
  tbl_uvregression(method = glm,y=target,
                   method.args = list(family=binomial()),
                   exponentiate = TRUE,pvalue_fun = ~style_pvalue(.x,digits = 3))|>
  modify_column_merge(pattern = "{estimate} ({ci})",rows = ! is.na(estimate))|>
  modify_header(estimate~"**OR(95%C.I)**")|>
  bold_labels()
Model1
Characteristic N OR(95%C.I) 95% CI p-value
Gender 270


    Female

    Male
4.04 (2.30, 7.34) 2.30, 7.34 <0.001
The number of major vessels 270


    0

    1
5.70 (3.01, 11.1) 3.01, 11.1 <0.001
    2
11.1 (4.71, 29.7) 4.71, 29.7 <0.001
    3
16.0 (5.02, 71.4) 5.02, 71.4 <0.001
Blood sugar levels on fasting > 120 mg/dl 270


    False

    True
0.91 (0.46, 1.79) 0.46, 1.79 0.789
Patients Age in years 270 1.05 (1.02, 1.08) 1.02, 1.08 <0.001
Patient's level of BP at resting mode in mm/HG 270 1.02 (1.00, 1.03) 1.00, 1.03 0.012
Serum cholesterol in mg/dl 270 1.00 (1.00, 1.01) 1.00, 1.01 0.056
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

Fitting logistic regression model to control confounder variables in model 2

#Adjusted Odds ratio
Model2<- glm(target~sex+
          fbs+ca,data =df ,family = binomial())|>
  tbl_regression(
    exponentiate = TRUE,pvalue_fun = ~style_pvalue(.x,digits = 3))|>
  modify_column_merge(pattern = "{estimate} ({ci})",rows = ! is.na(estimate))|>
  modify_header(estimate~"**OR(95%C.I)**")|>
  bold_labels()

Model2
Characteristic OR(95%C.I) 95% CI p-value
Gender


    Female
    Male 4.87 (2.50, 10.0) 2.50, 10.0 <0.001
Blood sugar levels on fasting > 120 mg/dl


    False
    True 0.55 (0.23, 1.23) 0.23, 1.23 0.151
The number of major vessels


    0
    1 5.45 (2.78, 11.1) 2.78, 11.1 <0.001
    2 16.3 (6.31, 48.3) 6.31, 48.3 <0.001
    3 18.6 (5.38, 89.0) 5.38, 89.0 <0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio
#=========================================================

Merging two tables

# Merging Tables----
Table_2<-tbl_merge(
  tbls = list(Model1,Model2),
  tab_spanner = c("**Unadjusted**","**Adjusted**")
)
The number rows in the tables to be merged do not match, which may result in
rows appearing out of order.
ℹ See `tbl_merge()` (`?gtsummary::tbl_merge()`) help file for details. Use
  `quiet=TRUE` to silence message.
Table_2
Characteristic
Unadjusted
Adjusted
N OR(95%C.I) 95% CI p-value OR(95%C.I) 95% CI p-value
Gender 270





    Female


    Male
4.04 (2.30, 7.34) 2.30, 7.34 <0.001 4.87 (2.50, 10.0) 2.50, 10.0 <0.001
The number of major vessels 270





    0


    1
5.70 (3.01, 11.1) 3.01, 11.1 <0.001 5.45 (2.78, 11.1) 2.78, 11.1 <0.001
    2
11.1 (4.71, 29.7) 4.71, 29.7 <0.001 16.3 (6.31, 48.3) 6.31, 48.3 <0.001
    3
16.0 (5.02, 71.4) 5.02, 71.4 <0.001 18.6 (5.38, 89.0) 5.38, 89.0 <0.001
Blood sugar levels on fasting > 120 mg/dl 270





    False


    True
0.91 (0.46, 1.79) 0.46, 1.79 0.789 0.55 (0.23, 1.23) 0.23, 1.23 0.151
Patients Age in years 270 1.05 (1.02, 1.08) 1.02, 1.08 <0.001


Patient's level of BP at resting mode in mm/HG 270 1.02 (1.00, 1.03) 1.00, 1.03 0.012


Serum cholesterol in mg/dl 270 1.00 (1.00, 1.01) 1.00, 1.01 0.056


Abbreviations: CI = Confidence Interval, OR = Odds Ratio

Plotting a 95% Cl(OR) using a forest plot though it is used mostly in meta analysis

#Fitting 95%Cl( Odd ratios) using forest plot
library(finalfit)
library(kernlab)

Attaching package: 'kernlab'
The following object is masked from 'package:ggplot2':

    alpha
#===========================================
# Outcome 
dependent <- "target"

# Explanatory variables
explanatory <- c(
  "age",
  "sex",
  "trestbps",
  "fbs",
  "ca"
)
#3.2.3 Forest plot---- 
plot1=df %>% or_plot(dependent, explanatory, remove_ref = TRUE, , table_text_size = 3, title_text_size = 14,
                         dependent_label = "Predictors of heart disease", prefix = "Forest Plot: - ", suffix = "")
Waiting for profiling to be done...
Waiting for profiling to be done...
Waiting for profiling to be done...
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_errorbarh()`).

plot1
TableGrob (2 x 2) "arrange": 3 grobs
  z     cells    name                grob
1 1 (2-2,1-1) arrange      gtable[layout]
2 2 (2-2,2-2) arrange      gtable[layout]
3 3 (1-1,1-2) arrange text[GRID.text.145]
#ggsave("forest2_mdr.png", plot = plot1, width = 10, height = 6, dpi = 300)

Diagnostic Analysis: Sensitivity, Specificity, NPV,PPV etc.

N/B: fbs: Blood sugar levels on fasting > 120 mg/dl represents as 1 in case of true and 0 as false (Nominal)

target: It is the target variable which we have to predict 1 means patient is suffering from heart disease and 0 means patient is normal.

#===Load packages
library(DescTools)

Attaching package: 'DescTools'
The following object is masked from 'package:maditr':

    %like%
library(epiDisplay)
Loading required package: foreign
Loading required package: survival

Attaching package: 'survival'
The following object is masked _by_ '.GlobalEnv':

    heart
Loading required package: MASS

Attaching package: 'MASS'
The following object is masked from 'package:gtsummary':

    select
The following object is masked from 'package:dplyr':

    select
Loading required package: nnet

Attaching package: 'epiDisplay'
The following objects are masked from 'package:kernlab':

    alpha, csi
The following object is masked from 'package:ggplot2':

    alpha
library(epiR)
Package epiR 2.0.81 is loaded
Type help(epi.about) for summary information
Type browseVignettes(package = 'epiR') to learn how to use epiR for applied epidemiological analyses
library(summarytools)

Attaching package: 'summarytools'
The following objects are masked from 'package:table1':

    label, label<-
#===================================
Table1<-with(heart,
             ctable(x=fbs,
                    y=target,
                    prop = "n",
                    totals = FALSE))
Table1
Cross-Tabulation  
fbs * target  
Data Frame: heart  

------- -------- -------- ---------
          target   Absent   Present
    fbs                            
  False               127       103
   True                23        17
------- -------- -------- ---------
# Form a 2 by 2 table
table1<-as.table(rbind(c(127,103),c(23,17)))
table1
    A   B
A 127 103
B  23  17
dimnames(table1)<-list(
  fbs=c("FALSE","TRUE"),
  Heartdisease=c("Absent","Present")
)
table1
       Heartdisease
fbs     Absent Present
  FALSE    127     103
  TRUE      23      17
epi.tests(table1,conf.level = 0.95)
          Outcome +    Outcome -      Total
Test +          127          103        230
Test -           23           17         40
Total           150          120        270

Point estimates and 95% CIs:
--------------------------------------------------------------
Apparent prevalence *                  0.85 (0.80, 0.89)
True prevalence *                      0.56 (0.49, 0.62)
Sensitivity *                          0.85 (0.78, 0.90)
Specificity *                          0.14 (0.08, 0.22)
Positive predictive value *            0.55 (0.49, 0.62)
Negative predictive value *            0.42 (0.27, 0.59)
Positive likelihood ratio              0.99 (0.89, 1.09)
Negative likelihood ratio              1.08 (0.61, 1.93)
False T+ proportion for true D- *      0.86 (0.78, 0.92)
False T- proportion for true D+ *      0.15 (0.10, 0.22)
False T+ proportion for T+ *           0.45 (0.38, 0.51)
False T- proportion for T- *           0.57 (0.41, 0.73)
Correctly classified proportion *      0.53 (0.47, 0.59)
--------------------------------------------------------------
* Exact CIs
#Hypothesis test of Association Using Relative Risk and Odds Ratio
epi.2by2(table1,method = "cohort.count",conf.level = 0.95)
             Outcome +    Outcome -      Total                 Inc risk *
Exposed +          127          103        230     55.22 (48.54 to 61.76)
Exposed -           23           17         40     57.50 (40.89 to 72.96)
Total              150          120        270     55.56 (49.41 to 61.58)

Point estimates and 95% CIs:
-------------------------------------------------------------------
Inc risk ratio                                 0.96 (0.72, 1.28)
Inc odds ratio                                 0.91 (0.46, 1.80)
Attrib risk in the exposed *                   -2.28 (-18.90, 14.33)
Attrib fraction in the exposed (%)            -4.13 (-39.27, 22.14)
Attrib risk in the population *                -1.94 (-18.37, 14.48)
Attrib fraction in the population (%)         -3.50 (-32.39, 19.08)
-------------------------------------------------------------------
Uncorrected chi2 test that OR = 1: chi2(1) = 0.072 Pr>chi2 = 0.789
Fisher exact test that OR = 1: Pr>chi2 = 0.864
 Wald confidence limits
 CI: confidence interval
 * Outcomes per 100 population units 

PART TWO: MACHINE LEARNING

Data Set used: Birth data set

Supervised: Logistic regression

# Clear R environment
rm(list = ls())
# Set working directory
setwd("C:/Machine Learning")
# Import data set
library(readxl)
birth <- read_excel("birth.xls")
View(birth)
#Load packages
library(caret)
Loading required package: lattice

Attaching package: 'lattice'
The following object is masked from 'package:epiDisplay':

    dotplot

Attaching package: 'caret'
The following object is masked from 'package:survival':

    cluster
The following objects are masked from 'package:DescTools':

    MAE, RMSE
library(doSNOW)
Loading required package: foreach

Attaching package: 'foreach'
The following object is masked from 'package:DescTools':

    %:%
The following object is masked from 'package:expss':

    when
Loading required package: iterators
Loading required package: snow
library(xgboost)

Attaching package: 'xgboost'
The following object is masked from 'package:dplyr':

    slice
# Data manipulation
birth<-birth%>%
  mutate(race=as.factor(race),
         smoke=as.factor(smoke),
         low=as.factor(low),
         ht=as.factor(ht),
         ui=as.factor(ui),
         ftv=as.factor(ftv))
#====drop id
Dropvar<-names(birth)%in%c("id")
birth<-birth[!Dropvar]
# Value label 
birth$low<-ordered(birth$low,levels=c(0,1),
                   labels=c("Normal","Low"))
#Splitting the data set(80/20)
library(caret)
library(doSNOW)
library(xgboost)
set.seed(543)
Indexes<-createDataPartition(birth$low,
                             times = 1,
                             p=0.7,
                             list = FALSE)
birth.train<-birth[Indexes,]
birth.test<-birth[-Indexes,]
#Examine the proportion of the birth label across the data sets
#================================================
library(summarytools)
#==============================
freq(birth$low,report.nas = FALSE)
Frequencies  
birth$low  
Type: Ordered Factor  

               Freq        %   % Cum.
------------ ------ -------- --------
      Normal    130    68.78    68.78
         Low     59    31.22   100.00
       Total    189   100.00   100.00
freq(birth.train$low,report.nas = FALSE)
Frequencies  
birth.train$low  
Type: Ordered Factor  

               Freq        %   % Cum.
------------ ------ -------- --------
      Normal     91    68.42    68.42
         Low     42    31.58   100.00
       Total    133   100.00   100.00
freq(birth.test$low,report.nas = FALSE)
Frequencies  
birth.test$low  
Type: Ordered Factor  

               Freq        %   % Cum.
------------ ------ -------- --------
      Normal     39    69.64    69.64
         Low     17    30.36   100.00
       Total     56   100.00   100.00

Setting up caret to perform 10-fold cross validation repeated 3 times and use a grid search for optimal model hyperparameter values.

train.control<-trainControl(method = "repeatedCV",
                            number = 10,
                            repeats = 3,
                            search = "grid")
Warning: `repeats` has no meaning for this resampling method.
##Then Leverage a grid search of hyper parameter for xgboost.
tune.grid<-expand.grid(eta=c(0.05,0.075,0.1),
                       nrounds=c(50,75,100),
                       max_depth=6:8,
                       min_child.weight=c(0.3,2.25,2.5),
                       colsample_bytree=c(0.3,0.4,0.5),
                       gamma=0,
                       subsample=1)

Using doSNOW package to enable caret to train in parallel

Cl<-makeCluster(10,type="SOCK")

Registering clusters so that caret will know train in parallel

registerDoSNOW(Cl)

Train the xgboost model using 10-fold cv and allowing hyparameter for grid search to train the optimal model.

caret.cv<-train(low~.,
                data = birth.train,
                method="xgbTree",
                tunegrid=tune.grid,
                trcontrol=train.control)
[13:51:06] WARNING: src/learner.cc:767: 
Parameters: { "trcontrol", "tunegrid" } are not used.
stopCluster(Cl)

Make predictions on the test set using a xgboost model prediction

pred<-predict(caret.cv,birth.test)
pred
 [1] Normal Normal Low    Low    Normal Normal Normal Normal Normal Normal
[11] Normal Normal Low    Normal Normal Normal Normal Normal Normal Normal
[21] Normal Low    Normal Normal Normal Normal Low    Normal Normal Normal
[31] Normal Normal Normal Normal Normal Normal Normal Normal Normal Low   
[41] Low    Low    Normal Normal Normal Normal Normal Low    Normal Normal
[51] Low    Normal Normal Low    Normal Normal
Levels: Normal < Low
# Examine caret's processing result
#caret.cv

Using caret’s confusionmatrix() to estimate the effectiveness of the model

confusionMatrix(pred,birth.test$low) 
Confusion Matrix and Statistics

          Reference
Prediction Normal Low
    Normal     34  11
    Low         5   6
                                         
               Accuracy : 0.7143         
                 95% CI : (0.5779, 0.827)
    No Information Rate : 0.6964         
    P-Value [Acc > NIR] : 0.4498         
                                         
                  Kappa : 0.2496         
                                         
 Mcnemar's Test P-Value : 0.2113         
                                         
            Sensitivity : 0.8718         
            Specificity : 0.3529         
         Pos Pred Value : 0.7556         
         Neg Pred Value : 0.5455         
             Prevalence : 0.6964         
         Detection Rate : 0.6071         
   Detection Prevalence : 0.8036         
      Balanced Accuracy : 0.6124         
                                         
       'Positive' Class : Normal         
                                         

Please note: Other supervised machine learning like SVM,Random forest, PCA, linear regression and others have not covered in this demo