Epidemiological Analysis and Machine Learning Demo

Author

OJALA BRIAN OLOO

Data Set Used: Heart Disease Data set

About Data set:

The data set is the Stat log Heart Disease data set taken from the UCI repository. The dataset consists of 270 individuals’ data. There are 14 columns in the dataset (which have been extracted from a larger set of 75). No missing values. The classification task is to predict whether an individual is suffering from heart disease or not. (0: absence, 1: presence)

Please note : This to show some of my skills in epidemiological data analysis and machine learning skills in R

# CLEAR WORKSPACE 
rm(list = ls(all.names = TRUE))
#==================================-
# Working directory
setwd("C:/Epidemiology")
#1.1 Import data
library(readxl)
heart <- read_excel("heart.xlsx",sheet = "heart")

Data processing

library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(expss)

Loading required package: maditr


To select columns from data: columns(mtcars, mpg, vs:carb)


Attaching package: 'maditr'

The following objects are masked from 'package:dplyr':

    between, coalesce, first, last


Use 'expss_output_rnotebook()' to display tables inside R Notebooks.
 To return to the console output, use 'expss_output_default()'.


Attaching package: 'expss'

The following objects are masked from 'package:dplyr':

    compute, contains, na_if, recode, vars, where

heart<-heart|>
  mutate(
    sex= factor(sex,
                levels = c(0,1),
                      labels = c("Female","Male"),
                      exclude = NA),
    cp=factor(cp,
              levels = c(0,1,2,3),
              labels = c("typical angina","atypical angina",
                         "non- anginal pain","asymptomatic"),
              exclude = NA),
    fbs=factor(fbs,
                levels = c(0,1),
                labels = c("False","True"),
                exclude = NA),
    restecg=factor(restecg,
                    levels = c(0,1,2),
                    labels = c("Normal","having ST-T wave abnormality",
                               "showing probable"),
                    exclude = NA),

    exang=factor(exang,
                  levels = c(0,1),
                  labels=c("No","Yes"),
                  exclude = NA),
    
    slope=factor(slope,
                    levels = c(0,1,2),
                    labels=c("up sloping","flat",
                             "down sloping"),
                     exclude = NA),
    ca=factor(ca,
               levels = c(0,1,2,3)),
              thal=factor(thal,
                      levels = c(0,1,2,3),
                      labels = c("NULL","normal blood flow","fixed defect",
                                 "reversible defect"),
                      exclude = NA),
              target=factor(target,
                                  levels = c(0,1),
                                  labels = c("Absent","Present")))|>
      apply_labels(
        age=    "Patients Age in years" ,
        sex=    "Gender",
        cp= "Type of chest pain",
       trestbps=    "Patient's level of BP at resting mode in mm/HG" ,
        chol=   "Serum cholesterol in mg/dl",
        fbs=    "Blood sugar levels on fasting > 120 mg/dl",
      restecg=  "Result of electrocardiogram while at rest" ,
      thalach=  "Maximum heart rate achieved" ,
     exang= "Angina induced by exercise" ,
     oldpeak=   "Exercise induced ST-depression" ,
     slope="ST segment measured in terms of slope",
     ca=    "The number of major vessels",
      thal= "A blood disorder called thalassemia",
      target=   "Heart Disease")|>
  mutate(
    Age_cat=case_when(
      age<35~1,
      age>=35 & age<40~2,
      age>=40 & age<45~3,
      age>=45 & age<50~4,
      age>=50 & age<55~5,
      age>=55 & age<60~6,
      age>=60 & age<65~7,
      age>=65~8
    ))|>
  mutate(
    Age_cat=factor(Age_cat,
                          levels = c(1,2,3,4,5,6,7,8),
                          labels = c("29-34","35-39","40-44","45-49",
                                     "50-54","55-59","60-64","65 and Above"),
                          
    ))

Selecting 8 columns for the purpose of demonstrating my skills in this analysis

df<-heart|>
  select(Age_cat,sex,ca,target,fbs,age,trestbps,chol)

# Viewing the first 10 observations
head(df,10)

# A tibble: 10 × 8
   Age_cat      sex    ca    target  fbs   age        trestbps   chol      
   <fct>        <fct>  <fct> <fct>   <fct> <labelled> <labelled> <labelled>
 1 65 and Above Male   3     Present False 70         130        322       
 2 65 and Above Female 0     Absent  False 67         115        564       
 3 55-59        Male   0     Present False 57         124        261       
 4 60-64        Male   1     Absent  False 64         128        263       
 5 65 and Above Female 1     Absent  False 74         120        269       
 6 65 and Above Male   0     Absent  False 65         120        177       
 7 55-59        Male   1     Present True  56         130        256       
 8 55-59        Male   1     Present False 59         110        239       
 9 60-64        Male   2     Present False 60         140        293       
10 60-64        Female 3     Present False 63         150        407

Study the distribution and pattern of heart disease in female patients

#Forming another data frame with only female patients
df_1<-df%>%
  select(Age_cat,sex,ca,target,fbs)%>%
  filter(sex=="Female")

Showing age group distribution among female patients using a bar graph

library(ggplot2)


Attaching package: 'ggplot2'

The following object is masked from 'package:expss':

    vars

data <- data.frame("Age" = c("29-34", "35-39","40-44","45-49","50-54","55-59","60-64","65+"),
                   "Freq" = c(1,3,10,9,15,12,21,16),
                   "Percent" = c("1.2%", "3.5%","11.5%","10.3%","17.2%","13.8%","24.1%","18.4%"))
#=====================================
ggplot(data, aes(x = Age, y = as.numeric(Freq))) + 
  geom_bar(stat = "identity", color = "black", fill = "dodgerblue1")+
  geom_text(label = with(data, paste(Freq, paste0('(', Percent, ')'))), vjust=-1) +
  ylim(0, 25)+
  labs(title = " Female Patients participated in the study",
       y= "Percentage distribution",
       x="Age group(years)")

Distribution of heart disease in female patients among age group using tables

library(table1)


Attaching package: 'table1'

The following objects are masked from 'package:base':

    units, units<-

table1(~Age_cat|target,data = df_1)

	Absent (N=67)	Present (N=20)	Overall (N=87)
Age_cat
29-34	1 (1.5%)	0 (0%)	1 (1.1%)
35-39	3 (4.5%)	0 (0%)	3 (3.4%)
40-44	9 (13.4%)	1 (5.0%)	10 (11.5%)
45-49	9 (13.4%)	0 (0%)	9 (10.3%)
50-54	14 (20.9%)	1 (5.0%)	15 (17.2%)
55-59	7 (10.4%)	5 (25.0%)	12 (13.8%)
60-64	10 (14.9%)	11 (55.0%)	21 (24.1%)
65 and Above	14 (20.9%)	2 (10.0%)	16 (18.4%)

N/B: The incidence rate of heart disease was high between 60-64 years among female patients

Study the distribution and pattern of heart disease in male patients

df_2<-df%>%
  select(Age_cat,sex,ca,target,fbs)%>%
  filter(sex=="Male")

# View the first observation
head(df_2,10)

# A tibble: 10 × 5
   Age_cat      sex   ca    target  fbs  
   <fct>        <fct> <fct> <fct>   <fct>
 1 65 and Above Male  3     Present False
 2 55-59        Male  0     Present False
 3 60-64        Male  1     Absent  False
 4 65 and Above Male  0     Absent  False
 5 55-59        Male  1     Present True 
 6 55-59        Male  1     Present False
 7 60-64        Male  2     Present False
 8 55-59        Male  0     Absent  False
 9 50-54        Male  0     Absent  False
10 40-44        Male  0     Absent  False

Presenting the age distribution in male patients using bar graph

dat <- data.frame("Age" = c("29-34", "35-39","40-44","45-49","50-54","55-59","60-64","65+"),
                   "Freq" = c(2,6,27,21,38,42,25,22),
                   "Percent" = c("1.1%", "3.3%","14.8%","11.5%","20.8%","23.0%","13.7%","12.0%"))
#=====================================
ggplot(dat, aes(x = Age, y = as.numeric(Freq))) + 
  geom_bar(stat = "identity", color = "black", fill = "green")+
  geom_text(label = with(dat, paste(Freq, paste0('(', Percent, ')'))), vjust=-1) +
  ylim(0, 50)+
  labs(title = "Male Patients participated in the study",
       y= "Percentage distribution",
       x="Age group(years)")

Distribution of heart disease in male patients

table1(~Age_cat|target,data = df_2)

	Absent (N=83)	Present (N=100)	Overall (N=183)
Age_cat
29-34	2 (2.4%)	0 (0%)	2 (1.1%)
35-39	2 (2.4%)	4 (4.0%)	6 (3.3%)
40-44	20 (24.1%)	7 (7.0%)	27 (14.8%)
45-49	10 (12.0%)	11 (11.0%)	21 (11.5%)
50-54	22 (26.5%)	16 (16.0%)	38 (20.8%)
55-59	15 (18.1%)	27 (27.0%)	42 (23.0%)
60-64	6 (7.2%)	19 (19.0%)	25 (13.7%)
65 and Above	6 (7.2%)	16 (16.0%)	22 (12.0%)

N/B: The incidence rate of heart disease was high between 55-59 years in male patients

Heart disease Analysis

# Load packages
library(gtsummary)


Attaching package: 'gtsummary'

The following objects are masked from 'package:expss':

    contains, vars, where

library(flextable)


Attaching package: 'flextable'

The following object is masked from 'package:gtsummary':

    continuous_summary

The following object is masked from 'package:expss':

    set_caption

library(officer)


Attaching package: 'officer'

The following object is masked from 'package:readxl':

    read_xlsx

library(broom)
library(gt)


Attaching package: 'gt'

The following objects are masked from 'package:expss':

    contains, gt, tab_caption, vars, where

#library(smd)
#========Drop Age_Cat from df data frame
df<-df|>
  select(-Age_cat)
#================================
Tab_1 <-df |>
  tbl_summary(
    by = target,  # Uncomment if you want group-wise summary
    statistic = list(
      all_continuous() ~ "{mean} ± {sd}",
      all_categorical() ~ "{n} ({p}%)"
    ),
    percent = "column",
    missing = "no"
  ) |>
  add_overall() |>
  add_p(pvalue_fun = ~style_pvalue(.x, digits = 2)) |>
  modify_footnote(all_stat_cols() ~ "Mean(SD)") |>
  modify_spanning_header(c("stat_1", "stat_2") ~ "**Heart Disease **") |>
  modify_caption("Table 1:Bivariate Analysis") |>
  bold_labels() |>
  add_n() |>
  as_flex_table()
sect_properties <- prop_section(page_size = page_size(orient = "portrait"))#, width = 8.3, height = 11.7)
save_as_docx(Tab_1,path="Table1c.docx", pr_section = sect_properties)
Tab_1

			Heart Disease
Characteristic	N	Overall N = 2701	Absent N = 1501	Present N = 1201	p-value2
Gender	270				<0.001
Female		87 (32%)	67 (45%)	20 (17%)
Male		183 (68%)	83 (55%)	100 (83%)
The number of major vessels	270				<0.001
0		160 (59%)	120 (80%)	40 (33%)
1		58 (21%)	20 (13%)	38 (32%)
2		33 (12%)	7 (4.7%)	26 (22%)
3		19 (7.0%)	3 (2.0%)	16 (13%)
Blood sugar levels on fasting > 120 mg/dl	270				0.79
False		230 (85%)	127 (85%)	103 (86%)
True		40 (15%)	23 (15%)	17 (14%)
Patients Age in years	270	54 ± 9	53 ± 10	57 ± 8	<0.001
Patient's level of BP at resting mode in mm/HG	270	131 ± 18	129 ± 16	134 ± 19	0.032
Serum cholesterol in mg/dl	270	250 ± 52	244 ± 54	256 ± 48	0.008
1Mean(SD)
2Pearson's Chi-squared test; Wilcoxon rank sum test

df %>%
  select(sex,target,trestbps) %>%
  tbl_summary(by = target) %>%
  add_overall() %>%
  add_p(test = all_continuous() ~ "t.test",
        pvalue_fun = ~style_pvalue(., digits = 2))%>%
  as_gt() %>%
  gt::tab_header(
    title = gt::md("**Table 1. Characteristics of Heart Disease**"),
    subtitle = gt::md("_Highly Confidential_")
  ) %>%
  gt::tab_source_note("Data updated by Ojala on Feb 24, 2026")

Characteristic	Overall N = 270¹	Absent N = 150¹	Present N = 120¹	p-value²
Table 1. Characteristics of Heart Disease
Highly Confidential
Gender				<0.001
Female	87 (32%)	67 (45%)	20 (17%)
Male	183 (68%)	83 (55%)	100 (83%)
Patient's level of BP at resting mode in mm/HG	130 (120, 140)	130 (120, 140)	130 (120, 145)	0.012
¹ n (%); Median (Q1, Q3)
² Pearson’s Chi-squared test; Welch Two Sample t-test
Data updated by Ojala on Feb 24, 2026

Measures of Associations

#Un Adjusted Odd Ratios
Model1<-df|>
  tbl_uvregression(method = glm,y=target,
                   method.args = list(family=binomial()),
                   exponentiate = TRUE,pvalue_fun = ~style_pvalue(.x,digits = 3))|>
  modify_column_merge(pattern = "{estimate} ({ci})",rows = ! is.na(estimate))|>
  modify_header(estimate~"**OR(95%C.I)**")|>
  bold_labels()
Model1

Characteristic	N	OR(95%C.I)	95% CI	p-value
Gender	270
Female		—	—
Male		4.04 (2.30, 7.34)	2.30, 7.34	<0.001
The number of major vessels	270
0		—	—
1		5.70 (3.01, 11.1)	3.01, 11.1	<0.001
2		11.1 (4.71, 29.7)	4.71, 29.7	<0.001
3		16.0 (5.02, 71.4)	5.02, 71.4	<0.001
Blood sugar levels on fasting > 120 mg/dl	270
False		—	—
True		0.91 (0.46, 1.79)	0.46, 1.79	0.789
Patients Age in years	270	1.05 (1.02, 1.08)	1.02, 1.08	<0.001
Patient's level of BP at resting mode in mm/HG	270	1.02 (1.00, 1.03)	1.00, 1.03	0.012
Serum cholesterol in mg/dl	270	1.00 (1.00, 1.01)	1.00, 1.01	0.056
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

Fitting logistic regression model to control confounder variables in model 2

#Adjusted Odds ratio
Model2<- glm(target~sex+
          fbs+ca,data =df ,family = binomial())|>
  tbl_regression(
    exponentiate = TRUE,pvalue_fun = ~style_pvalue(.x,digits = 3))|>
  modify_column_merge(pattern = "{estimate} ({ci})",rows = ! is.na(estimate))|>
  modify_header(estimate~"**OR(95%C.I)**")|>
  bold_labels()

Model2

Characteristic	OR(95%C.I)	95% CI	p-value
Gender
Female	—	—
Male	4.87 (2.50, 10.0)	2.50, 10.0	<0.001
Blood sugar levels on fasting > 120 mg/dl
False	—	—
True	0.55 (0.23, 1.23)	0.23, 1.23	0.151
The number of major vessels
0	—	—
1	5.45 (2.78, 11.1)	2.78, 11.1	<0.001
2	16.3 (6.31, 48.3)	6.31, 48.3	<0.001
3	18.6 (5.38, 89.0)	5.38, 89.0	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

#=========================================================

Merging two tables

# Merging Tables----
Table_2<-tbl_merge(
  tbls = list(Model1,Model2),
  tab_spanner = c("**Unadjusted**","**Adjusted**")
)

The number rows in the tables to be merged do not match, which may result in
rows appearing out of order.
ℹ See `tbl_merge()` (`?gtsummary::tbl_merge()`) help file for details. Use
  `quiet=TRUE` to silence message.

Table_2

Characteristic	Unadjusted				Adjusted
Characteristic	N	OR(95%C.I)	95% CI	p-value	OR(95%C.I)	95% CI	p-value
Gender	270
Female		—	—		—	—
Male		4.04 (2.30, 7.34)	2.30, 7.34	<0.001	4.87 (2.50, 10.0)	2.50, 10.0	<0.001
The number of major vessels	270
0		—	—		—	—
1		5.70 (3.01, 11.1)	3.01, 11.1	<0.001	5.45 (2.78, 11.1)	2.78, 11.1	<0.001
2		11.1 (4.71, 29.7)	4.71, 29.7	<0.001	16.3 (6.31, 48.3)	6.31, 48.3	<0.001
3		16.0 (5.02, 71.4)	5.02, 71.4	<0.001	18.6 (5.38, 89.0)	5.38, 89.0	<0.001
Blood sugar levels on fasting > 120 mg/dl	270
False		—	—		—	—
True		0.91 (0.46, 1.79)	0.46, 1.79	0.789	0.55 (0.23, 1.23)	0.23, 1.23	0.151
Patients Age in years	270	1.05 (1.02, 1.08)	1.02, 1.08	<0.001
Patient's level of BP at resting mode in mm/HG	270	1.02 (1.00, 1.03)	1.00, 1.03	0.012
Serum cholesterol in mg/dl	270	1.00 (1.00, 1.01)	1.00, 1.01	0.056
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

Plotting a 95% Cl(OR) using a forest plot though it is used mostly in meta analysis

#Fitting 95%Cl( Odd ratios) using forest plot
library(finalfit)
library(kernlab)


Attaching package: 'kernlab'

The following object is masked from 'package:ggplot2':

    alpha

#===========================================
# Outcome 
dependent <- "target"

# Explanatory variables
explanatory <- c(
  "age",
  "sex",
  "trestbps",
  "fbs",
  "ca"
)
#3.2.3 Forest plot---- 
plot1=df %>% or_plot(dependent, explanatory, remove_ref = TRUE, , table_text_size = 3, title_text_size = 14,
                         dependent_label = "Predictors of heart disease", prefix = "Forest Plot: - ", suffix = "")

Waiting for profiling to be done...

Waiting for profiling to be done...
Waiting for profiling to be done...

Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_errorbarh()`).

plot1

TableGrob (2 x 2) "arrange": 3 grobs
  z     cells    name                grob
1 1 (2-2,1-1) arrange      gtable[layout]
2 2 (2-2,2-2) arrange      gtable[layout]
3 3 (1-1,1-2) arrange text[GRID.text.145]

#ggsave("forest2_mdr.png", plot = plot1, width = 10, height = 6, dpi = 300)

Diagnostic Analysis: Sensitivity, Specificity, NPV,PPV etc.

N/B: fbs: Blood sugar levels on fasting > 120 mg/dl represents as 1 in case of true and 0 as false (Nominal)

target: It is the target variable which we have to predict 1 means patient is suffering from heart disease and 0 means patient is normal.

#===Load packages
library(DescTools)


Attaching package: 'DescTools'

The following object is masked from 'package:maditr':

    %like%

library(epiDisplay)

Loading required package: foreign

Loading required package: survival


Attaching package: 'survival'

The following object is masked _by_ '.GlobalEnv':

    heart

Loading required package: MASS


Attaching package: 'MASS'

The following object is masked from 'package:gtsummary':

    select

The following object is masked from 'package:dplyr':

    select

Loading required package: nnet


Attaching package: 'epiDisplay'

The following objects are masked from 'package:kernlab':

    alpha, csi

The following object is masked from 'package:ggplot2':

    alpha

library(epiR)

Package epiR 2.0.81 is loaded

Type help(epi.about) for summary information

Type browseVignettes(package = 'epiR') to learn how to use epiR for applied epidemiological analyses

library(summarytools)


Attaching package: 'summarytools'

The following objects are masked from 'package:table1':

    label, label<-

#===================================
Table1<-with(heart,
             ctable(x=fbs,
                    y=target,
                    prop = "n",
                    totals = FALSE))
Table1

Cross-Tabulation  
fbs * target  
Data Frame: heart  

------- -------- -------- ---------
          target   Absent   Present
    fbs                            
  False               127       103
   True                23        17
------- -------- -------- ---------

# Form a 2 by 2 table
table1<-as.table(rbind(c(127,103),c(23,17)))
table1

    A   B
A 127 103
B  23  17

dimnames(table1)<-list(
  fbs=c("FALSE","TRUE"),
  Heartdisease=c("Absent","Present")
)
table1

       Heartdisease
fbs     Absent Present
  FALSE    127     103
  TRUE      23      17

epi.tests(table1,conf.level = 0.95)

          Outcome +    Outcome -      Total
Test +          127          103        230
Test -           23           17         40
Total           150          120        270

Point estimates and 95% CIs:
--------------------------------------------------------------
Apparent prevalence *                  0.85 (0.80, 0.89)
True prevalence *                      0.56 (0.49, 0.62)
Sensitivity *                          0.85 (0.78, 0.90)
Specificity *                          0.14 (0.08, 0.22)
Positive predictive value *            0.55 (0.49, 0.62)
Negative predictive value *            0.42 (0.27, 0.59)
Positive likelihood ratio              0.99 (0.89, 1.09)
Negative likelihood ratio              1.08 (0.61, 1.93)
False T+ proportion for true D- *      0.86 (0.78, 0.92)
False T- proportion for true D+ *      0.15 (0.10, 0.22)
False T+ proportion for T+ *           0.45 (0.38, 0.51)
False T- proportion for T- *           0.57 (0.41, 0.73)
Correctly classified proportion *      0.53 (0.47, 0.59)
--------------------------------------------------------------
* Exact CIs

#Hypothesis test of Association Using Relative Risk and Odds Ratio
epi.2by2(table1,method = "cohort.count",conf.level = 0.95)

             Outcome +    Outcome -      Total                 Inc risk *
Exposed +          127          103        230     55.22 (48.54 to 61.76)
Exposed -           23           17         40     57.50 (40.89 to 72.96)
Total              150          120        270     55.56 (49.41 to 61.58)

Point estimates and 95% CIs:
-------------------------------------------------------------------
Inc risk ratio                                 0.96 (0.72, 1.28)
Inc odds ratio                                 0.91 (0.46, 1.80)
Attrib risk in the exposed *                   -2.28 (-18.90, 14.33)
Attrib fraction in the exposed (%)            -4.13 (-39.27, 22.14)
Attrib risk in the population *                -1.94 (-18.37, 14.48)
Attrib fraction in the population (%)         -3.50 (-32.39, 19.08)
-------------------------------------------------------------------
Uncorrected chi2 test that OR = 1: chi2(1) = 0.072 Pr>chi2 = 0.789
Fisher exact test that OR = 1: Pr>chi2 = 0.864
 Wald confidence limits
 CI: confidence interval
 * Outcomes per 100 population units

PART TWO: MACHINE LEARNING

Data Set used: Birth data set

Supervised: Logistic regression

# Clear R environment
rm(list = ls())
# Set working directory
setwd("C:/Machine Learning")
# Import data set
library(readxl)
birth <- read_excel("birth.xls")
View(birth)

#Load packages
library(caret)

Loading required package: lattice


Attaching package: 'lattice'

The following object is masked from 'package:epiDisplay':

    dotplot


Attaching package: 'caret'

The following object is masked from 'package:survival':

    cluster

The following objects are masked from 'package:DescTools':

    MAE, RMSE

library(doSNOW)

Loading required package: foreach


Attaching package: 'foreach'

The following object is masked from 'package:DescTools':

    %:%

The following object is masked from 'package:expss':

    when

Loading required package: iterators

Loading required package: snow

library(xgboost)


Attaching package: 'xgboost'

The following object is masked from 'package:dplyr':

    slice

# Data manipulation
birth<-birth%>%
  mutate(race=as.factor(race),
         smoke=as.factor(smoke),
         low=as.factor(low),
         ht=as.factor(ht),
         ui=as.factor(ui),
         ftv=as.factor(ftv))
#====drop id
Dropvar<-names(birth)%in%c("id")
birth<-birth[!Dropvar]

# Value label 
birth$low<-ordered(birth$low,levels=c(0,1),
                   labels=c("Normal","Low"))

#Splitting the data set(80/20)
library(caret)
library(doSNOW)
library(xgboost)
set.seed(543)
Indexes<-createDataPartition(birth$low,
                             times = 1,
                             p=0.7,
                             list = FALSE)
birth.train<-birth[Indexes,]
birth.test<-birth[-Indexes,]

#Examine the proportion of the birth label across the data sets
#================================================
library(summarytools)
#==============================
freq(birth$low,report.nas = FALSE)

Frequencies  
birth$low  
Type: Ordered Factor  

               Freq        %   % Cum.
------------ ------ -------- --------
      Normal    130    68.78    68.78
         Low     59    31.22   100.00
       Total    189   100.00   100.00

freq(birth.train$low,report.nas = FALSE)

Frequencies  
birth.train$low  
Type: Ordered Factor  

               Freq        %   % Cum.
------------ ------ -------- --------
      Normal     91    68.42    68.42
         Low     42    31.58   100.00
       Total    133   100.00   100.00

freq(birth.test$low,report.nas = FALSE)

Frequencies  
birth.test$low  
Type: Ordered Factor  

               Freq        %   % Cum.
------------ ------ -------- --------
      Normal     39    69.64    69.64
         Low     17    30.36   100.00
       Total     56   100.00   100.00

Setting up caret to perform 10-fold cross validation repeated 3 times and use a grid search for optimal model hyperparameter values.

train.control<-trainControl(method = "repeatedCV",
                            number = 10,
                            repeats = 3,
                            search = "grid")

Warning: `repeats` has no meaning for this resampling method.

##Then Leverage a grid search of hyper parameter for xgboost.
tune.grid<-expand.grid(eta=c(0.05,0.075,0.1),
                       nrounds=c(50,75,100),
                       max_depth=6:8,
                       min_child.weight=c(0.3,2.25,2.5),
                       colsample_bytree=c(0.3,0.4,0.5),
                       gamma=0,
                       subsample=1)

Using doSNOW package to enable caret to train in parallel

Cl<-makeCluster(10,type="SOCK")

Registering clusters so that caret will know train in parallel

registerDoSNOW(Cl)

Train the xgboost model using 10-fold cv and allowing hyparameter for grid search to train the optimal model.

caret.cv<-train(low~.,
                data = birth.train,
                method="xgbTree",
                tunegrid=tune.grid,
                trcontrol=train.control)

[13:51:06] WARNING: src/learner.cc:767: 
Parameters: { "trcontrol", "tunegrid" } are not used.

stopCluster(Cl)

Make predictions on the test set using a xgboost model prediction

pred<-predict(caret.cv,birth.test)
pred

 [1] Normal Normal Low    Low    Normal Normal Normal Normal Normal Normal
[11] Normal Normal Low    Normal Normal Normal Normal Normal Normal Normal
[21] Normal Low    Normal Normal Normal Normal Low    Normal Normal Normal
[31] Normal Normal Normal Normal Normal Normal Normal Normal Normal Low   
[41] Low    Low    Normal Normal Normal Normal Normal Low    Normal Normal
[51] Low    Normal Normal Low    Normal Normal
Levels: Normal < Low

# Examine caret's processing result
#caret.cv

Using caret’s confusionmatrix() to estimate the effectiveness of the model

confusionMatrix(pred,birth.test$low)

Confusion Matrix and Statistics

          Reference
Prediction Normal Low
    Normal     34  11
    Low         5   6
                                         
               Accuracy : 0.7143         
                 95% CI : (0.5779, 0.827)
    No Information Rate : 0.6964         
    P-Value [Acc > NIR] : 0.4498         
                                         
                  Kappa : 0.2496         
                                         
 Mcnemar's Test P-Value : 0.2113         
                                         
            Sensitivity : 0.8718         
            Specificity : 0.3529         
         Pos Pred Value : 0.7556         
         Neg Pred Value : 0.5455         
             Prevalence : 0.6964         
         Detection Rate : 0.6071         
   Detection Prevalence : 0.8036         
      Balanced Accuracy : 0.6124         
                                         
       'Positive' Class : Normal

Please note: Other supervised machine learning like SVM,Random forest, PCA, linear regression and others have not covered in this demo