# CLEAR WORKSPACE
rm(list = ls(all.names = TRUE))
#==================================-
# Working directory
setwd("C:/Epidemiology")
#1.1 Import data
library(readxl)
heart <- read_excel("heart.xlsx",sheet = "heart")Epidemiological Analysis and Machine Learning Demo
Data Set Used: Heart Disease Data set
About Data set:
The data set is the Stat log Heart Disease data set taken from the UCI repository. The dataset consists of 270 individuals’ data. There are 14 columns in the dataset (which have been extracted from a larger set of 75). No missing values. The classification task is to predict whether an individual is suffering from heart disease or not. (0: absence, 1: presence)
Please note : This to show some of my skills in epidemiological data analysis and machine learning skills in R
Data processing
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(expss)Loading required package: maditr
To select columns from data: columns(mtcars, mpg, vs:carb)
Attaching package: 'maditr'
The following objects are masked from 'package:dplyr':
between, coalesce, first, last
Use 'expss_output_rnotebook()' to display tables inside R Notebooks.
To return to the console output, use 'expss_output_default()'.
Attaching package: 'expss'
The following objects are masked from 'package:dplyr':
compute, contains, na_if, recode, vars, where
heart<-heart|>
mutate(
sex= factor(sex,
levels = c(0,1),
labels = c("Female","Male"),
exclude = NA),
cp=factor(cp,
levels = c(0,1,2,3),
labels = c("typical angina","atypical angina",
"non- anginal pain","asymptomatic"),
exclude = NA),
fbs=factor(fbs,
levels = c(0,1),
labels = c("False","True"),
exclude = NA),
restecg=factor(restecg,
levels = c(0,1,2),
labels = c("Normal","having ST-T wave abnormality",
"showing probable"),
exclude = NA),
exang=factor(exang,
levels = c(0,1),
labels=c("No","Yes"),
exclude = NA),
slope=factor(slope,
levels = c(0,1,2),
labels=c("up sloping","flat",
"down sloping"),
exclude = NA),
ca=factor(ca,
levels = c(0,1,2,3)),
thal=factor(thal,
levels = c(0,1,2,3),
labels = c("NULL","normal blood flow","fixed defect",
"reversible defect"),
exclude = NA),
target=factor(target,
levels = c(0,1),
labels = c("Absent","Present")))|>
apply_labels(
age= "Patients Age in years" ,
sex= "Gender",
cp= "Type of chest pain",
trestbps= "Patient's level of BP at resting mode in mm/HG" ,
chol= "Serum cholesterol in mg/dl",
fbs= "Blood sugar levels on fasting > 120 mg/dl",
restecg= "Result of electrocardiogram while at rest" ,
thalach= "Maximum heart rate achieved" ,
exang= "Angina induced by exercise" ,
oldpeak= "Exercise induced ST-depression" ,
slope="ST segment measured in terms of slope",
ca= "The number of major vessels",
thal= "A blood disorder called thalassemia",
target= "Heart Disease")|>
mutate(
Age_cat=case_when(
age<35~1,
age>=35 & age<40~2,
age>=40 & age<45~3,
age>=45 & age<50~4,
age>=50 & age<55~5,
age>=55 & age<60~6,
age>=60 & age<65~7,
age>=65~8
))|>
mutate(
Age_cat=factor(Age_cat,
levels = c(1,2,3,4,5,6,7,8),
labels = c("29-34","35-39","40-44","45-49",
"50-54","55-59","60-64","65 and Above"),
))Selecting 8 columns for the purpose of demonstrating my skills in this analysis
df<-heart|>
select(Age_cat,sex,ca,target,fbs,age,trestbps,chol)# Viewing the first 10 observations
head(df,10)# A tibble: 10 × 8
Age_cat sex ca target fbs age trestbps chol
<fct> <fct> <fct> <fct> <fct> <labelled> <labelled> <labelled>
1 65 and Above Male 3 Present False 70 130 322
2 65 and Above Female 0 Absent False 67 115 564
3 55-59 Male 0 Present False 57 124 261
4 60-64 Male 1 Absent False 64 128 263
5 65 and Above Female 1 Absent False 74 120 269
6 65 and Above Male 0 Absent False 65 120 177
7 55-59 Male 1 Present True 56 130 256
8 55-59 Male 1 Present False 59 110 239
9 60-64 Male 2 Present False 60 140 293
10 60-64 Female 3 Present False 63 150 407
Study the distribution and pattern of heart disease in female patients
#Forming another data frame with only female patients
df_1<-df%>%
select(Age_cat,sex,ca,target,fbs)%>%
filter(sex=="Female")Showing age group distribution among female patients using a bar graph
library(ggplot2)
Attaching package: 'ggplot2'
The following object is masked from 'package:expss':
vars
data <- data.frame("Age" = c("29-34", "35-39","40-44","45-49","50-54","55-59","60-64","65+"),
"Freq" = c(1,3,10,9,15,12,21,16),
"Percent" = c("1.2%", "3.5%","11.5%","10.3%","17.2%","13.8%","24.1%","18.4%"))
#=====================================
ggplot(data, aes(x = Age, y = as.numeric(Freq))) +
geom_bar(stat = "identity", color = "black", fill = "dodgerblue1")+
geom_text(label = with(data, paste(Freq, paste0('(', Percent, ')'))), vjust=-1) +
ylim(0, 25)+
labs(title = " Female Patients participated in the study",
y= "Percentage distribution",
x="Age group(years)")Distribution of heart disease in female patients among age group using tables
library(table1)
Attaching package: 'table1'
The following objects are masked from 'package:base':
units, units<-
table1(~Age_cat|target,data = df_1)| Absent (N=67) |
Present (N=20) |
Overall (N=87) |
|
|---|---|---|---|
| Age_cat | |||
| 29-34 | 1 (1.5%) | 0 (0%) | 1 (1.1%) |
| 35-39 | 3 (4.5%) | 0 (0%) | 3 (3.4%) |
| 40-44 | 9 (13.4%) | 1 (5.0%) | 10 (11.5%) |
| 45-49 | 9 (13.4%) | 0 (0%) | 9 (10.3%) |
| 50-54 | 14 (20.9%) | 1 (5.0%) | 15 (17.2%) |
| 55-59 | 7 (10.4%) | 5 (25.0%) | 12 (13.8%) |
| 60-64 | 10 (14.9%) | 11 (55.0%) | 21 (24.1%) |
| 65 and Above | 14 (20.9%) | 2 (10.0%) | 16 (18.4%) |
N/B: The incidence rate of heart disease was high between 60-64 years among female patients
Study the distribution and pattern of heart disease in male patients
df_2<-df%>%
select(Age_cat,sex,ca,target,fbs)%>%
filter(sex=="Male")# View the first observation
head(df_2,10)# A tibble: 10 × 5
Age_cat sex ca target fbs
<fct> <fct> <fct> <fct> <fct>
1 65 and Above Male 3 Present False
2 55-59 Male 0 Present False
3 60-64 Male 1 Absent False
4 65 and Above Male 0 Absent False
5 55-59 Male 1 Present True
6 55-59 Male 1 Present False
7 60-64 Male 2 Present False
8 55-59 Male 0 Absent False
9 50-54 Male 0 Absent False
10 40-44 Male 0 Absent False
Presenting the age distribution in male patients using bar graph
dat <- data.frame("Age" = c("29-34", "35-39","40-44","45-49","50-54","55-59","60-64","65+"),
"Freq" = c(2,6,27,21,38,42,25,22),
"Percent" = c("1.1%", "3.3%","14.8%","11.5%","20.8%","23.0%","13.7%","12.0%"))
#=====================================
ggplot(dat, aes(x = Age, y = as.numeric(Freq))) +
geom_bar(stat = "identity", color = "black", fill = "green")+
geom_text(label = with(dat, paste(Freq, paste0('(', Percent, ')'))), vjust=-1) +
ylim(0, 50)+
labs(title = "Male Patients participated in the study",
y= "Percentage distribution",
x="Age group(years)")Distribution of heart disease in male patients
table1(~Age_cat|target,data = df_2)| Absent (N=83) |
Present (N=100) |
Overall (N=183) |
|
|---|---|---|---|
| Age_cat | |||
| 29-34 | 2 (2.4%) | 0 (0%) | 2 (1.1%) |
| 35-39 | 2 (2.4%) | 4 (4.0%) | 6 (3.3%) |
| 40-44 | 20 (24.1%) | 7 (7.0%) | 27 (14.8%) |
| 45-49 | 10 (12.0%) | 11 (11.0%) | 21 (11.5%) |
| 50-54 | 22 (26.5%) | 16 (16.0%) | 38 (20.8%) |
| 55-59 | 15 (18.1%) | 27 (27.0%) | 42 (23.0%) |
| 60-64 | 6 (7.2%) | 19 (19.0%) | 25 (13.7%) |
| 65 and Above | 6 (7.2%) | 16 (16.0%) | 22 (12.0%) |
N/B: The incidence rate of heart disease was high between 55-59 years in male patients
Heart disease Analysis
# Load packages
library(gtsummary)
Attaching package: 'gtsummary'
The following objects are masked from 'package:expss':
contains, vars, where
library(flextable)
Attaching package: 'flextable'
The following object is masked from 'package:gtsummary':
continuous_summary
The following object is masked from 'package:expss':
set_caption
library(officer)
Attaching package: 'officer'
The following object is masked from 'package:readxl':
read_xlsx
library(broom)
library(gt)
Attaching package: 'gt'
The following objects are masked from 'package:expss':
contains, gt, tab_caption, vars, where
#library(smd)
#========Drop Age_Cat from df data frame
df<-df|>
select(-Age_cat)
#================================
Tab_1 <-df |>
tbl_summary(
by = target, # Uncomment if you want group-wise summary
statistic = list(
all_continuous() ~ "{mean} ± {sd}",
all_categorical() ~ "{n} ({p}%)"
),
percent = "column",
missing = "no"
) |>
add_overall() |>
add_p(pvalue_fun = ~style_pvalue(.x, digits = 2)) |>
modify_footnote(all_stat_cols() ~ "Mean(SD)") |>
modify_spanning_header(c("stat_1", "stat_2") ~ "**Heart Disease **") |>
modify_caption("Table 1:Bivariate Analysis") |>
bold_labels() |>
add_n() |>
as_flex_table()
sect_properties <- prop_section(page_size = page_size(orient = "portrait"))#, width = 8.3, height = 11.7)
save_as_docx(Tab_1,path="Table1c.docx", pr_section = sect_properties)
Tab_1
| Heart Disease |
| |||
|---|---|---|---|---|---|
Characteristic | N | Overall | Absent | Present | p-value2 |
Gender | 270 | <0.001 | |||
Female | 87 (32%) | 67 (45%) | 20 (17%) | ||
Male | 183 (68%) | 83 (55%) | 100 (83%) | ||
The number of major vessels | 270 | <0.001 | |||
0 | 160 (59%) | 120 (80%) | 40 (33%) | ||
1 | 58 (21%) | 20 (13%) | 38 (32%) | ||
2 | 33 (12%) | 7 (4.7%) | 26 (22%) | ||
3 | 19 (7.0%) | 3 (2.0%) | 16 (13%) | ||
Blood sugar levels on fasting > 120 mg/dl | 270 | 0.79 | |||
False | 230 (85%) | 127 (85%) | 103 (86%) | ||
True | 40 (15%) | 23 (15%) | 17 (14%) | ||
Patients Age in years | 270 | 54 ± 9 | 53 ± 10 | 57 ± 8 | <0.001 |
Patient's level of BP at resting mode in mm/HG | 270 | 131 ± 18 | 129 ± 16 | 134 ± 19 | 0.032 |
Serum cholesterol in mg/dl | 270 | 250 ± 52 | 244 ± 54 | 256 ± 48 | 0.008 |
1Mean(SD) | |||||
2Pearson's Chi-squared test; Wilcoxon rank sum test | |||||
df %>%
select(sex,target,trestbps) %>%
tbl_summary(by = target) %>%
add_overall() %>%
add_p(test = all_continuous() ~ "t.test",
pvalue_fun = ~style_pvalue(., digits = 2))%>%
as_gt() %>%
gt::tab_header(
title = gt::md("**Table 1. Characteristics of Heart Disease**"),
subtitle = gt::md("_Highly Confidential_")
) %>%
gt::tab_source_note("Data updated by Ojala on Feb 24, 2026")| Table 1. Characteristics of Heart Disease | ||||
| Highly Confidential | ||||
| Characteristic | Overall N = 2701 |
Absent N = 1501 |
Present N = 1201 |
p-value2 |
|---|---|---|---|---|
| Gender | <0.001 | |||
| Female | 87 (32%) | 67 (45%) | 20 (17%) | |
| Male | 183 (68%) | 83 (55%) | 100 (83%) | |
| Patient's level of BP at resting mode in mm/HG | 130 (120, 140) | 130 (120, 140) | 130 (120, 145) | 0.012 |
| 1 n (%); Median (Q1, Q3) | ||||
| 2 Pearson’s Chi-squared test; Welch Two Sample t-test | ||||
| Data updated by Ojala on Feb 24, 2026 | ||||
Measures of Associations
#Un Adjusted Odd Ratios
Model1<-df|>
tbl_uvregression(method = glm,y=target,
method.args = list(family=binomial()),
exponentiate = TRUE,pvalue_fun = ~style_pvalue(.x,digits = 3))|>
modify_column_merge(pattern = "{estimate} ({ci})",rows = ! is.na(estimate))|>
modify_header(estimate~"**OR(95%C.I)**")|>
bold_labels()
Model1| Characteristic | N | OR(95%C.I) | 95% CI | p-value |
|---|---|---|---|---|
| Gender | 270 | |||
| Female | — | — | ||
| Male | 4.04 (2.30, 7.34) | 2.30, 7.34 | <0.001 | |
| The number of major vessels | 270 | |||
| 0 | — | — | ||
| 1 | 5.70 (3.01, 11.1) | 3.01, 11.1 | <0.001 | |
| 2 | 11.1 (4.71, 29.7) | 4.71, 29.7 | <0.001 | |
| 3 | 16.0 (5.02, 71.4) | 5.02, 71.4 | <0.001 | |
| Blood sugar levels on fasting > 120 mg/dl | 270 | |||
| False | — | — | ||
| True | 0.91 (0.46, 1.79) | 0.46, 1.79 | 0.789 | |
| Patients Age in years | 270 | 1.05 (1.02, 1.08) | 1.02, 1.08 | <0.001 |
| Patient's level of BP at resting mode in mm/HG | 270 | 1.02 (1.00, 1.03) | 1.00, 1.03 | 0.012 |
| Serum cholesterol in mg/dl | 270 | 1.00 (1.00, 1.01) | 1.00, 1.01 | 0.056 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio | ||||
Fitting logistic regression model to control confounder variables in model 2
#Adjusted Odds ratio
Model2<- glm(target~sex+
fbs+ca,data =df ,family = binomial())|>
tbl_regression(
exponentiate = TRUE,pvalue_fun = ~style_pvalue(.x,digits = 3))|>
modify_column_merge(pattern = "{estimate} ({ci})",rows = ! is.na(estimate))|>
modify_header(estimate~"**OR(95%C.I)**")|>
bold_labels()
Model2| Characteristic | OR(95%C.I) | 95% CI | p-value |
|---|---|---|---|
| Gender | |||
| Female | — | — | |
| Male | 4.87 (2.50, 10.0) | 2.50, 10.0 | <0.001 |
| Blood sugar levels on fasting > 120 mg/dl | |||
| False | — | — | |
| True | 0.55 (0.23, 1.23) | 0.23, 1.23 | 0.151 |
| The number of major vessels | |||
| 0 | — | — | |
| 1 | 5.45 (2.78, 11.1) | 2.78, 11.1 | <0.001 |
| 2 | 16.3 (6.31, 48.3) | 6.31, 48.3 | <0.001 |
| 3 | 18.6 (5.38, 89.0) | 5.38, 89.0 | <0.001 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio | |||
#=========================================================Merging two tables
# Merging Tables----
Table_2<-tbl_merge(
tbls = list(Model1,Model2),
tab_spanner = c("**Unadjusted**","**Adjusted**")
)The number rows in the tables to be merged do not match, which may result in
rows appearing out of order.
ℹ See `tbl_merge()` (`?gtsummary::tbl_merge()`) help file for details. Use
`quiet=TRUE` to silence message.
Table_2| Characteristic |
Unadjusted
|
Adjusted
|
|||||
|---|---|---|---|---|---|---|---|
| N | OR(95%C.I) | 95% CI | p-value | OR(95%C.I) | 95% CI | p-value | |
| Gender | 270 | ||||||
| Female | — | — | — | — | |||
| Male | 4.04 (2.30, 7.34) | 2.30, 7.34 | <0.001 | 4.87 (2.50, 10.0) | 2.50, 10.0 | <0.001 | |
| The number of major vessels | 270 | ||||||
| 0 | — | — | — | — | |||
| 1 | 5.70 (3.01, 11.1) | 3.01, 11.1 | <0.001 | 5.45 (2.78, 11.1) | 2.78, 11.1 | <0.001 | |
| 2 | 11.1 (4.71, 29.7) | 4.71, 29.7 | <0.001 | 16.3 (6.31, 48.3) | 6.31, 48.3 | <0.001 | |
| 3 | 16.0 (5.02, 71.4) | 5.02, 71.4 | <0.001 | 18.6 (5.38, 89.0) | 5.38, 89.0 | <0.001 | |
| Blood sugar levels on fasting > 120 mg/dl | 270 | ||||||
| False | — | — | — | — | |||
| True | 0.91 (0.46, 1.79) | 0.46, 1.79 | 0.789 | 0.55 (0.23, 1.23) | 0.23, 1.23 | 0.151 | |
| Patients Age in years | 270 | 1.05 (1.02, 1.08) | 1.02, 1.08 | <0.001 | |||
| Patient's level of BP at resting mode in mm/HG | 270 | 1.02 (1.00, 1.03) | 1.00, 1.03 | 0.012 | |||
| Serum cholesterol in mg/dl | 270 | 1.00 (1.00, 1.01) | 1.00, 1.01 | 0.056 | |||
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio | |||||||
Plotting a 95% Cl(OR) using a forest plot though it is used mostly in meta analysis
#Fitting 95%Cl( Odd ratios) using forest plot
library(finalfit)
library(kernlab)
Attaching package: 'kernlab'
The following object is masked from 'package:ggplot2':
alpha
#===========================================
# Outcome
dependent <- "target"
# Explanatory variables
explanatory <- c(
"age",
"sex",
"trestbps",
"fbs",
"ca"
)
#3.2.3 Forest plot----
plot1=df %>% or_plot(dependent, explanatory, remove_ref = TRUE, , table_text_size = 3, title_text_size = 14,
dependent_label = "Predictors of heart disease", prefix = "Forest Plot: - ", suffix = "")Waiting for profiling to be done...
Waiting for profiling to be done...
Waiting for profiling to be done...
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_errorbarh()`).
plot1TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[layout]
2 2 (2-2,2-2) arrange gtable[layout]
3 3 (1-1,1-2) arrange text[GRID.text.145]
#ggsave("forest2_mdr.png", plot = plot1, width = 10, height = 6, dpi = 300)Diagnostic Analysis: Sensitivity, Specificity, NPV,PPV etc.
N/B: fbs: Blood sugar levels on fasting > 120 mg/dl represents as 1 in case of true and 0 as false (Nominal)
target: It is the target variable which we have to predict 1 means patient is suffering from heart disease and 0 means patient is normal.
#===Load packages
library(DescTools)
Attaching package: 'DescTools'
The following object is masked from 'package:maditr':
%like%
library(epiDisplay)Loading required package: foreign
Loading required package: survival
Attaching package: 'survival'
The following object is masked _by_ '.GlobalEnv':
heart
Loading required package: MASS
Attaching package: 'MASS'
The following object is masked from 'package:gtsummary':
select
The following object is masked from 'package:dplyr':
select
Loading required package: nnet
Attaching package: 'epiDisplay'
The following objects are masked from 'package:kernlab':
alpha, csi
The following object is masked from 'package:ggplot2':
alpha
library(epiR)Package epiR 2.0.81 is loaded
Type help(epi.about) for summary information
Type browseVignettes(package = 'epiR') to learn how to use epiR for applied epidemiological analyses
library(summarytools)
Attaching package: 'summarytools'
The following objects are masked from 'package:table1':
label, label<-
#===================================
Table1<-with(heart,
ctable(x=fbs,
y=target,
prop = "n",
totals = FALSE))
Table1Cross-Tabulation
fbs * target
Data Frame: heart
------- -------- -------- ---------
target Absent Present
fbs
False 127 103
True 23 17
------- -------- -------- ---------
# Form a 2 by 2 table
table1<-as.table(rbind(c(127,103),c(23,17)))
table1 A B
A 127 103
B 23 17
dimnames(table1)<-list(
fbs=c("FALSE","TRUE"),
Heartdisease=c("Absent","Present")
)
table1 Heartdisease
fbs Absent Present
FALSE 127 103
TRUE 23 17
epi.tests(table1,conf.level = 0.95) Outcome + Outcome - Total
Test + 127 103 230
Test - 23 17 40
Total 150 120 270
Point estimates and 95% CIs:
--------------------------------------------------------------
Apparent prevalence * 0.85 (0.80, 0.89)
True prevalence * 0.56 (0.49, 0.62)
Sensitivity * 0.85 (0.78, 0.90)
Specificity * 0.14 (0.08, 0.22)
Positive predictive value * 0.55 (0.49, 0.62)
Negative predictive value * 0.42 (0.27, 0.59)
Positive likelihood ratio 0.99 (0.89, 1.09)
Negative likelihood ratio 1.08 (0.61, 1.93)
False T+ proportion for true D- * 0.86 (0.78, 0.92)
False T- proportion for true D+ * 0.15 (0.10, 0.22)
False T+ proportion for T+ * 0.45 (0.38, 0.51)
False T- proportion for T- * 0.57 (0.41, 0.73)
Correctly classified proportion * 0.53 (0.47, 0.59)
--------------------------------------------------------------
* Exact CIs
#Hypothesis test of Association Using Relative Risk and Odds Ratio
epi.2by2(table1,method = "cohort.count",conf.level = 0.95) Outcome + Outcome - Total Inc risk *
Exposed + 127 103 230 55.22 (48.54 to 61.76)
Exposed - 23 17 40 57.50 (40.89 to 72.96)
Total 150 120 270 55.56 (49.41 to 61.58)
Point estimates and 95% CIs:
-------------------------------------------------------------------
Inc risk ratio 0.96 (0.72, 1.28)
Inc odds ratio 0.91 (0.46, 1.80)
Attrib risk in the exposed * -2.28 (-18.90, 14.33)
Attrib fraction in the exposed (%) -4.13 (-39.27, 22.14)
Attrib risk in the population * -1.94 (-18.37, 14.48)
Attrib fraction in the population (%) -3.50 (-32.39, 19.08)
-------------------------------------------------------------------
Uncorrected chi2 test that OR = 1: chi2(1) = 0.072 Pr>chi2 = 0.789
Fisher exact test that OR = 1: Pr>chi2 = 0.864
Wald confidence limits
CI: confidence interval
* Outcomes per 100 population units
PART TWO: MACHINE LEARNING
Data Set used: Birth data set
Supervised: Logistic regression
# Clear R environment
rm(list = ls())
# Set working directory
setwd("C:/Machine Learning")
# Import data set
library(readxl)
birth <- read_excel("birth.xls")
View(birth)#Load packages
library(caret)Loading required package: lattice
Attaching package: 'lattice'
The following object is masked from 'package:epiDisplay':
dotplot
Attaching package: 'caret'
The following object is masked from 'package:survival':
cluster
The following objects are masked from 'package:DescTools':
MAE, RMSE
library(doSNOW)Loading required package: foreach
Attaching package: 'foreach'
The following object is masked from 'package:DescTools':
%:%
The following object is masked from 'package:expss':
when
Loading required package: iterators
Loading required package: snow
library(xgboost)
Attaching package: 'xgboost'
The following object is masked from 'package:dplyr':
slice
# Data manipulation
birth<-birth%>%
mutate(race=as.factor(race),
smoke=as.factor(smoke),
low=as.factor(low),
ht=as.factor(ht),
ui=as.factor(ui),
ftv=as.factor(ftv))
#====drop id
Dropvar<-names(birth)%in%c("id")
birth<-birth[!Dropvar]# Value label
birth$low<-ordered(birth$low,levels=c(0,1),
labels=c("Normal","Low"))#Splitting the data set(80/20)
library(caret)
library(doSNOW)
library(xgboost)
set.seed(543)
Indexes<-createDataPartition(birth$low,
times = 1,
p=0.7,
list = FALSE)
birth.train<-birth[Indexes,]
birth.test<-birth[-Indexes,]#Examine the proportion of the birth label across the data sets
#================================================
library(summarytools)
#==============================
freq(birth$low,report.nas = FALSE)Frequencies
birth$low
Type: Ordered Factor
Freq % % Cum.
------------ ------ -------- --------
Normal 130 68.78 68.78
Low 59 31.22 100.00
Total 189 100.00 100.00
freq(birth.train$low,report.nas = FALSE)Frequencies
birth.train$low
Type: Ordered Factor
Freq % % Cum.
------------ ------ -------- --------
Normal 91 68.42 68.42
Low 42 31.58 100.00
Total 133 100.00 100.00
freq(birth.test$low,report.nas = FALSE)Frequencies
birth.test$low
Type: Ordered Factor
Freq % % Cum.
------------ ------ -------- --------
Normal 39 69.64 69.64
Low 17 30.36 100.00
Total 56 100.00 100.00
Setting up caret to perform 10-fold cross validation repeated 3 times and use a grid search for optimal model hyperparameter values.
train.control<-trainControl(method = "repeatedCV",
number = 10,
repeats = 3,
search = "grid")Warning: `repeats` has no meaning for this resampling method.
##Then Leverage a grid search of hyper parameter for xgboost.
tune.grid<-expand.grid(eta=c(0.05,0.075,0.1),
nrounds=c(50,75,100),
max_depth=6:8,
min_child.weight=c(0.3,2.25,2.5),
colsample_bytree=c(0.3,0.4,0.5),
gamma=0,
subsample=1)Using doSNOW package to enable caret to train in parallel
Cl<-makeCluster(10,type="SOCK")Registering clusters so that caret will know train in parallel
registerDoSNOW(Cl)Train the xgboost model using 10-fold cv and allowing hyparameter for grid search to train the optimal model.
caret.cv<-train(low~.,
data = birth.train,
method="xgbTree",
tunegrid=tune.grid,
trcontrol=train.control)[13:51:06] WARNING: src/learner.cc:767:
Parameters: { "trcontrol", "tunegrid" } are not used.
stopCluster(Cl)Make predictions on the test set using a xgboost model prediction
pred<-predict(caret.cv,birth.test)
pred [1] Normal Normal Low Low Normal Normal Normal Normal Normal Normal
[11] Normal Normal Low Normal Normal Normal Normal Normal Normal Normal
[21] Normal Low Normal Normal Normal Normal Low Normal Normal Normal
[31] Normal Normal Normal Normal Normal Normal Normal Normal Normal Low
[41] Low Low Normal Normal Normal Normal Normal Low Normal Normal
[51] Low Normal Normal Low Normal Normal
Levels: Normal < Low
# Examine caret's processing result
#caret.cvUsing caret’s confusionmatrix() to estimate the effectiveness of the model
confusionMatrix(pred,birth.test$low) Confusion Matrix and Statistics
Reference
Prediction Normal Low
Normal 34 11
Low 5 6
Accuracy : 0.7143
95% CI : (0.5779, 0.827)
No Information Rate : 0.6964
P-Value [Acc > NIR] : 0.4498
Kappa : 0.2496
Mcnemar's Test P-Value : 0.2113
Sensitivity : 0.8718
Specificity : 0.3529
Pos Pred Value : 0.7556
Neg Pred Value : 0.5455
Prevalence : 0.6964
Detection Rate : 0.6071
Detection Prevalence : 0.8036
Balanced Accuracy : 0.6124
'Positive' Class : Normal
Please note: Other supervised machine learning like SVM,Random forest, PCA, linear regression and others have not covered in this demo