#load libraries
library(tidyverse)
library(viridis)
library(corrplot)
library(randomForest)
library(e1071)
library(nnet)
library(caret)Fetal Health Status Prediction in R
About the Dataset
Context
Reduction of child mortality is reflected in several of the United Nations’ Sustainable Development Goals and is a key indicator of human progress.
The UN expects that by 2030, countries end preventable deaths of newborns and children under 5 years of age, with all countries aiming to reduce under‑5 mortality to at least as low as 25 per 1,000 live births.
Parallel to notion of child mortality is of course maternal mortality, which accounts for 295 000 deaths during and following pregnancy and childbirth (as of 2017). The vast majority of these deaths (94%) occurred in low-resource settings, and most could have been prevented.
In light of what was mentioned above, Cardiotocograms (CTGs) are a simple and cost accessible option to assess fetal health, allowing healthcare professionals to take action in order to prevent child and maternal mortality. The equipment itself works by sending ultrasound pulses and reading its response, thus shedding light on fetal heart rate (FHR), fetal movements, uterine contractions and more.
Data
This dataset contains 2126 records of features extracted from Cardiotocogram exams, which were then classified by three expert obstetricians into 3 classes:
Normal
Suspect
Pathological
Methodology
Four models have been used to predict the fetal health status. Their accuracy has been compared to identify the best model among the four used for prediction of fetal health status.
Code
#load data
data <- read.csv("fetal_health.csv")
glimpse(data)Rows: 2,126
Columns: 22
$ baseline.value <dbl> 120, 132, 133, …
$ accelerations <dbl> 0.000, 0.006, 0…
$ fetal_movement <dbl> 0.000, 0.000, 0…
$ uterine_contractions <dbl> 0.000, 0.006, 0…
$ light_decelerations <dbl> 0.000, 0.003, 0…
$ severe_decelerations <dbl> 0, 0, 0, 0, 0, …
$ prolongued_decelerations <dbl> 0.000, 0.000, 0…
$ abnormal_short_term_variability <dbl> 73, 17, 16, 16,…
$ mean_value_of_short_term_variability <dbl> 0.5, 2.1, 2.1, …
$ percentage_of_time_with_abnormal_long_term_variability <dbl> 43, 0, 0, 0, 0,…
$ mean_value_of_long_term_variability <dbl> 2.4, 10.4, 13.4…
$ histogram_width <dbl> 64, 130, 130, 1…
$ histogram_min <dbl> 62, 68, 68, 53,…
$ histogram_max <dbl> 126, 198, 198, …
$ histogram_number_of_peaks <dbl> 2, 6, 5, 11, 9,…
$ histogram_number_of_zeroes <dbl> 0, 1, 1, 0, 0, …
$ histogram_mode <dbl> 120, 141, 141, …
$ histogram_mean <dbl> 137, 136, 135, …
$ histogram_median <dbl> 121, 140, 138, …
$ histogram_variance <dbl> 73, 12, 13, 13,…
$ histogram_tendency <dbl> 1, 0, 0, 1, 1, …
$ fetal_health <dbl> 2, 1, 1, 1, 1, …
#converting fetal health to factor
data$fetal_health <- as.factor(data$fetal_health)
data$fetal_health <- factor(data$fetal_health, levels = c(1, 2, 3),labels = c("Normal","Suspect","Pathological"))Exploratory Data Analysis
summary(data) baseline.value accelerations fetal_movement uterine_contractions
Min. :106.0 Min. :0.000000 Min. :0.000000 Min. :0.000000
1st Qu.:126.0 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.002000
Median :133.0 Median :0.002000 Median :0.000000 Median :0.004000
Mean :133.3 Mean :0.003178 Mean :0.009481 Mean :0.004366
3rd Qu.:140.0 3rd Qu.:0.006000 3rd Qu.:0.003000 3rd Qu.:0.007000
Max. :160.0 Max. :0.019000 Max. :0.481000 Max. :0.015000
light_decelerations severe_decelerations prolongued_decelerations
Min. :0.000000 Min. :0.000e+00 Min. :0.0000000
1st Qu.:0.000000 1st Qu.:0.000e+00 1st Qu.:0.0000000
Median :0.000000 Median :0.000e+00 Median :0.0000000
Mean :0.001889 Mean :3.293e-06 Mean :0.0001585
3rd Qu.:0.003000 3rd Qu.:0.000e+00 3rd Qu.:0.0000000
Max. :0.015000 Max. :1.000e-03 Max. :0.0050000
abnormal_short_term_variability mean_value_of_short_term_variability
Min. :12.00 Min. :0.200
1st Qu.:32.00 1st Qu.:0.700
Median :49.00 Median :1.200
Mean :46.99 Mean :1.333
3rd Qu.:61.00 3rd Qu.:1.700
Max. :87.00 Max. :7.000
percentage_of_time_with_abnormal_long_term_variability
Min. : 0.000
1st Qu.: 0.000
Median : 0.000
Mean : 9.847
3rd Qu.:11.000
Max. :91.000
mean_value_of_long_term_variability histogram_width histogram_min
Min. : 0.000 Min. : 3.00 Min. : 50.00
1st Qu.: 4.600 1st Qu.: 37.00 1st Qu.: 67.00
Median : 7.400 Median : 67.50 Median : 93.00
Mean : 8.188 Mean : 70.45 Mean : 93.58
3rd Qu.:10.800 3rd Qu.:100.00 3rd Qu.:120.00
Max. :50.700 Max. :180.00 Max. :159.00
histogram_max histogram_number_of_peaks histogram_number_of_zeroes
Min. :122 Min. : 0.000 Min. : 0.0000
1st Qu.:152 1st Qu.: 2.000 1st Qu.: 0.0000
Median :162 Median : 3.000 Median : 0.0000
Mean :164 Mean : 4.068 Mean : 0.3236
3rd Qu.:174 3rd Qu.: 6.000 3rd Qu.: 0.0000
Max. :238 Max. :18.000 Max. :10.0000
histogram_mode histogram_mean histogram_median histogram_variance
Min. : 60.0 Min. : 73.0 Min. : 77.0 Min. : 0.00
1st Qu.:129.0 1st Qu.:125.0 1st Qu.:129.0 1st Qu.: 2.00
Median :139.0 Median :136.0 Median :139.0 Median : 7.00
Mean :137.5 Mean :134.6 Mean :138.1 Mean : 18.81
3rd Qu.:148.0 3rd Qu.:145.0 3rd Qu.:148.0 3rd Qu.: 24.00
Max. :187.0 Max. :182.0 Max. :186.0 Max. :269.00
histogram_tendency fetal_health
Min. :-1.0000 Normal :1655
1st Qu.: 0.0000 Suspect : 295
Median : 0.0000 Pathological: 176
Mean : 0.3203
3rd Qu.: 1.0000
Max. : 1.0000
sum(is.na(data))[1] 0
No NAs were found in data.
#Frequency of each health status
#getting the count and percentage of abnormal status
counts <- table(data$fetal_health)
total <- sum(counts)
percent_suspect <- (counts["Suspect"] / total) * 100
percent_pathological <- (counts["Pathological"] / total) * 100
title_text <- sprintf("Fetal Health Status was %.1f%% Suspect, %.1f%% Pathological", percent_suspect, percent_pathological)
#creating a bar plot with the values
ggplot(data, aes(x = fetal_health, fill = fetal_health)) +
geom_bar() +
geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
labs(title = title_text, x = NULL, y = NULL) +
theme_minimal() +
scale_fill_viridis_d() +
theme(legend.position = "none") Correlation analysis
#converting to numerical data
data$fetal_health_numeric <- as.numeric(data$fetal_health)
numeric_data <- data[sapply(data, is.numeric)]
#get the correlation values and save in a data frame
correlations <- sapply(numeric_data,
function(x) cor(x, numeric_data$fetal_health_numeric, use = "complete.obs"))
correlation_df <- data.frame(
variable = names(correlations),
correlation = correlations
)
correlation_df <- correlation_df[correlation_df$variable != "fetal_health_numeric", ]
#create a bra plot visulaizing positive and negative correlation
ggplot(correlation_df, aes(x = reorder(variable, correlation),
y = correlation,
fill = correlation)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_viridis(name = "Correlation") +
theme(legend.position = "none") +
labs(x = NULL, y = NULL,
title = "Correlation with Fetal Health")Split data into training and testing set
data <- subset(data, select = -fetal_health_numeric)
set.seed(523)
train_indices <- sample(1:nrow(data), 0.7 * nrow(data))
train_data <- data[train_indices, ]
test_data <- data[-train_indices, ]Random Forest Model
#train the model
rf_model <- randomForest(fetal_health ~ .,
data = train_data, importance = TRUE)
#predict on the test set
predictions <- predict(rf_model, test_data)
#confusion matrix
rf_cf_matrix <- confusionMatrix(predictions, test_data$fetal_health)
rf_cf_matrixConfusion Matrix and Statistics
Reference
Prediction Normal Suspect Pathological
Normal 485 25 3
Suspect 5 68 5
Pathological 2 2 43
Overall Statistics
Accuracy : 0.9342
95% CI : (0.9121, 0.9521)
No Information Rate : 0.7712
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.815
Mcnemar's Test P-Value : 0.001978
Statistics by Class:
Class: Normal Class: Suspect Class: Pathological
Sensitivity 0.9858 0.7158 0.84314
Specificity 0.8082 0.9816 0.99319
Pos Pred Value 0.9454 0.8718 0.91489
Neg Pred Value 0.9440 0.9518 0.98646
Prevalence 0.7712 0.1489 0.07994
Detection Rate 0.7602 0.1066 0.06740
Detection Prevalence 0.8041 0.1223 0.07367
Balanced Accuracy 0.8970 0.8487 0.91816
Support Vector Machine Model
#train the model
svm_model <- svm(fetal_health ~ ., data = train_data, kernel = "radial")
#predict on the test set
predictions <- predict(svm_model, test_data)
#confusion matrix
svm_cf_matrix <- confusionMatrix(predictions, test_data$fetal_health)
svm_cf_matrixConfusion Matrix and Statistics
Reference
Prediction Normal Suspect Pathological
Normal 473 35 4
Suspect 16 58 9
Pathological 3 2 38
Overall Statistics
Accuracy : 0.8918
95% CI : (0.8651, 0.9149)
No Information Rate : 0.7712
P-Value [Acc > NIR] : 2.879e-15
Kappa : 0.6965
Mcnemar's Test P-Value : 0.00858
Statistics by Class:
Class: Normal Class: Suspect Class: Pathological
Sensitivity 0.9614 0.61053 0.74510
Specificity 0.7329 0.95396 0.99148
Pos Pred Value 0.9238 0.69880 0.88372
Neg Pred Value 0.8492 0.93333 0.97815
Prevalence 0.7712 0.14890 0.07994
Detection Rate 0.7414 0.09091 0.05956
Detection Prevalence 0.8025 0.13009 0.06740
Balanced Accuracy 0.8471 0.78224 0.86829
Neural Network
#train the model
nn_model <- nnet(fetal_health ~ ., data = train_data, size = 1, maxit = 200)# weights: 28
initial value 1639.919106
iter 10 value 997.581643
iter 10 value 997.581643
iter 10 value 997.581643
final value 997.581643
converged
#predict on the test set
predictions <- predict(nn_model, test_data, type = "class")
predictions <- factor(predictions, levels = levels(test_data$fetal_health))
#confusion matrix
nn_cf_matrix <- confusionMatrix(predictions,test_data$fetal_health)
nn_cf_matrixConfusion Matrix and Statistics
Reference
Prediction Normal Suspect Pathological
Normal 492 95 51
Suspect 0 0 0
Pathological 0 0 0
Overall Statistics
Accuracy : 0.7712
95% CI : (0.7366, 0.8032)
No Information Rate : 0.7712
P-Value [Acc > NIR] : 0.5222
Kappa : 0
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Normal Class: Suspect Class: Pathological
Sensitivity 1.0000 0.0000 0.00000
Specificity 0.0000 1.0000 1.00000
Pos Pred Value 0.7712 NaN NaN
Neg Pred Value NaN 0.8511 0.92006
Prevalence 0.7712 0.1489 0.07994
Detection Rate 0.7712 0.0000 0.00000
Detection Prevalence 1.0000 0.0000 0.00000
Balanced Accuracy 0.5000 0.5000 0.50000
Logistic Regression
#train the model
multinom_model <- multinom(fetal_health ~ ., data = test_data)# weights: 69 (44 variable)
initial value 700.914640
iter 10 value 319.851059
iter 20 value 306.167183
iter 30 value 197.193020
iter 40 value 169.358487
iter 50 value 167.426511
iter 60 value 166.841217
iter 70 value 159.215004
iter 80 value 148.639495
iter 90 value 147.197302
iter 100 value 147.189051
final value 147.189051
stopped after 100 iterations
#predict on the test data
predictions <- predict(multinom_model, test_data)
#confusion matrix
lr_cf_matrix <- confusionMatrix(predictions, test_data$fetal_health)
lr_cf_matrixConfusion Matrix and Statistics
Reference
Prediction Normal Suspect Pathological
Normal 470 30 3
Suspect 19 61 5
Pathological 3 4 43
Overall Statistics
Accuracy : 0.8997
95% CI : (0.8737, 0.9219)
No Information Rate : 0.7712
P-Value [Acc > NIR] : <2e-16
Kappa : 0.7259
Mcnemar's Test P-Value : 0.4609
Statistics by Class:
Class: Normal Class: Suspect Class: Pathological
Sensitivity 0.9553 0.64211 0.84314
Specificity 0.7740 0.95580 0.98807
Pos Pred Value 0.9344 0.71765 0.86000
Neg Pred Value 0.8370 0.93852 0.98639
Prevalence 0.7712 0.14890 0.07994
Detection Rate 0.7367 0.09561 0.06740
Detection Prevalence 0.7884 0.13323 0.07837
Balanced Accuracy 0.8646 0.79895 0.91561
Model Accuracy Comparison
#create data frame of all the accuracy values
accuracy_df <- data.frame(
Model = c("Random Forest", "Support Vector Machine",
"Neural Network", "Logistic Regression (Multinomial)"),
Accuracy = c(rf_cf_matrix$overall['Accuracy'], svm_cf_matrix$overall['Accuracy'],
nn_cf_matrix$overall['Accuracy'], lr_cf_matrix$overall['Accuracy'])
)
#visualize the results
ggplot(accuracy_df, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity") +
scale_fill_viridis(discrete = TRUE) +
ylim(0, 1) +
ggtitle("Random Forest performed the best") +
theme(legend.position = "none") +
labs(x = NULL, y = NULL)Result: Random Forest model performed the best with 93.57% accuracy.
Citation
Ayres de Campos et al. (2000) SisPorto 2.0 A Program for Automated Analysis of Cardiotocograms. J Matern Fetal Med 5:311-318 (link)