The project, Diabetes Prediction Analysis Using Machine Learning in R, utilized the Pima Indians Diabetes dataset from Kaggle. This dataset consists of 765 rows and 9 features: pregnancies, glucose levels, blood pressure, skin thickness, insulin levels, BMI, diabetes pedigree function, age, and the outcome variable indicating diabetes presence. The goal was to evaluate and compare the performance of various machine learning models for diabetes prediction and identify key predictive features.
Exploratory Data Analysis (EDA): Analyzed feature distributions, correlations, and identified key variables influencing diabetes outcomes. Applied normalization to numerical features and detected outliers for data quality improvement.
Model Implementation and Evaluation:
Logistic Regression: Achieved an accuracy of 79.13% and an AUC of 0.8436, indicating strong performance and discrimination capability.
Support Vector Machine (SVM): Delivered the highest accuracy of 80%, demonstrating its suitability for binary classification.
Random Forest: Provided an accuracy of 74.78%, along with feature importance insights for interpretability.
XGBoost: Achieved an accuracy of 73.04% and an AUC of 0.8124, highlighting its potential with optimized hyperparameters.
This analysis successfully demonstrates the application of machine learning techniques to predict diabetes outcomes using the Pima dataset. Logistic Regression stood out with a balance of high accuracy (79.13%) and AUC (0.8436), while SVM achieved the highest accuracy (80%). Random Forest and XGBoost offered complementary insights through feature importance analysis and robust ensemble learning techniques. The project underscores the importance of data-driven modeling in healthcare analytics and the predictive significance of features such as glucose levels and BMI in diagnosing diabetes.
options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages(c("dplyr", "ggplot2", "caret", "e1071", "randomForest", "pROC", "xgboost", "ggcorrplot"))
## Installing packages into 'C:/Users/OMPRAKASH/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
## package 'ggplot2' successfully unpacked and MD5 sums checked
## package 'caret' successfully unpacked and MD5 sums checked
## package 'e1071' successfully unpacked and MD5 sums checked
## package 'randomForest' successfully unpacked and MD5 sums checked
## package 'pROC' successfully unpacked and MD5 sums checked
## package 'xgboost' successfully unpacked and MD5 sums checked
## package 'ggcorrplot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\OMPRAKASH\AppData\Local\Temp\RtmpuSVJVU\downloaded_packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(e1071)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
data <- read.csv("diabetes.csv")
head(data)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
summary(data)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
str(data)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
sum(is.na(data))
## [1] 0
data$Glucose <- scale(data$Glucose)
data$BMI <- scale(data$BMI)
boxplot(data$Age, main = "Age Outliers", horizontal=TRUE)
library(ggcorrplot)
cor_matrix <- cor(data[,-ncol(data)]) # Exclude the target column
ggcorrplot(cor_matrix, method = "circle")
ggplot(data, aes(x = Outcome, y = Glucose, fill = as.factor(Outcome))) +
geom_boxplot() +
labs(title = "Glucose Levels by Outcome", x = "Outcome", y = "Glucose") +
theme_minimal()
if (!require(caret)) install.packages("caret")
library(caret)
# Split the data (70% train, 30% test)
set.seed(123)
index <- createDataPartition(data$Outcome, p = 0.7, list = FALSE)
train <- data[index, ]
test <- data[-index, ]
log_model <- glm(Outcome ~ ., data = train, family = "binomial")
summary(log_model)
##
## Call:
## glm(formula = Outcome ~ ., family = "binomial", data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.255871 0.511144 -2.457 0.01401 *
## Pregnancies 0.103402 0.037988 2.722 0.00649 **
## Glucose 1.144409 0.145945 7.841 4.46e-15 ***
## BloodPressure -0.012665 0.006058 -2.091 0.03657 *
## SkinThickness 0.003591 0.008093 0.444 0.65721
## Insulin -0.001726 0.001060 -1.629 0.10335
## BMI 0.700192 0.141634 4.944 7.67e-07 ***
## DiabetesPedigreeFunction 0.699484 0.334854 2.089 0.03671 *
## Age 0.017113 0.011068 1.546 0.12206
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 695.03 on 537 degrees of freedom
## Residual deviance: 510.06 on 529 degrees of freedom
## AIC: 528.06
##
## Number of Fisher Scoring iterations: 5
log_pred <- predict(log_model, test, type = "response")
log_class <- ifelse(log_pred > 0.5, 1, 0)
conf_matrix <- table(test$Outcome, log_class)
conf_matrix
## log_class
## 0 1
## 0 137 12
## 1 36 45
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Logistic Regression Accuracy:", accuracy)
## Logistic Regression Accuracy: 0.7913043
if (!require(pROC)) install.packages("pROC")
library(pROC)
roc_curve <- roc(test$Outcome, log_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_curve)
## Area under the curve: 0.8436
train$Outcome <- as.factor(train$Outcome)
test$Outcome <- as.factor(test$Outcome)
if (!require(randomForest)) install.packages("randomForest")
library(randomForest)
rf_model <- randomForest(Outcome ~ ., data = train, ntree = 100, importance = TRUE)
rf_pred <- predict(rf_model, test)
conf_matrix_rf <- table(test$Outcome, rf_pred)
print(conf_matrix_rf)
## rf_pred
## 0 1
## 0 128 21
## 1 37 44
accuracy_rf <- sum(diag(conf_matrix_rf)) / sum(conf_matrix_rf)
cat("Random Forest Accuracy:", accuracy_rf)
## Random Forest Accuracy: 0.7478261
importance(rf_model)
## 0 1 MeanDecreaseAccuracy
## Pregnancies 2.2039268 -0.04281905 1.7609905
## Glucose 13.8313610 12.17217549 18.7840554
## BloodPressure 1.3734283 0.71549313 1.6741987
## SkinThickness 0.5786392 0.34728745 0.7102666
## Insulin 2.5160861 -0.43223298 1.6246152
## BMI 5.7419414 7.80288341 9.0527052
## DiabetesPedigreeFunction 2.5389156 1.41219476 2.7718543
## Age 5.2459279 2.59371531 6.2453208
## MeanDecreaseGini
## Pregnancies 19.39819
## Glucose 60.60511
## BloodPressure 23.00512
## SkinThickness 17.95806
## Insulin 18.16868
## BMI 43.28194
## DiabetesPedigreeFunction 30.05861
## Age 30.05910
varImpPlot(rf_model)
if (!require(e1071)) install.packages("e1071")
library(e1071)
svm_model <- svm(Outcome ~ ., data = train, kernel = "linear")
svm_pred <- predict(svm_model, test)
conf_matrix_svm <- table(test$Outcome, svm_pred)
conf_matrix_svm
## svm_pred
## 0 1
## 0 138 11
## 1 35 46
accuracy_svm <- sum(diag(conf_matrix_svm)) / sum(conf_matrix_svm)
cat("SVM Accuracy:",accuracy_svm)
## SVM Accuracy: 0.8
train_x <- as.matrix(train[, -ncol(train)])
train_y <- as.numeric(train$Outcome) - 1 # Convert to numeric and ensure labels are 0 or 1
test_x <- as.matrix(test[, -ncol(test)])
test_y <- as.numeric(test$Outcome) - 1 # Convert to numeric and ensure labels are 0 or 1
if (!require(xgboost)) install.packages("xgboost")
library(xgboost)
xgb_model <- xgboost(data = train_x, label = train_y, max_depth = 6, eta = 0.3, nrounds = 100, objective = "binary:logistic")
## [1] train-logloss:0.558520
## [2] train-logloss:0.474518
## [3] train-logloss:0.413514
## [4] train-logloss:0.365560
## [5] train-logloss:0.330533
## [6] train-logloss:0.304606
## [7] train-logloss:0.281495
## [8] train-logloss:0.263482
## [9] train-logloss:0.240190
## [10] train-logloss:0.225045
## [11] train-logloss:0.214879
## [12] train-logloss:0.202567
## [13] train-logloss:0.191022
## [14] train-logloss:0.183963
## [15] train-logloss:0.176286
## [16] train-logloss:0.168178
## [17] train-logloss:0.163284
## [18] train-logloss:0.153815
## [19] train-logloss:0.147593
## [20] train-logloss:0.143519
## [21] train-logloss:0.140936
## [22] train-logloss:0.136445
## [23] train-logloss:0.131629
## [24] train-logloss:0.125565
## [25] train-logloss:0.122269
## [26] train-logloss:0.119444
## [27] train-logloss:0.113924
## [28] train-logloss:0.111292
## [29] train-logloss:0.107750
## [30] train-logloss:0.104868
## [31] train-logloss:0.100684
## [32] train-logloss:0.097544
## [33] train-logloss:0.095504
## [34] train-logloss:0.092745
## [35] train-logloss:0.088544
## [36] train-logloss:0.085663
## [37] train-logloss:0.083809
## [38] train-logloss:0.080562
## [39] train-logloss:0.077822
## [40] train-logloss:0.075478
## [41] train-logloss:0.073023
## [42] train-logloss:0.070330
## [43] train-logloss:0.068061
## [44] train-logloss:0.067163
## [45] train-logloss:0.065625
## [46] train-logloss:0.064068
## [47] train-logloss:0.062812
## [48] train-logloss:0.061643
## [49] train-logloss:0.060666
## [50] train-logloss:0.059335
## [51] train-logloss:0.058149
## [52] train-logloss:0.057072
## [53] train-logloss:0.055474
## [54] train-logloss:0.054254
## [55] train-logloss:0.052966
## [56] train-logloss:0.052247
## [57] train-logloss:0.051810
## [58] train-logloss:0.050690
## [59] train-logloss:0.049826
## [60] train-logloss:0.049205
## [61] train-logloss:0.048163
## [62] train-logloss:0.047480
## [63] train-logloss:0.046795
## [64] train-logloss:0.045722
## [65] train-logloss:0.044930
## [66] train-logloss:0.044132
## [67] train-logloss:0.043807
## [68] train-logloss:0.043040
## [69] train-logloss:0.042120
## [70] train-logloss:0.041507
## [71] train-logloss:0.040606
## [72] train-logloss:0.040271
## [73] train-logloss:0.039667
## [74] train-logloss:0.039080
## [75] train-logloss:0.038386
## [76] train-logloss:0.038004
## [77] train-logloss:0.037651
## [78] train-logloss:0.037095
## [79] train-logloss:0.036700
## [80] train-logloss:0.036346
## [81] train-logloss:0.035912
## [82] train-logloss:0.035452
## [83] train-logloss:0.034965
## [84] train-logloss:0.034525
## [85] train-logloss:0.034048
## [86] train-logloss:0.033538
## [87] train-logloss:0.033192
## [88] train-logloss:0.032766
## [89] train-logloss:0.032383
## [90] train-logloss:0.031963
## [91] train-logloss:0.031637
## [92] train-logloss:0.031239
## [93] train-logloss:0.030814
## [94] train-logloss:0.030491
## [95] train-logloss:0.030148
## [96] train-logloss:0.029865
## [97] train-logloss:0.029523
## [98] train-logloss:0.029290
## [99] train-logloss:0.029075
## [100] train-logloss:0.028823
xgb_pred <- predict(xgb_model, test_x)
xgb_class <- ifelse(xgb_pred > 0.5, 1, 0)
conf_matrix_xgb <- table(test_y, xgb_class)
conf_matrix_xgb
## xgb_class
## test_y 0 1
## 0 122 27
## 1 35 46
accuracy_xgb <- sum(diag(conf_matrix_xgb)) / sum(conf_matrix_xgb)
cat("XGBoost Accuracy:", accuracy_xgb)
## XGBoost Accuracy: 0.7304348
roc_curve_xgb <- roc(test_y, xgb_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_curve_xgb)
## Area under the curve: 0.8124
accuracy_table <- data.frame(
Model = c("Logistic Regression", "Random Forest", "SVM", "XGBoost"),
Accuracy = c(accuracy, accuracy_rf, accuracy_svm, accuracy_xgb)
)
print(accuracy_table)
## Model Accuracy
## 1 Logistic Regression 0.7913043
## 2 Random Forest 0.7478261
## 3 SVM 0.8000000
## 4 XGBoost 0.7304348
plot(roc_curve, col = "red", main = "ROC Curves")
plot(roc_curve_xgb, col = "blue", add = TRUE)
legend("bottomright", legend = c("Logistic", "XGBoost"), col = c("red", "blue"), lwd = 2)