options(repos = c(CRAN = "https://cloud.r-project.org"))

Install Required Packages

install.packages(c("dplyr", "ggplot2", "caret", "e1071", "randomForest", "pROC", "xgboost", "ggcorrplot"))
## Installing packages into 'C:/Users/OMPRAKASH/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
## package 'ggplot2' successfully unpacked and MD5 sums checked
## package 'caret' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'caret'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\00LOCK\caret\libs\x64\caret.dll
## to C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\caret\libs\x64\caret.dll:
## Permission denied
## Warning: restored 'caret'
## package 'e1071' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'e1071'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\00LOCK\e1071\libs\x64\e1071.dll
## to C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\e1071\libs\x64\e1071.dll:
## Permission denied
## Warning: restored 'e1071'
## package 'randomForest' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'randomForest'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\00LOCK\randomForest\libs\x64\randomForest.dll
## to
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\randomForest\libs\x64\randomForest.dll:
## Permission denied
## Warning: restored 'randomForest'
## package 'pROC' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'pROC'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\00LOCK\pROC\libs\x64\pROC.dll
## to C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\pROC\libs\x64\pROC.dll:
## Permission denied
## Warning: restored 'pROC'
## package 'xgboost' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'xgboost'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\00LOCK\xgboost\libs\x64\xgboost.dll
## to
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\xgboost\libs\x64\xgboost.dll:
## Permission denied
## Warning: restored 'xgboost'
## package 'ggcorrplot' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\OMPRAKASH\AppData\Local\Temp\Rtmp0UfCaX\downloaded_packages

Load the installed packages

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(e1071)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice

Load the dataset

data <- read.csv("diabetes.csv")

Inspect the dataset

head(data)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0
summary(data)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000
str(data)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...

Check for missing values

sum(is.na(data))
## [1] 0

Normalize numerical columns (if needed)

data$Glucose <- scale(data$Glucose)
data$BMI <- scale(data$BMI)

Check for outliers using boxplots

boxplot(data$Age, main = "Age Outliers", horizontal=TRUE)

Correlation Matrix

library(ggcorrplot)
cor_matrix <- cor(data[,-ncol(data)]) # Exclude the target column
ggcorrplot(cor_matrix, method = "circle")

Glucose levels by Outcome

ggplot(data, aes(x = Outcome, y = Glucose, fill = as.factor(Outcome))) +
  geom_boxplot() +
  labs(title = "Glucose Levels by Outcome", x = "Outcome", y = "Glucose") +
  theme_minimal()

Install and load the caret package

if (!require(caret)) install.packages("caret")
library(caret)

# Split the data (70% train, 30% test)

set.seed(123)
index <- createDataPartition(data$Outcome, p = 0.7, list = FALSE)
train <- data[index, ]
test <- data[-index, ]

Logistic Regression Model

log_model <- glm(Outcome ~ ., data = train, family = "binomial")
summary(log_model)
## 
## Call:
## glm(formula = Outcome ~ ., family = "binomial", data = train)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.255871   0.511144  -2.457  0.01401 *  
## Pregnancies               0.103402   0.037988   2.722  0.00649 ** 
## Glucose                   1.144409   0.145945   7.841 4.46e-15 ***
## BloodPressure            -0.012665   0.006058  -2.091  0.03657 *  
## SkinThickness             0.003591   0.008093   0.444  0.65721    
## Insulin                  -0.001726   0.001060  -1.629  0.10335    
## BMI                       0.700192   0.141634   4.944 7.67e-07 ***
## DiabetesPedigreeFunction  0.699484   0.334854   2.089  0.03671 *  
## Age                       0.017113   0.011068   1.546  0.12206    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 695.03  on 537  degrees of freedom
## Residual deviance: 510.06  on 529  degrees of freedom
## AIC: 528.06
## 
## Number of Fisher Scoring iterations: 5

Predict and Evaluate

log_pred <- predict(log_model, test, type = "response")
log_class <- ifelse(log_pred > 0.5, 1, 0)

Confusion Matrix

conf_matrix <- table(test$Outcome, log_class)
conf_matrix
##    log_class
##       0   1
##   0 137  12
##   1  36  45

Accuracy

accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Logistic Regression Accuracy:", accuracy)
## Logistic Regression Accuracy: 0.7913043

AUC

if (!require(pROC)) install.packages("pROC")
library(pROC)

roc_curve <- roc(test$Outcome, log_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_curve)
## Area under the curve: 0.8436
train$Outcome <- as.factor(train$Outcome)
test$Outcome <- as.factor(test$Outcome)

Train Random Forest model

if (!require(randomForest)) install.packages("randomForest")
library(randomForest)

rf_model <- randomForest(Outcome ~ ., data = train, ntree = 100, importance = TRUE)

Predict

rf_pred <- predict(rf_model, test)

Confusion Matrix

conf_matrix_rf <- table(test$Outcome, rf_pred)
print(conf_matrix_rf)
##    rf_pred
##       0   1
##   0 128  21
##   1  37  44

Accuracy

accuracy_rf <- sum(diag(conf_matrix_rf)) / sum(conf_matrix_rf)
cat("Random Forest Accuracy:", accuracy_rf)
## Random Forest Accuracy: 0.7478261

Feature Importance

importance(rf_model)
##                                   0           1 MeanDecreaseAccuracy
## Pregnancies               2.2039268 -0.04281905            1.7609905
## Glucose                  13.8313610 12.17217549           18.7840554
## BloodPressure             1.3734283  0.71549313            1.6741987
## SkinThickness             0.5786392  0.34728745            0.7102666
## Insulin                   2.5160861 -0.43223298            1.6246152
## BMI                       5.7419414  7.80288341            9.0527052
## DiabetesPedigreeFunction  2.5389156  1.41219476            2.7718543
## Age                       5.2459279  2.59371531            6.2453208
##                          MeanDecreaseGini
## Pregnancies                      19.39819
## Glucose                          60.60511
## BloodPressure                    23.00512
## SkinThickness                    17.95806
## Insulin                          18.16868
## BMI                              43.28194
## DiabetesPedigreeFunction         30.05861
## Age                              30.05910
varImpPlot(rf_model)

SVM Model

if (!require(e1071)) install.packages("e1071")
library(e1071)

svm_model <- svm(Outcome ~ ., data = train, kernel = "linear")
svm_pred <- predict(svm_model, test)

Confusion Matrix

conf_matrix_svm <- table(test$Outcome, svm_pred)
conf_matrix_svm
##    svm_pred
##       0   1
##   0 138  11
##   1  35  46

Accuracy

accuracy_svm <- sum(diag(conf_matrix_svm)) / sum(conf_matrix_svm)
cat("SVM Accuracy:",accuracy_svm)
## SVM Accuracy: 0.8

Prepare data for XGBoost

train_x <- as.matrix(train[, -ncol(train)])
train_y <- as.numeric(train$Outcome) - 1  # Convert to numeric and ensure labels are 0 or 1
test_x <- as.matrix(test[, -ncol(test)])
test_y <- as.numeric(test$Outcome) - 1    # Convert to numeric and ensure labels are 0 or 1

XGBoost Model

if (!require(xgboost)) install.packages("xgboost")
library(xgboost)

xgb_model <- xgboost(data = train_x, label = train_y, max_depth = 6, eta = 0.3, nrounds = 100, objective = "binary:logistic")
## [1]  train-logloss:0.558520 
## [2]  train-logloss:0.474518 
## [3]  train-logloss:0.413514 
## [4]  train-logloss:0.365560 
## [5]  train-logloss:0.330533 
## [6]  train-logloss:0.304606 
## [7]  train-logloss:0.281495 
## [8]  train-logloss:0.263482 
## [9]  train-logloss:0.240190 
## [10] train-logloss:0.225045 
## [11] train-logloss:0.214879 
## [12] train-logloss:0.202567 
## [13] train-logloss:0.191022 
## [14] train-logloss:0.183963 
## [15] train-logloss:0.176286 
## [16] train-logloss:0.168178 
## [17] train-logloss:0.163284 
## [18] train-logloss:0.153815 
## [19] train-logloss:0.147593 
## [20] train-logloss:0.143519 
## [21] train-logloss:0.140936 
## [22] train-logloss:0.136445 
## [23] train-logloss:0.131629 
## [24] train-logloss:0.125565 
## [25] train-logloss:0.122269 
## [26] train-logloss:0.119444 
## [27] train-logloss:0.113924 
## [28] train-logloss:0.111292 
## [29] train-logloss:0.107750 
## [30] train-logloss:0.104868 
## [31] train-logloss:0.100684 
## [32] train-logloss:0.097544 
## [33] train-logloss:0.095504 
## [34] train-logloss:0.092745 
## [35] train-logloss:0.088544 
## [36] train-logloss:0.085663 
## [37] train-logloss:0.083809 
## [38] train-logloss:0.080562 
## [39] train-logloss:0.077822 
## [40] train-logloss:0.075478 
## [41] train-logloss:0.073023 
## [42] train-logloss:0.070330 
## [43] train-logloss:0.068061 
## [44] train-logloss:0.067163 
## [45] train-logloss:0.065625 
## [46] train-logloss:0.064068 
## [47] train-logloss:0.062812 
## [48] train-logloss:0.061643 
## [49] train-logloss:0.060666 
## [50] train-logloss:0.059335 
## [51] train-logloss:0.058149 
## [52] train-logloss:0.057072 
## [53] train-logloss:0.055474 
## [54] train-logloss:0.054254 
## [55] train-logloss:0.052966 
## [56] train-logloss:0.052247 
## [57] train-logloss:0.051810 
## [58] train-logloss:0.050690 
## [59] train-logloss:0.049826 
## [60] train-logloss:0.049205 
## [61] train-logloss:0.048163 
## [62] train-logloss:0.047480 
## [63] train-logloss:0.046795 
## [64] train-logloss:0.045722 
## [65] train-logloss:0.044930 
## [66] train-logloss:0.044132 
## [67] train-logloss:0.043807 
## [68] train-logloss:0.043040 
## [69] train-logloss:0.042120 
## [70] train-logloss:0.041507 
## [71] train-logloss:0.040606 
## [72] train-logloss:0.040271 
## [73] train-logloss:0.039667 
## [74] train-logloss:0.039080 
## [75] train-logloss:0.038386 
## [76] train-logloss:0.038004 
## [77] train-logloss:0.037651 
## [78] train-logloss:0.037095 
## [79] train-logloss:0.036700 
## [80] train-logloss:0.036346 
## [81] train-logloss:0.035912 
## [82] train-logloss:0.035452 
## [83] train-logloss:0.034965 
## [84] train-logloss:0.034525 
## [85] train-logloss:0.034048 
## [86] train-logloss:0.033538 
## [87] train-logloss:0.033192 
## [88] train-logloss:0.032766 
## [89] train-logloss:0.032383 
## [90] train-logloss:0.031963 
## [91] train-logloss:0.031637 
## [92] train-logloss:0.031239 
## [93] train-logloss:0.030814 
## [94] train-logloss:0.030491 
## [95] train-logloss:0.030148 
## [96] train-logloss:0.029865 
## [97] train-logloss:0.029523 
## [98] train-logloss:0.029290 
## [99] train-logloss:0.029075 
## [100]    train-logloss:0.028823

Predict and Evaluate

xgb_pred <- predict(xgb_model, test_x)
xgb_class <- ifelse(xgb_pred > 0.5, 1, 0)

Confusion Matrix

conf_matrix_xgb <- table(test_y, xgb_class)
conf_matrix_xgb
##       xgb_class
## test_y   0   1
##      0 122  27
##      1  35  46

Accuracy

accuracy_xgb <- sum(diag(conf_matrix_xgb)) / sum(conf_matrix_xgb)
cat("XGBoost Accuracy:", accuracy_xgb)
## XGBoost Accuracy: 0.7304348

AUC

roc_curve_xgb <- roc(test_y, xgb_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_curve_xgb)
## Area under the curve: 0.8124

Accuracy comparison

accuracy_table <- data.frame(
  Model = c("Logistic Regression", "Random Forest", "SVM", "XGBoost"),
  Accuracy = c(accuracy, accuracy_rf, accuracy_svm, accuracy_xgb)
)
print(accuracy_table)
##                 Model  Accuracy
## 1 Logistic Regression 0.7913043
## 2       Random Forest 0.7478261
## 3                 SVM 0.8000000
## 4             XGBoost 0.7304348

Plot AUC

plot(roc_curve, col = "red", main = "ROC Curves")
plot(roc_curve_xgb, col = "blue", add = TRUE)
legend("bottomright", legend = c("Logistic", "XGBoost"), col = c("red", "blue"), lwd = 2)