options(repos = c(CRAN = "https://cloud.r-project.org"))
Install Required Packages
install.packages(c("dplyr", "ggplot2", "caret", "e1071", "randomForest", "pROC", "xgboost", "ggcorrplot"))
## Installing packages into 'C:/Users/OMPRAKASH/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
## package 'ggplot2' successfully unpacked and MD5 sums checked
## package 'caret' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'caret'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\00LOCK\caret\libs\x64\caret.dll
## to C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\caret\libs\x64\caret.dll:
## Permission denied
## Warning: restored 'caret'
## package 'e1071' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'e1071'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\00LOCK\e1071\libs\x64\e1071.dll
## to C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\e1071\libs\x64\e1071.dll:
## Permission denied
## Warning: restored 'e1071'
## package 'randomForest' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'randomForest'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\00LOCK\randomForest\libs\x64\randomForest.dll
## to
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\randomForest\libs\x64\randomForest.dll:
## Permission denied
## Warning: restored 'randomForest'
## package 'pROC' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'pROC'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\00LOCK\pROC\libs\x64\pROC.dll
## to C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\pROC\libs\x64\pROC.dll:
## Permission denied
## Warning: restored 'pROC'
## package 'xgboost' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'xgboost'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\00LOCK\xgboost\libs\x64\xgboost.dll
## to
## C:\Users\OMPRAKASH\AppData\Local\R\win-library\4.4\xgboost\libs\x64\xgboost.dll:
## Permission denied
## Warning: restored 'xgboost'
## package 'ggcorrplot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\OMPRAKASH\AppData\Local\Temp\Rtmp0UfCaX\downloaded_packages
Load the installed packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(e1071)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
Load the dataset
data <- read.csv("diabetes.csv")
Inspect the dataset
head(data)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
summary(data)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
str(data)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
Check for missing values
sum(is.na(data))
## [1] 0
Normalize numerical columns (if needed)
data$Glucose <- scale(data$Glucose)
data$BMI <- scale(data$BMI)
Check for outliers using boxplots
boxplot(data$Age, main = "Age Outliers", horizontal=TRUE)

Correlation Matrix
library(ggcorrplot)
cor_matrix <- cor(data[,-ncol(data)]) # Exclude the target column
ggcorrplot(cor_matrix, method = "circle")

Glucose levels by Outcome
ggplot(data, aes(x = Outcome, y = Glucose, fill = as.factor(Outcome))) +
geom_boxplot() +
labs(title = "Glucose Levels by Outcome", x = "Outcome", y = "Glucose") +
theme_minimal()

Install and load the caret package
if (!require(caret)) install.packages("caret")
library(caret)
# Split the data (70% train, 30% test)
set.seed(123)
index <- createDataPartition(data$Outcome, p = 0.7, list = FALSE)
train <- data[index, ]
test <- data[-index, ]
Logistic Regression Model
log_model <- glm(Outcome ~ ., data = train, family = "binomial")
summary(log_model)
##
## Call:
## glm(formula = Outcome ~ ., family = "binomial", data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.255871 0.511144 -2.457 0.01401 *
## Pregnancies 0.103402 0.037988 2.722 0.00649 **
## Glucose 1.144409 0.145945 7.841 4.46e-15 ***
## BloodPressure -0.012665 0.006058 -2.091 0.03657 *
## SkinThickness 0.003591 0.008093 0.444 0.65721
## Insulin -0.001726 0.001060 -1.629 0.10335
## BMI 0.700192 0.141634 4.944 7.67e-07 ***
## DiabetesPedigreeFunction 0.699484 0.334854 2.089 0.03671 *
## Age 0.017113 0.011068 1.546 0.12206
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 695.03 on 537 degrees of freedom
## Residual deviance: 510.06 on 529 degrees of freedom
## AIC: 528.06
##
## Number of Fisher Scoring iterations: 5
Predict and Evaluate
log_pred <- predict(log_model, test, type = "response")
log_class <- ifelse(log_pred > 0.5, 1, 0)
Confusion Matrix
conf_matrix <- table(test$Outcome, log_class)
conf_matrix
## log_class
## 0 1
## 0 137 12
## 1 36 45
Accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Logistic Regression Accuracy:", accuracy)
## Logistic Regression Accuracy: 0.7913043
AUC
if (!require(pROC)) install.packages("pROC")
library(pROC)
roc_curve <- roc(test$Outcome, log_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_curve)
## Area under the curve: 0.8436
train$Outcome <- as.factor(train$Outcome)
test$Outcome <- as.factor(test$Outcome)
Train Random Forest model
if (!require(randomForest)) install.packages("randomForest")
library(randomForest)
rf_model <- randomForest(Outcome ~ ., data = train, ntree = 100, importance = TRUE)
Predict
rf_pred <- predict(rf_model, test)
Confusion Matrix
conf_matrix_rf <- table(test$Outcome, rf_pred)
print(conf_matrix_rf)
## rf_pred
## 0 1
## 0 128 21
## 1 37 44
Accuracy
accuracy_rf <- sum(diag(conf_matrix_rf)) / sum(conf_matrix_rf)
cat("Random Forest Accuracy:", accuracy_rf)
## Random Forest Accuracy: 0.7478261
Feature Importance
importance(rf_model)
## 0 1 MeanDecreaseAccuracy
## Pregnancies 2.2039268 -0.04281905 1.7609905
## Glucose 13.8313610 12.17217549 18.7840554
## BloodPressure 1.3734283 0.71549313 1.6741987
## SkinThickness 0.5786392 0.34728745 0.7102666
## Insulin 2.5160861 -0.43223298 1.6246152
## BMI 5.7419414 7.80288341 9.0527052
## DiabetesPedigreeFunction 2.5389156 1.41219476 2.7718543
## Age 5.2459279 2.59371531 6.2453208
## MeanDecreaseGini
## Pregnancies 19.39819
## Glucose 60.60511
## BloodPressure 23.00512
## SkinThickness 17.95806
## Insulin 18.16868
## BMI 43.28194
## DiabetesPedigreeFunction 30.05861
## Age 30.05910
varImpPlot(rf_model)

SVM Model
if (!require(e1071)) install.packages("e1071")
library(e1071)
svm_model <- svm(Outcome ~ ., data = train, kernel = "linear")
svm_pred <- predict(svm_model, test)
Confusion Matrix
conf_matrix_svm <- table(test$Outcome, svm_pred)
conf_matrix_svm
## svm_pred
## 0 1
## 0 138 11
## 1 35 46
Accuracy
accuracy_svm <- sum(diag(conf_matrix_svm)) / sum(conf_matrix_svm)
cat("SVM Accuracy:",accuracy_svm)
## SVM Accuracy: 0.8
Prepare data for XGBoost
train_x <- as.matrix(train[, -ncol(train)])
train_y <- as.numeric(train$Outcome) - 1 # Convert to numeric and ensure labels are 0 or 1
test_x <- as.matrix(test[, -ncol(test)])
test_y <- as.numeric(test$Outcome) - 1 # Convert to numeric and ensure labels are 0 or 1
XGBoost Model
if (!require(xgboost)) install.packages("xgboost")
library(xgboost)
xgb_model <- xgboost(data = train_x, label = train_y, max_depth = 6, eta = 0.3, nrounds = 100, objective = "binary:logistic")
## [1] train-logloss:0.558520
## [2] train-logloss:0.474518
## [3] train-logloss:0.413514
## [4] train-logloss:0.365560
## [5] train-logloss:0.330533
## [6] train-logloss:0.304606
## [7] train-logloss:0.281495
## [8] train-logloss:0.263482
## [9] train-logloss:0.240190
## [10] train-logloss:0.225045
## [11] train-logloss:0.214879
## [12] train-logloss:0.202567
## [13] train-logloss:0.191022
## [14] train-logloss:0.183963
## [15] train-logloss:0.176286
## [16] train-logloss:0.168178
## [17] train-logloss:0.163284
## [18] train-logloss:0.153815
## [19] train-logloss:0.147593
## [20] train-logloss:0.143519
## [21] train-logloss:0.140936
## [22] train-logloss:0.136445
## [23] train-logloss:0.131629
## [24] train-logloss:0.125565
## [25] train-logloss:0.122269
## [26] train-logloss:0.119444
## [27] train-logloss:0.113924
## [28] train-logloss:0.111292
## [29] train-logloss:0.107750
## [30] train-logloss:0.104868
## [31] train-logloss:0.100684
## [32] train-logloss:0.097544
## [33] train-logloss:0.095504
## [34] train-logloss:0.092745
## [35] train-logloss:0.088544
## [36] train-logloss:0.085663
## [37] train-logloss:0.083809
## [38] train-logloss:0.080562
## [39] train-logloss:0.077822
## [40] train-logloss:0.075478
## [41] train-logloss:0.073023
## [42] train-logloss:0.070330
## [43] train-logloss:0.068061
## [44] train-logloss:0.067163
## [45] train-logloss:0.065625
## [46] train-logloss:0.064068
## [47] train-logloss:0.062812
## [48] train-logloss:0.061643
## [49] train-logloss:0.060666
## [50] train-logloss:0.059335
## [51] train-logloss:0.058149
## [52] train-logloss:0.057072
## [53] train-logloss:0.055474
## [54] train-logloss:0.054254
## [55] train-logloss:0.052966
## [56] train-logloss:0.052247
## [57] train-logloss:0.051810
## [58] train-logloss:0.050690
## [59] train-logloss:0.049826
## [60] train-logloss:0.049205
## [61] train-logloss:0.048163
## [62] train-logloss:0.047480
## [63] train-logloss:0.046795
## [64] train-logloss:0.045722
## [65] train-logloss:0.044930
## [66] train-logloss:0.044132
## [67] train-logloss:0.043807
## [68] train-logloss:0.043040
## [69] train-logloss:0.042120
## [70] train-logloss:0.041507
## [71] train-logloss:0.040606
## [72] train-logloss:0.040271
## [73] train-logloss:0.039667
## [74] train-logloss:0.039080
## [75] train-logloss:0.038386
## [76] train-logloss:0.038004
## [77] train-logloss:0.037651
## [78] train-logloss:0.037095
## [79] train-logloss:0.036700
## [80] train-logloss:0.036346
## [81] train-logloss:0.035912
## [82] train-logloss:0.035452
## [83] train-logloss:0.034965
## [84] train-logloss:0.034525
## [85] train-logloss:0.034048
## [86] train-logloss:0.033538
## [87] train-logloss:0.033192
## [88] train-logloss:0.032766
## [89] train-logloss:0.032383
## [90] train-logloss:0.031963
## [91] train-logloss:0.031637
## [92] train-logloss:0.031239
## [93] train-logloss:0.030814
## [94] train-logloss:0.030491
## [95] train-logloss:0.030148
## [96] train-logloss:0.029865
## [97] train-logloss:0.029523
## [98] train-logloss:0.029290
## [99] train-logloss:0.029075
## [100] train-logloss:0.028823
Predict and Evaluate
xgb_pred <- predict(xgb_model, test_x)
xgb_class <- ifelse(xgb_pred > 0.5, 1, 0)
Confusion Matrix
conf_matrix_xgb <- table(test_y, xgb_class)
conf_matrix_xgb
## xgb_class
## test_y 0 1
## 0 122 27
## 1 35 46
Accuracy
accuracy_xgb <- sum(diag(conf_matrix_xgb)) / sum(conf_matrix_xgb)
cat("XGBoost Accuracy:", accuracy_xgb)
## XGBoost Accuracy: 0.7304348
AUC
roc_curve_xgb <- roc(test_y, xgb_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_curve_xgb)
## Area under the curve: 0.8124
Accuracy comparison
accuracy_table <- data.frame(
Model = c("Logistic Regression", "Random Forest", "SVM", "XGBoost"),
Accuracy = c(accuracy, accuracy_rf, accuracy_svm, accuracy_xgb)
)
print(accuracy_table)
## Model Accuracy
## 1 Logistic Regression 0.7913043
## 2 Random Forest 0.7478261
## 3 SVM 0.8000000
## 4 XGBoost 0.7304348
Plot AUC
plot(roc_curve, col = "red", main = "ROC Curves")
plot(roc_curve_xgb, col = "blue", add = TRUE)
legend("bottomright", legend = c("Logistic", "XGBoost"), col = c("red", "blue"), lwd = 2)
