if (!requireNamespace("needs", quietly = TRUE)) install.packages("needs")
library(needs)
needs( shiny, readxl, ggplot2, dplyr, corrplot, gridExtra, rpart.plot, e1071, mice, DMwR2, pROC, caTools, caret, doMC )
if (!requireNamespace("doMC", quietly = TRUE)) install.packages("doMC")
library(doMC)
registerDoMC(cores = detectCores() - 1)
df <- read.csv("diabetes.csv")
str(df)
'data.frame': 768 obs. of 9 variables:
$ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
$ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
$ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
$ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
$ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
$ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
$ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
$ Age : int 50 31 32 21 33 30 26 29 53 54 ...
$ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
df$Outcome <- factor(make.names(df$Outcome))
summary(df)
Pregnancies Glucose BloodPressure SkinThickness
Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
Insulin BMI DiabetesPedigreeFunction Age
Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
Outcome
X0:500
X1:268
biological_df <- df[,setdiff(names(df), c('Outcome', 'Pregnancies'))]
features_with_missing_values <- apply(biological_df, 2, function(x) sum(x<=0))
features_miss <- names(biological_df)[ features_with_missing_values > 0]
features_with_missing_values
Glucose BloodPressure SkinThickness
5 35 227
Insulin BMI DiabetesPedigreeFunction
374 11 0
Age
0
rows_with_errors <- apply(biological_df, 1, function(x) sum(x<=0)>1)
cat("Number of rows affected by this problem:", sum(rows_with_errors), "\n")
Number of rows affected by this problem: 234
cat("Total rows in dataset affected by this :", (sum(rows_with_errors)/nrow(df) * 100), "%", "\n")
Total rows in dataset affected by this : 30.46875 %
biological_df[biological_df<=0] <- NA
df[, names(biological_df)] <- biological_df
df_original <- df
df[,-9] <- knnImputation(df[,-9], k=5)
cat("Performed KNN imputation with k=5 on all columns except the 9th column.\n")
Performed KNN imputation with k=5 on all columns except the 9th column.
cat("Outcome Variable proportion", "\n")
Outcome Variable proportion
prop.table(table(df$Outcome))
X0 X1
0.6510417 0.3489583
correlation_between_variables <- cor(df[, setdiff(names(df), 'Outcome')])
cat("Calculated the correlation matrix for variables excluding 'Outcome':\n")
Calculated the correlation matrix for variables excluding 'Outcome':
print(correlation_between_variables)
Pregnancies Glucose BloodPressure SkinThickness
Pregnancies 1.00000000 0.1297037 0.210406811 0.1325765
Glucose 0.12970373 1.0000000 0.230511935 0.2270679
BloodPressure 0.21040681 0.2305119 1.000000000 0.2220024
SkinThickness 0.13257655 0.2270679 0.222002442 1.0000000
Insulin 0.08960516 0.6016978 0.151431318 0.2430054
BMI 0.02313687 0.2355170 0.293127018 0.6635971
DiabetesPedigreeFunction -0.03352267 0.1376799 -0.002122848 0.1355200
Age 0.54434123 0.2684005 0.332688642 0.1553945
Insulin BMI DiabetesPedigreeFunction
Pregnancies 0.08960516 0.02313687 -0.033522673
Glucose 0.60169777 0.23551696 0.137679934
BloodPressure 0.15143132 0.29312702 -0.002122848
SkinThickness 0.24300544 0.66359709 0.135519953
Insulin 1.00000000 0.27883783 0.153760380
BMI 0.27883783 1.00000000 0.151787668
DiabetesPedigreeFunction 0.15376038 0.15178767 1.000000000
Age 0.28355464 0.02571874 0.033561312
Age
Pregnancies 0.54434123
Glucose 0.26840049
BloodPressure 0.33268864
SkinThickness 0.15539450
Insulin 0.28355464
BMI 0.02571874
DiabetesPedigreeFunction 0.03356131
Age 1.00000000
corrplot(
correlation_between_variables,
method = "color",
col = colorRampPalette(c("blue", "white", "red"))(200),
type = "full",
diag = FALSE,
addCoef.col = "black",
number.cex = 0.7,
tl.col = "black",
tl.srt = 45,
cl.lim = c(-1, 1),
title = "Correlation Plot of Variables Excluding 'Outcome'"
)
correlation_between_variables
Pregnancies Glucose BloodPressure SkinThickness
Pregnancies 1.00000000 0.1297037 0.210406811 0.1325765
Glucose 0.12970373 1.0000000 0.230511935 0.2270679
BloodPressure 0.21040681 0.2305119 1.000000000 0.2220024
SkinThickness 0.13257655 0.2270679 0.222002442 1.0000000
Insulin 0.08960516 0.6016978 0.151431318 0.2430054
BMI 0.02313687 0.2355170 0.293127018 0.6635971
DiabetesPedigreeFunction -0.03352267 0.1376799 -0.002122848 0.1355200
Age 0.54434123 0.2684005 0.332688642 0.1553945
Insulin BMI DiabetesPedigreeFunction
Pregnancies 0.08960516 0.02313687 -0.033522673
Glucose 0.60169777 0.23551696 0.137679934
BloodPressure 0.15143132 0.29312702 -0.002122848
SkinThickness 0.24300544 0.66359709 0.135519953
Insulin 1.00000000 0.27883783 0.153760380
BMI 0.27883783 1.00000000 0.151787668
DiabetesPedigreeFunction 0.15376038 0.15178767 1.000000000
Age 0.28355464 0.02571874 0.033561312
Age
Pregnancies 0.54434123
Glucose 0.26840049
BloodPressure 0.33268864
SkinThickness 0.15539450
Insulin 0.28355464
BMI 0.02571874
DiabetesPedigreeFunction 0.03356131
Age 1.00000000
univariate_graph <- function(univariate_name, univar, data, output_variable) {
g_1 <- ggplot(data, aes(x=univar)) + geom_density() + xlab(univariate_name)
g_2 <- ggplot(data, aes(x=univar, fill=output_variable)) + geom_density(alpha=0.4) + xlab(univariate_name)
grid.arrange(g_1, g_2, ncol=2, top=paste(univariate_name,"variable", "/ [ Skew:",skewness(univar),"]"))
}
for (x in 1:(ncol(df)-1)) {
univariate_graph(names(df)[x], df[,x], df, df[,'Outcome'])
}
boxplot_graph <- function(variable_name, variable, data, group_variable) {
g_1 <- ggplot(data, aes(x = variable)) +
geom_boxplot() +
xlab(variable_name) +
ggtitle(paste(variable_name, "Boxplot"))
g_2 <- ggplot(data, aes(x = group_variable, y = variable)) +
geom_boxplot() +
xlab("Group") +
ylab(variable_name) +
ggtitle(paste(variable_name, "Boxplot by", deparse(substitute(group_variable))))
grid.arrange(g_1, g_2, ncol = 2, top = paste(variable_name, "Boxplot"))
}
# Example usage for boxplots
for (x in 1:(ncol(df) - 1)) {
boxplot_graph(names(df)[x], df[, x], df, df$Outcome)
}
histogram_graph <- function(variable_name, variable, data, group_variable) {
g_1 <- ggplot(data, aes(x = variable)) +
geom_histogram(binwidth = 1) +
xlab(variable_name) +
ggtitle(paste(variable_name, "Histogram"))
g_2 <- ggplot(data, aes(x = variable, fill = group_variable)) +
geom_histogram(binwidth = 1, alpha = 0.4) +
xlab(variable_name) +
ggtitle(paste(variable_name, "Histogram by", deparse(substitute(group_variable))))
grid.arrange(g_1, g_2, ncol = 2, top = paste(variable_name, "Histogram"))
}
# Example usage for histograms
for (x in 1:(ncol(df) - 1)) {
histogram_graph(names(df)[x], df[, x], df, df$Outcome)
}
needs( caret, mice, pROC, ROCR )
set.seed(123) # For reproducibility
# Split the data: 70% training and 30% testing
split <- createDataPartition(df$Outcome, p = 0.7, list = FALSE)
train_data <- df[split, ]
test_data <- df[-split, ]
cat("Training set size:", nrow(train_data), "\n")
Training set size: 538
cat("Testing set size:", nrow(test_data), "\n")
Testing set size: 230
log_model <- glm(Outcome ~ ., data = train_data, family = binomial)
summary(log_model)
Call:
glm(formula = Outcome ~ ., family = binomial, data = train_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -8.9844117 0.9789049 -9.178 < 2e-16 ***
Pregnancies 0.0877094 0.0392027 2.237 0.02526 *
Glucose 0.0345225 0.0048440 7.127 1.03e-12 ***
BloodPressure -0.0109310 0.0100405 -1.089 0.27629
SkinThickness -0.0101824 0.0161604 -0.630 0.52864
Insulin 0.0008184 0.0014130 0.579 0.56245
BMI 0.1007873 0.0237692 4.240 2.23e-05 ***
DiabetesPedigreeFunction 1.0834061 0.3592497 3.016 0.00256 **
Age 0.0231499 0.0119500 1.937 0.05272 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 696.28 on 537 degrees of freedom
Residual deviance: 503.72 on 529 degrees of freedom
AIC: 521.72
Number of Fisher Scoring iterations: 5
# Predict probabilities
test_prob <- predict(log_model, newdata = test_data, type = "response")
# Convert probabilities to class labels
test_pred <- ifelse(test_prob > 0.5, "X1", "X0") # Assuming "X1" is diabetes and "X0" is no diabetes
test_pred <- factor(test_pred, levels = levels(df$Outcome))
conf_matrix <- confusionMatrix(test_pred, test_data$Outcome)
print(conf_matrix)
Confusion Matrix and Statistics
Reference
Prediction X0 X1
X0 127 33
X1 23 47
Accuracy : 0.7565
95% CI : (0.6958, 0.8105)
No Information Rate : 0.6522
P-Value [Acc > NIR] : 0.000421
Kappa : 0.4472
Mcnemar's Test P-Value : 0.229102
Sensitivity : 0.8467
Specificity : 0.5875
Pos Pred Value : 0.7938
Neg Pred Value : 0.6714
Prevalence : 0.6522
Detection Rate : 0.5522
Detection Prevalence : 0.6957
Balanced Accuracy : 0.7171
'Positive' Class : X0
cat("Confusion Matrix:\n")
Confusion Matrix:
print(conf_matrix$table)
Reference
Prediction X0 X1
X0 127 33
X1 23 47
cat("\nOverall Statistics:\n")
Overall Statistics:
print(conf_matrix$overall)
Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
0.7565217391 0.4472103004 0.6957743690 0.8105281717 0.6521739130
AccuracyPValue McnemarPValue
0.0004209657 0.2291018841
cat("\nClass Statistics:\n")
Class Statistics:
print(conf_matrix$byClass)
Sensitivity Specificity Pos Pred Value
0.8466667 0.5875000 0.7937500
Neg Pred Value Precision Recall
0.6714286 0.7937500 0.8466667
F1 Prevalence Detection Rate
0.8193548 0.6521739 0.5521739
Detection Prevalence Balanced Accuracy
0.6956522 0.7170833
set.seed(1234)
# Calculate total within-cluster sum of squares for different numbers of clusters
wss <- (nrow(train_data) - 1) * sum(apply(train_data[, -ncol(train_data)], 2, var))
for (i in 2:10) wss[i] <- sum(kmeans(train_data[, -ncol(train_data)], centers = i, nstart = 20)$tot.withinss)
plot(1:10, wss, type = "b", pch = 19, frame = FALSE,
xlab = "Number of Clusters",
ylab = "Total Within-Cluster Sum of Squares",
main = "Elbow Method for Optimal Number of Clusters")
abline(v = which.min(wss), col = "red", lty = 2) # Mark the elbow point
roc_curve <- roc(test_data$Outcome, test_prob)
auc_value <- auc(roc_curve)
plot(roc_curve, col = "blue", main = paste("ROC Curve (AUC =", round(auc_value, 2), ")"))
abline(a = 0, b = 1, col = "red", lty = 2)
cat("AUC:", auc_value, "\n")
AUC: 0.85
The logistic regression model developed for predicting diabetes among Pima Indians has shown a respectable performance. The key takeaways are:
Accuracy: The model correctly predicts the outcome 75.65% of the time, which is a strong indicator that the model is reliable but still leaves some room for improvement.
AUC: With an AUC of 0.85, the model demonstrates good discriminative ability, meaning it can distinguish between positive and negative cases effectively.
Sensitivity (Recall): The model has a sensitivity of 84.67%, showing that it is quite effective at identifying true positive cases (patients with diabetes).
Specificity: The specificity is 58.75%, indicating that the model is less effective at identifying true negative cases (patients without diabetes), which suggests a potential area for improvement.
Kappa Score: The Kappa score of 0.4472 reflects moderate agreement between the predicted and actual outcomes, indicating the model’s predictions are more than just chance but not exceptionally high.
Elbow Chart: The Elbow chart suggests that the chosen number of clusters for K-means clustering is optimal, which is critical for effective segmentation or categorization within the data.
Overall, the model performs well, with a solid AUC and accuracy, though the imbalance between sensitivity and specificity suggests it might be more cautious in predicting the absence of diabetes. The Elbow chart confirms that the clustering approach is sound. This model is a strong starting point, with opportunities for refinement in future iterations.