# Loading the neccessary packages
library(rpart)
install.packages("rpart.plot", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//Rtmpove3s5/downloaded_packages
library(rpart.plot)
library(nnet)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ggplot2)
library(reshape2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.95 loaded
install.packages("GGally", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//Rtmpove3s5/downloaded_packages
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
General
# Getting the working directory
getwd()
## [1] "/Users/ursulapodosenin/Desktop"
# Reading the file
data<- read.csv("/Users/ursulapodosenin/Desktop/diabetes.csv")
head(data)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
Exploratory Data Analysis
# Getting summary statistics
summary(data)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
# Checking the structure of the data
str(data)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
# Looking for missing values
sapply(data, function(x) sum(is.na(x)))
## Pregnancies Glucose BloodPressure
## 0 0 0
## SkinThickness Insulin BMI
## 0 0 0
## DiabetesPedigreeFunction Age Outcome
## 0 0 0
# Checking the distribution of the diabetes outcome
ggplot(data, aes(x = factor(Outcome))) +
geom_bar(fill = c("steelblue", "firebrick")) +
labs(title = "Diabetes Outcome Distribution", x = "Outcome", y = "Count") +
theme_minimal()

# Creating a histogram for all numeric values
data_long <- melt(data, id.vars = "Outcome")
ggplot(data_long, aes(x = value)) +
facet_wrap(~variable, scales = "free", ncol = 3) +
geom_histogram(bins = 30, fill = "darkcyan", color = "white") +
labs(title = "Distribution of All Features") +
theme_minimal()

# Creating box plots grouped by the outcome
ggplot(data_long, aes(x = factor(Outcome), y = value, fill = factor(Outcome))) +
geom_boxplot() +
facet_wrap(~variable, scales = "free", ncol = 3) +
labs(title = "Feature Distributions by Diabetes Outcome", x = "Outcome") +
theme_minimal()

# Creating a correlation matrix
corr_matrix <- cor(data[, -9]) # exclude Outcome
corrplot(corr_matrix, method = "color", addCoef.col = "black",
tl.cex = 0.8, number.cex = 0.7, title = "Correlation Matrix",
mar = c(0, 0, 1, 0))

# Plotting BMI vs. Age by diabetes outcome
ggplot(data, aes(x = Age, y = BMI, color = factor(Outcome))) +
geom_point(alpha = 0.7) +
labs(title = "BMI vs Age Colored by Diabetes Outcome", color = "Outcome") +
theme_minimal()

# Setting the seed so my code can be reproduced
set.seed(745)
# Splitting the data into training and test sets
trainIndex <- createDataPartition(data$Outcome, p = 0.7, list = FALSE)
trainData <- data[trainIndex, ]
testData <- data[-trainIndex, ]
Decision Tree Model
# Creating a decision tree model
dt_model <- rpart(Outcome ~ ., data = trainData, method = "class")
rpart.plot(dt_model, main = "Decision Tree for Diabetes Prediction")

# Creating predictions on the test data
dt_probs <- predict(dt_model, testData)[, 2] # probability of class 1
dt_preds <- ifelse(dt_probs > 0.5, 1, 0)
# Evaluating my model using a confusion matrix and checking accuracy
dt_cm <- confusionMatrix(as.factor(dt_preds), as.factor(testData$Outcome))
dt_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 133 33
## 1 19 45
##
## Accuracy : 0.7739
## 95% CI : (0.7143, 0.8263)
## No Information Rate : 0.6609
## P-Value [Acc > NIR] : 0.0001244
##
## Kappa : 0.4726
##
## Mcnemar's Test P-Value : 0.0714235
##
## Sensitivity : 0.8750
## Specificity : 0.5769
## Pos Pred Value : 0.8012
## Neg Pred Value : 0.7031
## Prevalence : 0.6609
## Detection Rate : 0.5783
## Detection Prevalence : 0.7217
## Balanced Accuracy : 0.7260
##
## 'Positive' Class : 0
##
Neural Network
# Normalizing the data
maxs <- apply(trainData[, -9], 2, max)
mins <- apply(trainData[, -9], 2, min)
# Scaling the train and test data
scaled_train <- as.data.frame(scale(trainData[, -9], center = mins, scale = maxs - mins))
scaled_test <- as.data.frame(scale(testData[, -9], center = mins, scale = maxs - mins))
# Adding in the diabetes outcome
scaled_train$Outcome <- trainData$Outcome
scaled_test$Outcome <- testData$Outcome
# Creating the neural network model
nn_model <- nnet(Outcome ~ ., data = scaled_train, size = 5, maxit = 200, linout = FALSE)
## # weights: 51
## initial value 124.828431
## iter 10 value 87.461269
## iter 20 value 80.995413
## iter 30 value 76.956100
## iter 40 value 75.229606
## iter 50 value 72.159678
## iter 60 value 70.813523
## iter 70 value 67.521810
## iter 80 value 64.514593
## iter 90 value 63.404226
## iter 100 value 62.806918
## iter 110 value 62.468376
## iter 120 value 62.334312
## iter 130 value 62.264210
## iter 140 value 62.217958
## iter 150 value 62.172566
## iter 160 value 62.004130
## iter 170 value 61.836623
## iter 180 value 61.744515
## iter 190 value 61.729184
## iter 200 value 61.671448
## final value 61.671448
## stopped after 200 iterations
# Making predictions on the test data
nn_probs <- predict(nn_model, scaled_test[, -9])
nn_preds <- ifelse(nn_probs > 0.5, 1, 0)
# Evaluating my model using a confusion matrix and checking accuracy
nn_cm <- confusionMatrix(as.factor(nn_preds), as.factor(scaled_test$Outcome))
nn_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 123 34
## 1 29 44
##
## Accuracy : 0.7261
## 95% CI : (0.6636, 0.7826)
## No Information Rate : 0.6609
## P-Value [Acc > NIR] : 0.02037
##
## Kappa : 0.3792
##
## Mcnemar's Test P-Value : 0.61429
##
## Sensitivity : 0.8092
## Specificity : 0.5641
## Pos Pred Value : 0.7834
## Neg Pred Value : 0.6027
## Prevalence : 0.6609
## Detection Rate : 0.5348
## Detection Prevalence : 0.6826
## Balanced Accuracy : 0.6867
##
## 'Positive' Class : 0
##
ROC Curve
# Checking the ROC for both models
dt_roc <- roc(testData$Outcome, dt_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
nn_roc <- roc(scaled_test$Outcome, nn_probs)
## Setting levels: control = 0, case = 1
## Warning in roc.default(scaled_test$Outcome, nn_probs): Deprecated use a matrix
## as predictor. Unexpected results may be produced, please pass a numeric vector.
## Setting direction: controls < cases
# Plotting the ROC curve for both models
# Plot both ROC curves
plot(dt_roc, col = "blue", main = "ROC Curve: Decision Tree vs Neural Network")
lines(nn_roc, col = "red")
legend("bottomright", legend = c("Decision Tree", "Neural Network"),
col = c("blue", "red"), lwd = 2)
