# Loading the neccessary packages
library(rpart)
install.packages("rpart.plot", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//Rtmpove3s5/downloaded_packages
library(rpart.plot)
library(nnet)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ggplot2)
library(reshape2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.95 loaded
install.packages("GGally", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//Rtmpove3s5/downloaded_packages
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

General

# Getting the working directory
getwd()
## [1] "/Users/ursulapodosenin/Desktop"
# Reading the file
data<- read.csv("/Users/ursulapodosenin/Desktop/diabetes.csv")
head(data)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0

Exploratory Data Analysis

# Getting summary statistics
summary(data)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000
# Checking the structure of the data
str(data)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...
# Looking for missing values 
sapply(data, function(x) sum(is.na(x))) 
##              Pregnancies                  Glucose            BloodPressure 
##                        0                        0                        0 
##            SkinThickness                  Insulin                      BMI 
##                        0                        0                        0 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                        0                        0                        0
# Checking the distribution of the diabetes outcome
ggplot(data, aes(x = factor(Outcome))) +
  geom_bar(fill = c("steelblue", "firebrick")) +
  labs(title = "Diabetes Outcome Distribution", x = "Outcome", y = "Count") +
  theme_minimal()

# Creating a histogram for all numeric values
data_long <- melt(data, id.vars = "Outcome")
ggplot(data_long, aes(x = value)) +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  geom_histogram(bins = 30, fill = "darkcyan", color = "white") +
  labs(title = "Distribution of All Features") +
  theme_minimal()

# Creating box plots grouped by the outcome
ggplot(data_long, aes(x = factor(Outcome), y = value, fill = factor(Outcome))) +
  geom_boxplot() +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  labs(title = "Feature Distributions by Diabetes Outcome", x = "Outcome") +
  theme_minimal()

# Creating a correlation matrix
corr_matrix <- cor(data[, -9])  # exclude Outcome
corrplot(corr_matrix, method = "color", addCoef.col = "black",
         tl.cex = 0.8, number.cex = 0.7, title = "Correlation Matrix",
         mar = c(0, 0, 1, 0))

# Plotting BMI vs. Age by diabetes outcome
ggplot(data, aes(x = Age, y = BMI, color = factor(Outcome))) +
  geom_point(alpha = 0.7) +
  labs(title = "BMI vs Age Colored by Diabetes Outcome", color = "Outcome") +
  theme_minimal()

# Setting the seed so my code can be reproduced
set.seed(745)
# Splitting the data into training and test sets
trainIndex <- createDataPartition(data$Outcome, p = 0.7, list = FALSE)
trainData <- data[trainIndex, ]
testData <- data[-trainIndex, ]

Decision Tree Model

# Creating a decision tree model
dt_model <- rpart(Outcome ~ ., data = trainData, method = "class")
rpart.plot(dt_model, main = "Decision Tree for Diabetes Prediction")

# Creating predictions on the test data
dt_probs <- predict(dt_model, testData)[, 2]  # probability of class 1
dt_preds <- ifelse(dt_probs > 0.5, 1, 0)
# Evaluating my model using a confusion matrix and checking accuracy
dt_cm <- confusionMatrix(as.factor(dt_preds), as.factor(testData$Outcome))
dt_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 133  33
##          1  19  45
##                                           
##                Accuracy : 0.7739          
##                  95% CI : (0.7143, 0.8263)
##     No Information Rate : 0.6609          
##     P-Value [Acc > NIR] : 0.0001244       
##                                           
##                   Kappa : 0.4726          
##                                           
##  Mcnemar's Test P-Value : 0.0714235       
##                                           
##             Sensitivity : 0.8750          
##             Specificity : 0.5769          
##          Pos Pred Value : 0.8012          
##          Neg Pred Value : 0.7031          
##              Prevalence : 0.6609          
##          Detection Rate : 0.5783          
##    Detection Prevalence : 0.7217          
##       Balanced Accuracy : 0.7260          
##                                           
##        'Positive' Class : 0               
## 

Neural Network

# Normalizing the data
maxs <- apply(trainData[, -9], 2, max)
mins <- apply(trainData[, -9], 2, min)

# Scaling the train and test data
scaled_train <- as.data.frame(scale(trainData[, -9], center = mins, scale = maxs - mins))
scaled_test <- as.data.frame(scale(testData[, -9], center = mins, scale = maxs - mins))
# Adding in the diabetes outcome
scaled_train$Outcome <- trainData$Outcome
scaled_test$Outcome <- testData$Outcome
# Creating the neural network model
nn_model <- nnet(Outcome ~ ., data = scaled_train, size = 5, maxit = 200, linout = FALSE)
## # weights:  51
## initial  value 124.828431 
## iter  10 value 87.461269
## iter  20 value 80.995413
## iter  30 value 76.956100
## iter  40 value 75.229606
## iter  50 value 72.159678
## iter  60 value 70.813523
## iter  70 value 67.521810
## iter  80 value 64.514593
## iter  90 value 63.404226
## iter 100 value 62.806918
## iter 110 value 62.468376
## iter 120 value 62.334312
## iter 130 value 62.264210
## iter 140 value 62.217958
## iter 150 value 62.172566
## iter 160 value 62.004130
## iter 170 value 61.836623
## iter 180 value 61.744515
## iter 190 value 61.729184
## iter 200 value 61.671448
## final  value 61.671448 
## stopped after 200 iterations
# Making predictions on the test data
nn_probs <- predict(nn_model, scaled_test[, -9])
nn_preds <- ifelse(nn_probs > 0.5, 1, 0)
# Evaluating my model using a confusion matrix and checking accuracy
nn_cm <- confusionMatrix(as.factor(nn_preds), as.factor(scaled_test$Outcome))
nn_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 123  34
##          1  29  44
##                                           
##                Accuracy : 0.7261          
##                  95% CI : (0.6636, 0.7826)
##     No Information Rate : 0.6609          
##     P-Value [Acc > NIR] : 0.02037         
##                                           
##                   Kappa : 0.3792          
##                                           
##  Mcnemar's Test P-Value : 0.61429         
##                                           
##             Sensitivity : 0.8092          
##             Specificity : 0.5641          
##          Pos Pred Value : 0.7834          
##          Neg Pred Value : 0.6027          
##              Prevalence : 0.6609          
##          Detection Rate : 0.5348          
##    Detection Prevalence : 0.6826          
##       Balanced Accuracy : 0.6867          
##                                           
##        'Positive' Class : 0               
## 

ROC Curve

# Checking the ROC for both models
dt_roc <- roc(testData$Outcome, dt_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
nn_roc <- roc(scaled_test$Outcome, nn_probs)
## Setting levels: control = 0, case = 1
## Warning in roc.default(scaled_test$Outcome, nn_probs): Deprecated use a matrix
## as predictor. Unexpected results may be produced, please pass a numeric vector.
## Setting direction: controls < cases
# Plotting the ROC curve for both models
# Plot both ROC curves
plot(dt_roc, col = "blue", main = "ROC Curve: Decision Tree vs Neural Network")
lines(nn_roc, col = "red")
legend("bottomright", legend = c("Decision Tree", "Neural Network"),
       col = c("blue", "red"), lwd = 2)