R Markdown

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.1
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'purrr' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.1
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## Warning: package 'dials' was built under R version 4.3.1
## Warning: package 'modeldata' was built under R version 4.3.1
## Warning: package 'parsnip' was built under R version 4.3.1
## Warning: package 'recipes' was built under R version 4.3.1
## Warning: package 'rsample' was built under R version 4.3.1
## Warning: package 'tune' was built under R version 4.3.1
## Warning: package 'workflows' was built under R version 4.3.1
## Warning: package 'workflowsets' was built under R version 4.3.1
## Warning: package 'yardstick' was built under R version 4.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

Loading the data.

library(readr)
diabetes <- read_csv("C:/Users/sivak/OneDrive/Desktop/ITAUMA/machine learning/shahin final project/diabetes.csv")
## Rows: 768 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

checking for missing data.

missing_counts <- map_dbl(diabetes, ~sum(is.na(.)))

# Create a data frame with the missing value counts
missing_data <- data.frame(Variable = names(diabetes), Missing_Count = missing_counts)

# Print the missing value counts
print(missing_data)
##                                          Variable Missing_Count
## Pregnancies                           Pregnancies             0
## Glucose                                   Glucose             0
## BloodPressure                       BloodPressure             0
## SkinThickness                       SkinThickness             0
## Insulin                                   Insulin             0
## BMI                                           BMI             0
## DiabetesPedigreeFunction DiabetesPedigreeFunction             0
## Age                                           Age             0
## Outcome                                   Outcome             0
glimpse(diabetes)
## Rows: 768
## Columns: 9
## $ Pregnancies              <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, …
## $ Glucose                  <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125…
## $ BloodPressure            <dbl> 72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92, 74…
## $ SkinThickness            <dbl> 35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, 0, …
## $ Insulin                  <dbl> 0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, 0, …
## $ BMI                      <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.…
## $ DiabetesPedigreeFunction <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.2…
## $ Age                      <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 3…
## $ Outcome                  <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, …
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

converting outcome variable into factor.

diabetes$Outcome <- as.factor(diabetes$Outcome)

diabetes$Outcome
##   [1] 1 0 1 0 1 0 1 0 1 1 0 1 0 1 1 1 1 1 0 1 0 0 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0
##  [38] 1 1 1 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0
##  [75] 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1
## [112] 1 0 0 1 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## [149] 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0
## [186] 1 1 1 1 1 0 0 1 1 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 1 1 0 1 1 1 1
## [223] 0 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0
## [260] 1 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 1 1 1 0 0
## [297] 1 0 1 0 1 1 0 1 0 0 1 0 1 1 0 0 1 0 1 0 0 1 0 1 0 1 1 1 0 0 1 0 1 0 0 0 1
## [334] 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 0 1 0 0 1
## [371] 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 1 1 0 1 0 1 0 1
## [408] 0 1 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1
## [445] 1 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1
## [482] 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0
## [519] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0
## [556] 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 0 1 0 1 0
## [593] 1 0 0 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0
## [630] 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 0 1 1 1 1 0
## [667] 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 0 1 1
## [704] 0 0 0 1 0 1 1 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1
## [741] 1 0 0 1 0 0 1 0 1 1 1 0 0 1 1 1 0 1 0 1 0 1 0 0 0 0 1 0
## Levels: 0 1

pre processing method. Applying Normalisation on data.

# I Selected the columns I want to normalize
columns_to_normalize <- c("Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "Age", "Pregnancies","DiabetesPedigreeFunction")

# Normalize the selected columns using the scale() function
diabetes[columns_to_normalize] <- scale(diabetes[columns_to_normalize])

# Print the first few rows of the normalized dataset
head(diabetes)
## # A tibble: 6 × 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin    BMI
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl>  <dbl>
## 1       0.640   0.848         0.150         0.907  -0.692  0.204
## 2      -0.844  -1.12         -0.160         0.531  -0.692 -0.684
## 3       1.23    1.94         -0.264        -1.29   -0.692 -1.10 
## 4      -0.844  -0.998        -0.160         0.154   0.123 -0.494
## 5      -1.14    0.504        -1.50          0.907   0.765  1.41 
## 6       0.343  -0.153         0.253        -1.29   -0.692 -0.811
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <fct>

Split the data into training and testing.

set.seed(20230712)

train_test_split <- initial_split(diabetes, prop = .80, strata = "Outcome")

data_train <- training(train_test_split)
data_test  <- testing(train_test_split)

Creating recipe for the model.

# Define recipe and model
my_rec <- recipe(Outcome ~ ., data = data_train)
my_mod <- logistic_reg()

# Create a workflow
my_wf <- workflow() %>%
  add_model(my_mod) %>%
  add_recipe(my_rec)

# Fit the model using the data_train with the "Outcome" variable as a factor
fitted_model <- fit(my_wf, data = data_train)

# Perform the last_fit with the correct workflow
final_fit <- last_fit(my_mod, my_rec, split = train_test_split)
final_fit
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits            id               .metrics .notes   .predictions .workflow 
##   <list>            <chr>            <list>   <list>   <list>       <list>    
## 1 <split [614/154]> train/test split <tibble> <tibble> <tibble>     <workflow>

final fit.

final_fit %>%
    collect_predictions()
## # A tibble: 154 × 7
##    id               .pred_0 .pred_1  .row .pred_class Outcome .config           
##    <chr>              <dbl>   <dbl> <int> <fct>       <fct>   <chr>             
##  1 train/test split   0.111  0.889      5 1           1       Preprocessor1_Mod…
##  2 train/test split   0.754  0.246     11 0           0       Preprocessor1_Mod…
##  3 train/test split   0.804  0.196     18 0           1       Preprocessor1_Mod…
##  4 train/test split   0.628  0.372     19 0           0       Preprocessor1_Mod…
##  5 train/test split   0.754  0.246     20 0           1       Preprocessor1_Mod…
##  6 train/test split   0.744  0.256     24 0           1       Preprocessor1_Mod…
##  7 train/test split   0.243  0.757     27 1           1       Preprocessor1_Mod…
##  8 train/test split   0.473  0.527     32 1           1       Preprocessor1_Mod…
##  9 train/test split   0.620  0.380     38 0           1       Preprocessor1_Mod…
## 10 train/test split   0.960  0.0399    48 0           0       Preprocessor1_Mod…
## # ℹ 144 more rows
final_predictions <- final_fit %>%
  collect_predictions()

# View the final predictions
head(final_predictions)
## # A tibble: 6 × 7
##   id               .pred_0 .pred_1  .row .pred_class Outcome .config            
##   <chr>              <dbl>   <dbl> <int> <fct>       <fct>   <chr>              
## 1 train/test split   0.111   0.889     5 1           1       Preprocessor1_Mode…
## 2 train/test split   0.754   0.246    11 0           0       Preprocessor1_Mode…
## 3 train/test split   0.804   0.196    18 0           1       Preprocessor1_Mode…
## 4 train/test split   0.628   0.372    19 0           0       Preprocessor1_Mode…
## 5 train/test split   0.754   0.246    20 0           1       Preprocessor1_Mode…
## 6 train/test split   0.744   0.256    24 0           1       Preprocessor1_Mode…
# Assuming final_fit is your final model
final_fit %>%
  collect_predictions() %>% # Collect test set predictions
  select(.pred_class, Outcome) %>% # Select relevant columns
  mutate(correct = .pred_class == Outcome) %>% # Create a new variable 'correct'
  group_by(correct) %>% # Group by correct/incorrect predictions
  summarize(count = n()) # Count the number of correct and incorrect predictions
## # A tibble: 2 × 2
##   correct count
##   <lgl>   <int>
## 1 FALSE      30
## 2 TRUE      124

calculating precision, accuracy and recall for this Logistic regression model.

library(dplyr)

# Assuming final_fit is your final model
final_predictions <- final_fit %>%
  collect_predictions() %>%
  select(.pred_class, Outcome)

# Calculate Accuracy
accuracy <- final_predictions %>%
  mutate(correct = .pred_class == Outcome) %>%
  summarize(accuracy = mean(correct))

# Calculate Precision
precision <- final_predictions %>%
  filter(.pred_class == "1") %>%
  mutate(correct = .pred_class == Outcome) %>%
  summarize(precision = sum(correct) / n())

# Calculate Recall
recall <- final_predictions %>%
  filter(Outcome == "1") %>%
  mutate(correct = .pred_class == Outcome) %>%
  summarize(recall = sum(correct) / n())

# Print the results
print("Accuracy:")
## [1] "Accuracy:"
print(accuracy)
## # A tibble: 1 × 1
##   accuracy
##      <dbl>
## 1    0.805
print("Precision:")
## [1] "Precision:"
print(precision)
## # A tibble: 1 × 1
##   precision
##       <dbl>
## 1       0.8
print("Recall:")
## [1] "Recall:"
print(recall)
## # A tibble: 1 × 1
##   recall
##    <dbl>
## 1  0.593

Creating Decision tree model and finding precision, accuracy and recall.

library(tidyverse)
library(tidymodels)
library(readr)
library(rpart)
## 
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
## 
##     prune
# Define and train a decision tree model
my_mod <- decision_tree() %>%
  set_engine("rpart") %>%
  set_mode("classification")

fitted_model <- fit(my_mod, Outcome ~ ., data = data_train)

# Make predictions on the test set
predictions <- predict(fitted_model, data_test)

# Calculate Accuracy, Precision, and Recall
conf_matrix <- table(Actual = data_test$Outcome, Predicted = predictions$.pred_class)

accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
recall <- conf_matrix[2, 2] / sum(conf_matrix[2, ])

# Print the results
print("Accuracy:")
## [1] "Accuracy:"
print(accuracy)
## [1] 0.7532468
print("Precision:")
## [1] "Precision:"
print(precision)
## [1] 0.6818182
print("Recall:")
## [1] "Recall:"
print(recall)
## [1] 0.5555556

Creating Random forest model and calculating accuracy, precision, recall.

# Define and train a Random Forest model
my_mod <- rand_forest(mtry = 3, trees = 1000) %>%
  set_engine("randomForest") %>%
  set_mode("classification")

fitted_model <- fit(my_mod, Outcome ~ ., data = data_train)

# Make predictions on the test set
predictions <- predict(fitted_model, data_test)

# Calculate Accuracy, Precision, and Recall
conf_matrix <- table(Actual = data_test$Outcome, Predicted = predictions$.pred_class)

accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
recall <- conf_matrix[2, 2] / sum(conf_matrix[2, ])

# Print the results
print("Accuracy:")
## [1] "Accuracy:"
print(accuracy)
## [1] 0.7857143
print("Precision:")
## [1] "Precision:"
print(precision)
## [1] 0.7234043
print("Recall:")
## [1] "Recall:"
print(recall)
## [1] 0.6296296

Creating Naive bayes model.

library(tidyverse)
library(tidymodels)
library(readr)
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.1
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:tune':
## 
##     tune
## The following object is masked from 'package:rsample':
## 
##     permutations
## The following object is masked from 'package:parsnip':
## 
##     tune
# Train a Naive Bayes model
fitted_model <- naiveBayes(Outcome ~ ., data = data_train)

# Make predictions on the test set
predictions <- predict(fitted_model, data_test)

# Calculate Accuracy, Precision, and Recall
conf_matrix <- table(Actual = data_test$Outcome, Predicted = predictions)

accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
recall <- conf_matrix[2, 2] / sum(conf_matrix[2, ])

# Print the results
print("Accuracy:")
## [1] "Accuracy:"
print(accuracy)
## [1] 0.7792208
print("Precision:")
## [1] "Precision:"
print(precision)
## [1] 0.7083333
print("Recall:")
## [1] "Recall:"
print(recall)
## [1] 0.6296296