R Markdown

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.1

## Warning: package 'ggplot2' was built under R version 4.3.1

## Warning: package 'purrr' was built under R version 4.3.1

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.3.1

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8

## Warning: package 'dials' was built under R version 4.3.1

## Warning: package 'modeldata' was built under R version 4.3.1

## Warning: package 'parsnip' was built under R version 4.3.1

## Warning: package 'recipes' was built under R version 4.3.1

## Warning: package 'rsample' was built under R version 4.3.1

## Warning: package 'tune' was built under R version 4.3.1

## Warning: package 'workflows' was built under R version 4.3.1

## Warning: package 'workflowsets' was built under R version 4.3.1

## Warning: package 'yardstick' was built under R version 4.3.1

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

Loading the data.

library(readr)
diabetes <- read_csv("C:/Users/sivak/OneDrive/Desktop/ITAUMA/machine learning/shahin final project/diabetes.csv")

## Rows: 768 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

checking for missing data.

missing_counts <- map_dbl(diabetes, ~sum(is.na(.)))

# Create a data frame with the missing value counts
missing_data <- data.frame(Variable = names(diabetes), Missing_Count = missing_counts)

# Print the missing value counts
print(missing_data)

##                                          Variable Missing_Count
## Pregnancies                           Pregnancies             0
## Glucose                                   Glucose             0
## BloodPressure                       BloodPressure             0
## SkinThickness                       SkinThickness             0
## Insulin                                   Insulin             0
## BMI                                           BMI             0
## DiabetesPedigreeFunction DiabetesPedigreeFunction             0
## Age                                           Age             0
## Outcome                                   Outcome             0

glimpse(diabetes)

## Rows: 768
## Columns: 9
## $ Pregnancies              <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, …
## $ Glucose                  <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125…
## $ BloodPressure            <dbl> 72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92, 74…
## $ SkinThickness            <dbl> 35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, 0, …
## $ Insulin                  <dbl> 0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, 0, …
## $ BMI                      <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.…
## $ DiabetesPedigreeFunction <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.2…
## $ Age                      <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 3…
## $ Outcome                  <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, …

##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

converting outcome variable into factor.

diabetes$Outcome <- as.factor(diabetes$Outcome)

diabetes$Outcome

##   [1] 1 0 1 0 1 0 1 0 1 1 0 1 0 1 1 1 1 1 0 1 0 0 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0
##  [38] 1 1 1 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0
##  [75] 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1
## [112] 1 0 0 1 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## [149] 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0
## [186] 1 1 1 1 1 0 0 1 1 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 1 1 0 1 1 1 1
## [223] 0 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0
## [260] 1 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 1 1 1 0 0
## [297] 1 0 1 0 1 1 0 1 0 0 1 0 1 1 0 0 1 0 1 0 0 1 0 1 0 1 1 1 0 0 1 0 1 0 0 0 1
## [334] 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 0 1 0 0 1
## [371] 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 1 1 0 1 0 1 0 1
## [408] 0 1 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1
## [445] 1 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1
## [482] 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0
## [519] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0
## [556] 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 0 1 0 1 0
## [593] 1 0 0 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0
## [630] 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 0 1 1 1 1 0
## [667] 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 0 1 1
## [704] 0 0 0 1 0 1 1 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1
## [741] 1 0 0 1 0 0 1 0 1 1 1 0 0 1 1 1 0 1 0 1 0 1 0 0 0 0 1 0
## Levels: 0 1

pre processing method. Applying Normalisation on data.

# I Selected the columns I want to normalize
columns_to_normalize <- c("Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "Age", "Pregnancies","DiabetesPedigreeFunction")

# Normalize the selected columns using the scale() function
diabetes[columns_to_normalize] <- scale(diabetes[columns_to_normalize])

# Print the first few rows of the normalized dataset
head(diabetes)

## # A tibble: 6 × 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin    BMI
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl>  <dbl>
## 1       0.640   0.848         0.150         0.907  -0.692  0.204
## 2      -0.844  -1.12         -0.160         0.531  -0.692 -0.684
## 3       1.23    1.94         -0.264        -1.29   -0.692 -1.10 
## 4      -0.844  -0.998        -0.160         0.154   0.123 -0.494
## 5      -1.14    0.504        -1.50          0.907   0.765  1.41 
## 6       0.343  -0.153         0.253        -1.29   -0.692 -0.811
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <fct>

Split the data into training and testing.

set.seed(20230712)

train_test_split <- initial_split(diabetes, prop = .80, strata = "Outcome")

data_train <- training(train_test_split)
data_test  <- testing(train_test_split)

Creating recipe for the model.

# Define recipe and model
my_rec <- recipe(Outcome ~ ., data = data_train)
my_mod <- logistic_reg()

# Create a workflow
my_wf <- workflow() %>%
  add_model(my_mod) %>%
  add_recipe(my_rec)

# Fit the model using the data_train with the "Outcome" variable as a factor
fitted_model <- fit(my_wf, data = data_train)

# Perform the last_fit with the correct workflow
final_fit <- last_fit(my_mod, my_rec, split = train_test_split)
final_fit

## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits            id               .metrics .notes   .predictions .workflow 
##   <list>            <chr>            <list>   <list>   <list>       <list>    
## 1 <split [614/154]> train/test split <tibble> <tibble> <tibble>     <workflow>

final fit.

final_fit %>%
    collect_predictions()

## # A tibble: 154 × 7
##    id               .pred_0 .pred_1  .row .pred_class Outcome .config           
##    <chr>              <dbl>   <dbl> <int> <fct>       <fct>   <chr>             
##  1 train/test split   0.111  0.889      5 1           1       Preprocessor1_Mod…
##  2 train/test split   0.754  0.246     11 0           0       Preprocessor1_Mod…
##  3 train/test split   0.804  0.196     18 0           1       Preprocessor1_Mod…
##  4 train/test split   0.628  0.372     19 0           0       Preprocessor1_Mod…
##  5 train/test split   0.754  0.246     20 0           1       Preprocessor1_Mod…
##  6 train/test split   0.744  0.256     24 0           1       Preprocessor1_Mod…
##  7 train/test split   0.243  0.757     27 1           1       Preprocessor1_Mod…
##  8 train/test split   0.473  0.527     32 1           1       Preprocessor1_Mod…
##  9 train/test split   0.620  0.380     38 0           1       Preprocessor1_Mod…
## 10 train/test split   0.960  0.0399    48 0           0       Preprocessor1_Mod…
## # ℹ 144 more rows

final_predictions <- final_fit %>%
  collect_predictions()

# View the final predictions
head(final_predictions)

## # A tibble: 6 × 7
##   id               .pred_0 .pred_1  .row .pred_class Outcome .config            
##   <chr>              <dbl>   <dbl> <int> <fct>       <fct>   <chr>              
## 1 train/test split   0.111   0.889     5 1           1       Preprocessor1_Mode…
## 2 train/test split   0.754   0.246    11 0           0       Preprocessor1_Mode…
## 3 train/test split   0.804   0.196    18 0           1       Preprocessor1_Mode…
## 4 train/test split   0.628   0.372    19 0           0       Preprocessor1_Mode…
## 5 train/test split   0.754   0.246    20 0           1       Preprocessor1_Mode…
## 6 train/test split   0.744   0.256    24 0           1       Preprocessor1_Mode…

# Assuming final_fit is your final model
final_fit %>%
  collect_predictions() %>% # Collect test set predictions
  select(.pred_class, Outcome) %>% # Select relevant columns
  mutate(correct = .pred_class == Outcome) %>% # Create a new variable 'correct'
  group_by(correct) %>% # Group by correct/incorrect predictions
  summarize(count = n()) # Count the number of correct and incorrect predictions

## # A tibble: 2 × 2
##   correct count
##   <lgl>   <int>
## 1 FALSE      30
## 2 TRUE      124

calculating precision, accuracy and recall for this Logistic regression model.

library(dplyr)

# Assuming final_fit is your final model
final_predictions <- final_fit %>%
  collect_predictions() %>%
  select(.pred_class, Outcome)

# Calculate Accuracy
accuracy <- final_predictions %>%
  mutate(correct = .pred_class == Outcome) %>%
  summarize(accuracy = mean(correct))

# Calculate Precision
precision <- final_predictions %>%
  filter(.pred_class == "1") %>%
  mutate(correct = .pred_class == Outcome) %>%
  summarize(precision = sum(correct) / n())

# Calculate Recall
recall <- final_predictions %>%
  filter(Outcome == "1") %>%
  mutate(correct = .pred_class == Outcome) %>%
  summarize(recall = sum(correct) / n())

# Print the results
print("Accuracy:")

## [1] "Accuracy:"

print(accuracy)

## # A tibble: 1 × 1
##   accuracy
##      <dbl>
## 1    0.805

print("Precision:")

## [1] "Precision:"

print(precision)

## # A tibble: 1 × 1
##   precision
##       <dbl>
## 1       0.8

print("Recall:")

## [1] "Recall:"

print(recall)

## # A tibble: 1 × 1
##   recall
##    <dbl>
## 1  0.593

Creating Decision tree model and finding precision, accuracy and recall.

library(tidyverse)
library(tidymodels)
library(readr)
library(rpart)

## 
## Attaching package: 'rpart'

## The following object is masked from 'package:dials':
## 
##     prune

# Define and train a decision tree model
my_mod <- decision_tree() %>%
  set_engine("rpart") %>%
  set_mode("classification")

fitted_model <- fit(my_mod, Outcome ~ ., data = data_train)

# Make predictions on the test set
predictions <- predict(fitted_model, data_test)

# Calculate Accuracy, Precision, and Recall
conf_matrix <- table(Actual = data_test$Outcome, Predicted = predictions$.pred_class)

accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
recall <- conf_matrix[2, 2] / sum(conf_matrix[2, ])

# Print the results
print("Accuracy:")

## [1] "Accuracy:"

print(accuracy)

## [1] 0.7532468

print("Precision:")

## [1] "Precision:"

print(precision)

## [1] 0.6818182

print("Recall:")

## [1] "Recall:"

print(recall)

## [1] 0.5555556

Creating Random forest model and calculating accuracy, precision, recall.

# Define and train a Random Forest model
my_mod <- rand_forest(mtry = 3, trees = 1000) %>%
  set_engine("randomForest") %>%
  set_mode("classification")

fitted_model <- fit(my_mod, Outcome ~ ., data = data_train)

# Make predictions on the test set
predictions <- predict(fitted_model, data_test)

# Calculate Accuracy, Precision, and Recall
conf_matrix <- table(Actual = data_test$Outcome, Predicted = predictions$.pred_class)

accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
recall <- conf_matrix[2, 2] / sum(conf_matrix[2, ])

# Print the results
print("Accuracy:")

## [1] "Accuracy:"

print(accuracy)

## [1] 0.7857143

print("Precision:")

## [1] "Precision:"

print(precision)

## [1] 0.7234043

print("Recall:")

## [1] "Recall:"

print(recall)

## [1] 0.6296296

Creating Naive bayes model.

library(tidyverse)
library(tidymodels)
library(readr)
library(e1071)

## Warning: package 'e1071' was built under R version 4.3.1

## 
## Attaching package: 'e1071'

## The following object is masked from 'package:tune':
## 
##     tune

## The following object is masked from 'package:rsample':
## 
##     permutations

## The following object is masked from 'package:parsnip':
## 
##     tune

# Train a Naive Bayes model
fitted_model <- naiveBayes(Outcome ~ ., data = data_train)

# Make predictions on the test set
predictions <- predict(fitted_model, data_test)

# Calculate Accuracy, Precision, and Recall
conf_matrix <- table(Actual = data_test$Outcome, Predicted = predictions)

accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
recall <- conf_matrix[2, 2] / sum(conf_matrix[2, ])

# Print the results
print("Accuracy:")

## [1] "Accuracy:"

print(accuracy)

## [1] 0.7792208

print("Precision:")

## [1] "Precision:"

print(precision)

## [1] 0.7083333

print("Recall:")

## [1] "Recall:"

print(recall)

## [1] 0.6296296

shahin

shahin

2023-10-22

R Markdown

Loading the data.

checking for missing data.

converting outcome variable into factor.

pre processing method. Applying Normalisation on data.

Split the data into training and testing.

Creating recipe for the model.

final fit.

calculating precision, accuracy and recall for this Logistic regression model.

Creating Decision tree model and finding precision, accuracy and recall.

Creating Random forest model and calculating accuracy, precision, recall.

Creating Naive bayes model.