R Markdown
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.1
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'purrr' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.1
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## Warning: package 'dials' was built under R version 4.3.1
## Warning: package 'modeldata' was built under R version 4.3.1
## Warning: package 'parsnip' was built under R version 4.3.1
## Warning: package 'recipes' was built under R version 4.3.1
## Warning: package 'rsample' was built under R version 4.3.1
## Warning: package 'tune' was built under R version 4.3.1
## Warning: package 'workflows' was built under R version 4.3.1
## Warning: package 'workflowsets' was built under R version 4.3.1
## Warning: package 'yardstick' was built under R version 4.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
Loading the data.
library(readr)
diabetes <- read_csv("C:/Users/sivak/OneDrive/Desktop/ITAUMA/machine learning/shahin final project/diabetes.csv")
## Rows: 768 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
checking for missing data.
missing_counts <- map_dbl(diabetes, ~sum(is.na(.)))
# Create a data frame with the missing value counts
missing_data <- data.frame(Variable = names(diabetes), Missing_Count = missing_counts)
# Print the missing value counts
print(missing_data)
## Variable Missing_Count
## Pregnancies Pregnancies 0
## Glucose Glucose 0
## BloodPressure BloodPressure 0
## SkinThickness SkinThickness 0
## Insulin Insulin 0
## BMI BMI 0
## DiabetesPedigreeFunction DiabetesPedigreeFunction 0
## Age Age 0
## Outcome Outcome 0
glimpse(diabetes)
## Rows: 768
## Columns: 9
## $ Pregnancies <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, …
## $ Glucose <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125…
## $ BloodPressure <dbl> 72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92, 74…
## $ SkinThickness <dbl> 35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, 0, …
## $ Insulin <dbl> 0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, 0, …
## $ BMI <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.…
## $ DiabetesPedigreeFunction <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.2…
## $ Age <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 3…
## $ Outcome <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, …
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
converting outcome variable into factor.
diabetes$Outcome <- as.factor(diabetes$Outcome)
diabetes$Outcome
## [1] 1 0 1 0 1 0 1 0 1 1 0 1 0 1 1 1 1 1 0 1 0 0 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0
## [38] 1 1 1 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0
## [75] 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1
## [112] 1 0 0 1 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## [149] 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0
## [186] 1 1 1 1 1 0 0 1 1 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 1 1 0 1 1 1 1
## [223] 0 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0
## [260] 1 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 1 1 1 0 0
## [297] 1 0 1 0 1 1 0 1 0 0 1 0 1 1 0 0 1 0 1 0 0 1 0 1 0 1 1 1 0 0 1 0 1 0 0 0 1
## [334] 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 0 1 0 0 1
## [371] 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 1 1 0 1 0 1 0 1
## [408] 0 1 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1
## [445] 1 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1
## [482] 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0
## [519] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0
## [556] 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 0 1 0 1 0
## [593] 1 0 0 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0
## [630] 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 0 1 1 1 1 0
## [667] 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 0 1 1
## [704] 0 0 0 1 0 1 1 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1
## [741] 1 0 0 1 0 0 1 0 1 1 1 0 0 1 1 1 0 1 0 1 0 1 0 0 0 0 1 0
## Levels: 0 1
pre processing method. Applying Normalisation on data.
# I Selected the columns I want to normalize
columns_to_normalize <- c("Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "Age", "Pregnancies","DiabetesPedigreeFunction")
# Normalize the selected columns using the scale() function
diabetes[columns_to_normalize] <- scale(diabetes[columns_to_normalize])
# Print the first few rows of the normalized dataset
head(diabetes)
## # A tibble: 6 × 9
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.640 0.848 0.150 0.907 -0.692 0.204
## 2 -0.844 -1.12 -0.160 0.531 -0.692 -0.684
## 3 1.23 1.94 -0.264 -1.29 -0.692 -1.10
## 4 -0.844 -0.998 -0.160 0.154 0.123 -0.494
## 5 -1.14 0.504 -1.50 0.907 0.765 1.41
## 6 0.343 -0.153 0.253 -1.29 -0.692 -0.811
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <fct>
Split the data into training and testing.
set.seed(20230712)
train_test_split <- initial_split(diabetes, prop = .80, strata = "Outcome")
data_train <- training(train_test_split)
data_test <- testing(train_test_split)
Creating recipe for the model.
# Define recipe and model
my_rec <- recipe(Outcome ~ ., data = data_train)
my_mod <- logistic_reg()
# Create a workflow
my_wf <- workflow() %>%
add_model(my_mod) %>%
add_recipe(my_rec)
# Fit the model using the data_train with the "Outcome" variable as a factor
fitted_model <- fit(my_wf, data = data_train)
# Perform the last_fit with the correct workflow
final_fit <- last_fit(my_mod, my_rec, split = train_test_split)
final_fit
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [614/154]> train/test split <tibble> <tibble> <tibble> <workflow>
final fit.
final_fit %>%
collect_predictions()
## # A tibble: 154 × 7
## id .pred_0 .pred_1 .row .pred_class Outcome .config
## <chr> <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 train/test split 0.111 0.889 5 1 1 Preprocessor1_Mod…
## 2 train/test split 0.754 0.246 11 0 0 Preprocessor1_Mod…
## 3 train/test split 0.804 0.196 18 0 1 Preprocessor1_Mod…
## 4 train/test split 0.628 0.372 19 0 0 Preprocessor1_Mod…
## 5 train/test split 0.754 0.246 20 0 1 Preprocessor1_Mod…
## 6 train/test split 0.744 0.256 24 0 1 Preprocessor1_Mod…
## 7 train/test split 0.243 0.757 27 1 1 Preprocessor1_Mod…
## 8 train/test split 0.473 0.527 32 1 1 Preprocessor1_Mod…
## 9 train/test split 0.620 0.380 38 0 1 Preprocessor1_Mod…
## 10 train/test split 0.960 0.0399 48 0 0 Preprocessor1_Mod…
## # ℹ 144 more rows
final_predictions <- final_fit %>%
collect_predictions()
# View the final predictions
head(final_predictions)
## # A tibble: 6 × 7
## id .pred_0 .pred_1 .row .pred_class Outcome .config
## <chr> <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 train/test split 0.111 0.889 5 1 1 Preprocessor1_Mode…
## 2 train/test split 0.754 0.246 11 0 0 Preprocessor1_Mode…
## 3 train/test split 0.804 0.196 18 0 1 Preprocessor1_Mode…
## 4 train/test split 0.628 0.372 19 0 0 Preprocessor1_Mode…
## 5 train/test split 0.754 0.246 20 0 1 Preprocessor1_Mode…
## 6 train/test split 0.744 0.256 24 0 1 Preprocessor1_Mode…
# Assuming final_fit is your final model
final_fit %>%
collect_predictions() %>% # Collect test set predictions
select(.pred_class, Outcome) %>% # Select relevant columns
mutate(correct = .pred_class == Outcome) %>% # Create a new variable 'correct'
group_by(correct) %>% # Group by correct/incorrect predictions
summarize(count = n()) # Count the number of correct and incorrect predictions
## # A tibble: 2 × 2
## correct count
## <lgl> <int>
## 1 FALSE 30
## 2 TRUE 124
calculating precision, accuracy and recall for this Logistic
regression model.
library(dplyr)
# Assuming final_fit is your final model
final_predictions <- final_fit %>%
collect_predictions() %>%
select(.pred_class, Outcome)
# Calculate Accuracy
accuracy <- final_predictions %>%
mutate(correct = .pred_class == Outcome) %>%
summarize(accuracy = mean(correct))
# Calculate Precision
precision <- final_predictions %>%
filter(.pred_class == "1") %>%
mutate(correct = .pred_class == Outcome) %>%
summarize(precision = sum(correct) / n())
# Calculate Recall
recall <- final_predictions %>%
filter(Outcome == "1") %>%
mutate(correct = .pred_class == Outcome) %>%
summarize(recall = sum(correct) / n())
# Print the results
print("Accuracy:")
## [1] "Accuracy:"
print(accuracy)
## # A tibble: 1 × 1
## accuracy
## <dbl>
## 1 0.805
print("Precision:")
## [1] "Precision:"
print(precision)
## # A tibble: 1 × 1
## precision
## <dbl>
## 1 0.8
print("Recall:")
## [1] "Recall:"
print(recall)
## # A tibble: 1 × 1
## recall
## <dbl>
## 1 0.593
Creating Decision tree model and finding precision, accuracy and
recall.
library(tidyverse)
library(tidymodels)
library(readr)
library(rpart)
##
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
##
## prune
# Define and train a decision tree model
my_mod <- decision_tree() %>%
set_engine("rpart") %>%
set_mode("classification")
fitted_model <- fit(my_mod, Outcome ~ ., data = data_train)
# Make predictions on the test set
predictions <- predict(fitted_model, data_test)
# Calculate Accuracy, Precision, and Recall
conf_matrix <- table(Actual = data_test$Outcome, Predicted = predictions$.pred_class)
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
recall <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
# Print the results
print("Accuracy:")
## [1] "Accuracy:"
print(accuracy)
## [1] 0.7532468
print("Precision:")
## [1] "Precision:"
print(precision)
## [1] 0.6818182
print("Recall:")
## [1] "Recall:"
print(recall)
## [1] 0.5555556
Creating Random forest model and calculating accuracy, precision,
recall.
# Define and train a Random Forest model
my_mod <- rand_forest(mtry = 3, trees = 1000) %>%
set_engine("randomForest") %>%
set_mode("classification")
fitted_model <- fit(my_mod, Outcome ~ ., data = data_train)
# Make predictions on the test set
predictions <- predict(fitted_model, data_test)
# Calculate Accuracy, Precision, and Recall
conf_matrix <- table(Actual = data_test$Outcome, Predicted = predictions$.pred_class)
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
recall <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
# Print the results
print("Accuracy:")
## [1] "Accuracy:"
print(accuracy)
## [1] 0.7857143
print("Precision:")
## [1] "Precision:"
print(precision)
## [1] 0.7234043
print("Recall:")
## [1] "Recall:"
print(recall)
## [1] 0.6296296
Creating Naive bayes model.
library(tidyverse)
library(tidymodels)
library(readr)
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.1
##
## Attaching package: 'e1071'
## The following object is masked from 'package:tune':
##
## tune
## The following object is masked from 'package:rsample':
##
## permutations
## The following object is masked from 'package:parsnip':
##
## tune
# Train a Naive Bayes model
fitted_model <- naiveBayes(Outcome ~ ., data = data_train)
# Make predictions on the test set
predictions <- predict(fitted_model, data_test)
# Calculate Accuracy, Precision, and Recall
conf_matrix <- table(Actual = data_test$Outcome, Predicted = predictions)
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
recall <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
# Print the results
print("Accuracy:")
## [1] "Accuracy:"
print(accuracy)
## [1] 0.7792208
print("Precision:")
## [1] "Precision:"
print(precision)
## [1] 0.7083333
print("Recall:")
## [1] "Recall:"
print(recall)
## [1] 0.6296296