knitr::opts_chunk$set(echo = TRUE)

# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
# Read the data
diabetes_data <- read.csv("diabetes_prediction_dataset.csv")

# Take 30 rows of data at random
set.seed(123)
random_indices <- sample(1:nrow(diabetes_data), 30, replace = FALSE)
data_30_random <- diabetes_data[random_indices, ]

# Create scatter plots with correlations
# 1. HbA1c vs Diabetes
plot1 <- ggplot(data_30_random, aes(x=HbA1c_level, y=diabetes)) +
  geom_point(color="#FF6B6B", size=3) +
  geom_smooth(method=lm, color="#4ECDC4") +
  labs(title="HbA1c vs Diabetes", x="HbA1c", y="Diabetes") +
  theme_minimal()

# 2. Age vs BMI
plot2 <- ggplot(data_30_random, aes(x=age, y=bmi)) +
  geom_point(color="#45B7D1", size=3) +
  geom_smooth(method=lm, color="#96CEB4") +
  labs(title="Age vs BMI", x="Age", y="BMI") +
  theme_minimal()

# 3. BMI vs Glucose
plot3 <- ggplot(data_30_random, aes(x=bmi, y=blood_glucose_level)) +
  geom_point(color="#D65076", size=3) +
  geom_smooth(method=lm, color="#EEB868") +
  labs(title="BMI vs Blood Glucose Level", x="BMI", y="Blood Glucose Level") +
  theme_minimal()

# Display plots in a grid
grid.arrange(plot1, plot2, plot3, ncol=2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# Calculate correlations
cor_HbA1c_diabetes <- cor(data_30_random$HbA1c_level, data_30_random$diabetes)
cor_age_bmi <- cor(data_30_random$age, data_30_random$bmi)
cor_bmi_glucose <- cor(data_30_random$bmi, data_30_random$blood_glucose_level)

# Print correlations
cat("Correlations:\n",
    "HbA1c-Diabetes:", round(cor_HbA1c_diabetes, 3), "\n",
    "Age-BMI:", round(cor_age_bmi, 3), "\n",
    "BMI-Glucose:", round(cor_bmi_glucose, 3))
## Correlations:
##  HbA1c-Diabetes: 0.157 
##  Age-BMI: 0.343 
##  BMI-Glucose: -0.257