R Markdown

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(naniar)
library(readr)
library(ggplot2)

Loading the data:

library(knitr)
diab_data <- read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv", col_names= FALSE)
## Rows: 768 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): X1, X2, X3, X4, X5, X6, X7, X8, X9
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(diab_data)

Assigning column names:

colnames(diab_data) <- c("Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome")
head(diab_data)
## # A tibble: 6 × 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
## 1           6     148            72            35       0  33.6
## 2           1      85            66            29       0  26.6
## 3           8     183            64             0       0  23.3
## 4           1      89            66            23      94  28.1
## 5           0     137            40            35     168  43.1
## 6           5     116            74             0       0  25.6
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>
summary(diab_data)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000
print(paste0("The percentage of complete data is ", pct_complete(diab_data), "%"))
## [1] "The percentage of complete data is 100%"
print("There were no missing values in this data set")
## [1] "There were no missing values in this data set"

Creating new samples based on the Outcome and Glucose levels:

diabetics <- subset(diab_data, Outcome == 1) $ "Glucose"
non_diabetics <- subset(diab_data, Outcome == 0) $ "Glucose"

Performing a t-test:

t_test_result_glucose <- t.test(diabetics, non_diabetics)
print(t_test_result_glucose)
## 
##  Welch Two Sample t-test
## 
## data:  diabetics and non_diabetics
## t = 13.752, df = 461.33, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  26.80786 35.74707
## sample estimates:
## mean of x mean of y 
##  141.2575  109.9800

Making results look better:

library(broom)
tidy_result_glucose <- tidy(t_test_result_glucose)
glimpse(tidy_result_glucose)
## Rows: 1
## Columns: 10
## $ estimate    <dbl> 31.27746
## $ estimate1   <dbl> 141.2575
## $ estimate2   <dbl> 109.98
## $ statistic   <dbl> 13.75154
## $ p.value     <dbl> 2.644161e-36
## $ parameter   <dbl> 461.3317
## $ conf.low    <dbl> 26.80786
## $ conf.high   <dbl> 35.74707
## $ method      <chr> "Welch Two Sample t-test"
## $ alternative <chr> "two.sided"
p_glucose <- tidy_result_glucose $ p.value
print(p_glucose)
## [1] 2.644161e-36

P-value is very close to zero, suggesting extremely strong evidence against the null hypothesis.

Conducting t-test based on BMI score:

diabetics1 <- subset(diab_data, Outcome == 1) $ "BMI"
non_diabetics1 <- subset(diab_data, Outcome == 0) $ "BMI"
t_test_result_BMI <- t.test(diabetics1, non_diabetics1)
tidy_result_BMI <- tidy(t_test_result_BMI)
glimpse(tidy_result_BMI)
## Rows: 1
## Columns: 10
## $ estimate    <dbl> 4.838337
## $ estimate1   <dbl> 35.14254
## $ estimate2   <dbl> 30.3042
## $ statistic   <dbl> 8.619317
## $ p.value     <dbl> 6.566238e-17
## $ parameter   <dbl> 573.4725
## $ conf.low    <dbl> 3.735811
## $ conf.high   <dbl> 5.940864
## $ method      <chr> "Welch Two Sample t-test"
## $ alternative <chr> "two.sided"
p_BMI <- tidy_result_BMI $ p.value
print(p_BMI)
## [1] 6.566238e-17

Conducting a t-test based on Blood Pressure:

diabetics2 <- subset(diab_data, Outcome == 1) $ "BloodPressure"
non_diabetics2 <- subset(diab_data, Outcome == 0) $ "BloodPressure"
t_test_result_BP <- t.test(diabetics2, non_diabetics2)
p_BP <- tidy(t_test_result_BP) $ p.value
print(p_BP)
## [1] 0.08735425

Conducting a t-test based on number of Pregnancies:

diabetics3 <- subset(diab_data, Outcome == 1) $ "Pregnancies"
non_diabetics3 <- subset(diab_data, Outcome == 0) $ "Pregnancies"
t_test_result_preg <- t.test(diabetics3, non_diabetics3)
p_preg <- tidy(t_test_result_preg) $ p.value
print(p_preg)
## [1] 6.821926e-09

Conducting a t-test based on Age:

diabetics4 <- subset(diab_data, Outcome == 1) $ "Age"
non_diabetics4 <- subset(diab_data, Outcome == 0) $ "Age"
t_test_result_age <- t.test(diabetics4, non_diabetics4)
p_age <- tidy(t_test_result_age) $ p.value
print(p_age)
## [1] 1.201513e-11

Conducting a t-test based on Skin Thickness:

diabetics5 <- subset(diab_data, Outcome == 1) $ "SkinThickness"
non_diabetics5 <- subset(diab_data, Outcome == 0) $ "SkinThickness"
t_test_result_ST <- t.test(diabetics5, non_diabetics5)
p_ST <- tidy(t_test_result_ST) $ p.value
print(p_ST)
## [1] 0.04935586
p_values <- c(p_glucose, p_age, p_ST, p_preg, p_BMI, p_BP)
print(paste("Original p-value:", p_values))
## [1] "Original p-value: 2.6441613495396e-36" 
## [2] "Original p-value: 1.20151252731136e-11"
## [3] "Original p-value: 0.0493558600864805"  
## [4] "Original p-value: 6.82192560045718e-09"
## [5] "Original p-value: 6.56623762470887e-17"
## [6] "Original p-value: 0.0873542482146159"
p_adjusted_bh <- p.adjust(p_values, method = "BH")
p_adjusted_bonferroni <- p.adjust(p_values, method = "bonferroni")
print(paste("BH adjusted p-values:", p_adjusted_bh))
## [1] "BH adjusted p-values: 1.58649680972376e-35"
## [2] "BH adjusted p-values: 2.40302505462273e-11"
## [3] "BH adjusted p-values: 0.0592270321037766"  
## [4] "BH adjusted p-values: 1.02328884006858e-08"
## [5] "BH adjusted p-values: 1.96987128741266e-16"
## [6] "BH adjusted p-values: 0.0873542482146159"
print(paste("Bonferroni adjusted p-value:", p_adjusted_bonferroni))
## [1] "Bonferroni adjusted p-value: 1.58649680972376e-35"
## [2] "Bonferroni adjusted p-value: 7.20907516386818e-11"
## [3] "Bonferroni adjusted p-value: 0.296135160518883"   
## [4] "Bonferroni adjusted p-value: 4.09315536027431e-08"
## [5] "Bonferroni adjusted p-value: 3.93974257482532e-16"
## [6] "Bonferroni adjusted p-value: 0.524125489287695"

Adjusted p-values help identify which tests remain significant after correction. If, for example, a p-value is less than 0.05 even after adjustment, it’s considered statistically significant under the chosen threshold. Generally, Bonferroni is more stringent, which can reduce Type I errors but may increase Type II errors (false negatives). In this case our p-values for Glucose, Age, Number of Pregnancies and BMI remained very smaller than 0 after the Bonferroni ajustments, meaning a strong association between these variables and Diabetes. Which can not be said about Skin Thickness and Blood pressure (p-values> 0.05 after the adjustment).

Plotting boxplots of glucose levels of diabetocs and non_diabetics:

ggplot(diab_data, aes(x = factor(Outcome), y = Glucose, fill=Outcome)) + 
  geom_boxplot() +
  labs(x = "Outcome", y = "Glucose", title = "Glucose Levels by Outcome")

ggplot(diab_data, aes(x = factor(Outcome), y = BMI, fill=Outcome)) + 
  geom_boxplot() +
  labs(x = "Outcome", y = "BMI", title = "BMI Levels by Outcome")

The significant differences in glucose and BMI levels between diabetic (Outcome 1) and non-diabetic (Outcome 0) groups, as shown in the boxplots, highlight important public health considerations. Diabetic individuals had notably higher median glucose and BMI levels, showing that these are established risk factors for Type 2 diabetes (considering also p-values < 0.05 after Bunferroni adjustment).These findings emphasize the importance of regular glucose monitoring and weight management in diabetes prevention. Public health strategies should focus on early detection, promoting healthier diets and physical activity, and raising awareness about the strong connection between obesity, glucose levels, and diabetes. Targeted interventions for high-risk populations, especially those with higher BMI, are crucial for reducing the incidence of Type 2 diabetes.