Packages

Loading required packages:

install.packages("readr")
install.packages("dplyr") 
install.packages("psych") 
install.packages("ggplot2") 
install.packages("tidyr")
install.packages("naniar")
install.packages("visdat")
install.packages("mice")
library(readr)
library(dplyr)
library(psych)
library(ggplot2)
library(tidyr)
library(naniar)
library(visdat)
library(mice)

Import the Data

Student <- read_csv("Data/PimaIndiansDiabeteDatabase.csv")
head(Student)
## # A tibble: 6 × 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
## 1           6     148            72            35       0  33.6
## 2           1      85            66            29       0  26.6
## 3           8     183            64             0       0  23.3
## 4           1      89            66            23      94  28.1
## 5           0     137            40            35     168  43.1
## 6           5     116            74             0       0  25.6
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>

Here is what the columns in the data mean:

Column Details
Pregnancies Number of times pregnant
Glucose Plasma glucose concentration a 2 hours in an oral glucose tolerance test
BloodPressure Diastolic blood pressure (mm Hg)
SkinThickness Triceps skin fold thickness (mm)
Insulin 2-Hour serum insulin (mu U/ml)
BMI Body mass index (weight in kg/(height in m)^2)
DiabetesPedigreeFunction Diabetes pedigree function
Age Age (years)
Outcome Class variable (0=No or 1=Yes)

Question 1 (10 marks)

Outcome is a categorical variable, 1 means Yes, 0 means No. So, convert it into a factor data type. Now, calculate mean, standard deviation, minimum, 1st quartile, median, 3rd quartile, maximum, and inter-quartile range of all the numeric columns in the dataset. Do you see any anomalies? Write your comment.

# Your code here
Student$Outcome <- factor(Student$Outcome, levels = c(0, 1), labels = c("No", "Yes"))
str(Student)
## spc_tbl_ [768 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Pregnancies             : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : Factor w/ 2 levels "No","Yes": 2 1 2 1 2 1 2 1 2 2 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Pregnancies = col_double(),
##   ..   Glucose = col_double(),
##   ..   BloodPressure = col_double(),
##   ..   SkinThickness = col_double(),
##   ..   Insulin = col_double(),
##   ..   BMI = col_double(),
##   ..   DiabetesPedigreeFunction = col_double(),
##   ..   Age = col_double(),
##   ..   Outcome = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
outcome_counts <- table(Student$Outcome)
outcome_proportions <- prop.table(outcome_counts)

print(outcome_counts)
## 
##  No Yes 
## 500 268
print(outcome_proportions)
## 
##        No       Yes 
## 0.6510417 0.3489583
summary(Student$Outcome)
##  No Yes 
## 500 268
numeric_cols <- Student[, sapply(Student, is.numeric)]

summary_stats <- data.frame(
  mean = apply(numeric_cols, 2, mean, na.rm = TRUE),
  sd = apply(numeric_cols, 2, sd, na.rm = TRUE),
  min = apply(numeric_cols, 2, min, na.rm = TRUE),
  q1 = apply(numeric_cols, 2, quantile, probs = 0.25, na.rm = TRUE),
  median = apply(numeric_cols, 2, median, na.rm = TRUE),
  q3 = apply(numeric_cols, 2, quantile, probs = 0.75, na.rm = TRUE),
  max = apply(numeric_cols, 2, max, na.rm = TRUE),
  iqr = apply(numeric_cols, 2, IQR, na.rm = TRUE)
)

print(summary_stats)
##                                 mean          sd    min       q1   median
## Pregnancies                3.8450521   3.3695781  0.000  1.00000   3.0000
## Glucose                  120.8945312  31.9726182  0.000 99.00000 117.0000
## BloodPressure             69.1054688  19.3558072  0.000 62.00000  72.0000
## SkinThickness             20.5364583  15.9522176  0.000  0.00000  23.0000
## Insulin                   79.7994792 115.2440024  0.000  0.00000  30.5000
## BMI                       31.9925781   7.8841603  0.000 27.30000  32.0000
## DiabetesPedigreeFunction   0.4718763   0.3313286  0.078  0.24375   0.3725
## Age                       33.2408854  11.7602315 21.000 24.00000  29.0000
##                                 q3    max      iqr
## Pregnancies                6.00000  17.00   5.0000
## Glucose                  140.25000 199.00  41.2500
## BloodPressure             80.00000 122.00  18.0000
## SkinThickness             32.00000  99.00  32.0000
## Insulin                  127.25000 846.00 127.2500
## BMI                       36.60000  67.10   9.3000
## DiabetesPedigreeFunction   0.62625   2.42   0.3825
## Age                       41.00000  81.00  17.0000

Your comment:

How many people have Outcome = Yes?

# Your code here

outcome_counts["Yes"]
## Yes 
## 268

Question 2 (40 marks)

Create histograms for all the numeric variables in the data using ggplot2.

# Your code here
df_long <- Student %>%
  select_if(is.numeric) %>%  # Select only numeric columns
  pivot_longer(everything(), names_to = "variable", values_to = "value")

ggplot(df_long, aes(x = value)) +
  geom_histogram(aes(y = after_stat(density)), fill = "skyblue", color = "black") + # Use density on y-axis
  facet_wrap(~ variable, scales = "free") +  # Facet by variable, free scales
  labs(title = "Histograms of Numeric Variables",
       x = "Value",
       y = "Density") +  # Y-axis label is now "Density"
  theme_bw() + # Optional: Use a black and white theme
  theme(strip.text = element_text(size = 12)) #Optional: Increase facet label size

Question 3 (20 marks)

Create boxplot of Glucose, Insulin, BMI, and Age by Outcome using ggplot2.

# Your code here
#Select the variables you want to plot
variables_to_plot <- c("Glucose", "Insulin", "BMI", "Age")

#Create the boxplots using ggplot2

ggplot(Student, aes(x = Outcome, y = .data[[variables_to_plot[1]]])) +  # Start with the first variable
  geom_boxplot(fill = "skyblue", color = "black") +
  labs(title = paste("Boxplot of", variables_to_plot[1], "by Outcome"),
       x = "Outcome",
       y = variables_to_plot[1]) +
  theme_bw() + # Optional: Use a black and white theme
  theme(plot.title = element_text(hjust = 0.5)) -> p1 #Center title and save plot

ggplot(Student, aes(x = Outcome, y = .data[[variables_to_plot[2]]])) +  # Start with the first variable
  geom_boxplot(fill = "lightgreen", color = "black") +
  labs(title = paste("Boxplot of", variables_to_plot[2], "by Outcome"),
       x = "Outcome",
       y = variables_to_plot[2]) +
  theme_bw() + # Optional: Use a black and white theme
  theme(plot.title = element_text(hjust = 0.5)) -> p2 #Center title and save plot

ggplot(Student, aes(x = Outcome, y = .data[[variables_to_plot[3]]])) +  # Start with the first variable
  geom_boxplot(fill = "lightcoral", color = "black") +
  labs(title = paste("Boxplot of", variables_to_plot[3], "by Outcome"),
       x = "Outcome",
       y = variables_to_plot[3]) +
  theme_bw() + # Optional: Use a black and white theme
  theme(plot.title = element_text(hjust = 0.5)) -> p3 #Center title and save plot

ggplot(Student, aes(x = Outcome, y = .data[[variables_to_plot[4]]])) +  # Start with the first variable
  geom_boxplot(fill = "lightyellow", color = "black") +
  labs(title = paste("Boxplot of", variables_to_plot[4], "by Outcome"),
       x = "Outcome",
       y = variables_to_plot[4]) +
  theme_bw() + # Optional: Use a black and white theme
  theme(plot.title = element_text(hjust = 0.5)) -> p4 #Center title and save plot

#Print the plots
p1

p2

p3

p4

Question 4 (10 marks)

Replace 0 with NA in the variables where a value of 0 does not make sense.

# Your code here
# Identify columns where 0 should be replaced with NA (replace with your actual column names)
cols_to_replace <- c("Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI") 

# Method 1: Using a loop (more explicit)

for (col in cols_to_replace) {
  Student[[col]][Student[[col]] == 0] <- NA  # Replace 0 with NA in the specified column
}

# Method 2: Using lapply (more concise)

Student[cols_to_replace] <- lapply(Student[cols_to_replace], function(x) {
  x[x == 0] <- NA
  x # Important: Return the modified vector!
})

df <- Student %>%
  mutate(across(all_of(cols_to_replace), ~na_if(., 0)))

summary(df)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   : 44.0   Min.   : 24.00   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.00   1st Qu.:22.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :29.00  
##  Mean   : 3.845   Mean   :121.7   Mean   : 72.41   Mean   :29.15  
##  3rd Qu.: 6.000   3rd Qu.:141.0   3rd Qu.: 80.00   3rd Qu.:36.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##                   NA's   :5       NA's   :35       NA's   :227    
##     Insulin            BMI        DiabetesPedigreeFunction      Age       
##  Min.   : 14.00   Min.   :18.20   Min.   :0.0780           Min.   :21.00  
##  1st Qu.: 76.25   1st Qu.:27.50   1st Qu.:0.2437           1st Qu.:24.00  
##  Median :125.00   Median :32.30   Median :0.3725           Median :29.00  
##  Mean   :155.55   Mean   :32.46   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:190.00   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.00   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##  NA's   :374      NA's   :11                                              
##  Outcome  
##  No :500  
##  Yes:268  
##           
##           
##           
##           
## 
colSums(is.na(Student)) #Shows NA count per column
##              Pregnancies                  Glucose            BloodPressure 
##                        0                        5                       35 
##            SkinThickness                  Insulin                      BMI 
##                      227                      374                       11 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                        0                        0                        0

Question 5 (10 marks)

Use naniar package to inspect number of missing values in the data after replacing 0s with NAs

# Your code here
gg_miss_upset(Student)  # From naniar

miss_var_summary(Student) # From naniar
## # A tibble: 9 × 3
##   variable                 n_miss pct_miss
##   <chr>                     <int>    <num>
## 1 Insulin                     374   48.7  
## 2 SkinThickness               227   29.6  
## 3 BloodPressure                35    4.56 
## 4 BMI                          11    1.43 
## 5 Glucose                       5    0.651
## 6 Pregnancies                   0    0    
## 7 DiabetesPedigreeFunction      0    0    
## 8 Age                           0    0    
## 9 Outcome                       0    0