Loading required packages:
install.packages("readr")
install.packages("dplyr")
install.packages("psych")
install.packages("ggplot2")
install.packages("tidyr")
install.packages("naniar")
install.packages("visdat")
install.packages("mice")
library(readr)
library(dplyr)
library(psych)
library(ggplot2)
library(tidyr)
library(naniar)
library(visdat)
library(mice)
Student <- read_csv("Data/PimaIndiansDiabeteDatabase.csv")
head(Student)
## # A tibble: 6 × 9
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>
Here is what the columns in the data mean:
| Column | Details |
|---|---|
| Pregnancies | Number of times pregnant |
| Glucose | Plasma glucose concentration a 2 hours in an oral glucose tolerance test |
| BloodPressure | Diastolic blood pressure (mm Hg) |
| SkinThickness | Triceps skin fold thickness (mm) |
| Insulin | 2-Hour serum insulin (mu U/ml) |
| BMI | Body mass index (weight in kg/(height in m)^2) |
| DiabetesPedigreeFunction | Diabetes pedigree function |
| Age | Age (years) |
| Outcome | Class variable (0=No or 1=Yes) |
Outcome is a categorical variable, 1 means Yes,
0 means No. So, convert it into a factor data type. Now, calculate mean,
standard deviation, minimum, 1st quartile, median, 3rd quartile,
maximum, and inter-quartile range of all the numeric columns in the
dataset. Do you see any anomalies? Write your comment.
# Your code here
Student$Outcome <- factor(Student$Outcome, levels = c(0, 1), labels = c("No", "Yes"))
str(Student)
## spc_tbl_ [768 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Pregnancies : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : Factor w/ 2 levels "No","Yes": 2 1 2 1 2 1 2 1 2 2 ...
## - attr(*, "spec")=
## .. cols(
## .. Pregnancies = col_double(),
## .. Glucose = col_double(),
## .. BloodPressure = col_double(),
## .. SkinThickness = col_double(),
## .. Insulin = col_double(),
## .. BMI = col_double(),
## .. DiabetesPedigreeFunction = col_double(),
## .. Age = col_double(),
## .. Outcome = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
outcome_counts <- table(Student$Outcome)
outcome_proportions <- prop.table(outcome_counts)
print(outcome_counts)
##
## No Yes
## 500 268
print(outcome_proportions)
##
## No Yes
## 0.6510417 0.3489583
summary(Student$Outcome)
## No Yes
## 500 268
numeric_cols <- Student[, sapply(Student, is.numeric)]
summary_stats <- data.frame(
mean = apply(numeric_cols, 2, mean, na.rm = TRUE),
sd = apply(numeric_cols, 2, sd, na.rm = TRUE),
min = apply(numeric_cols, 2, min, na.rm = TRUE),
q1 = apply(numeric_cols, 2, quantile, probs = 0.25, na.rm = TRUE),
median = apply(numeric_cols, 2, median, na.rm = TRUE),
q3 = apply(numeric_cols, 2, quantile, probs = 0.75, na.rm = TRUE),
max = apply(numeric_cols, 2, max, na.rm = TRUE),
iqr = apply(numeric_cols, 2, IQR, na.rm = TRUE)
)
print(summary_stats)
## mean sd min q1 median
## Pregnancies 3.8450521 3.3695781 0.000 1.00000 3.0000
## Glucose 120.8945312 31.9726182 0.000 99.00000 117.0000
## BloodPressure 69.1054688 19.3558072 0.000 62.00000 72.0000
## SkinThickness 20.5364583 15.9522176 0.000 0.00000 23.0000
## Insulin 79.7994792 115.2440024 0.000 0.00000 30.5000
## BMI 31.9925781 7.8841603 0.000 27.30000 32.0000
## DiabetesPedigreeFunction 0.4718763 0.3313286 0.078 0.24375 0.3725
## Age 33.2408854 11.7602315 21.000 24.00000 29.0000
## q3 max iqr
## Pregnancies 6.00000 17.00 5.0000
## Glucose 140.25000 199.00 41.2500
## BloodPressure 80.00000 122.00 18.0000
## SkinThickness 32.00000 99.00 32.0000
## Insulin 127.25000 846.00 127.2500
## BMI 36.60000 67.10 9.3000
## DiabetesPedigreeFunction 0.62625 2.42 0.3825
## Age 41.00000 81.00 17.0000
Your comment:
How many people have Outcome = Yes?
# Your code here
outcome_counts["Yes"]
## Yes
## 268
Create histograms for all the numeric variables in the data using ggplot2.
# Your code here
df_long <- Student %>%
select_if(is.numeric) %>% # Select only numeric columns
pivot_longer(everything(), names_to = "variable", values_to = "value")
ggplot(df_long, aes(x = value)) +
geom_histogram(aes(y = after_stat(density)), fill = "skyblue", color = "black") + # Use density on y-axis
facet_wrap(~ variable, scales = "free") + # Facet by variable, free scales
labs(title = "Histograms of Numeric Variables",
x = "Value",
y = "Density") + # Y-axis label is now "Density"
theme_bw() + # Optional: Use a black and white theme
theme(strip.text = element_text(size = 12)) #Optional: Increase facet label size
Create boxplot of Glucose, Insulin, BMI, and Age by Outcome using ggplot2.
# Your code here
#Select the variables you want to plot
variables_to_plot <- c("Glucose", "Insulin", "BMI", "Age")
#Create the boxplots using ggplot2
ggplot(Student, aes(x = Outcome, y = .data[[variables_to_plot[1]]])) + # Start with the first variable
geom_boxplot(fill = "skyblue", color = "black") +
labs(title = paste("Boxplot of", variables_to_plot[1], "by Outcome"),
x = "Outcome",
y = variables_to_plot[1]) +
theme_bw() + # Optional: Use a black and white theme
theme(plot.title = element_text(hjust = 0.5)) -> p1 #Center title and save plot
ggplot(Student, aes(x = Outcome, y = .data[[variables_to_plot[2]]])) + # Start with the first variable
geom_boxplot(fill = "lightgreen", color = "black") +
labs(title = paste("Boxplot of", variables_to_plot[2], "by Outcome"),
x = "Outcome",
y = variables_to_plot[2]) +
theme_bw() + # Optional: Use a black and white theme
theme(plot.title = element_text(hjust = 0.5)) -> p2 #Center title and save plot
ggplot(Student, aes(x = Outcome, y = .data[[variables_to_plot[3]]])) + # Start with the first variable
geom_boxplot(fill = "lightcoral", color = "black") +
labs(title = paste("Boxplot of", variables_to_plot[3], "by Outcome"),
x = "Outcome",
y = variables_to_plot[3]) +
theme_bw() + # Optional: Use a black and white theme
theme(plot.title = element_text(hjust = 0.5)) -> p3 #Center title and save plot
ggplot(Student, aes(x = Outcome, y = .data[[variables_to_plot[4]]])) + # Start with the first variable
geom_boxplot(fill = "lightyellow", color = "black") +
labs(title = paste("Boxplot of", variables_to_plot[4], "by Outcome"),
x = "Outcome",
y = variables_to_plot[4]) +
theme_bw() + # Optional: Use a black and white theme
theme(plot.title = element_text(hjust = 0.5)) -> p4 #Center title and save plot
#Print the plots
p1
p2
p3
p4
Replace 0 with NA in the variables where a value of 0 does not make sense.
# Your code here
# Identify columns where 0 should be replaced with NA (replace with your actual column names)
cols_to_replace <- c("Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI")
# Method 1: Using a loop (more explicit)
for (col in cols_to_replace) {
Student[[col]][Student[[col]] == 0] <- NA # Replace 0 with NA in the specified column
}
# Method 2: Using lapply (more concise)
Student[cols_to_replace] <- lapply(Student[cols_to_replace], function(x) {
x[x == 0] <- NA
x # Important: Return the modified vector!
})
df <- Student %>%
mutate(across(all_of(cols_to_replace), ~na_if(., 0)))
summary(df)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :29.00
## Mean : 3.845 Mean :121.7 Mean : 72.41 Mean :29.15
## 3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## NA's :5 NA's :35 NA's :227
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00
## 1st Qu.: 76.25 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
## Median :125.00 Median :32.30 Median :0.3725 Median :29.00
## Mean :155.55 Mean :32.46 Mean :0.4719 Mean :33.24
## 3rd Qu.:190.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00
## NA's :374 NA's :11
## Outcome
## No :500
## Yes:268
##
##
##
##
##
colSums(is.na(Student)) #Shows NA count per column
## Pregnancies Glucose BloodPressure
## 0 5 35
## SkinThickness Insulin BMI
## 227 374 11
## DiabetesPedigreeFunction Age Outcome
## 0 0 0
Use naniar package to inspect number of missing values in the data after replacing 0s with NAs
# Your code here
gg_miss_upset(Student) # From naniar
miss_var_summary(Student) # From naniar
## # A tibble: 9 × 3
## variable n_miss pct_miss
## <chr> <int> <num>
## 1 Insulin 374 48.7
## 2 SkinThickness 227 29.6
## 3 BloodPressure 35 4.56
## 4 BMI 11 1.43
## 5 Glucose 5 0.651
## 6 Pregnancies 0 0
## 7 DiabetesPedigreeFunction 0 0
## 8 Age 0 0
## 9 Outcome 0 0