library(haven)
library(sas7bdat)
library(ggplot2)
library(dplyr)
library(pubh)
library(ggpubr)
library(data.table)
library(gridExtra)
library(grid)
library(vcd)
library(PASSED)
#Presets for graphs
conf.plot <- theme(
plot.title = element_text(size = 12),
axis.text = element_text(size = 9),
axis.title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(size = 0.5),
legend.position = c(.9, .2),
legend.text = element_text(size = 8)
)
link to access the codes: CODES
The data considered in this assignment are longitudinal measurements of the body mass index (BMI, kg/m2 ) for a sample of Belgian employees who are invited for yearly medical examinations. The employees are measured at a maximum of 10 occasions. The aim of the study is:
To investigate how BMI evolves over time.
Whether the evolution is different for males and females.
Whether the evolution depends on the age of the employee at the start of the study.
Information about smoking behavior is collected as well.
Describe the data, and use graphical techniques to explore the mean structure, the variance structure and the correlation structure.
Summarize your conclusions.
What are the implications with respect to statistical modeling ?
The dataset under consideration comprises a collection of longitudinal measurements of the Body Mass Index (BMI) for a group of Belgian employees who undergo annual medical examinations for a period of maximum 10 years.
The dataset consists of:
14,975 observations, which correspond to 4,425 unique subjects.
25% female and 75% male subjects.
For females, the mean age is 30.73 years (ranging from 15 to 69), with an average BMI of 24.01 \(kg/m^2\).
Male subjects have a mean age of 32.36 years (ranging from 14 to 66) and an average BMI of 24.95 $kg/m^2
Furthermore, 38% of females and 45% of males (1495) reported smoking at least once throughout the study.
# Reads dataset
data <- read_sas("bmilda.sas7bdat")
# Changing the sex / smoking to factor variables
data[c("smoking", "sex")] <- lapply(data[c("smoking", "sex")], factor)
# Part 1: Summary statistics for original data
result <- data %>%
group_by(sex) %>%
summarise(
unique_id_count = n_distinct(id),
percent = round(n_distinct(id) / length(unique(data$id)), 2),
min_age = min(fage),
max_age = max(fage),
min_bmi = min(bmi),
max_bmi = max(bmi)
)
# Print the summary statistics for the original data
print(data.frame(result))
sex unique_id_count percent min_age max_age min_bmi max_bmi
1 0 1091 0.25 15 69 14.51837 60.59557
2 1 3334 0.75 14 66 12.48400 49.48097
# Part 2: Creating a new dataframe with only unique measurements per subject
uniquedf <- data.frame(matrix(ncol = 6, nrow = 0))
# Loop through unique subject IDs and extract the first row for each
for (i in unique(data$id)) {
new_row <- data[data$id == i, ][1, ]
uniquedf <- rbind(uniquedf, new_row)
}
# Part 3: Summary statistics for unique data
result2 <- uniquedf %>%
group_by(sex) %>%
summarise(
mean_age = mean(fage),
mean_bmi = mean(bmi)
)
# Print the summary statistics for the unique data
print(data.frame(result2))
sex mean_age mean_bmi
1 0 30.73236 24.00837
2 1 32.36203 24.95177
TABLE: people who have smoked at least once
# Subset data for individuals who smoked at least once
resultsm <- data[data$smoking == 1, ] %>%
group_by(sex) %>%
summarise(
unique_id_count = n_distinct(id)
)
# Print the summary statistics for individuals who smoked at least once
print(data.frame(resultsm))
sex unique_id_count
1 0 412
2 1 1495
3 <NA> 1
Check if the data set is balanced
# Check if all individuals have the same time point
individuals_same_time <- tapply(data$time, data$id, function(x) length(unique(x)) == 1)
# Check if all entries are TRUE, indicating that all individuals have the same time point
all_same_time <- all(individuals_same_time)
# Print the result
if (all_same_time) {
cat("All individuals have their BMI measured at the same time point.\n")
} else {
cat("Not all individuals have their BMI measured at the same time point.\n")
}
Not all individuals have their BMI measured at the same time point.
Explore data per age group and sex
HISTOGRAM of initial age divided by gender.
# Create a histogram using ggplot2 for unique data
ggplot(uniquedf, aes(x = fage, fill = factor(sex))) +
geom_histogram(binwidth = 5, alpha = 0.5, position = "stack") +
# Add labels to the axes
labs(x = "Age", y = "Frequency") +
# Customize the fill colors based on the "sex" variable
scale_fill_manual(values = c("0" = "red", "1" = "blue")) +
# Add a title to the plot
ggtitle("Histogram of Ages by Gender") + conf.plot
# Group the data into age groups and calculate summary statistics
# Summary per group and gender
summary_stats <- uniquedf %>%
mutate(Age_Group = cut(fage, breaks = c(0, 19, 39, 59, Inf))) %>%
group_by(Age_Group, sex) %>%
summarise(
Count = n(),
mean_age = mean(fage),
Mean_BMI = mean(bmi, na.rm = TRUE),
Median_BMI = median(bmi, na.rm = TRUE),
SD_BMI = sd(bmi, na.rm = TRUE),
Min_BMI = min(bmi, na.rm = TRUE),
Max_BMI = max(bmi, na.rm = TRUE)
)
print("Summary per group and gender")
[1] "Summary per group and gender"
data.frame(summary_stats)
| Age_Group | sex | Count | mean_age | Mean_BMI | Median_BMI | SD_BMI | Min_BMI | Max_BMI |
|---|---|---|---|---|---|---|---|---|
| (0,19] | 0 | 98 | 17.9 | 22.2 | 21.9 | 3.46 | 16.2 | 36.7 |
| (0,19] | 1 | 209 | 17.9 | 22.6 | 22.2 | 3.43 | 14.3 | 38.4 |
| (19,39] | 0 | 779 | 28.3 | 23.8 | 23Â Â | 4.33 | 17.1 | 60.6 |
| (19,39] | 1 | 2328 | 28.2 | 24.5 | 24.1 | 3.57 | 14.5 | 49.5 |
| (39,59] | 0 | 209 | 45Â Â | 25.7 | 25.2 | 4.27 | 17.3 | 44.5 |
| (39,59] | 1 | 773 | 47.9 | 26.8 | 26.3 | 3.74 | 18.1 | 44.6 |
| (59,Inf] | 0 | 5 | 63.2 | 26.4 | 24.6 | 5.45 | 20.2 | 33.1 |
| (59,Inf] | 1 | 24 | 62.1 | 27.1 | 26.8 | 2.88 | 22.7 | 32.2 |
# Summary ONLY per group
summary_stats_overal <- uniquedf %>%
mutate(Age_Group = cut(fage, breaks = c(0, 19, 39, 59, Inf))) %>%
group_by(Age_Group) %>%
summarise(
Count = n(),
mean_age = mean(fage),
Mean_BMI = mean(bmi, na.rm = TRUE),
Median_BMI = median(bmi, na.rm = TRUE),
SD_BMI = sd(bmi, na.rm = TRUE),
Min_BMI = min(bmi, na.rm = TRUE),
Max_BMI = max(bmi, na.rm = TRUE)
)
print("Summary per group")
[1] "Summary per group"
data.frame(summary_stats_overal)
| Age_Group | Count | mean_age | Mean_BMI | Median_BMI | SD_BMI | Min_BMI | Max_BMI |
|---|---|---|---|---|---|---|---|
| (0,19] | 307 | 17.9 | 22.5 | 22.1 | 3.44 | 14.3 | 38.4 |
| (19,39] | 3107 | 28.2 | 24.3 | 23.8 | 3.79 | 14.5 | 60.6 |
| (39,59] | 982 | 47.3 | 26.6 | 26.2 | 3.88 | 17.3 | 44.6 |
| (59,Inf] | 29 | 62.3 | 27Â Â | 26.3 | 3.34 | 20.2 | 33.1 |
# Combine the two summary datasets
combined_summary <- rbind(summary_stats, summary_stats_overal)
combined_summary <- combined_summary[, c("Age_Group", "sex", "Mean_BMI")]
# Handle missing sex values
combined_summary$sex <- ifelse(is.na(combined_summary$sex), 3, combined_summary$sex)
combined_summary$sex <- as.factor(combined_summary$sex)
# Create a plot using ggplot2
ggplot(combined_summary, aes(x = Age_Group, y = Mean_BMI, color = factor(sex))) +
geom_point() +
geom_line(aes(group = factor(sex))) +
labs(x = "Age Group", y = "Mean BMI (kg/m^2)") +
scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "black"), labels = c("Female", "Male", "Overall")) +
ggtitle("Mean BMI vs Age Group and Gender") + conf.plot
Mean Structure The main focus of this study revolved around the time and age variables. Nevertheless, because the data exhibited an unbalance, it was impractical to present the mean bmi at every time point. Consequently, a viable strategy was employed to segment the time and age variables into intervals and compute the average bmi within each group. The time interval was divided into 5 groups of 2 years intervals. 0-2 years with mid point 1, 2-4 years with mid point 1, 4-6 years with mid point 1 .
# Define your time intervals (modify as needed)
intervals <- c(0, 2, 4, 6, 8, 10)
class_ranges <- c(0, 19, 39, 59, Inf)
class_labels <- c("Youth",
"Early adult",
"Middle adult",
"Senior")
# Create a new variable 'TimeGroup' based on the intervals for the original dataset
data <- data %>%
mutate(TimeGroup = cut(time, breaks = intervals, labels = FALSE, include.lowest = TRUE))
# Create a new variable 'age_class' based on the age ranges
data$age_class <- cut(data$fage, breaks = class_ranges, labels = class_labels, include.lowest = TRUE)
# Adjust the 'TimeGroup' variable
data$TimeGroup <- data$TimeGroup * 2
# Create a new variable 'TimeGroup' based on the intervals for the unique dataset
uniquedf <- uniquedf %>%
mutate(TimeGroup = cut(time, breaks = intervals, labels = FALSE, include.lowest = TRUE))
# Create a new variable 'age_class' based on the age ranges for the unique dataset
uniquedf$age_class <- cut(uniquedf$fage, breaks = class_ranges, labels = class_labels, include.lowest = TRUE)
# Calculate summary statistics for Meanbmi and SEbmi
summary_data1 <- data %>%
group_by(TimeGroup) %>%
summarize(Meanbmi = mean(bmi), SEbmi = sd(bmi) / sqrt(n()))
# Merge summary_data1 back into the original data
summary_data1 <- merge(data, summary_data1)
# Calculate summary statistics for Meanbmi_sex and SEbmi_sex
summary_data2 <- data %>%
group_by(sex, TimeGroup) %>%
summarize(Meanbmi_sex = mean(bmi), SEbmi_sex = sd(bmi) / sqrt(n()))
# Merge summary_data2 back into the data
data <- merge(summary_data1, summary_data2, by = c("sex", "TimeGroup"), all = FALSE)
# Ggplot
ggplot(data, aes(x = TimeGroup, y = Meanbmi, group = age_class)) +
geom_errorbar(aes(ymin = Meanbmi - SEbmi, ymax = Meanbmi + SEbmi), width = 0.2, color = "black") +
geom_line(color = "black") +
geom_point(size = 3, color = "black") +
labs(x = "Time Intervals", y = "Mean BMI") +
theme_minimal() +
# Adding two lines for Meanbmi_sex by sex
geom_line(data = subset(data, sex == 0), aes(x = TimeGroup, y = Meanbmi_sex), color = "red") +
geom_line(data = subset(data, sex == 1), aes(x = TimeGroup, y = Meanbmi_sex), color = "blue") +
geom_point(data = subset(data, sex == 0), aes(x = TimeGroup, y = Meanbmi_sex), color = "red", size = 3) +
geom_point(data = subset(data, sex == 1), aes(x = TimeGroup, y = Meanbmi_sex), color = "blue", size = 3) +
# Design of the plot
labs(x = "Time Interval (years)", y = "Mean BMI (kg/m^2)") +
ggtitle("Mean BMI vs Time Intervals") +
scale_color_manual(values = c("Male" = "blue", "Female" = "red", "Overall" = "black")) +
labs(color = "Sex") +
conf.plot
Age.df<-data %>%
group_by(age_class) %>%
summarise(meanbmi_age = mean(bmi, na.rm = TRUE),
sebmi_age = sd(bmi, na.rm = TRUE) / sqrt(n()))
# Plot the results
ggplot(data = Age.df, aes(x = age_class, y = meanbmi_age)) +
geom_errorbar(aes(ymin = meanbmi_age - sebmi_age, ymax = meanbmi_age + sebmi_age), width = 0.3, color = "darkblue") +
labs(title = "Mean BMI by AgeGroup with SE",
x = "Age class",
y = "Mean BMI (kg/m^2)")+
conf.plot
head(data)
| sex | TimeGroup | id | time | bmi | fage | smoking | age_class | Meanbmi | SEbmi | Meanbmi_sex | SEbmi_sex |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10 | 4.21e+03 | 9 | 21.2 | 33 | 0 | Early adult | 26.2 | 0.149 | 25.1 | 0.358 |
| 0 | 10 | 3.92e+03 | 9 | 23.5 | 27 | Early adult | 26.2 | 0.149 | 25.1 | 0.358 | |
| 0 | 10 | 4.84e+03 | 9 | 42.9 | 26 | 0 | Early adult | 26.2 | 0.149 | 25.1 | 0.358 |
| 0 | 10 | 2.07e+03 | 9 | 24.7 | 21 | 0 | Early adult | 26.2 | 0.149 | 25.1 | 0.358 |
| 0 | 10 | 4.94e+03 | 9 | 30.9 | 30 | 0 | Early adult | 26.2 | 0.149 | 25.1 | 0.358 |
| 0 | 10 | 4.59e+03 | 9 | 28.7 | 37 | 0 | Early adult | 26.2 | 0.149 | 25.1 | 0.358 |
set.seed(123)
# Randomly select 500 unique IDs (ADDITION: sample per group with more than )
df.random <- list()
for (i in unique(data$age_class)) {
temp <- data[data$age_class == i & data$time > 6, ]
smpl <-ifelse(length(unique(temp$id)) > 49, 50, length(unique(temp$id)))
# Randomly select 50
sampled_ids <- sample(unique(temp$id), smpl)
sampled_data <- data %>% filter(id %in% sampled_ids)
df.random <- rbind(df.random, sampled_data)
}
ggplot(df.random, aes(x = time, y = bmi, color = as.factor(id))) +
geom_line() +
labs(x = "Time (years)", y = "BMI (kg/m^2)") +
ggtitle("Individual Profiles of BMI over Time by Age Class") +
facet_wrap(~ age_class, ncol = 4, scales = "free_y") +
theme(legend.position = "none") # Hide the legend
# Create a ggplot object for BMI changes over time
gg <- ggplot(data, aes(x = TimeGroup, y = bmi, color = as.factor(sex))) +
geom_smooth(method = "loess", span = 0.8, se = FALSE, color = "darkblue", size = 0.7) + # Add Lowess trend line
geom_point(stat = "summary", fun = mean, color = "red", size = 1) + # Add points for mean sqr.res
facet_grid(sex ~ age_class) +
labs(x = "Time Interval (years)", y = "BMI (kg/m^2)") +
ggtitle("BMI Changes Over Time by Gender and Age Group") + theme_bw()
# Display the plot
gg
# Define age classes .
age_classes <- c("Youth", "Early adult", "Middle adult", "Senior")
# Create an empty list to store the plots
plot_list <- list()
# Initialize a counter
i <- 1
lab.y <- "Squared residuals"
lab.x <- "Time"
# Loop through age classes
for (sex in 0:1) {
lab.y <- "Squared residuals"
for (age in age_classes) {
# Subset the data for the specific sex and age class
temp <- data[data$sex == sex & data$age_class == age,]
# Fit a Loess model to the data
loess_fit <- loess(bmi ~ TimeGroup, temp, span = 0.8) #Predict of temp data
loess_fit <- predict(loess_fit)
sqrdres <- as.data.frame((temp[,"bmi"]- loess_fit)^2)
colnames(sqrdres) <- "sqr.res"
temp <- cbind(temp, loess_fit, sqrdres)
# Create a ggplot for the residuals
g.plot <- ggplot(temp, aes(x = TimeGroup, y = sqr.res)) +
geom_smooth(method = "loess", span = 0.8, se = FALSE, color = "black", size = 0.7) +
geom_point(stat = "summary", fun = mean, color = "red", size = 1) + # Add points for mean sqr.res
ggtitle(paste0('Sex: ', sex, " Age Group: ", age)) +
labs(x = lab.x, y = lab.y) +
theme_bw()
# Store the ggplot in the list
plot_list[i] <- grob(g.plot)
# Increment the counter
i <- i + 1
lab.y <-""
}
}
gridExtra::grid.arrange(grobs = plot_list, nrow = 2)
*** MEAN BMI VS TIME and SQUARED RESIDUALES VS TIME no groupping***
# Fit a Loess model to the data
loess_fit <- loess(bmi ~ TimeGroup, data, span = 0.8) #Predict of temp data
loess_fit <- predict(loess_fit)
sqrdres <- as.data.frame((data[,"bmi"]- loess_fit)^2)
colnames(sqrdres) <- "sqr.res"
temp <- cbind(data, loess_fit, sqrdres)
# Create a ggplot for the residuals
g.plot <- ggplot(temp, aes(x = TimeGroup, y = sqr.res)) +
geom_smooth(method = "loess", span = 0.7, se = FALSE, color = "black", size = 0.7) +
geom_point(stat = "summary", fun = mean, color = "red", size = 2) + # Add points for mean sqr.res
theme_bw()
g.plot