BMIDATA

BMI DATA

link to access the codes: CODES

The data considered in this assignment are longitudinal measurements of the body mass index (BMI, kg/m2 ) for a sample of Belgian employees who are invited for yearly medical examinations. The employees are measured at a maximum of 10 occasions. The aim of the study is:

To investigate how BMI evolves over time.
Whether the evolution is different for males and females.
Whether the evolution depends on the age of the employee at the start of the study.

Information about smoking behavior is collected as well.

Task 1

Describe the data, and use graphical techniques to explore the mean structure, the variance structure and the correlation structure.
Summarize your conclusions.
What are the implications with respect to statistical modeling ?

DATA DESCRIPTION

The dataset under consideration comprises a collection of longitudinal measurements of the Body Mass Index (BMI) for a group of Belgian employees who undergo annual medical examinations for a period of maximum 10 years.

The dataset consists of:

14,975 observations, which correspond to 4,425 unique subjects.
25% female and 75% male subjects.
For females, the mean age is 30.73 years (ranging from 15 to 69), with an average BMI of 24.01 $kg/m^2$.
Male subjects have a mean age of 32.36 years (ranging from 14 to 66) and an average BMI of 24.95 $kg/m^2
Furthermore, 38% of females and 45% of males (1495) reported smoking at least once throughout the study.

# Reads dataset
data <- read_sas("bmilda.sas7bdat")

# Changing the sex / smoking to factor variables
data[c("smoking", "sex")] <- lapply(data[c("smoking", "sex")], factor)

# Part 1: Summary statistics for original data
result <- data %>%
  group_by(sex) %>%
  summarise(
    unique_id_count = n_distinct(id),
    percent = round(n_distinct(id) / length(unique(data$id)), 2),
    min_age = min(fage),
    max_age = max(fage),
    min_bmi = min(bmi),
    max_bmi = max(bmi)
  )

# Print the summary statistics for the original data
print(data.frame(result))

  sex unique_id_count percent min_age max_age  min_bmi  max_bmi
1   0            1091    0.25      15      69 14.51837 60.59557
2   1            3334    0.75      14      66 12.48400 49.48097

# Part 2: Creating a new dataframe with only unique measurements per subject
uniquedf <- data.frame(matrix(ncol = 6, nrow = 0))

# Loop through unique subject IDs and extract the first row for each
for (i in unique(data$id)) {
  new_row <- data[data$id == i, ][1, ]
  uniquedf <- rbind(uniquedf, new_row)
}

# Part 3: Summary statistics for unique data
result2 <- uniquedf %>%
  group_by(sex) %>%
  summarise(
    mean_age = mean(fage),
    mean_bmi = mean(bmi)
  )

# Print the summary statistics for the unique data
print(data.frame(result2))

  sex mean_age mean_bmi
1   0 30.73236 24.00837
2   1 32.36203 24.95177

TABLE: people who have smoked at least once

# Subset data for individuals who smoked at least once
resultsm <- data[data$smoking == 1, ] %>%
  group_by(sex) %>%
  summarise(
    unique_id_count = n_distinct(id)
  )

# Print the summary statistics for individuals who smoked at least once
print(data.frame(resultsm))

   sex unique_id_count
1    0             412
2    1            1495
3 <NA>               1

Check if the data set is balanced

# Check if all individuals have the same time point
individuals_same_time <- tapply(data$time, data$id, function(x) length(unique(x)) == 1)

# Check if all entries are TRUE, indicating that all individuals have the same time point
all_same_time <- all(individuals_same_time)

# Print the result
if (all_same_time) {
  cat("All individuals have their BMI measured at the same time point.\n")
} else {
  cat("Not all individuals have their BMI measured at the same time point.\n")
}

Not all individuals have their BMI measured at the same time point.

Explore data per age group and sex

HISTOGRAM of initial age divided by gender.

# Create a histogram using ggplot2 for unique data
ggplot(uniquedf, aes(x = fage, fill = factor(sex))) +
  geom_histogram(binwidth = 5, alpha = 0.5, position = "stack") +
  
  # Add labels to the axes
  labs(x = "Age", y = "Frequency") +
  
  # Customize the fill colors based on the "sex" variable
  scale_fill_manual(values = c("0" = "red", "1" = "blue")) +
  
  # Add a title to the plot
  ggtitle("Histogram of Ages by Gender") + conf.plot

# Group the data into age groups and calculate summary statistics
# Summary per group and gender
summary_stats <- uniquedf %>%
  mutate(Age_Group = cut(fage, breaks = c(0, 19, 39, 59, Inf))) %>%
  group_by(Age_Group, sex) %>%
  summarise(
    Count = n(),
    mean_age = mean(fage),
    Mean_BMI = mean(bmi, na.rm = TRUE),
    Median_BMI = median(bmi, na.rm = TRUE),
    SD_BMI = sd(bmi, na.rm = TRUE),
    Min_BMI = min(bmi, na.rm = TRUE),
    Max_BMI = max(bmi, na.rm = TRUE)
  )
print("Summary per group and gender")

[1] "Summary per group and gender"

data.frame(summary_stats)

Age_Group	sex	Count	mean_age	Mean_BMI	Median_BMI	SD_BMI	Min_BMI	Max_BMI
(0,19]	0	98	17.9	22.2	21.9	3.46	16.2	36.7
(0,19]	1	209	17.9	22.6	22.2	3.43	14.3	38.4
(19,39]	0	779	28.3	23.8	23	4.33	17.1	60.6
(19,39]	1	2328	28.2	24.5	24.1	3.57	14.5	49.5
(39,59]	0	209	45	25.7	25.2	4.27	17.3	44.5
(39,59]	1	773	47.9	26.8	26.3	3.74	18.1	44.6
(59,Inf]	0	5	63.2	26.4	24.6	5.45	20.2	33.1
(59,Inf]	1	24	62.1	27.1	26.8	2.88	22.7	32.2

# Summary ONLY per group
summary_stats_overal <- uniquedf %>%
  mutate(Age_Group = cut(fage, breaks = c(0, 19, 39, 59, Inf))) %>%
  group_by(Age_Group) %>%
  summarise(
    Count = n(),
    mean_age = mean(fage),
    Mean_BMI = mean(bmi, na.rm = TRUE),
    Median_BMI = median(bmi, na.rm = TRUE),
    SD_BMI = sd(bmi, na.rm = TRUE),
    Min_BMI = min(bmi, na.rm = TRUE),
    Max_BMI = max(bmi, na.rm = TRUE)
  )
print("Summary per group")

[1] "Summary per group"

data.frame(summary_stats_overal)

Age_Group	Count	mean_age	Mean_BMI	Median_BMI	SD_BMI	Min_BMI	Max_BMI
(0,19]	307	17.9	22.5	22.1	3.44	14.3	38.4
(19,39]	3107	28.2	24.3	23.8	3.79	14.5	60.6
(39,59]	982	47.3	26.6	26.2	3.88	17.3	44.6
(59,Inf]	29	62.3	27	26.3	3.34	20.2	33.1

# Combine the two summary datasets
combined_summary <- rbind(summary_stats, summary_stats_overal)
combined_summary <- combined_summary[, c("Age_Group", "sex", "Mean_BMI")]

# Handle missing sex values
combined_summary$sex <- ifelse(is.na(combined_summary$sex), 3, combined_summary$sex)
combined_summary$sex <- as.factor(combined_summary$sex)

# Create a plot using ggplot2
ggplot(combined_summary, aes(x = Age_Group, y = Mean_BMI, color = factor(sex))) +
  geom_point() +
  geom_line(aes(group = factor(sex))) +
  labs(x = "Age Group", y = "Mean BMI (kg/m^2)") +
  scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "black"), labels = c("Female", "Male", "Overall")) +
  ggtitle("Mean BMI vs Age Group and Gender") + conf.plot

Mean Structure The main focus of this study revolved around the time and age variables. Nevertheless, because the data exhibited an unbalance, it was impractical to present the mean bmi at every time point. Consequently, a viable strategy was employed to segment the time and age variables into intervals and compute the average bmi within each group. The time interval was divided into 5 groups of 2 years intervals. 0-2 years with mid point 1, 2-4 years with mid point 1, 4-6 years with mid point 1 .

# Define your time intervals (modify as needed)
intervals <- c(0, 2, 4, 6, 8, 10)
class_ranges <- c(0, 19, 39, 59, Inf)
class_labels <- c("Youth",
                  "Early adult", 
                  "Middle adult", 
                  "Senior")

# Create a new variable 'TimeGroup' based on the intervals for the original dataset
data <- data %>%
  mutate(TimeGroup = cut(time, breaks = intervals, labels = FALSE, include.lowest = TRUE))

# Create a new variable 'age_class' based on the age ranges
data$age_class <- cut(data$fage, breaks = class_ranges, labels = class_labels, include.lowest = TRUE)

# Adjust the 'TimeGroup' variable
data$TimeGroup <- data$TimeGroup * 2

# Create a new variable 'TimeGroup' based on the intervals for the unique dataset
uniquedf <- uniquedf %>%
  mutate(TimeGroup = cut(time, breaks = intervals, labels = FALSE, include.lowest = TRUE))

# Create a new variable 'age_class' based on the age ranges for the unique dataset
uniquedf$age_class <- cut(uniquedf$fage, breaks = class_ranges, labels = class_labels, include.lowest = TRUE)

Mean Bmi vs Time intervals

  # Calculate summary statistics for Meanbmi and SEbmi
summary_data1 <- data %>%
  group_by(TimeGroup) %>%
  summarize(Meanbmi = mean(bmi), SEbmi = sd(bmi) / sqrt(n()))

# Merge summary_data1 back into the original data
summary_data1 <- merge(data, summary_data1)

# Calculate summary statistics for Meanbmi_sex and SEbmi_sex
summary_data2 <- data %>%
  group_by(sex, TimeGroup) %>%
  summarize(Meanbmi_sex = mean(bmi), SEbmi_sex = sd(bmi) / sqrt(n()))

# Merge summary_data2 back into the data
data <- merge(summary_data1, summary_data2, by = c("sex", "TimeGroup"), all = FALSE)

# Ggplot
ggplot(data, aes(x = TimeGroup, y = Meanbmi, group = age_class)) +
  geom_errorbar(aes(ymin = Meanbmi - SEbmi, ymax = Meanbmi + SEbmi), width = 0.2, color = "black") +
  geom_line(color = "black") +
  geom_point(size = 3, color = "black") +
  labs(x = "Time Intervals", y = "Mean BMI") +
  theme_minimal() +
  
  # Adding two lines for Meanbmi_sex by sex
  geom_line(data = subset(data, sex == 0), aes(x = TimeGroup, y = Meanbmi_sex), color = "red") +
  geom_line(data = subset(data, sex == 1), aes(x = TimeGroup, y = Meanbmi_sex), color = "blue") +
  geom_point(data = subset(data, sex == 0), aes(x = TimeGroup, y = Meanbmi_sex), color = "red", size = 3) +
  geom_point(data = subset(data, sex == 1), aes(x = TimeGroup, y = Meanbmi_sex), color = "blue", size = 3) +
  
  # Design of the plot
  labs(x = "Time Interval (years)", y = "Mean BMI (kg/m^2)") +
  ggtitle("Mean BMI vs Time Intervals") +
  scale_color_manual(values = c("Male" = "blue", "Female" = "red", "Overall" = "black")) +
  labs(color = "Sex") +
  conf.plot

Age.df<-data %>%
  group_by(age_class) %>%
  summarise(meanbmi_age = mean(bmi, na.rm = TRUE),
            sebmi_age = sd(bmi, na.rm = TRUE) / sqrt(n()))

 # Plot the results
  ggplot(data = Age.df, aes(x = age_class, y = meanbmi_age)) +
  geom_errorbar(aes(ymin = meanbmi_age - sebmi_age, ymax = meanbmi_age + sebmi_age), width = 0.3, color = "darkblue") +
  labs(title = "Mean BMI by AgeGroup with SE",
       x = "Age class",
       y = "Mean BMI (kg/m^2)")+
    conf.plot

Individual profiles

head(data)

TimeGroup	id	time	bmi	fage	smoking	age_class	Meanbmi	SEbmi	Meanbmi_sex	SEbmi_sex
10	4.21e+03	9	21.2	33	0	Early adult	26.2	0.149	25.1	0.358
10	3.92e+03	9	23.5	27		Early adult	26.2	0.149	25.1	0.358
10	4.84e+03	9	42.9	26	0	Early adult	26.2	0.149	25.1	0.358
10	2.07e+03	9	24.7	21	0	Early adult	26.2	0.149	25.1	0.358
10	4.94e+03	9	30.9	30	0	Early adult	26.2	0.149	25.1	0.358
10	4.59e+03	9	28.7	37	0	Early adult	26.2	0.149	25.1	0.358

set.seed(123)

# Randomly select 500 unique IDs (ADDITION: sample per group with more than )
df.random <- list()
for (i in unique(data$age_class)) {
  temp <- data[data$age_class == i & data$time > 6, ]
  smpl <-ifelse(length(unique(temp$id)) > 49, 50, length(unique(temp$id)))

# Randomly select 50
sampled_ids <- sample(unique(temp$id), smpl)
sampled_data <- data %>% filter(id %in% sampled_ids)
df.random <- rbind(df.random, sampled_data)
}

ggplot(df.random, aes(x = time, y = bmi, color = as.factor(id))) +
  geom_line() +
  labs(x = "Time (years)", y = "BMI (kg/m^2)") +
  ggtitle("Individual Profiles of BMI over Time by Age Class") +
  facet_wrap(~ age_class, ncol = 4, scales = "free_y") +
  theme(legend.position = "none")   # Hide the legend

VARIANCE STRUCTURE

# Create a ggplot object for BMI changes over time
gg <- ggplot(data, aes(x = TimeGroup, y = bmi, color = as.factor(sex))) +
  geom_smooth(method = "loess", span = 0.8, se = FALSE, color = "darkblue", size = 0.7) +  # Add Lowess trend line
  
  geom_point(stat = "summary", fun = mean, color = "red", size = 1) +  # Add points for mean sqr.res
  facet_grid(sex ~ age_class) +
  labs(x = "Time Interval (years)", y = "BMI (kg/m^2)") +
  ggtitle("BMI Changes Over Time by Gender and Age Group") + theme_bw()

# Display the plot
gg

# Define age classes  .
age_classes <- c("Youth", "Early adult", "Middle adult", "Senior")

# Create an empty list to store the plots
plot_list <- list()

# Initialize a counter
i <- 1

lab.y <- "Squared residuals"
lab.x <- "Time"

# Loop through age classes
for (sex in 0:1) {
  lab.y <- "Squared residuals"
  
  for (age in age_classes) {
    # Subset the data for the specific sex and age class
    temp <- data[data$sex == sex & data$age_class == age,]

    # Fit a Loess model to the data
    loess_fit <- loess(bmi ~ TimeGroup, temp, span = 0.8) #Predict of temp data
    loess_fit <- predict(loess_fit)
    
    sqrdres <- as.data.frame((temp[,"bmi"]- loess_fit)^2)
    colnames(sqrdres) <- "sqr.res"
    temp <- cbind(temp, loess_fit, sqrdres)
    
    # Create a ggplot for the residuals
g.plot <- ggplot(temp, aes(x = TimeGroup, y = sqr.res)) +
  geom_smooth(method = "loess", span = 0.8, se = FALSE, color = "black", size = 0.7) +
  geom_point(stat = "summary", fun = mean, color = "red", size = 1) +  # Add points for mean sqr.res
  ggtitle(paste0('Sex: ', sex, " Age Group: ", age)) +
  labs(x = lab.x, y = lab.y) +
  theme_bw()

    
    # Store the ggplot in the list
    plot_list[i] <- grob(g.plot)
    
    # Increment the counter
    i <- i + 1
    lab.y <-""
  }
}

gridExtra::grid.arrange(grobs = plot_list, nrow = 2)

*** MEAN BMI VS TIME and SQUARED RESIDUALES VS TIME no groupping***

    # Fit a Loess model to the data
    loess_fit <- loess(bmi ~ TimeGroup, data, span = 0.8) #Predict of temp data
    loess_fit <- predict(loess_fit)
    
    sqrdres <- as.data.frame((data[,"bmi"]- loess_fit)^2)
    colnames(sqrdres) <- "sqr.res"
    temp <- cbind(data, loess_fit, sqrdres)
    
    # Create a ggplot for the residuals
    g.plot <- ggplot(temp, aes(x = TimeGroup, y = sqr.res)) +
      geom_smooth(method = "loess", span = 0.7, se = FALSE, color = "black", size = 0.7) +
  geom_point(stat = "summary", fun = mean, color = "red", size = 2) +  # Add points for mean sqr.res
      theme_bw()
    g.plot