library(haven)
library(sas7bdat)
library(ggplot2)
library(dplyr)
library(pubh)
library(ggpubr)
library(data.table)
library(gridExtra)
library(grid)
library(vcd)
library(PASSED)
library(lme4)
library(nlme)

Working dataframe

# Reads dataset
data <- read_sas("bmilda.sas7bdat")

# Changing the sex / smoking to factor variables
data[c("smoking", "sex")] <- lapply(data[c("smoking", "sex")], factor)

# Creating a new variable 'age' by adding 'fage' and 'time'
data$age <- data$fage + data$time

# Creating a new variable 'two_year_index' based on the 'time' variable
data <- data %>%
  mutate(TimeGroup = case_when(
    time %in% c(0, 1) ~ 0,
    time %in% c(2, 3) ~ 1,
    time %in% c(4, 5) ~ 2,
    time %in% c(6, 7) ~ 3,
    time %in% c(8, 9) ~ 4
  ))

# Grouping by 'id' and 'two_year_index' and calculating the mean of 'bmi'
data <- data %>% group_by(id, TimeGroup) %>%
  mutate(bmi_mean = mean(bmi))

# Define your age class intervals
class_ranges <- c(0, 19, 39, 59, Inf)
class_labels <- c("Youth",
                  "Early adult", 
                  "Middle adult", 
                  "Senior")

# Create a new variable 'age_class' based on the age ranges
data$age_class <- cut(data$fage, breaks = class_ranges, labels = class_labels, include.lowest = TRUE)

# Display the first few rows of the dataset
head(as.data.frame(data))
idtimebmifagesexsmokingageTimeGroupbmi_meanage_class
1024.2361136024.9Early adult
1125.5361137024.9Early adult
1225.8361138125.8Early adult
1325.8361139125.8Early adult
1425.8361140225.8Early adult
1525.8361141225.8Early adult

Two Stage Approach

Stage 1

To model the individual subject specific profiles the following model was used

\[\begin{equation} BMI_{ij} = \beta_{1i} + \beta_{2i } t_{ij } + \beta_{3i} t^2_{ij} + \epsilon_{ij}, \quad j = 2,...,n_i, \end{equation}\]
data <-data.frame(data)
data$TimeGroup2 <- data$TimeGroup^2
temp <- unique(data[data$TimeGroup > 0, "id"])
temp <- data %>% filter(id %in% temp) %>% as.data.frame()

#Removes NAs
missing_rows <- which(!complete.cases(temp))
temp<- temp[-missing_rows, ]

temp <- temp %>% group_by(id, TimeGroup) %>%
     summarise(id = first(id),
               bmi_mean = mean(bmi_mean),
               TimeGroup = first(TimeGroup),
               TimeGroup2 = first(TimeGroup2),
               fage =first(fage),
               sex = first(sex),
               age_class = first(age_class))

res.list <- lmList(bmi_mean ~ TimeGroup| id, data = temp)

# Extract the slopes from the lmList object
slopes <- sapply(res.list, function(model) coef(model)[2]) 
intercepts <- sapply(res.list, function(model) coef(model)[1])

# Assuming TimeGroup is the second coefficient
names(slopes)<- as.numeric(gsub("[^0-9.]", "", names(slopes)))

# Create a data frame for plotting
stage1<- data.frame(id = as.numeric(names(slopes)), slope = slopes, intercept = intercepts)

# Merge with the original data to get additional information if needed
stage1 <- merge(stage1, temp, by = "id")
# Quadratic term
res.list.2 <- lmList(bmi_mean ~ TimeGroup + TimeGroup2| id, data = temp)

# Extract the slopes from the lmList object
slopes2 <- sapply(res.list.2, function(model) coef(model)[2]) 
intercepts2 <- sapply(res.list.2, function(model) coef(model)[1])
quadr <- sapply(res.list.2, function(model) coef(model)[3])

# Assuming TimeGroup is the second coefficient
names(slopes2)<- as.numeric(gsub("[^0-9.]", "", names(slopes2)))

# Create a data frame for plotting
stage1.2<- data.frame(id = as.numeric(names(slopes2)), slope = slopes2, intercept = intercepts2, quadr = as.numeric(quadr))

# Merge with the original data to get additional information if needed
stage1.2 <- merge(stage1.2, temp, by = "id")

Plot linear

set.seed(200)

# Randomly select 1 subject per age_class
selected_subjects <- temp[temp$TimeGroup>0,] %>%
  group_by(age_class) %>%
  sample_n(1) %>%
  ungroup() %>%
  pull(id)

plot_list <- list()
i<- 1
time.seq <- seq(0,4,length.out = 10)

 titles <- stage1.2 %>% filter(id %in% selected_subjects) %>% as.data.frame()
 titles <- unique(titles$age_class)

# predictions plot subject id = 1
for (index in selected_subjects){


tot.res <- stage1[stage1$id == index, "intercept"][1] +   stage1[stage1$id == index, "slope"][1]*time.seq 

quad.plot <- data.frame(x = time.seq, y = tot.res)
id.points <- stage1[stage1$id == index,c("TimeGroup", "bmi_mean")]
#index not exactly rows
gg <- ggplot(quad.plot, aes(x = x, y = y)) + geom_line() + 
  geom_point(data = id.points, aes(x = TimeGroup, y = bmi_mean),color = "red", size = 1)+theme_bw() + ggtitle(as.character(titles[i]))

    # Store the ggplot in the list
    plot_list[i] <- grob(gg)
    i <- i + 1

}

gridExtra::grid.arrange(grobs = plot_list, nrow = 1) 

Plot quadratic

set.seed(200)

plot_list <- list()
i<- 1

# predictions plot subject id = 1
for (index in selected_subjects){

tot.res <- stage1.2[stage1.2$id == index, "intercept"][1] + 
  stage1.2[stage1.2$id == index, "slope"][1]*time.seq + 
  stage1.2[stage1.2$id == index, "quadr"][1]*time.seq^2
quad.plot <- data.frame(x = time.seq, y = tot.res)
id.points <- stage1.2[stage1.2$id == index,c("TimeGroup", "bmi_mean")]
#index not exactly rows
gg <- ggplot(quad.plot, aes(x = x, y = y)) + geom_line() + 
  geom_point(data = id.points, aes(x = TimeGroup, y = bmi_mean),color = "red", size = 1)  + ggtitle(as.character(titles[i]))

    # Store the ggplot in the list
    plot_list[i] <- grob(gg)
    i <- i + 1

}

gridExtra::grid.arrange(grobs = plot_list, nrow = 1) 

Stage 2

\[\begin{equation} \beta_{1i} = \beta_1Gender + \beta_2Youth + \beta_3Early + \beta_4Middle + \beta_5{Senior} \\ \beta_{2i} = \beta_6Gender + \beta_7Youth + \beta_8Early + \beta_9Middle + \beta_{10}{Senior} \end{equation}\]
# Create the dummyvariable
stage1.2$Youth <- as.factor(as.numeric(stage1.2$age_class == "Youth"))
stage1.2$Early <- as.factor(as.numeric(stage1.2$age_class == "Early adult"))
stage1.2$Middle <- as.factor(as.numeric(stage1.2$age_class == "Middle adult"))
stage1.2$Senior <- as.factor(as.numeric(stage1.2$age_class == "Senior"))
Stage 2 model including gender
stage2 <- stage1.2 %>%
  group_by(id, TimeGroup) %>%
  summarise(
    bmi = first(bmi_mean),
    slope = first(slope),
    intercept = first(intercept),
    quad = first(quadr),
    sex = first(sex),         
    fage = first(fage),       
    age_class = first(age_class),
    Youth = first(Youth),
    Early = first(Early),
    Middle = first(Middle),
    Senior = first(Senior)
  )

mlm1 <- lm(cbind(slope, intercept, quadr) ~ sex + Youth + Early + Middle + Senior, data = stage2 )
summary(mlm1)
Response slope :

Call:
lm(formula = slope ~ sex + Youth + Early + Middle + Senior, data = stage2)

Residuals:
    Min      1Q  Median      3Q     Max 
-16.416  -0.780  -0.090   0.644  47.550 

Coefficients: (1 not defined because of singularities)
            Estimate Std. Error t value Pr(>|t|)
(Intercept)  0.44998    0.67320   0.668    0.504
sex1         0.03584    0.06845   0.524    0.601
Youth1      -0.10268    0.69985  -0.147    0.883
Early1      -0.04210    0.67198  -0.063    0.950
Middle1     -0.03668    0.67383  -0.054    0.957
Senior1           NA         NA      NA       NA

Residual standard error: 2.013 on 4629 degrees of freedom
  (3349 observations deleted due to missingness)
Multiple R-squared:  8.584e-05, Adjusted R-squared:  -0.0007782 
F-statistic: 0.09935 on 4 and 4629 DF,  p-value: 0.9827


Response intercept :

Call:
lm(formula = intercept ~ sex + Youth + Early + Middle + Senior, 
    data = stage2)

Residuals:
    Min      1Q  Median      3Q     Max 
-61.967  -2.457  -0.412   2.095  25.545 

Coefficients: (1 not defined because of singularities)
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  24.5009     1.3511  18.134   <2e-16 ***
sex1          1.2106     0.1374   8.812   <2e-16 ***
Youth1       -1.5349     1.4046  -1.093    0.275    
Early1       -0.9743     1.3486  -0.722    0.470    
Middle1       0.7632     1.3523   0.564    0.573    
Senior1           NA         NA      NA       NA    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.041 on 4629 degrees of freedom
  (3349 observations deleted due to missingness)
Multiple R-squared:  0.05199,   Adjusted R-squared:  0.05117 
F-statistic: 63.47 on 4 and 4629 DF,  p-value: < 2.2e-16


Response quadr :

Call:
lm(formula = quadr ~ sex + Youth + Early + Middle + Senior, data = stage2)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.7502 -0.2141 -0.0003  0.2057  5.5343 

Coefficients: (1 not defined because of singularities)
            Estimate Std. Error t value Pr(>|t|)  
(Intercept)  0.06414    0.20201   0.318   0.7509  
sex1        -0.03730    0.02054  -1.816   0.0695 .
Youth1      -0.15787    0.21000  -0.752   0.4522  
Early1      -0.05108    0.20164  -0.253   0.8000  
Middle1     -0.05785    0.20220  -0.286   0.7748  
Senior1           NA         NA      NA       NA  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.6041 on 4629 degrees of freedom
  (3349 observations deleted due to missingness)
Multiple R-squared:  0.00138,   Adjusted R-squared:  0.000517 
F-statistic: 1.599 on 4 and 4629 DF,  p-value: 0.1716
Stage 2 model not inlcuding gender
mlm2 <- lm(cbind(slope, intercept, quadr) ~ Youth + Early + Middle + Senior, data = stage2 )
summary(mlm2)
Response slope :

Call:
lm(formula = slope ~ Youth + Early + Middle + Senior, data = stage2)

Residuals:
    Min      1Q  Median      3Q     Max 
-16.443  -0.780  -0.087   0.647  47.557 

Coefficients: (1 not defined because of singularities)
            Estimate Std. Error t value Pr(>|t|)
(Intercept)  0.47786    0.67104   0.712    0.476
Youth1      -0.10724    0.69974  -0.153    0.878
Early1      -0.04360    0.67192  -0.065    0.948
Middle1     -0.03584    0.67378  -0.053    0.958
Senior1           NA         NA      NA       NA

Residual standard error: 2.013 on 4630 degrees of freedom
  (3349 observations deleted due to missingness)
Multiple R-squared:  2.661e-05, Adjusted R-squared:  -0.0006213 
F-statistic: 0.04108 on 3 and 4630 DF,  p-value: 0.9889


Response intercept :

Call:
lm(formula = intercept ~ Youth + Early + Middle + Senior, data = stage2)

Residuals:
    Min      1Q  Median      3Q     Max 
-61.726  -2.543  -0.364   2.195  25.865 

Coefficients: (1 not defined because of singularities)
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  25.4425     1.3579  18.736   <2e-16 ***
Youth1       -1.6890     1.4160  -1.193    0.233    
Early1       -1.0250     1.3597  -0.754    0.451    
Middle1       0.7916     1.3635   0.581    0.562    
Senior1           NA         NA      NA       NA    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.074 on 4630 degrees of freedom
  (3349 observations deleted due to missingness)
Multiple R-squared:  0.03609,   Adjusted R-squared:  0.03547 
F-statistic: 57.78 on 3 and 4630 DF,  p-value: < 2.2e-16


Response quadr :

Call:
lm(formula = quadr ~ Youth + Early + Middle + Senior, data = stage2)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.7601 -0.2111  0.0038  0.2017  5.5617 

Coefficients: (1 not defined because of singularities)
            Estimate Std. Error t value Pr(>|t|)
(Intercept)  0.03513    0.20142   0.174    0.862
Youth1      -0.15312    0.21004  -0.729    0.466
Early1      -0.04952    0.20169  -0.246    0.806
Middle1     -0.05872    0.20225  -0.290    0.772
Senior1           NA         NA      NA       NA

Residual standard error: 0.6043 on 4630 degrees of freedom
  (3349 observations deleted due to missingness)
Multiple R-squared:  0.0006688, Adjusted R-squared:  2.125e-05 
F-statistic: 1.033 on 3 and 4630 DF,  p-value: 0.3768