library(haven)
library(sas7bdat)
library(ggplot2)
library(dplyr)
library(pubh)
library(ggpubr)
library(data.table)
library(gridExtra)
library(grid)
library(vcd)
library(PASSED)
library(lme4)
library(nlme)
# Reads dataset
data <- read_sas("bmilda.sas7bdat")
# Changing the sex / smoking to factor variables
data[c("smoking", "sex")] <- lapply(data[c("smoking", "sex")], factor)
# Creating a new variable 'age' by adding 'fage' and 'time'
data$age <- data$fage + data$time
# Creating a new variable 'two_year_index' based on the 'time' variable
data <- data %>%
mutate(TimeGroup = case_when(
time %in% c(0, 1) ~ 0,
time %in% c(2, 3) ~ 1,
time %in% c(4, 5) ~ 2,
time %in% c(6, 7) ~ 3,
time %in% c(8, 9) ~ 4
))
# Grouping by 'id' and 'two_year_index' and calculating the mean of 'bmi'
data <- data %>% group_by(id, TimeGroup) %>%
mutate(bmi_mean = mean(bmi))
# Define your age class intervals
class_ranges <- c(0, 19, 39, 59, Inf)
class_labels <- c("Youth",
"Early adult",
"Middle adult",
"Senior")
# Create a new variable 'age_class' based on the age ranges
data$age_class <- cut(data$fage, breaks = class_ranges, labels = class_labels, include.lowest = TRUE)
# Display the first few rows of the dataset
head(as.data.frame(data))
| id | time | bmi | fage | sex | smoking | age | TimeGroup | bmi_mean | age_class |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 24.2 | 36 | 1 | 1 | 36 | 0 | 24.9 | Early adult |
| 1 | 1 | 25.5 | 36 | 1 | 1 | 37 | 0 | 24.9 | Early adult |
| 1 | 2 | 25.8 | 36 | 1 | 1 | 38 | 1 | 25.8 | Early adult |
| 1 | 3 | 25.8 | 36 | 1 | 1 | 39 | 1 | 25.8 | Early adult |
| 1 | 4 | 25.8 | 36 | 1 | 1 | 40 | 2 | 25.8 | Early adult |
| 1 | 5 | 25.8 | 36 | 1 | 1 | 41 | 2 | 25.8 | Early adult |
To model the individual subject specific profiles the following model was used
\[\begin{equation} BMI_{ij} = \beta_{1i} + \beta_{2i } t_{ij } + \beta_{3i} t^2_{ij} + \epsilon_{ij}, \quad j = 2,...,n_i, \end{equation}\]data <-data.frame(data)
data$TimeGroup2 <- data$TimeGroup^2
temp <- unique(data[data$TimeGroup > 0, "id"])
temp <- data %>% filter(id %in% temp) %>% as.data.frame()
#Removes NAs
missing_rows <- which(!complete.cases(temp))
temp<- temp[-missing_rows, ]
temp <- temp %>% group_by(id, TimeGroup) %>%
summarise(id = first(id),
bmi_mean = mean(bmi_mean),
TimeGroup = first(TimeGroup),
TimeGroup2 = first(TimeGroup2),
fage =first(fage),
sex = first(sex),
age_class = first(age_class))
res.list <- lmList(bmi_mean ~ TimeGroup| id, data = temp)
# Extract the slopes from the lmList object
slopes <- sapply(res.list, function(model) coef(model)[2])
intercepts <- sapply(res.list, function(model) coef(model)[1])
# Assuming TimeGroup is the second coefficient
names(slopes)<- as.numeric(gsub("[^0-9.]", "", names(slopes)))
# Create a data frame for plotting
stage1<- data.frame(id = as.numeric(names(slopes)), slope = slopes, intercept = intercepts)
# Merge with the original data to get additional information if needed
stage1 <- merge(stage1, temp, by = "id")
# Quadratic term
res.list.2 <- lmList(bmi_mean ~ TimeGroup + TimeGroup2| id, data = temp)
# Extract the slopes from the lmList object
slopes2 <- sapply(res.list.2, function(model) coef(model)[2])
intercepts2 <- sapply(res.list.2, function(model) coef(model)[1])
quadr <- sapply(res.list.2, function(model) coef(model)[3])
# Assuming TimeGroup is the second coefficient
names(slopes2)<- as.numeric(gsub("[^0-9.]", "", names(slopes2)))
# Create a data frame for plotting
stage1.2<- data.frame(id = as.numeric(names(slopes2)), slope = slopes2, intercept = intercepts2, quadr = as.numeric(quadr))
# Merge with the original data to get additional information if needed
stage1.2 <- merge(stage1.2, temp, by = "id")
set.seed(200)
# Randomly select 1 subject per age_class
selected_subjects <- temp[temp$TimeGroup>0,] %>%
group_by(age_class) %>%
sample_n(1) %>%
ungroup() %>%
pull(id)
plot_list <- list()
i<- 1
time.seq <- seq(0,4,length.out = 10)
titles <- stage1.2 %>% filter(id %in% selected_subjects) %>% as.data.frame()
titles <- unique(titles$age_class)
# predictions plot subject id = 1
for (index in selected_subjects){
tot.res <- stage1[stage1$id == index, "intercept"][1] + stage1[stage1$id == index, "slope"][1]*time.seq
quad.plot <- data.frame(x = time.seq, y = tot.res)
id.points <- stage1[stage1$id == index,c("TimeGroup", "bmi_mean")]
#index not exactly rows
gg <- ggplot(quad.plot, aes(x = x, y = y)) + geom_line() +
geom_point(data = id.points, aes(x = TimeGroup, y = bmi_mean),color = "red", size = 1)+theme_bw() + ggtitle(as.character(titles[i]))
# Store the ggplot in the list
plot_list[i] <- grob(gg)
i <- i + 1
}
gridExtra::grid.arrange(grobs = plot_list, nrow = 1)
set.seed(200)
plot_list <- list()
i<- 1
# predictions plot subject id = 1
for (index in selected_subjects){
tot.res <- stage1.2[stage1.2$id == index, "intercept"][1] +
stage1.2[stage1.2$id == index, "slope"][1]*time.seq +
stage1.2[stage1.2$id == index, "quadr"][1]*time.seq^2
quad.plot <- data.frame(x = time.seq, y = tot.res)
id.points <- stage1.2[stage1.2$id == index,c("TimeGroup", "bmi_mean")]
#index not exactly rows
gg <- ggplot(quad.plot, aes(x = x, y = y)) + geom_line() +
geom_point(data = id.points, aes(x = TimeGroup, y = bmi_mean),color = "red", size = 1) + ggtitle(as.character(titles[i]))
# Store the ggplot in the list
plot_list[i] <- grob(gg)
i <- i + 1
}
gridExtra::grid.arrange(grobs = plot_list, nrow = 1)
# Create the dummyvariable
stage1.2$Youth <- as.factor(as.numeric(stage1.2$age_class == "Youth"))
stage1.2$Early <- as.factor(as.numeric(stage1.2$age_class == "Early adult"))
stage1.2$Middle <- as.factor(as.numeric(stage1.2$age_class == "Middle adult"))
stage1.2$Senior <- as.factor(as.numeric(stage1.2$age_class == "Senior"))
stage2 <- stage1.2 %>%
group_by(id, TimeGroup) %>%
summarise(
bmi = first(bmi_mean),
slope = first(slope),
intercept = first(intercept),
quad = first(quadr),
sex = first(sex),
fage = first(fage),
age_class = first(age_class),
Youth = first(Youth),
Early = first(Early),
Middle = first(Middle),
Senior = first(Senior)
)
mlm1 <- lm(cbind(slope, intercept, quadr) ~ sex + Youth + Early + Middle + Senior, data = stage2 )
summary(mlm1)
Response slope :
Call:
lm(formula = slope ~ sex + Youth + Early + Middle + Senior, data = stage2)
Residuals:
Min 1Q Median 3Q Max
-16.416 -0.780 -0.090 0.644 47.550
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.44998 0.67320 0.668 0.504
sex1 0.03584 0.06845 0.524 0.601
Youth1 -0.10268 0.69985 -0.147 0.883
Early1 -0.04210 0.67198 -0.063 0.950
Middle1 -0.03668 0.67383 -0.054 0.957
Senior1 NA NA NA NA
Residual standard error: 2.013 on 4629 degrees of freedom
(3349 observations deleted due to missingness)
Multiple R-squared: 8.584e-05, Adjusted R-squared: -0.0007782
F-statistic: 0.09935 on 4 and 4629 DF, p-value: 0.9827
Response intercept :
Call:
lm(formula = intercept ~ sex + Youth + Early + Middle + Senior,
data = stage2)
Residuals:
Min 1Q Median 3Q Max
-61.967 -2.457 -0.412 2.095 25.545
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 24.5009 1.3511 18.134 <2e-16 ***
sex1 1.2106 0.1374 8.812 <2e-16 ***
Youth1 -1.5349 1.4046 -1.093 0.275
Early1 -0.9743 1.3486 -0.722 0.470
Middle1 0.7632 1.3523 0.564 0.573
Senior1 NA NA NA NA
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.041 on 4629 degrees of freedom
(3349 observations deleted due to missingness)
Multiple R-squared: 0.05199, Adjusted R-squared: 0.05117
F-statistic: 63.47 on 4 and 4629 DF, p-value: < 2.2e-16
Response quadr :
Call:
lm(formula = quadr ~ sex + Youth + Early + Middle + Senior, data = stage2)
Residuals:
Min 1Q Median 3Q Max
-7.7502 -0.2141 -0.0003 0.2057 5.5343
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.06414 0.20201 0.318 0.7509
sex1 -0.03730 0.02054 -1.816 0.0695 .
Youth1 -0.15787 0.21000 -0.752 0.4522
Early1 -0.05108 0.20164 -0.253 0.8000
Middle1 -0.05785 0.20220 -0.286 0.7748
Senior1 NA NA NA NA
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.6041 on 4629 degrees of freedom
(3349 observations deleted due to missingness)
Multiple R-squared: 0.00138, Adjusted R-squared: 0.000517
F-statistic: 1.599 on 4 and 4629 DF, p-value: 0.1716
mlm2 <- lm(cbind(slope, intercept, quadr) ~ Youth + Early + Middle + Senior, data = stage2 )
summary(mlm2)
Response slope :
Call:
lm(formula = slope ~ Youth + Early + Middle + Senior, data = stage2)
Residuals:
Min 1Q Median 3Q Max
-16.443 -0.780 -0.087 0.647 47.557
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.47786 0.67104 0.712 0.476
Youth1 -0.10724 0.69974 -0.153 0.878
Early1 -0.04360 0.67192 -0.065 0.948
Middle1 -0.03584 0.67378 -0.053 0.958
Senior1 NA NA NA NA
Residual standard error: 2.013 on 4630 degrees of freedom
(3349 observations deleted due to missingness)
Multiple R-squared: 2.661e-05, Adjusted R-squared: -0.0006213
F-statistic: 0.04108 on 3 and 4630 DF, p-value: 0.9889
Response intercept :
Call:
lm(formula = intercept ~ Youth + Early + Middle + Senior, data = stage2)
Residuals:
Min 1Q Median 3Q Max
-61.726 -2.543 -0.364 2.195 25.865
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 25.4425 1.3579 18.736 <2e-16 ***
Youth1 -1.6890 1.4160 -1.193 0.233
Early1 -1.0250 1.3597 -0.754 0.451
Middle1 0.7916 1.3635 0.581 0.562
Senior1 NA NA NA NA
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.074 on 4630 degrees of freedom
(3349 observations deleted due to missingness)
Multiple R-squared: 0.03609, Adjusted R-squared: 0.03547
F-statistic: 57.78 on 3 and 4630 DF, p-value: < 2.2e-16
Response quadr :
Call:
lm(formula = quadr ~ Youth + Early + Middle + Senior, data = stage2)
Residuals:
Min 1Q Median 3Q Max
-7.7601 -0.2111 0.0038 0.2017 5.5617
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.03513 0.20142 0.174 0.862
Youth1 -0.15312 0.21004 -0.729 0.466
Early1 -0.04952 0.20169 -0.246 0.806
Middle1 -0.05872 0.20225 -0.290 0.772
Senior1 NA NA NA NA
Residual standard error: 0.6043 on 4630 degrees of freedom
(3349 observations deleted due to missingness)
Multiple R-squared: 0.0006688, Adjusted R-squared: 2.125e-05
F-statistic: 1.033 on 3 and 4630 DF, p-value: 0.3768