BIS628 HW2 National Cooperative Gallstone Study (NCGS)

knitr

knitr::opts_chunk$set(echo = TRUE)

on your mark, get set

rm(list=ls())
graphics.off()
closeAllConnections()

a few of my favorite packages

library(tidyverse)
library(xfun)
library(tidyr)
library(nlme)

director fury

setwd(
  "~/Yale University/2semester/BIS628 Longitudinal Multilevel Analysis/HW/2")

the import-export business

cholesterol.data <- read.table(
  "~/Yale University/2semester/BIS628 Longitudinal Multilevel Analysis/HW/2/cholesterol-data.txt", 
  quote="\"", comment.char="")
names(cholesterol.data) <- c("Group", "ID", "Y0", "Y6", "Y12", "Y18", "Y24")
head(cholesterol.data)

as numbers

cholesterol.data$Y0 <- as.numeric(cholesterol.data$Y0)
cholesterol.data$Y6 <- as.numeric(cholesterol.data$Y6)
cholesterol.data$Y12 <- as.numeric(cholesterol.data$Y12)

## Warning: NAs introduced by coercion

cholesterol.data$Y18 <- as.numeric(cholesterol.data$Y18)

## Warning: NAs introduced by coercion

cholesterol.data$Y24 <- as.numeric(cholesterol.data$Y24)

## Warning: NAs introduced by coercion

assigning groups

cholesterol.data$Group <- as.factor(cholesterol.data$Group)
levels(cholesterol.data$Group)[2] <- c("High-dose Treatment")
levels(cholesterol.data$Group)[1] <- c("Placebo")
levels(cholesterol.data$Group)

## [1] "Placebo"             "High-dose Treatment"

Question #1

1.1 Read ‘cholesterol-data.txt’ file into SAS or R (depending upon your preference) and put the data in a ‘long’ format.

long_chol.d <- cholesterol.data %>%
  pivot_longer(cols = starts_with("Y"), 
               names_to = "Time", 
               values_to = "Serum Cholesterol")
cholesterol.data <- long_chol.d
rm(long_chol.d)

time as a number

time_number <- substr(cholesterol.data$Time, 2, nchar(cholesterol.data$Time))
time_number <- as.numeric(time_number)
cbind(cholesterol.data, time_number)

cholesterol.data$Time_number <- time_number
rm(time_number)

1.2 Create a summary table of missingness in the outcome over time by group. Describe what you observe.

#number of na's 
na_counts <- cholesterol.data %>%
  group_by(Group, Time_number) %>%
  summarise(NA_Count = sum(is.na(`Serum Cholesterol`)))

## `summarise()` has grouped output by 'Group'. You can override using the
## `.groups` argument.

print(na_counts)

## # A tibble: 10 × 3
## # Groups:   Group [2]
##    Group               Time_number NA_Count
##    <fct>                     <dbl>    <int>
##  1 Placebo                       0        0
##  2 Placebo                       6        0
##  3 Placebo                      12        3
##  4 Placebo                      18        6
##  5 Placebo                      24       10
##  6 High-dose Treatment           0        0
##  7 High-dose Treatment           6        0
##  8 High-dose Treatment          12        7
##  9 High-dose Treatment          18       18
## 10 High-dose Treatment          24       24

#switch me
na_c_w <- na_counts %>%
  pivot_wider(id_cols = "Group",
              names_from = "Time_number",
              values_from = "NA_Count"
              )
print(na_c_w)

## # A tibble: 2 × 6
## # Groups:   Group [2]
##   Group                 `0`   `6`  `12`  `18`  `24`
##   <fct>               <int> <int> <int> <int> <int>
## 1 Placebo                 0     0     3     6    10
## 2 High-dose Treatment     0     0     7    18    24

#number of responses
followup <- cholesterol.data %>%
  group_by(Group, Time_number) %>%
  summarise(responses = sum(!is.na(`Serum Cholesterol`))) %>%
  arrange(Time_number)

## `summarise()` has grouped output by 'Group'. You can override using the
## `.groups` argument.

print(followup)

## # A tibble: 10 × 3
## # Groups:   Group [2]
##    Group               Time_number responses
##    <fct>                     <dbl>     <int>
##  1 Placebo                       0        41
##  2 High-dose Treatment           0        62
##  3 Placebo                       6        41
##  4 High-dose Treatment           6        62
##  5 Placebo                      12        38
##  6 High-dose Treatment          12        55
##  7 Placebo                      18        35
##  8 High-dose Treatment          18        44
##  9 Placebo                      24        31
## 10 High-dose Treatment          24        38

#switch me
followup <- followup %>%
  pivot_wider(id_cols = "Group",
              names_from = "Time_number",
              values_from = "responses"
              )
print(followup)

## # A tibble: 2 × 6
## # Groups:   Group [2]
##   Group                 `0`   `6`  `12`  `18`  `24`
##   <fct>               <int> <int> <int> <int> <int>
## 1 Placebo                41    41    38    35    31
## 2 High-dose Treatment    62    62    55    44    38

#combined for a ratio of na's
na_ratio <- merge(na_c_w, followup, all = TRUE, sort = TRUE)
print(na_ratio)

##                 Group  0  6 12 18 24
## 1             Placebo  0  0  3  6 10
## 2             Placebo 41 41 38 35 31
## 3 High-dose Treatment  0  0  7 18 24
## 4 High-dose Treatment 62 62 55 44 38

#ratio
missingness <- na_ratio
for (i in 2:6) {
  missingness[1, i] <- na_ratio[1, i] / na_ratio[2, i]
  missingness[3, i] <- na_ratio[3, i] / na_ratio[4, i]
}
missingness <- missingness[c(1, 3), ]

the table below shows the ratio of missing values to observed follow-ups increases over time for both groups, but starts at a higher baseline for the treatment group.

print(missingness)

##                 Group 0 6         12        18        24
## 1             Placebo 0 0 0.07894737 0.1714286 0.3225806
## 3 High-dose Treatment 0 0 0.12727273 0.4090909 0.6315789

1.3 In a Single Figure, create a panel (2 rows by 3 columns) of 5 plots (one for each time point) for distribution of cholesterol at each follow up timepoint by group (High-dose versus Placebo) overlaid with empirical probability curves. Hint: using the ggplot code in R Lab#1 which produced histograms by group with overlaid empirical probability curves and ADD to the code + facet_wrap(~FU_Time) where FU_Time is the follow up time “0 months”, “6 months”, etc.:

Comment on the Normality of the observed cholesterol data at each follow up time point and explain how you might justify using a model for the mean response that relies on the assumption of MVN for Yi of cholesterol.

basic.hist <- ggplot(data = cholesterol.data, aes(x = `Serum Cholesterol`))
basic.hist + 
  geom_histogram(color = "black", 
                 fill = "lightblue") +
  labs(title = "Distribution of Serum Cholesterol",
       x = "Serum Cholesterol Measurments (mg/dL)",
       y = "Count of Observations")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).

with a normal distribution

basic.hist + 
  geom_histogram(aes(y = ..density..),  
                 color = "black", 
                 fill = "lightblue", 
                 binwidth = density(cholesterol.data$`Serum Cholesterol`, na.rm = TRUE)$bw) + 
  labs(title = "Distribution of Serum Cholesterol",
       x = "Serum Cholesterol",
       y = "Density of Observations") +
  stat_function(color = "red", size = .8, fun = dnorm, 
                args = list(mean = mean(cholesterol.data$`Serum Cholesterol`, na.rm = TRUE), 
                            sd = sd(cholesterol.data$`Serum Cholesterol`, na.rm = TRUE)))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).

layered_hist <- ggplot(data = cholesterol.data, aes(x = `Serum Cholesterol`, fill = Group))
layered_hist + 
  geom_histogram(color = "black", 
                 alpha = 0.5, 
                 position="identity") + geom_density(linetype = "dashed") +
  labs(title = "Distribution of Serum Cholesterol by Group", 
       x = "Serum Cholesterol (mg/dL)", fill="Group", y = "Count of Observations") + facet_wrap(~ Time_number)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_density()`).

layered_hist <- ggplot(data = cholesterol.data, aes(x = `Serum Cholesterol`, fill = Group))
layered_hist + 
  geom_histogram(color = "black", 
                 alpha = 0.5) + geom_density(linetype = "dashed") +
  labs(title = "Distribution of Serum Cholesterol by Group", 
       x = "Serum Cholesterol (mg/dL)", fill="Group", y = "Count of Observations")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_density()`).

p2 <- ggplot(data = cholesterol.data, aes(x = `Serum Cholesterol`, na.rm = TRUE, color = Group))

p2 + 
  geom_histogram(aes(y = ..density.., fill = Group), , alpha = 0.2,
                 binwidth = density(cholesterol.data$`Serum Cholesterol`, na.rm = TRUE)$bw) +
  geom_density(linetype = "dashed") + 
  labs(title = "Distribution of Serum Cholesterol",
       x = "Serum Cholesterol",
       y = "Density") + facet_wrap(~ Time_number)

## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_density()`).

we’re plotting the distribution of the response variables yi at each of the time points, and using the property of each of the distributions at each of the time points to make a determination on the normality of the variances. and that variance is the random error. so if the response variable distribution looks normal at each time point, then we can use that to make a determination on the normality of random errors at each time point. this is the MVN.

Question #2

2.1 Variogram g is defined as \[g(u)=E[1/2{Y(t) – Y(t-u)}^2]\] with t > u, where Y(t) is a longitudinal response measured at time t, u is the lapsed time between the two observations within a same subject at time t and t-u. Using slides 7-8 of Lecture 3, where g(u) was defined:

Assuming that the residual in Y(t) is defined by the second-order stationary process (C(u)=0) with a negligible measurement error \(\tau^2 =0\), show that \(g(u)=\sigma^2[1-\rho(u)]\) where \(\sigma^2\) is the variance of \(Y(t)\) and \(𝝆(u)\) is the correlation coefficient between \(Y(t)\) and \(Y(t-u)\). Please show all steps in your expansion of the Expectation. (Hint: Use the definition of the second-order stationary process to be able to add some terms in the expansion together and set some terms in the expansion to 0; but you need to say which properties of the stationary process you used.)

Explain in words and show that the correlation coefficient, \(𝝆(u)\) between \(Y(t)\) and \(Y(t-u)\) only depends on lapsed time or lag ‘u’ but not the actual time t.

the covariance structure of the variogram is a stochastic random process that’s second-order stationary, meaning the covariance (and therefore correlation) only depends on distance, not time. See this is true, as when we plug in for \(Y(t)=0\) the only thing we’re left with is \(Y(u)\) , which stands for the lag \(u\)

#light data work handling na's
cholesterol.data_c <- cholesterol.data[is.na(cholesterol.data$`Serum Cholesterol`)==F,]
#light data work handling names
cholesterol.data_c <- rename(cholesterol.data_c, S_Cholesterol = `Serum Cholesterol`)
#light data work handling time
#time as a factor t_as_factor
cholesterol.data_c$t_as_factor <- as.factor(cholesterol.data_c$Time) %>% as.numeric() %>% as.factor()
#time as a number (1:5) t_enumerated
cholesterol.data_c$t_enumerated <- as.factor(cholesterol.data_c$Time_number) %>% as.numeric()

2.2 Back to our HW dataset, using the “maximal” model for the mean response, plot sample or empirical variogram for the dataset:

Which ‘maximal’ model for the mean did you chose? Why? Hint: consider your study design features.

###Sample variogram: use lmensssp package in R; OR use "Empirical or Sample Variogram Function.r" provided on canvas for Lab 3
#install.packages("lmenssp")
#library(lmenssp)
source("Empirical or Sample Variogram Function.r")
#Using OLS, fit a piece-wise linear model with independent covariance structure
#cholesterol.data_c <- cholesterol.data[is.na(cholesterol.data$`Serum Cholesterol`)==F,] #only select rows where na is false



# Remove rows with NA in 'Serum Cholesterol', 'Group', or 'Time_number'
#cholesterol.data_c_clean <- 
  #cholesterol.data_c[complete.cases(cholesterol.data_c$`Serum Cholesterol`,
                                    #cholesterol.data_c$Group,
                                    #cholesterol.data_c$Time_number), ]
#row goals is 447, after cleaning

lm_variogram <- lm(S_Cholesterol ~ Group * Time_number, data = cholesterol.data_c)

#Plot sample variogram
#Note: In literature "semivariogram" is sometimes used interchangeably with "variogram", but they actually mean different things 
variogram(resid = resid(lm_variogram),
          timeVar = cholesterol.data_c$Time_number,
          id = cholesterol.data_c$ID,
          irregular = FALSE) #this line of code refers to whether the study design time is balanced or not

## $lags
## [1]  6 12 18 24
## 
## $means
##        6       12       18       24 
## 529.6383 542.3154 646.0807 935.8719 
## 
## $sizes
##   6  12  18  24 
## 342 239 148  69 
## 
## $process.var
## [1] 1996.968

abline(h=529.6383, col="red", lty=2) #around the smallest value in $means in Console Output
abline(h=937.9797, col="blue", lty=2) #around the largest value in $means in Console Output

for my maximal model, I chose a linear regression where \(Y_i\) is a function of an interaction between \(Group x Time_number\), which accounts for all possible beta coefficients in the regression

2.2.ii Describe the different sources of variability in the response variable using the plot of the empirical variogram (see slide 8 in Lecture 3).

we have the variance of the random effect, which in this model we say is \(C(u)=0\), then the variance of the serial process which we described above, and the measurement error variance \(\tau^2\) is also negligible, as per stated above in the question.

Question #3

Consider the following covariance structures (Ri): Independent, Unstructured, Compound Symmetry, Autoregressive(AR1), Toeplitz, and Heterogeneous Compound Symmetry:

3.1 Write out \(R_i\) for each covariance model using the study’s time points and identify all nested covariance models by showing what constraints you can apply to an Ri to get nested models and identify All nested models. (For example, what constraints would you apply to the Unstructured covariance to get to the Compound Symmetry covariance?)

Summary of Constraints and Nested Models:
Unstructured (UN): No constraints, all variances and covariances freely estimated

Compound Symmetry (CS): Variances are equal across time points, covariances are equal across time points. Constraint: σ12=σ22=⋯=σn2σ12=σ22=⋯=σn2 and ρij=θρij=θ for all pairs.

Heterogeneous Compound Symmetry (CSH): Covariances are equal, but variances are allowed to differ across time points. Constraint: σ12≠σ22≠⋯≠σn2σ12=σ22=⋯=σn2, and ρij=θρij=θ for all pairs.

Autoregressive (AR1): Variances are equal, covariances depend on the lag. Constraint: ρij=θ×ρ∣i−j∣ρij=θ×ρ∣i−j∣.

Toeplitz (TOEP): Variances are equal, but covariances depend on the lag and can vary for each lag. Constraint: ρij=θ∣i−j∣ρij=θ∣i−j∣ for each lag ∣i−j∣∣i

These models can be interpreted in a hierarchy, with Unstructured being the most flexible and other models being more constrained versions.

3.2 Using the “maximal” model for the mean response, fit (using R or SAS) the following models for the covariance: Independent, Unstructured, Compound Symmetry, Autoregressive(AR1), Toeplitz, and Heterogeneous Compound Symmetry. (For each model, simply show your code – no output, not yet.)

Independent Covariance

### Fit piece-wise linear models assuming different covariance patterns- use REML, which is the default, NOT method='ML'!!!
#Independent covariance, i.e. all rows of data are considered independent and identically distributed
#this is the maximal model
lm_indi <- gls(S_Cholesterol ~ Group * t_as_factor, data = cholesterol.data_c)
#Get estimates of the fixed effects and correlation matrix
summary(lm_indi)

Unstructured Covariance

##suppress the summary statements
#mean response for maximal model
#time as a factor t_as_factor
#time as a number (1:5) t_enumerated
#factor time var goes for variances, second line

#Unstructured covariance
lm_unstructured <- gls(S_Cholesterol ~ Group * t_as_factor, data = cholesterol.data_c, 
          corr = corSymm(form = ~ t_enumerated| ID), 
          weights = varIdent(form = ~ 1 | t_as_factor))
#also factor #ends up as variances. diff variances at each time point, where cat time goes. c_time goes for weights. also goes for mean response (always, otherwise models as constant slope. only gives one parameter for tiem)
summary(lm_unstructured)
#Get variance covariance matrix via built-in R function:
getVarCov(lm_unstructured)
#conjunction junction
corandcov2 <- function(glsob,cov=T,...){
  corm <- corMatrix(glsob$modelStruct$corStruct)[[1]]
  corv<-print(getVarCov(glsob))
  return(corm)
}
#Get variance covariance matrix via 'corandcov2' function above:
corandcov2(lm_unstructured)
#week number, then week n as factor, then time points enumerated

Compound Symmetry Covariance

#Compound symmetry covariance
lm_comp_sym_cov <- gls(S_Cholesterol ~ Group * t_as_factor, data = cholesterol.data_c, 
          corr = corCompSymm(form = ~ t_enumerated | ID))
summary(lm_comp_sym_cov)
getVarCov(lm_comp_sym_cov)
#Get variance covariance matrix via 'corandcov' function above:
corandcov2(lm_comp_sym_cov)

Autoregressive (AR1) Covariance

#First-order autoregressive covariance
lm_auto_cov <- gls(S_Cholesterol ~ Group * t_as_factor, data = cholesterol.data_c, 
          corr = corAR1(form = ~ t_enumerated | ID))
summary(lm_auto_cov)
getVarCov(lm_auto_cov)
#Get variance covariance matrix via 'corandcov' function above:
corandcov2(lm_auto_cov)

Toeplitz

#Toeplitz covariance
#Using ARMA structure in corARMA: q is the value of the moving average, which is q=zero
#Using ARMA structure in corARMA: p is the number of successive residuals after the first one, which is p=n-1 (dimension of Covariance matrix minus 1); since the maximum n which is the number of time points within a person is 6 in this dataset, then p=6-1=5
lm_toeplitz <- gls(S_Cholesterol ~ Group * t_as_factor, data = cholesterol.data_c,
                   corr = corARMA(form = ~ t_enumerated | ID, p=3, q=0))
summary(lm_toeplitz)
#Note 1: if you type 'summary(m5) into Console to the right directly, you can get phi estimates
#Note 2: Read "Extracting Autocorrelations.pdf" on Canvas to see how to get from phi estimates to rho (correlation) estimates
vector.phi <-corARMA(
  value=c(0.3227926, 0.3449167, 0.2128423),
  form = ~Time, 
  p = 3, 
  q = 0,
  fixed = T )#5 lags
vector.phi

getVarCov(lm_toeplitz)
#conjunction junction function
corandcov2 <- function(glsob,cov=T,...){
  corm <- corMatrix(glsob$modelStruct$corStruct)[[1]]
  corv<-print(getVarCov(glsob))
  return(corm)
}
#Get variance covariance matrix via 'corandcov' function above:
corandcov2(lm_toeplitz)

Heterogeneous Compound Symmetry

#Heterogeneous compound symmetry covariance
lm_het_com_sym <- gls(S_Cholesterol ~ Group * t_as_factor, data = cholesterol.data_c, 
          corr = corCompSymm(form = ~ t_enumerated | ID),
          weights = varIdent(form = ~ 1 | t_as_factor))
summary(lm_het_com_sym)
getVarCov(lm_het_com_sym)
#Get variance covariance matrix via 'corandcov' function above:
corandcov2 <- function(glsob,cov=T,...){
  corm <- corMatrix(glsob$modelStruct$corStruct)[[1]]
  corv<-print(getVarCov(glsob))
  return(corm)
}
corandcov2(lm_het_com_sym)
##couldn't find function corandcov2

#Compare corMatrix in M3 (CS) to CorMatrix in M7 (CS with Non-Constant Variance)
corMatrix(lm_het_com_sym$modelStruct$corStruct)[[1]]

3.3.i Without conducting any statistical tests, are all covariance models being considered applicable to the design of the study? State which one(s) are not applicable.

no, not all models are applicable for this study. we’re working with a balanced design here, so that throws away exponential. we can throw away the IND model, (observations within a subject might be correlated). unstructured could work, it just vastly increases the number of parameters estimated. toss out compound symmetry since it assumes correlation is the same regardless of the distance between time points. the AR(1) model works with balanced designs (which we have) since for any two timepoints, \(i,j\) correlation exponent is \(|i-j|\). Toeplitz works in a similar way. Heterogeneous compound symmetry works here because there’s different variances and constant correlation between the time points.

3.3.ii After you have eliminated some covariance(s) from consideration due to study design: when choosing the best covariance based on statistical hypothesis testing, show your work for selecting the best covariance for nested models (Hint: use LRT or AIC?)

I chose the heterogeneous compound symmetry model since it has both the lowest AIC and highest log likelihood out of all the models.

3.3.iii After you have eliminated some covariance(s) from consideration due to study design: when choosing the best covariance based on statistical hypothesis testing, show your work for selecting the best covariance for NOT nested models. (Hint: use LRT or AIC?)

the autoregressive (AR1) covariance model has the highest AIC and smallest log likelihood of all the models we didn’t throw away due to study design.

3.3.iv Present your conclusion for the best covariance model.

therefore, the best covariance matrix to pick here is the heterogeneous compound symmetry model

Question #4

After you have chosen the ‘best’ covariance model in Question 3, it’s time to get back to the model for the mean of the outcome and answer this research question: is there a differential effect of group on the change in mean levels of cholesterol across the follow up time?

4.1 Using R or SAS, choose between two models for the mean response: (i) mean response profile (MRP) with group, follow up time and their interaction (in that order), and (ii) non-constant slope of change in mean cholesterol over time with group, follow up time (polynomial with a linear and a quadratic slope), and their statistical interaction (in that order). For both models use the best model for covariance you choose in Question 3.

lm_het_com_sym <- gls(S_Cholesterol ~ Group * t_as_factor, data = cholesterol.data_c, method = "ML", 
          corr = corCompSymm(form = ~ t_enumerated | ID),
          weights = varIdent(form = ~ 1 | t_as_factor))
summary(lm_het_com_sym)

## Generalized least squares fit by maximum likelihood
##   Model: S_Cholesterol ~ Group * t_as_factor 
##   Data: cholesterol.data_c 
##        AIC      BIC    logLik
##   4362.003 4427.644 -2165.001
## 
## Correlation Structure: Compound symmetry
##  Formula: ~t_enumerated | ID 
##  Parameter estimate(s):
##       Rho 
## 0.7224242 
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | t_as_factor 
##  Parameter estimates:
##         1         5         2         3         4 
## 1.0000000 0.9106011 0.8726776 0.8956395 1.0744466 
## 
## Coefficients:
##                                           Value Std.Error  t-value p-value
## (Intercept)                           235.92683  7.332592 32.17509  0.0000
## GroupHigh-dose Treatment               -9.91070  9.451055 -1.04863  0.2949
## t_as_factor2                            8.80301  5.309768  1.65789  0.0981
## t_as_factor3                           23.26903  5.485912  4.24160  0.0000
## t_as_factor4                           20.65028  6.289838  3.28312  0.0011
## t_as_factor5                            7.24390  5.254526  1.37860  0.1687
## GroupHigh-dose Treatment:t_as_factor2  16.50528  6.878002  2.39972  0.0168
## GroupHigh-dose Treatment:t_as_factor3   4.88838  7.241304  0.67507  0.5000
## GroupHigh-dose Treatment:t_as_factor4   7.69924  8.380344  0.91873  0.3587
## GroupHigh-dose Treatment:t_as_factor5  12.27223  6.772614  1.81204  0.0707
## 
##  Correlation: 
##                                       (Intr) GrpH-T t_s_f2 t_s_f3 t_s_f4 t_s_f5
## GroupHigh-dose Treatment              -0.776                                   
## t_as_factor2                          -0.510  0.396                            
## t_as_factor3                          -0.472  0.366  0.543                     
## t_as_factor4                          -0.261  0.202  0.450  0.449              
## t_as_factor5                          -0.477  0.370  0.551  0.530  0.444       
## GroupHigh-dose Treatment:t_as_factor2  0.394 -0.508 -0.772 -0.419 -0.347 -0.425
## GroupHigh-dose Treatment:t_as_factor3  0.357 -0.461 -0.412 -0.758 -0.340 -0.402
## GroupHigh-dose Treatment:t_as_factor4  0.196 -0.252 -0.338 -0.337 -0.751 -0.333
## GroupHigh-dose Treatment:t_as_factor5  0.370 -0.477 -0.427 -0.411 -0.344 -0.776
##                                       GH-T:__2 GH-T:__3 GH-T:__4
## GroupHigh-dose Treatment                                        
## t_as_factor2                                                    
## t_as_factor3                                                    
## t_as_factor4                                                    
## t_as_factor5                                                    
## GroupHigh-dose Treatment:t_as_factor2                           
## GroupHigh-dose Treatment:t_as_factor3  0.529                    
## GroupHigh-dose Treatment:t_as_factor4  0.432    0.433           
## GroupHigh-dose Treatment:t_as_factor5  0.548    0.518    0.429  
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.39326246 -0.68107026 -0.02769445  0.62493643  3.92201785 
## 
## Residual standard error: 46.42334 
## Degrees of freedom: 447 total; 437 residual

#Center week_num and create a quadratic term of week_num: we center to decorrelate time with time^2 variables
cholesterol.data_c$average_week <- cholesterol.data_c$Time_number - mean(cholesterol.data_c$Time_number) 

cholesterol.data_c$average_week_2 <- (cholesterol.data_c$average_week)^2


#Create the quadratic term
cholesterol.data_c$t_enumerated_2 <- cholesterol.data_c$t_enumerated^2
#quadratic model
non_constant_slope_model <- gls(S_Cholesterol ~ Group*(average_week + average_week_2), data=cholesterol.data_c, method = "ML", 
          corr = corCompSymm(form = ~ t_enumerated | ID),
          weights = varIdent(form = ~ 1 | t_as_factor))
summary(non_constant_slope_model)$coefficients

##                             (Intercept)                GroupHigh-dose Treatment 
##                           247.744735108                             3.106266341 
##                            average_week                          average_week_2 
##                             1.008567100                            -0.006826771 
##   GroupHigh-dose Treatment:average_week GroupHigh-dose Treatment:average_week_2 
##                             0.265027160                            -0.080054032

anova(lm_het_com_sym, non_constant_slope_model)

based on the table of model performance above, with the ANOVA comparison p-value output as .3047 (above .05), the linear model works better. even though the quadratic has a low p-value, the linear model is less complex, with better AIC and LogLik values.

4.2 Using the best-fit model for the mean response from 4.1 and the best-fit model for the covariance from Question 3, determine whether the mean trajectory of change in cholesterol over time is different between those randomized to High-does versus Placebo:

the null hypothesis is that there’s no difference between the placebo and treatment group, with the placebo group as a baseline, when measuring mean difference between observed time points/lags. the alternative hypothesis is that there’s is a difference.

4.3 Write out the final model for the mean response: \(E(Y_{ij})=\mu_{ij}\)

\[ E(Y_{ij})=\beta_0+\beta_{0,1}+\beta_{1,2,3,4,5}+\epsilon_{ij} \]

4.4 Plot the observed mean levels of cholesterol and estimated by the finalized model for the mean response across the follow up by group; do not smooth but connect the means. By visual examination, how well does the final model for the mean response fit the observed data?

observed_means <- cholesterol.data_c %>%
  group_by(Group, t_enumerated) %>%
  summarise(S_Cholesterol = mean(S_Cholesterol)) %>%
  ungroup()

## `summarise()` has grouped output by 'Group'. You can override using the
## `.groups` argument.

# Predict the fitted values using the model
cholesterol.data_c$fitted_values <- predict(lm_het_com_sym)

# Summarize the fitted data
fitted_means <- cholesterol.data_c %>%
  group_by(Group, t_enumerated) %>%
  summarise(estimated_mean_cholesterol = mean(fitted_values)) %>%
  ungroup()

## `summarise()` has grouped output by 'Group'. You can override using the
## `.groups` argument.

# Merge the observed and fitted summaries for plotting
plot_data <- observed_means %>%
  left_join(fitted_means, by = c("Group", "t_enumerated"))

# Plot observed and estimated mean cholesterol levels
ggplot(plot_data, aes(x = t_enumerated, color = Group)) +
  geom_line(aes(y = S_Cholesterol), linetype = "dashed") +  # Dashed line for observed means
  geom_line(aes(y = estimated_mean_cholesterol)) +              # Solid line for estimated means
  geom_point(aes(y = S_Cholesterol), shape = 1) +            # Points for observed means
  geom_point(aes(y = estimated_mean_cholesterol), shape = 2) +  # Points for estimated means
  labs(title = "Observed and Estimated Mean Cholesterol Levels",
       x = "Follow-up Time (six month increments)",
       y = "Mean Cholesterol Level", color = "Group") +
  theme_minimal()

just looking at the graph, it looks like the model fits the data pretty well, more than I would have thought if only examining the p-values of each interaction. but it does make sense that the first follow up period most closely follows the difference between the observed and predicted values, since that returned the lowest p-value in the model.

BIS628 HW2 National Cooperative Gallstone Study (NCGS)

Samuele Petruccelli

2025-02-07

Question #1

Question #2

Question #3

Question #4