BIS628 HW1 Vlagtwedde-Vlaardingen Study

mark, get set

rm(list=ls())
graphics.off()
closeAllConnections()

knitr

knitr::opts_chunk$set(echo = TRUE)

a few of my favorite packages

library(tidyverse)
library(xfun)
library(tidyr)
library(nlme)

director fury

setwd("~/Yale University/2semester/BIS628 Longitudinal Multilevel Analysis/HW/1")

the import business

HW1_data <- 
  read.delim("~/Yale University/2semester/BIS628 Longitudinal Multilevel Analysis/HW/1/HW1_data.txt", 
             header=FALSE)

new name

names(HW1_data) <- c("ID", "Smoker", "Y0", "Y3", "Y6", "Y9", "Y12", "Y15")

Question #1

1.1 the long way home

HW1_data_long <- HW1_data %>%
  pivot_longer(cols = starts_with("Y"), 
               names_to = "Time", 
               values_to = "FEV1")

level up

HW1_data_long$Time <- factor(HW1_data_long$Time, 
                             levels = c("Y0", "Y3", "Y6", "Y9", "Y12", "Y15"))
print(HW1_data_long)

## # A tibble: 798 × 4
##       ID Smoker Time   FEV1
##    <int>  <int> <fct> <dbl>
##  1     1      0 Y0     3.4 
##  2     1      0 Y3     3.4 
##  3     1      0 Y6     3.45
##  4     1      0 Y9     3.2 
##  5     1      0 Y12   NA   
##  6     1      0 Y15    2.95
##  7     2      1 Y0     3.1 
##  8     2      1 Y3     3.15
##  9     2      1 Y6     3.5 
## 10     2      1 Y9     2.95
## # ℹ 788 more rows

1.2 long responses

na_count <- HW1_data_long %>%
  group_by(Smoker, Time) %>%
  summarise(NA_Count = sum(!is.na(FEV1)), .groups = 'drop') %>%
  arrange(Time)
print(na_count)

## # A tibble: 12 × 3
##    Smoker Time  NA_Count
##     <int> <fct>    <int>
##  1      0 Y0          23
##  2      1 Y0          85
##  3      0 Y3          27
##  4      1 Y3          95
##  5      0 Y6          28
##  6      1 Y6          89
##  7      0 Y9          30
##  8      1 Y9          85
##  9      0 Y12         29
## 10      1 Y12         81
## 11      0 Y15         24
## 12      1 Y15         73

wide responses

widetable <- na_count %>%
  pivot_wider(id_cols = "Smoker",
              names_from = "Time",
              values_from = "NA_Count"
              )
print(widetable)

## # A tibble: 2 × 7
##   Smoker    Y0    Y3    Y6    Y9   Y12   Y15
##    <int> <int> <int> <int> <int> <int> <int>
## 1      0    23    27    28    30    29    24
## 2      1    85    95    89    85    81    73

wide not responses

notthere <- HW1_data_long %>%
  group_by(Smoker, Time) %>%
  summarise(NA_Count = sum(is.na(FEV1)), .groups = 'drop') %>%
  arrange(Time)
print(notthere)

## # A tibble: 12 × 3
##    Smoker Time  NA_Count
##     <int> <fct>    <int>
##  1      0 Y0           9
##  2      1 Y0          16
##  3      0 Y3           5
##  4      1 Y3           6
##  5      0 Y6           4
##  6      1 Y6          12
##  7      0 Y9           2
##  8      1 Y9          16
##  9      0 Y12          3
## 10      1 Y12         20
## 11      0 Y15          8
## 12      1 Y15         28

#switch me
notthere <- notthere %>%
  pivot_wider(id_cols = "Smoker",
              names_from = "Time",
              values_from = "NA_Count"
              )
print(notthere)

## # A tibble: 2 × 7
##   Smoker    Y0    Y3    Y6    Y9   Y12   Y15
##    <int> <int> <int> <int> <int> <int> <int>
## 1      0     9     5     4     2     3     8
## 2      1    16     6    12    16    20    28

combined NA and responses in one wide table

merged_table <- merge(widetable, notthere, all = TRUE)
print(merged_table)

##   Smoker Y0 Y3 Y6 Y9 Y12 Y15
## 1      0  9  5  4  2   3   8
## 2      0 23 27 28 30  29  24
## 3      1 16  6 12 16  20  28
## 4      1 85 95 89 85  81  73

percent missing

missingness <- merged_table

for (i in 2:7) {
  missingness[1, i] <- merged_table[1, i] / merged_table[2, i]
  missingness[3, i] <- merged_table[3, i] / merged_table[4, i]
}

missingness <- missingness[c(1, 3), ]

print(missingness)

##   Smoker        Y0         Y3        Y6         Y9       Y12       Y15
## 1      0 0.3913043 0.18518519 0.1428571 0.06666667 0.1034483 0.3333333
## 3      1 0.1882353 0.06315789 0.1348315 0.18823529 0.2469136 0.3835616

1.3 In a Single Figure, create Boxplots of FEV1 across the follow up time (in years, starting from baseline) by Group (Smoker Status):

Describe the observed variability in the outcome over time: output observed variances and standard deviation by Group and follow up time
Comment on whether the mean and the median coincide

ggplot(data = HW1_data_long, aes(x = FEV1, y = Time)) + 
  geom_boxplot(fill = "gold") +
  facet_wrap(~ Smoker, ncol = 2 ) +
  labs(x = "smoker status", y = "Time")

## Warning: Removed 129 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

the observed variance and standard deviation appears similar across all time points and between smoker status, though there are more outliers among former smokers. often the mean and median do not coincide

1.4 In a Single Figure, create a panel (2 rows by 3 columns) of 6 plots (one for each time point) for distribution of FEV1 at each follow up timepoint by group (Smoker Status) overlaid with empirical probability curves. Hint: using the ggplot code in R Lab#1 which produced histograms by group with overlaid empirical probability curves and ADD to the code + facet_wrap(~FU_Time) where FU_Time is the follow up time “0 years”, “3 years”, etc. - Comment on the Normality of the observed FEV1 data at each follow up time point and explain how you might justify using a model for the mean response that relies on the assumption of MVN for Yi of FEV1.

bigplot <- ggplot(data = HW1_data_long, aes(x = FEV1)) + geom_histogram(
  binwidth = .1) + facet_wrap(~ Time) +
    geom_histogram(aes(color = Smoker == 1)) + 
      geom_density(aes(color = Smoker == 1), 
               linetype="dashed") + 
  labs(title = "Distribution of FEV1 for Smokers and non-Smokers Across Time",
       x = "FEV1", 
       color = "Currently Smoking", 
       fill = "Baseline Height >= 1.3 m")

print(bigplot)

## Warning: Removed 129 rows containing non-finite outside the scale range
## (`stat_bin()`).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 129 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 129 rows containing non-finite outside the scale range
## (`stat_density()`).

generally, the FEV1 for both smokers and non-smokers is normally distributed without skew, which would indicate mean-response as an appropriate model to find any differences along with using the MVN since covariance would be balanced

1.5 In a Single Figure, plot the observed mean FEV1 over time (years of follow up) by Group (Smoker Status), and connect the group means across timepoints starting from the baseline (0 years). DO NOT SMOOTH. - Describe the general characteristics of the time trends for the two groups: what is going on with the group means over time? What might you guess the best-fit model for the mean response will be?

#create a mean data frame
mean_data <- HW1_data_long %>%
  group_by(Smoker, Time) %>%
  summarise(Mean_Value = mean(FEV1, na.rm = TRUE))

## `summarise()` has grouped output by 'Smoker'. You can override using the
## `.groups` argument.

print(mean_data)

## # A tibble: 12 × 3
## # Groups:   Smoker [2]
##    Smoker Time  Mean_Value
##     <int> <fct>      <dbl>
##  1      0 Y0          3.52
##  2      0 Y3          3.58
##  3      0 Y6          3.26
##  4      0 Y9          3.17
##  5      0 Y12         3.14
##  6      0 Y15         2.87
##  7      1 Y0          3.23
##  8      1 Y3          3.12
##  9      1 Y6          3.09
## 10      1 Y9          2.87
## 11      1 Y12         2.80
## 12      1 Y15         2.68

#factor
HW1_data_long$Smoker <- as.factor(HW1_data_long$Smoker)
levels(HW1_data_long$Smoker) <- c("Current Smoker", "Former Smoker")

#Create average trend lines
lineup <- ggplot(data = mean_data, aes(x = Time, y = Mean_Value, group = Smoker)) + 
  geom_line(aes(color=Smoker)) + geom_point(aes(color=Smoker)) + 
  labs(title="Average Trend of Observed Mean FEV1 by Treatment Group",
       x="Time", y="FEV1")
print(lineup)

the mean FEV1 for both past and current smokers decreases over time, indicating the best model might be a simple linear regression

Question #2

Using the notation for Lecture#2 and Lab#2, but with the relevant variables from the current dataset, write out the model for the mean response E(Yij) for:

2.1 The MRP model, and write out the Null hypothesis of interest using your beta coefficients. Explain your Xijp variables.

sum(is.na(HW1_data_long$FEV1))

## [1] 129

sum(is.na(HW1_data_long$Time))

## [1] 0

sum(is.na(HW1_data_long$ID))

## [1] 0

HW1_data_long$Time_enumerated  = as.numeric(as.factor(HW1_data_long$Time))
#Check that the code above worked

table(HW1_data_long$Time_enumerated,HW1_data_long$Time)

##    
##      Y0  Y3  Y6  Y9 Y12 Y15
##   1 133   0   0   0   0   0
##   2   0 133   0   0   0   0
##   3   0   0 133   0   0   0
##   4   0   0   0 133   0   0
##   5   0   0   0   0 133   0
##   6   0   0   0   0   0 133

### Fit a response profile model using the gls function
#The default estimation method is REML
#m1 is Model 1 in the accompanying Word document: 
  #MRP not assuming equal group means at baseline
HW1_data_long_clean <- HW1_data_long %>%
  filter(!is.na(FEV1), !is.na(Time), !is.na(ID))


HW1_data_long_clean <- na.omit(HW1_data_long)

m_1 <- gls(FEV1 ~ Smoker*Time, data = HW1_data_long_clean, 
          corr=corSymm(form= ~ Time_enumerated | ID), 
        weights=varIdent(form = ~ 1 | Time), na.action = na.omit)
#varIdent allows for different variances at each time point
#Get estimates of the fixed effects and model fit statistics (AIC, BIC, logLik)
print(summary(m_1)) ###have a look of the coefficients!!!

## Generalized least squares fit by REML
##   Model: FEV1 ~ Smoker * Time 
##   Data: HW1_data_long_clean 
##        AIC      BIC    logLik
##   347.1353 495.2289 -140.5677
## 
## Correlation Structure: General
##  Formula: ~Time_enumerated | ID 
##  Parameter estimate(s):
##  Correlation: 
##   1     2     3     4     5    
## 2 0.864                        
## 3 0.849 0.893                  
## 4 0.837 0.833 0.848            
## 5 0.855 0.860 0.889 0.892      
## 6 0.841 0.871 0.872 0.874 0.931
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Time 
##  Parameter estimates:
##        Y0        Y3        Y6        Y9       Y15       Y12 
## 1.0000000 0.9696520 0.9435027 0.9528324 0.9665003 0.9387173 
## 
## Coefficients:
##                                 Value  Std.Error  t-value p-value
## (Intercept)                  3.487793 0.10946055 31.86347  0.0000
## SmokerFormer Smoker         -0.279122 0.12502607 -2.23251  0.0259
## TimeY3                      -0.010901 0.06346478 -0.17177  0.8637
## TimeY6                      -0.210907 0.06554868 -3.21756  0.0014
## TimeY9                      -0.320675 0.06706319 -4.78168  0.0000
## TimeY12                     -0.380604 0.06341193 -6.00209  0.0000
## TimeY15                     -0.501443 0.06862330 -7.30718  0.0000
## SmokerFormer Smoker:TimeY3  -0.070556 0.07163060 -0.98499  0.3250
## SmokerFormer Smoker:TimeY6   0.071464 0.07420000  0.96313  0.3358
## SmokerFormer Smoker:TimeY9  -0.051547 0.07644132 -0.67433  0.5003
## SmokerFormer Smoker:TimeY12 -0.029820 0.07216006 -0.41325  0.6796
## SmokerFormer Smoker:TimeY15 -0.067079 0.07798420 -0.86016  0.3900
## 
##  Correlation: 
##                             (Intr) SmkrFS TimeY3 TimeY6 TimeY9 TimY12 TimY15
## SmokerFormer Smoker         -0.876                                          
## TimeY3                      -0.378  0.331                                   
## TimeY6                      -0.430  0.377  0.678                            
## TimeY9                      -0.427  0.374  0.546  0.616                     
## TimeY12                     -0.441  0.386  0.596  0.700  0.724              
## TimeY15                     -0.394  0.345  0.608  0.646  0.668  0.782       
## SmokerFormer Smoker:TimeY3   0.335 -0.372 -0.886 -0.600 -0.484 -0.528 -0.539
## SmokerFormer Smoker:TimeY6   0.380 -0.422 -0.599 -0.883 -0.544 -0.618 -0.571
## SmokerFormer Smoker:TimeY9   0.375 -0.417 -0.479 -0.540 -0.877 -0.635 -0.586
## SmokerFormer Smoker:TimeY12  0.387 -0.430 -0.524 -0.615 -0.636 -0.879 -0.687
## SmokerFormer Smoker:TimeY15  0.347 -0.385 -0.535 -0.569 -0.587 -0.688 -0.880
##                             SFS:TY3 SFS:TY6 SFS:TY9 SFS:TY12
## SmokerFormer Smoker                                         
## TimeY3                                                      
## TimeY6                                                      
## TimeY9                                                      
## TimeY12                                                     
## TimeY15                                                     
## SmokerFormer Smoker:TimeY3                                  
## SmokerFormer Smoker:TimeY6   0.674                          
## SmokerFormer Smoker:TimeY9   0.536   0.603                  
## SmokerFormer Smoker:TimeY12  0.588   0.689   0.710          
## SmokerFormer Smoker:TimeY15  0.603   0.637   0.653   0.772  
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.92624003 -0.65632030  0.05787637  0.65630627  3.20305033 
## 
## Residual standard error: 0.5962603 
## Degrees of freedom: 669 total; 657 residual

m_1.1 <- gls(FEV1 ~ Smoker*Time, data = HW1_data_long_clean, 
          corr=corSymm(form= ~ Time_enumerated | ID),
        weights=varIdent(form = ~ 1 | Time), method = "ML", na.action = na.omit)
print(summary(m_1.1))

## Generalized least squares fit by maximum likelihood
##   Model: FEV1 ~ Smoker * Time 
##   Data: HW1_data_long_clean 
##        AIC      BIC    logLik
##   291.8494 440.5402 -112.9247
## 
## Correlation Structure: General
##  Formula: ~Time_enumerated | ID 
##  Parameter estimate(s):
##  Correlation: 
##   1     2     3     4     5    
## 2 0.864                        
## 3 0.850 0.893                  
## 4 0.837 0.833 0.848            
## 5 0.856 0.860 0.889 0.893      
## 6 0.841 0.871 0.873 0.874 0.932
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Time 
##  Parameter estimates:
##        Y0        Y3        Y6        Y9       Y15       Y12 
## 1.0000000 0.9698924 0.9436551 0.9529209 0.9665721 0.9388369 
## 
## Coefficients:
##                                 Value  Std.Error  t-value p-value
## (Intercept)                  3.487811 0.10957322 31.83087  0.0000
## SmokerFormer Smoker         -0.279157 0.12515650 -2.23047  0.0261
## TimeY3                      -0.010918 0.06345065 -0.17207  0.8634
## TimeY6                      -0.210914 0.06553080 -3.21855  0.0014
## TimeY9                      -0.320710 0.06704313 -4.78364  0.0000
## TimeY12                     -0.380608 0.06337998 -6.00518  0.0000
## TimeY15                     -0.501412 0.06858235 -7.31110  0.0000
## SmokerFormer Smoker:TimeY3  -0.070504 0.07161507 -0.98448  0.3252
## SmokerFormer Smoker:TimeY6   0.071488 0.07417996  0.96371  0.3355
## SmokerFormer Smoker:TimeY9  -0.051500 0.07641892 -0.67392  0.5006
## SmokerFormer Smoker:TimeY12 -0.029831 0.07212325 -0.41362  0.6793
## SmokerFormer Smoker:TimeY15 -0.067102 0.07793732 -0.86097  0.3896
## 
##  Correlation: 
##                             (Intr) SmkrFS TimeY3 TimeY6 TimeY9 TimY12 TimY15
## SmokerFormer Smoker         -0.875                                          
## TimeY3                      -0.377  0.330                                   
## TimeY6                      -0.430  0.376  0.677                            
## TimeY9                      -0.427  0.374  0.545  0.616                     
## TimeY12                     -0.440  0.385  0.595  0.700  0.724              
## TimeY15                     -0.394  0.345  0.608  0.646  0.668  0.783       
## SmokerFormer Smoker:TimeY3   0.334 -0.371 -0.886 -0.600 -0.483 -0.527 -0.539
## SmokerFormer Smoker:TimeY6   0.380 -0.422 -0.598 -0.883 -0.544 -0.618 -0.571
## SmokerFormer Smoker:TimeY9   0.374 -0.416 -0.478 -0.540 -0.877 -0.635 -0.586
## SmokerFormer Smoker:TimeY12  0.387 -0.430 -0.523 -0.615 -0.636 -0.879 -0.688
## SmokerFormer Smoker:TimeY15  0.347 -0.384 -0.535 -0.569 -0.588 -0.689 -0.880
##                             SFS:TY3 SFS:TY6 SFS:TY9 SFS:TY12
## SmokerFormer Smoker                                         
## TimeY3                                                      
## TimeY6                                                      
## TimeY9                                                      
## TimeY12                                                     
## TimeY15                                                     
## SmokerFormer Smoker:TimeY3                                  
## SmokerFormer Smoker:TimeY6   0.674                          
## SmokerFormer Smoker:TimeY9   0.535   0.603                  
## SmokerFormer Smoker:TimeY12  0.587   0.689   0.710          
## SmokerFormer Smoker:TimeY15  0.603   0.637   0.653   0.773  
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.94932081 -0.66141929  0.05836135  0.66154229  3.22802665 
## 
## Residual standard error: 0.591567 
## Degrees of freedom: 669 total; 657 residual

m_1.2 <- gls(FEV1 ~ Smoker + Time, data = HW1_data_long_clean, 
          corr=corSymm(form= ~ Time_enumerated | ID),
        weights=varIdent(form = ~ 1 | Time), method = "ML", na.action = na.omit)
print(summary(m_1.2))

## Generalized least squares fit by maximum likelihood
##   Model: FEV1 ~ Smoker + Time 
##   Data: HW1_data_long_clean 
##        AIC      BIC    logLik
##   289.5773 415.7393 -116.7887
## 
## Correlation Structure: General
##  Formula: ~Time_enumerated | ID 
##  Parameter estimate(s):
##  Correlation: 
##   1     2     3     4     5    
## 2 0.863                        
## 3 0.850 0.888                  
## 4 0.836 0.835 0.842            
## 5 0.856 0.862 0.887 0.893      
## 6 0.839 0.872 0.869 0.874 0.931
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Time 
##  Parameter estimates:
##        Y0        Y3        Y6        Y9       Y15       Y12 
## 1.0000000 0.9748603 0.9474614 0.9525876 0.9633347 0.9389191 
## 
## Coefficients:
##                         Value  Std.Error   t-value p-value
## (Intercept)          3.498971 0.09834206  35.57960  0.0000
## SmokerFormer Smoker -0.293386 0.10923274  -2.68588  0.0074
## TimeY3              -0.067006 0.02948543  -2.27251  0.0234
## TimeY6              -0.156902 0.03066146  -5.11723  0.0000
## TimeY9              -0.359571 0.03206646 -11.21331  0.0000
## TimeY12             -0.403248 0.03009547 -13.39895  0.0000
## TimeY15             -0.552308 0.03253095 -16.97792  0.0000
## 
##  Correlation: 
##                     (Intr) SmkrFS TimeY3 TimeY6 TimeY9 TimY12
## SmokerFormer Smoker -0.844                                   
## TimeY3              -0.188  0.001                            
## TimeY6              -0.215  0.002  0.648                     
## TimeY9              -0.216  0.007  0.516  0.557              
## TimeY12             -0.223  0.006  0.571  0.656  0.678       
## TimeY15             -0.200  0.003  0.596  0.605  0.622  0.748
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -2.9283352 -0.6550817  0.0819973  0.6668006  3.2487553 
## 
## Residual standard error: 0.5915045 
## Degrees of freedom: 669 total; 662 residual

# now everybody from the LRT ...
anova(m_1.1, m_1.2)

the $N_0$ is that the effects of smoking status, time, and their interaction are not statistically significant and have no effect on FEV1. we can fail to reject the $N_0$ hypothesis. None of the $X_{ijp}$ variables had a p-value below .05

2.2 The constant slope of change in mean FEV1, and write out the Null hypothesis of interest using the beta coefficients. Explain your Xijp variables.

the values for TimeY3:TImeY6. These represent the differences in FEV1 from the baseline from Time0. Each coefficient shows how much FEV1 changes from Time 0 at each of these specific time points, and apart from $Y_3$, are all statistically significant.

2.3 The non-constant slope of change in mean FEV1 (polynomial up to quadratic effect of follow up time), and write out the Null hypothesis of interest using the beta coefficients. Explain your Xijp variables.

we need a polynomial model for this, as below

HW1_data_long_clean$ctime <- HW1_data_long_clean$Time_enumerated - 
  mean(HW1_data_long_clean$Time_enumerated)

HW1_data_long_clean$ctime2 <- (HW1_data_long_clean$ctime)^2

#looks similar
summary(HW1_data_long$ctime)

## Warning: Unknown or uninitialised column: `ctime`.

## Length  Class   Mode 
##      0   NULL   NULL

### Fit a quadratic trend model
m_i6 <- gls(FEV1 ~ Smoker * ctime + Smoker * ctime2, 
            data = HW1_data_long_clean,
          corr=corSymm(form= ~ Time_enumerated | ID),
          weights=varIdent(form= ~ 1 | Time))
summary(m_i6)

## Generalized least squares fit by REML
##   Model: FEV1 ~ Smoker * ctime + Smoker * ctime2 
##   Data: HW1_data_long_clean 
##       AIC      BIC    logLik
##   337.991 459.4039 -141.9955
## 
## Correlation Structure: General
##  Formula: ~Time_enumerated | ID 
##  Parameter estimate(s):
##  Correlation: 
##   1     2     3     4     5    
## 2 0.862                        
## 3 0.849 0.889                  
## 4 0.832 0.830 0.833            
## 5 0.855 0.863 0.890 0.883      
## 6 0.842 0.873 0.873 0.868 0.932
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Time 
##  Parameter estimates:
##        Y0        Y3        Y6        Y9       Y15       Y12 
## 1.0000000 0.9770804 0.9493417 0.9513921 0.9693235 0.9398637 
## 
## Coefficients:
##                                Value  Std.Error  t-value p-value
## (Intercept)                 3.251352 0.09564734 33.99313  0.0000
## SmokerFormer Smoker        -0.280808 0.10984322 -2.55645  0.0108
## ctime                      -0.106446 0.01215982 -8.75390  0.0000
## ctime2                      0.000331 0.00669002  0.04947  0.9606
## SmokerFormer Smoker:ctime  -0.007274 0.01382671 -0.52609  0.5990
## SmokerFormer Smoker:ctime2 -0.006571 0.00766256 -0.85751  0.3915
## 
##  Correlation: 
##                            (Intr) SmkrFS ctime  ctime2 SmkFS:
## SmokerFormer Smoker        -0.871                            
## ctime                      -0.003  0.003                     
## ctime2                     -0.086  0.075 -0.257              
## SmokerFormer Smoker:ctime   0.003 -0.001 -0.879  0.226       
## SmokerFormer Smoker:ctime2  0.075 -0.090  0.224 -0.873 -0.244
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.90912802 -0.67271976  0.03459019  0.67478636  3.24118537 
## 
## Residual standard error: 0.5957773 
## Degrees of freedom: 669 total; 663 residual

m_i6.1 <- gls(FEV1 ~ Smoker * ctime + Smoker * ctime2, 
            data = HW1_data_long_clean,
            method = "ML",
          corr=corSymm(form= ~ Time_enumerated | ID),
          weights=varIdent(form= ~ 1 | Time))
summary(m_i6.1)

## Generalized least squares fit by maximum likelihood
##   Model: FEV1 ~ Smoker * ctime + Smoker * ctime2 
##   Data: HW1_data_long_clean 
##        AIC      BIC    logLik
##   298.2473 419.9035 -122.1237
## 
## Correlation Structure: General
##  Formula: ~Time_enumerated | ID 
##  Parameter estimate(s):
##  Correlation: 
##   1     2     3     4     5    
## 2 0.861                        
## 3 0.849 0.888                  
## 4 0.832 0.828 0.831            
## 5 0.856 0.862 0.889 0.881      
## 6 0.842 0.873 0.873 0.868 0.932
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Time 
##  Parameter estimates:
##        Y0        Y3        Y6        Y9       Y15       Y12 
## 1.0000000 0.9781922 0.9502159 0.9521886 0.9693064 0.9404409 
## 
## Coefficients:
##                                Value  Std.Error  t-value p-value
## (Intercept)                 3.251337 0.09534491 34.10079  0.0000
## SmokerFormer Smoker        -0.280869 0.10949654 -2.56510  0.0105
## ctime                      -0.106453 0.01211448 -8.78728  0.0000
## ctime2                      0.000321 0.00665361  0.04830  0.9615
## SmokerFormer Smoker:ctime  -0.007255 0.01377484 -0.52666  0.5986
## SmokerFormer Smoker:ctime2 -0.006563 0.00762113 -0.86110  0.3895
## 
##  Correlation: 
##                            (Intr) SmkrFS ctime  ctime2 SmkFS:
## SmokerFormer Smoker        -0.871                            
## ctime                      -0.003  0.003                     
## ctime2                     -0.086  0.075 -0.257              
## SmokerFormer Smoker:ctime   0.003 -0.001 -0.879  0.226       
## SmokerFormer Smoker:ctime2  0.075 -0.089  0.224 -0.873 -0.244
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.93205436 -0.67752242  0.03503597  0.67965261  3.26494799 
## 
## Residual standard error: 0.5910947 
## Degrees of freedom: 669 total; 663 residual

m_i6.2 <- gls(FEV1 ~ Smoker + ctime + ctime2, 
            data = HW1_data_long_clean,
            method = "ML",
          corr=corSymm(form= ~ Time_enumerated | ID),
          weights=varIdent(form= ~ 1 | Time))
summary(m_i6.2)

## Generalized least squares fit by maximum likelihood
##   Model: FEV1 ~ Smoker + ctime + ctime2 
##   Data: HW1_data_long_clean 
##        AIC      BIC    logLik
##   295.5166 408.1612 -122.7583
## 
## Correlation Structure: General
##  Formula: ~Time_enumerated | ID 
##  Parameter estimate(s):
##  Correlation: 
##   1     2     3     4     5    
## 2 0.862                        
## 3 0.848 0.887                  
## 4 0.833 0.828 0.830            
## 5 0.855 0.862 0.888 0.881      
## 6 0.841 0.873 0.869 0.868 0.931
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Time 
##  Parameter estimates:
##        Y0        Y3        Y6        Y9       Y15       Y12 
## 1.0000000 0.9771470 0.9512045 0.9527234 0.9656179 0.9407500 
## 
## Coefficients:
##                         Value  Std.Error   t-value p-value
## (Intercept)          3.260015 0.09493655  34.33888  0.0000
## SmokerFormer Smoker -0.292784 0.10886049  -2.68954  0.0073
## ctime               -0.111878 0.00575943 -19.42526  0.0000
## ctime2              -0.004516 0.00324973  -1.38950  0.1651
## 
##  Correlation: 
##                     (Intr) SmkrFS ctime 
## SmokerFormer Smoker -0.870              
## ctime               -0.005  0.005       
## ctime2              -0.047 -0.006 -0.210
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.88845683 -0.68441123  0.05266598  0.68565849  3.28560179 
## 
## Residual standard error: 0.5909879 
## Degrees of freedom: 669 total; 665 residual

anova(m_i6.1, m_i6.2)

the null hypothesis is if $\beta_3$ and $\beta_5$ are $ = 0$, which they both are, though the p-values are above .05, indicating non-statistical significance. regarding the $X_{ijp}$ variables, Smoker × ctime: The interaction between smoker status and linear time, representing how the rate of change in FEV1 differs for smokers compared to non-smokers. Smoker × ctime2: The interaction between smoker status and quadratic time, representing how the non-linear effect of time on FEV1 differs for smokers compared to non-smokers

table(HW1_data_long$Time_enumerated,HW1_data_long$Time)

##    
##      Y0  Y3  Y6  Y9 Y12 Y15
##   1 133   0   0   0   0   0
##   2   0 133   0   0   0   0
##   3   0   0 133   0   0   0
##   4   0   0   0 133   0   0
##   5   0   0   0   0 133   0
##   6   0   0   0   0   0 133

2.4 The piece-wise linear change in FEV1 with a knot at the 3-year follow up, and write out the Null hypothesis of interest using the beta coefficients. Explain your Xijp variables.

# threat level 7
HW1_data_long_clean$y3 <- (HW1_data_long_clean$Time_enumerated - 3) * 
  I(HW1_data_long_clean$Time_enumerated >= 3)
head(HW1_data_long_clean)

### Fit a piece-wise linear model
m7 <- gls(FEV1 ~ Smoker*Time_enumerated + Smoker*y3, data = HW1_data_long_clean, 
          corr=corSymm(form= ~ Time_enumerated | ID),
          weights=varIdent(form= ~ 1 | Time))
summary(m7)

## Generalized least squares fit by REML
##   Model: FEV1 ~ Smoker * Time_enumerated + Smoker * y3 
##   Data: HW1_data_long_clean 
##       AIC    BIC    logLik
##   328.877 450.29 -137.4385
## 
## Correlation Structure: General
##  Formula: ~Time_enumerated | ID 
##  Parameter estimate(s):
##  Correlation: 
##   1     2     3     4     5    
## 2 0.862                        
## 3 0.850 0.891                  
## 4 0.831 0.830 0.837            
## 5 0.855 0.862 0.891 0.881      
## 6 0.841 0.873 0.873 0.869 0.932
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Time 
##  Parameter estimates:
##        Y0        Y3        Y6        Y9       Y15       Y12 
## 1.0000000 0.9761671 0.9466063 0.9505686 0.9693499 0.9397061 
## 
## Coefficients:
##                                         Value  Std.Error   t-value p-value
## (Intercept)                          3.651597 0.12323654 29.630801  0.0000
## SmokerFormer Smoker                 -0.364707 0.14047701 -2.596206  0.0096
## Time_enumerated                     -0.120938 0.02970930 -4.070727  0.0001
## y3                                   0.021471 0.04015482  0.534704  0.5930
## SmokerFormer Smoker:Time_enumerated  0.036321 0.03368764  1.078167  0.2814
## SmokerFormer Smoker:y3              -0.069127 0.04581959 -1.508674  0.1319
## 
##  Correlation: 
##                                     (Intr) SmkrFS Tm_nmr y3     SFS:T_
## SmokerFormer Smoker                 -0.877                            
## Time_enumerated                     -0.626  0.549                     
## y3                                   0.532 -0.466 -0.918              
## SmokerFormer Smoker:Time_enumerated  0.552 -0.619 -0.882  0.810       
## SmokerFormer Smoker:y3              -0.466  0.524  0.805 -0.876 -0.917
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.92575818 -0.67492471  0.01886003  0.68115799  3.23190067 
## 
## Residual standard error: 0.596026 
## Degrees of freedom: 669 total; 663 residual

m7.1 <- gls(FEV1 ~ Smoker*Time_enumerated + Smoker*y3, data = HW1_data_long_clean, 
          corr=corSymm(form= ~ Time_enumerated | ID),
          weights=varIdent(form= ~ 1 | Time), method = "ML")
summary(m7.1)

## Generalized least squares fit by maximum likelihood
##   Model: FEV1 ~ Smoker * Time_enumerated + Smoker * y3 
##   Data: HW1_data_long_clean 
##        AIC      BIC    logLik
##   296.2794 417.9355 -121.1397
## 
## Correlation Structure: General
##  Formula: ~Time_enumerated | ID 
##  Parameter estimate(s):
##  Correlation: 
##   1     2     3     4     5    
## 2 0.860                        
## 3 0.850 0.890                  
## 4 0.831 0.828 0.835            
## 5 0.855 0.861 0.890 0.880      
## 6 0.841 0.873 0.873 0.868 0.932
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Time 
##  Parameter estimates:
##        Y0        Y3        Y6        Y9       Y15       Y12 
## 1.0000000 0.9773222 0.9472690 0.9514751 0.9694814 0.9403262 
## 
## Coefficients:
##                                         Value  Std.Error   t-value p-value
## (Intercept)                          3.651249 0.12275745 29.743599  0.0000
## SmokerFormer Smoker                 -0.364219 0.13993059 -2.602858  0.0095
## Time_enumerated                     -0.120827 0.02956910 -4.086247  0.0000
## y3                                   0.021293 0.03997864  0.532607  0.5945
## SmokerFormer Smoker:Time_enumerated  0.036125 0.03352788  1.077463  0.2817
## SmokerFormer Smoker:y3              -0.068828 0.04561868 -1.508761  0.1318
## 
##  Correlation: 
##                                     (Intr) SmkrFS Tm_nmr y3     SFS:T_
## SmokerFormer Smoker                 -0.877                            
## Time_enumerated                     -0.625  0.548                     
## y3                                   0.531 -0.466 -0.918              
## SmokerFormer Smoker:Time_enumerated  0.551 -0.618 -0.882  0.810       
## SmokerFormer Smoker:y3              -0.465  0.523  0.804 -0.876 -0.917
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.94832301 -0.68040038  0.01913402  0.68620816  3.25579117 
## 
## Residual standard error: 0.59131 
## Degrees of freedom: 669 total; 663 residual

m7.2 <- gls(FEV1 ~ Smoker + Time_enumerated + y3, data = HW1_data_long_clean, 
          corr=corSymm(form= ~ Time_enumerated | ID),
          weights=varIdent(form= ~ 1 | Time), method = "ML")
summary(m7.2)

## Generalized least squares fit by maximum likelihood
##   Model: FEV1 ~ Smoker + Time_enumerated + y3 
##   Data: HW1_data_long_clean 
##       AIC      BIC   logLik
##   295.018 407.6626 -122.509
## 
## Correlation Structure: General
##  Formula: ~Time_enumerated | ID 
##  Parameter estimate(s):
##  Correlation: 
##   1     2     3     4     5    
## 2 0.862                        
## 3 0.849 0.887                  
## 4 0.833 0.829 0.832            
## 5 0.855 0.862 0.888 0.881      
## 6 0.841 0.873 0.869 0.869 0.931
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Time 
##  Parameter estimates:
##        Y0        Y3        Y6        Y9       Y15       Y12 
## 1.0000000 0.9772319 0.9498477 0.9522841 0.9654843 0.9410908 
## 
## Coefficients:
##                         Value  Std.Error  t-value p-value
## (Intercept)          3.596282 0.10156759 35.40777  0.0000
## SmokerFormer Smoker -0.292423 0.10885487 -2.68636  0.0074
## Time_enumerated     -0.093584 0.01396281 -6.70236  0.0000
## y3                  -0.030262 0.01934509 -1.56433  0.1182
## 
##  Correlation: 
##                     (Intr) SmkrFS Tm_nmr
## SmokerFormer Smoker -0.815              
## Time_enumerated     -0.352  0.006       
## y3                   0.294 -0.004 -0.915
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.88539075 -0.67488182  0.05781599  0.67338844  3.29366461 
## 
## Residual standard error: 0.5910889 
## Degrees of freedom: 669 total; 665 residual

The $H_0$ tests whether the interaction at the 3-year follow-up (y3 x smoker) has any significant effect on the change in FEV1. $H_0 = $ has to be $\beta_{y3} = 0$ and $\beta_{Smoker:y3} = 0$, which they are, but not statistically significant. $\beta_{y3}$ indicates the knot at year 3. $\beta_{Smoker:y3}$ shows the differential effect of the knot at year 3 on FEV1 for smokers versus non-smokers. It indicates whether the slope change at year 3 is different for smokers compared to non-smokers.

2.5 Using R or SAS statistical software, implement each of the 4 models for the mean response (NO OUTPUT IS NEEDED at this point BUT show your programming code) and pick among them the best-fit model for the mean of FEV1: - First chose among the nested models: Which models are nested and why? Which test should you use to choose among them?

the nested models are the non-integer models (m_1.1, m_1.2, m_i6.1, etc), since they are special cases of the simpler model. nested models are either less constrained or have more parameters

Compare the best-fit model among the nested models to the not-nested model: Which test should you use chose among them?

we can exclude the quadratic model (non-nested) since it has the highest AIC above 300. all the other models have roughly the same AIC and all share insignificant $\beta_i$ coefficients. so we’ll favor the linear model (m1 series), since it uses the fewest predictors to achieve similar results. even though the p-values for the LRT decrease for the quadratic and piecewise functions, the LRT decreases too. so we still favor the linear model with it’s much higher LRT and still insignificant p-value.

Show your R or SAS output for the summary statistics from your tests (e.g., from ANOVA in R) and use that as evidence for selecting the best-fit model for the mean response.

anova(m7.1, m7.2)

Question #3

Using the best-fit model for the mean response you chose in Question #2:

3.1 Provide your R or SAS output ONLY for the estimated regression coefficients.

summary(m_1.1)

## Generalized least squares fit by maximum likelihood
##   Model: FEV1 ~ Smoker * Time 
##   Data: HW1_data_long_clean 
##        AIC      BIC    logLik
##   291.8494 440.5402 -112.9247
## 
## Correlation Structure: General
##  Formula: ~Time_enumerated | ID 
##  Parameter estimate(s):
##  Correlation: 
##   1     2     3     4     5    
## 2 0.864                        
## 3 0.850 0.893                  
## 4 0.837 0.833 0.848            
## 5 0.856 0.860 0.889 0.893      
## 6 0.841 0.871 0.873 0.874 0.932
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Time 
##  Parameter estimates:
##        Y0        Y3        Y6        Y9       Y15       Y12 
## 1.0000000 0.9698924 0.9436551 0.9529209 0.9665721 0.9388369 
## 
## Coefficients:
##                                 Value  Std.Error  t-value p-value
## (Intercept)                  3.487811 0.10957322 31.83087  0.0000
## SmokerFormer Smoker         -0.279157 0.12515650 -2.23047  0.0261
## TimeY3                      -0.010918 0.06345065 -0.17207  0.8634
## TimeY6                      -0.210914 0.06553080 -3.21855  0.0014
## TimeY9                      -0.320710 0.06704313 -4.78364  0.0000
## TimeY12                     -0.380608 0.06337998 -6.00518  0.0000
## TimeY15                     -0.501412 0.06858235 -7.31110  0.0000
## SmokerFormer Smoker:TimeY3  -0.070504 0.07161507 -0.98448  0.3252
## SmokerFormer Smoker:TimeY6   0.071488 0.07417996  0.96371  0.3355
## SmokerFormer Smoker:TimeY9  -0.051500 0.07641892 -0.67392  0.5006
## SmokerFormer Smoker:TimeY12 -0.029831 0.07212325 -0.41362  0.6793
## SmokerFormer Smoker:TimeY15 -0.067102 0.07793732 -0.86097  0.3896
## 
##  Correlation: 
##                             (Intr) SmkrFS TimeY3 TimeY6 TimeY9 TimY12 TimY15
## SmokerFormer Smoker         -0.875                                          
## TimeY3                      -0.377  0.330                                   
## TimeY6                      -0.430  0.376  0.677                            
## TimeY9                      -0.427  0.374  0.545  0.616                     
## TimeY12                     -0.440  0.385  0.595  0.700  0.724              
## TimeY15                     -0.394  0.345  0.608  0.646  0.668  0.783       
## SmokerFormer Smoker:TimeY3   0.334 -0.371 -0.886 -0.600 -0.483 -0.527 -0.539
## SmokerFormer Smoker:TimeY6   0.380 -0.422 -0.598 -0.883 -0.544 -0.618 -0.571
## SmokerFormer Smoker:TimeY9   0.374 -0.416 -0.478 -0.540 -0.877 -0.635 -0.586
## SmokerFormer Smoker:TimeY12  0.387 -0.430 -0.523 -0.615 -0.636 -0.879 -0.688
## SmokerFormer Smoker:TimeY15  0.347 -0.384 -0.535 -0.569 -0.588 -0.689 -0.880
##                             SFS:TY3 SFS:TY6 SFS:TY9 SFS:TY12
## SmokerFormer Smoker                                         
## TimeY3                                                      
## TimeY6                                                      
## TimeY9                                                      
## TimeY12                                                     
## TimeY15                                                     
## SmokerFormer Smoker:TimeY3                                  
## SmokerFormer Smoker:TimeY6   0.674                          
## SmokerFormer Smoker:TimeY9   0.535   0.603                  
## SmokerFormer Smoker:TimeY12  0.587   0.689   0.710          
## SmokerFormer Smoker:TimeY15  0.603   0.637   0.653   0.773  
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.94932081 -0.66141929  0.05836135  0.66154229  3.22802665 
## 
## Residual standard error: 0.591567 
## Degrees of freedom: 669 total; 657 residual

#a summary of the coefficients
print(summary(m_1.1$coefficients))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.5014 -0.2895 -0.0688  0.1364 -0.0251  3.4878

3.2 Conduct an appropriate hypothesis test for the differences in mean FEV1 levels over time between the two groups of smokers (current versus past smoker). Make sure you include the following in your response: - State your Null and Alternative Hypotheses using the regression coefficients.

the $H_0$ is that the $\beta_{smoker:time} = 0$ over time, which it is, roughly by one-tenth, but with an insignificant p-value. the alternative hypothesis is that $\beta_{smoker:time} \neq 0$

What test did you use?

anova(m_1,type = "marginal")

using the f-test we see that the interaction between smoker status and time is limited, and without a significant p-value. it’s nested model, m_1.1, with a maximum likelihood, does not change either of these results

What is your conclusion in words? (Do not simply state whether you failed to reject or rejected the Null BUT also explain in one or two sentences your conclusion.)

my conclusion is that we failed to reject the $N_0$. Our $\beta_{interactions}$ failed to yield statistically significant p-values, had low F-statistic values, and the optimal model did not have much of a big AIC difference from unfit models

3.3 Using the estimated regression coefficients from your output, write out and hand-calculate time-specific estimates of FEV1 means at baseline separately in Current and Former Smokers. Provide 95% Confidence Intervals (95%CI) for these two estimated means of FEV1 (you can use software for these).

summary(m_1.1)

## Generalized least squares fit by maximum likelihood
##   Model: FEV1 ~ Smoker * Time 
##   Data: HW1_data_long_clean 
##        AIC      BIC    logLik
##   291.8494 440.5402 -112.9247
## 
## Correlation Structure: General
##  Formula: ~Time_enumerated | ID 
##  Parameter estimate(s):
##  Correlation: 
##   1     2     3     4     5    
## 2 0.864                        
## 3 0.850 0.893                  
## 4 0.837 0.833 0.848            
## 5 0.856 0.860 0.889 0.893      
## 6 0.841 0.871 0.873 0.874 0.932
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Time 
##  Parameter estimates:
##        Y0        Y3        Y6        Y9       Y15       Y12 
## 1.0000000 0.9698924 0.9436551 0.9529209 0.9665721 0.9388369 
## 
## Coefficients:
##                                 Value  Std.Error  t-value p-value
## (Intercept)                  3.487811 0.10957322 31.83087  0.0000
## SmokerFormer Smoker         -0.279157 0.12515650 -2.23047  0.0261
## TimeY3                      -0.010918 0.06345065 -0.17207  0.8634
## TimeY6                      -0.210914 0.06553080 -3.21855  0.0014
## TimeY9                      -0.320710 0.06704313 -4.78364  0.0000
## TimeY12                     -0.380608 0.06337998 -6.00518  0.0000
## TimeY15                     -0.501412 0.06858235 -7.31110  0.0000
## SmokerFormer Smoker:TimeY3  -0.070504 0.07161507 -0.98448  0.3252
## SmokerFormer Smoker:TimeY6   0.071488 0.07417996  0.96371  0.3355
## SmokerFormer Smoker:TimeY9  -0.051500 0.07641892 -0.67392  0.5006
## SmokerFormer Smoker:TimeY12 -0.029831 0.07212325 -0.41362  0.6793
## SmokerFormer Smoker:TimeY15 -0.067102 0.07793732 -0.86097  0.3896
## 
##  Correlation: 
##                             (Intr) SmkrFS TimeY3 TimeY6 TimeY9 TimY12 TimY15
## SmokerFormer Smoker         -0.875                                          
## TimeY3                      -0.377  0.330                                   
## TimeY6                      -0.430  0.376  0.677                            
## TimeY9                      -0.427  0.374  0.545  0.616                     
## TimeY12                     -0.440  0.385  0.595  0.700  0.724              
## TimeY15                     -0.394  0.345  0.608  0.646  0.668  0.783       
## SmokerFormer Smoker:TimeY3   0.334 -0.371 -0.886 -0.600 -0.483 -0.527 -0.539
## SmokerFormer Smoker:TimeY6   0.380 -0.422 -0.598 -0.883 -0.544 -0.618 -0.571
## SmokerFormer Smoker:TimeY9   0.374 -0.416 -0.478 -0.540 -0.877 -0.635 -0.586
## SmokerFormer Smoker:TimeY12  0.387 -0.430 -0.523 -0.615 -0.636 -0.879 -0.688
## SmokerFormer Smoker:TimeY15  0.347 -0.384 -0.535 -0.569 -0.588 -0.689 -0.880
##                             SFS:TY3 SFS:TY6 SFS:TY9 SFS:TY12
## SmokerFormer Smoker                                         
## TimeY3                                                      
## TimeY6                                                      
## TimeY9                                                      
## TimeY12                                                     
## TimeY15                                                     
## SmokerFormer Smoker:TimeY3                                  
## SmokerFormer Smoker:TimeY6   0.674                          
## SmokerFormer Smoker:TimeY9   0.535   0.603                  
## SmokerFormer Smoker:TimeY12  0.587   0.689   0.710          
## SmokerFormer Smoker:TimeY15  0.603   0.637   0.653   0.773  
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.94932081 -0.66141929  0.05836135  0.66154229  3.22802665 
## 
## Residual standard error: 0.591567 
## Degrees of freedom: 669 total; 657 residual

current smokers’ baseline equation $Y_i = 3.49 + (-.27 * 1) = 3.22$ former smokers’ baseline equation $Y_i = 3.49 + (-.27 * 0) = 3.49$

confit_baseline <- confint(m_1)
print(confit_baseline)

##                                   2.5 %      97.5 %
## (Intercept)                  3.27325428  3.70233175
## SmokerFormer Smoker         -0.52416898 -0.03407580
## TimeY3                      -0.13529010  0.11348727
## TimeY6                      -0.33937981 -0.08243370
## TimeY9                      -0.45211611 -0.18923324
## TimeY12                     -0.50488907 -0.25631886
## TimeY15                     -0.63594181 -0.36694341
## SmokerFormer Smoker:TimeY3  -0.21094914  0.06983766
## SmokerFormer Smoker:TimeY6  -0.07396496  0.21689369
## SmokerFormer Smoker:TimeY9  -0.20136899  0.09827546
## SmokerFormer Smoker:TimeY12 -0.17125125  0.11161097
## SmokerFormer Smoker:TimeY15 -0.21992477  0.08576768

print(paste("the 95% CI for the estimated FEV1 baseline means are",
            round(confit_baseline[1,1], 2), "-", round(confit_baseline[1,2], 2),
            "for current smokers and", round((confit_baseline[1,1]) + 
                                               confit_baseline[2,1]),
            "-", round((confit_baseline[1,2]) + confit_baseline[2,2], 2), 
            "for former smokers"))

## [1] "the 95% CI for the estimated FEV1 baseline means are 3.27 - 3.7 for current smokers and 3 - 3.67 for former smokers"

3.4 On the same Figure, plot the observed and estimated mean FEV1 levels for each group (Current and Former Smokers) across the follow up time from the best-fit model, connecting the means to produce the mean profiles (i.e. connect the means without smoothing). What are your observations of how the model fits the observed data?

lsmeans_mod1 <- unique(data.frame(Smoker = HW1_data_long_clean$Smoker, Time_enumerated = HW1_data_long_clean$Time_enumerated, FEV1 = m_1$fitted))
lsmeans_mod1

write.csv(lsmeans_mod1, "mod.csv", row.names = FALSE)
mod <- read.csv("mod.csv", header = TRUE)
write.csv(mean_data, "mean.csv", row.names = FALSE)
mean_data <- (read.csv("mean.csv", header = TRUE))
trends.meanprofile <- read.csv("all.csv", header = TRUE)
trends.meanprofile

#trend lines
trends <- ggplot(data = mod, aes(
  x = Time_enumerated,
  y = FEV1,
  group = Smoker))
trends + geom_line(aes(color = Smoker)) + geom_point(aes(color = Smoker)) +
  labs(title="Average Trend of FEV1 by Smoker Group Fitted by m_1",
       x="Time", y="FEV1", colour="Smoker Group")

#Plot on one figure observed means and fitted means from m1
#Create a grouping factor to distinguish predicted values from observed means
mod$group <- "Predicted"
mean_data$group <- "Observed"
#combine the dataframe with mlogcd4 by row #trying to find mean 

trends.meanprofile

trends.meanprofile$Group <- as.factor(trends.meanprofile$Group)

mean_data

trends_b <- ggplot(trends.meanprofile, aes(x = Time_enumerated,
                                           y = FEV1,
                                           color=Smoker,
                                           linetype=Group,
                                           group=interaction(Smoker,Group)))
trends_b + geom_line() + 
  geom_point() +
  labs(title="Predicted and Observed Mean Trajectories of FEV1 by Smoker Group",
       x="Time Point", y="FEV1", color="Treatment Group", linetype="Predicted/Observed") +
  scale_linetype_manual(values=c("dashed","solid"), labels=c("Observed","Predicted"))

the model and the observed values are fitting fairly close together, indicating a close fit and making me surprised that the beta coefficients

print(summary(m_1))

## Generalized least squares fit by REML
##   Model: FEV1 ~ Smoker * Time 
##   Data: HW1_data_long_clean 
##        AIC      BIC    logLik
##   347.1353 495.2289 -140.5677
## 
## Correlation Structure: General
##  Formula: ~Time_enumerated | ID 
##  Parameter estimate(s):
##  Correlation: 
##   1     2     3     4     5    
## 2 0.864                        
## 3 0.849 0.893                  
## 4 0.837 0.833 0.848            
## 5 0.855 0.860 0.889 0.892      
## 6 0.841 0.871 0.872 0.874 0.931
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Time 
##  Parameter estimates:
##        Y0        Y3        Y6        Y9       Y15       Y12 
## 1.0000000 0.9696520 0.9435027 0.9528324 0.9665003 0.9387173 
## 
## Coefficients:
##                                 Value  Std.Error  t-value p-value
## (Intercept)                  3.487793 0.10946055 31.86347  0.0000
## SmokerFormer Smoker         -0.279122 0.12502607 -2.23251  0.0259
## TimeY3                      -0.010901 0.06346478 -0.17177  0.8637
## TimeY6                      -0.210907 0.06554868 -3.21756  0.0014
## TimeY9                      -0.320675 0.06706319 -4.78168  0.0000
## TimeY12                     -0.380604 0.06341193 -6.00209  0.0000
## TimeY15                     -0.501443 0.06862330 -7.30718  0.0000
## SmokerFormer Smoker:TimeY3  -0.070556 0.07163060 -0.98499  0.3250
## SmokerFormer Smoker:TimeY6   0.071464 0.07420000  0.96313  0.3358
## SmokerFormer Smoker:TimeY9  -0.051547 0.07644132 -0.67433  0.5003
## SmokerFormer Smoker:TimeY12 -0.029820 0.07216006 -0.41325  0.6796
## SmokerFormer Smoker:TimeY15 -0.067079 0.07798420 -0.86016  0.3900
## 
##  Correlation: 
##                             (Intr) SmkrFS TimeY3 TimeY6 TimeY9 TimY12 TimY15
## SmokerFormer Smoker         -0.876                                          
## TimeY3                      -0.378  0.331                                   
## TimeY6                      -0.430  0.377  0.678                            
## TimeY9                      -0.427  0.374  0.546  0.616                     
## TimeY12                     -0.441  0.386  0.596  0.700  0.724              
## TimeY15                     -0.394  0.345  0.608  0.646  0.668  0.782       
## SmokerFormer Smoker:TimeY3   0.335 -0.372 -0.886 -0.600 -0.484 -0.528 -0.539
## SmokerFormer Smoker:TimeY6   0.380 -0.422 -0.599 -0.883 -0.544 -0.618 -0.571
## SmokerFormer Smoker:TimeY9   0.375 -0.417 -0.479 -0.540 -0.877 -0.635 -0.586
## SmokerFormer Smoker:TimeY12  0.387 -0.430 -0.524 -0.615 -0.636 -0.879 -0.687
## SmokerFormer Smoker:TimeY15  0.347 -0.385 -0.535 -0.569 -0.587 -0.688 -0.880
##                             SFS:TY3 SFS:TY6 SFS:TY9 SFS:TY12
## SmokerFormer Smoker                                         
## TimeY3                                                      
## TimeY6                                                      
## TimeY9                                                      
## TimeY12                                                     
## TimeY15                                                     
## SmokerFormer Smoker:TimeY3                                  
## SmokerFormer Smoker:TimeY6   0.674                          
## SmokerFormer Smoker:TimeY9   0.536   0.603                  
## SmokerFormer Smoker:TimeY12  0.588   0.689   0.710          
## SmokerFormer Smoker:TimeY15  0.603   0.637   0.653   0.772  
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.92624003 -0.65632030  0.05787637  0.65630627  3.20305033 
## 
## Residual standard error: 0.5962603 
## Degrees of freedom: 669 total; 657 residual

Question #4

Using the best-fit model for the mean response and the worst-fit model for the mean response from your solution in Question #2:

4.1 Why do we say that the parameters inside the Covariance $\sum_!$ are ‘nuisance’ in this example?

because these parameters inside the matrix capture the dependencies in the data that we’re not interested in but necessary for understanding. they describe how the data points within the same subject (ID) are related (correlation over time) and how the variability in measurements changes over time. But we’re interested in the relationship between smoker status and time, which these parameters aren’t measuring.

4.2 Output the estimated Covariance matrix from each model (should have 2 matrices).

#m1 is the best model (linear), m_i6 is the worst model (quadratic)
cov_matrix_m_1 <- getVarCov(m_1)
cov_matrix_m_i6 <- getVarCov(m_i6)

print(cov_matrix_m_1)

## Marginal variance covariance matrix
##         [,1]    [,2]    [,3]    [,4]    [,5]
## [1,] 0.35553 0.29779 0.28484 0.28338 0.28884
## [2,] 0.29779 0.33427 0.29045 0.27354 0.29020
## [3,] 0.28484 0.29045 0.31649 0.27091 0.28284
## [4,] 0.28338 0.27354 0.27091 0.32278 0.28606
## [5,] 0.28884 0.29020 0.28284 0.28606 0.33211
##   Standard Deviations: 0.59626 0.57816 0.56257 0.56814 0.57629

print(cov_matrix_m_i6)

## Marginal variance covariance matrix
##         [,1]    [,2]    [,3]    [,4]    [,5]
## [1,] 0.35495 0.29912 0.28607 0.28104 0.28964
## [2,] 0.29912 0.33887 0.29268 0.27389 0.29354
## [3,] 0.28607 0.29268 0.31990 0.26706 0.28508
## [4,] 0.28104 0.27389 0.26706 0.32128 0.28427
## [5,] 0.28964 0.29354 0.28508 0.28427 0.33351
##   Standard Deviations: 0.59578 0.58212 0.5656 0.56682 0.5775

4.3 Comment on the variances and covariances: which model has larger/smaller variances and covariances? Why?

the linear model has slightly larger variances and slightly smaller covariances compared to the quadratic model. The difference is small but consistent across the matrices. The smaller variances and slightly larger covariances in the quadratic may indicate that it’s is better at capturing the correlation structure or the relationships between time points, which leads to a tighter fit with lower variability.

4.4 Output the estimated Correlation matrix from each model (should have 2 matrices).

std_devs_m_1 <- sqrt(diag(cov_matrix_m_1))
std_devs_m_i6 <- sqrt(diag(cov_matrix_m_i6))

# 2. Normalize the covariance matrices by the standard deviations
cor_matrix_m_1 <- cov_matrix_m_1 / (std_devs_m_1 %*% t(std_devs_m_1))
cor_matrix_m_i6 <- cov_matrix_m_i6 / (std_devs_m_i6 %*% t(std_devs_m_i6))

print(cor_matrix_m_1)

## Marginal variance covariance matrix
##         [,1]    [,2]    [,3]    [,4]    [,5]
## [1,] 1.00000 0.86382 0.84915 0.83654 0.84058
## [2,] 0.86382 1.00000 0.89296 0.83276 0.87098
## [3,] 0.84915 0.89296 1.00000 0.84760 0.87243
## [4,] 0.83654 0.83276 0.84760 1.00000 0.87371
## [5,] 0.84058 0.87098 0.87243 0.87371 1.00000
##   Standard Deviations: 1 1 1 1 1

print(cor_matrix_m_i6)

## Marginal variance covariance matrix
##         [,1]    [,2]    [,3]    [,4]    [,5]
## [1,] 1.00000 0.86248 0.84896 0.83222 0.84183
## [2,] 0.86248 1.00000 0.88894 0.83007 0.87317
## [3,] 0.84896 0.88894 1.00000 0.83302 0.87280
## [4,] 0.83222 0.83007 0.83302 1.00000 0.86844
## [5,] 0.84183 0.87317 0.87280 0.86844 1.00000
##   Standard Deviations: 1 1 1 1 1

4.5 Comment on the correlations: which model has larger/smaller correlations? Why? Basically, I am asking you about the effect of how we specify the mean response $E(Y_{ij}) =𝜇_{ij}$ on the estimated Covariance $(\sum_i)$ and Correlation $(R_i)$ matrices. Why do we care about the model for the mean response?

the linear model has slightly larger correlations than the quadratic model. This is because the linear model does not capture the non-linear relationships in the data as effectively as the quadratic model, resulting in larger residuals and stronger correlations. The specification of the mean response is important because it determines how well the model fits the data. A more accurate specification (like the quadratic model) leads to smaller residuals and smaller correlations, while a less accurate specification leads to larger residuals and stronger correlations.

BIS628 HW1 Vlagtwedde-Vlaardingen Study

Samuele Petruccelli

2025-02-01

Question #1

Question #2

Question #3

Question #4