mark, get set
rm(list=ls())
graphics.off()
closeAllConnections()
knitr
knitr::opts_chunk$set(echo = TRUE)
a few of my favorite packages
library(tidyverse)
library(xfun)
library(tidyr)
library(nlme)
director fury
setwd("~/Yale University/2semester/BIS628 Longitudinal Multilevel Analysis/HW/1")
the import business
HW1_data <-
read.delim("~/Yale University/2semester/BIS628 Longitudinal Multilevel Analysis/HW/1/HW1_data.txt",
header=FALSE)
new name
names(HW1_data) <- c("ID", "Smoker", "Y0", "Y3", "Y6", "Y9", "Y12", "Y15")
1.1 the long way home
HW1_data_long <- HW1_data %>%
pivot_longer(cols = starts_with("Y"),
names_to = "Time",
values_to = "FEV1")
level up
HW1_data_long$Time <- factor(HW1_data_long$Time,
levels = c("Y0", "Y3", "Y6", "Y9", "Y12", "Y15"))
print(HW1_data_long)
## # A tibble: 798 × 4
## ID Smoker Time FEV1
## <int> <int> <fct> <dbl>
## 1 1 0 Y0 3.4
## 2 1 0 Y3 3.4
## 3 1 0 Y6 3.45
## 4 1 0 Y9 3.2
## 5 1 0 Y12 NA
## 6 1 0 Y15 2.95
## 7 2 1 Y0 3.1
## 8 2 1 Y3 3.15
## 9 2 1 Y6 3.5
## 10 2 1 Y9 2.95
## # ℹ 788 more rows
1.2 long responses
na_count <- HW1_data_long %>%
group_by(Smoker, Time) %>%
summarise(NA_Count = sum(!is.na(FEV1)), .groups = 'drop') %>%
arrange(Time)
print(na_count)
## # A tibble: 12 × 3
## Smoker Time NA_Count
## <int> <fct> <int>
## 1 0 Y0 23
## 2 1 Y0 85
## 3 0 Y3 27
## 4 1 Y3 95
## 5 0 Y6 28
## 6 1 Y6 89
## 7 0 Y9 30
## 8 1 Y9 85
## 9 0 Y12 29
## 10 1 Y12 81
## 11 0 Y15 24
## 12 1 Y15 73
wide responses
widetable <- na_count %>%
pivot_wider(id_cols = "Smoker",
names_from = "Time",
values_from = "NA_Count"
)
print(widetable)
## # A tibble: 2 × 7
## Smoker Y0 Y3 Y6 Y9 Y12 Y15
## <int> <int> <int> <int> <int> <int> <int>
## 1 0 23 27 28 30 29 24
## 2 1 85 95 89 85 81 73
wide not responses
notthere <- HW1_data_long %>%
group_by(Smoker, Time) %>%
summarise(NA_Count = sum(is.na(FEV1)), .groups = 'drop') %>%
arrange(Time)
print(notthere)
## # A tibble: 12 × 3
## Smoker Time NA_Count
## <int> <fct> <int>
## 1 0 Y0 9
## 2 1 Y0 16
## 3 0 Y3 5
## 4 1 Y3 6
## 5 0 Y6 4
## 6 1 Y6 12
## 7 0 Y9 2
## 8 1 Y9 16
## 9 0 Y12 3
## 10 1 Y12 20
## 11 0 Y15 8
## 12 1 Y15 28
#switch me
notthere <- notthere %>%
pivot_wider(id_cols = "Smoker",
names_from = "Time",
values_from = "NA_Count"
)
print(notthere)
## # A tibble: 2 × 7
## Smoker Y0 Y3 Y6 Y9 Y12 Y15
## <int> <int> <int> <int> <int> <int> <int>
## 1 0 9 5 4 2 3 8
## 2 1 16 6 12 16 20 28
combined NA and responses in one wide table
merged_table <- merge(widetable, notthere, all = TRUE)
print(merged_table)
## Smoker Y0 Y3 Y6 Y9 Y12 Y15
## 1 0 9 5 4 2 3 8
## 2 0 23 27 28 30 29 24
## 3 1 16 6 12 16 20 28
## 4 1 85 95 89 85 81 73
percent missing
missingness <- merged_table
for (i in 2:7) {
missingness[1, i] <- merged_table[1, i] / merged_table[2, i]
missingness[3, i] <- merged_table[3, i] / merged_table[4, i]
}
missingness <- missingness[c(1, 3), ]
print(missingness)
## Smoker Y0 Y3 Y6 Y9 Y12 Y15
## 1 0 0.3913043 0.18518519 0.1428571 0.06666667 0.1034483 0.3333333
## 3 1 0.1882353 0.06315789 0.1348315 0.18823529 0.2469136 0.3835616
1.3 In a Single Figure, create Boxplots of FEV1 across the follow up time (in years, starting from baseline) by Group (Smoker Status):
Describe the observed variability in the outcome over time: output observed variances and standard deviation by Group and follow up time
Comment on whether the mean and the median coincide
ggplot(data = HW1_data_long, aes(x = FEV1, y = Time)) +
geom_boxplot(fill = "gold") +
facet_wrap(~ Smoker, ncol = 2 ) +
labs(x = "smoker status", y = "Time")
## Warning: Removed 129 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
the observed variance and standard deviation appears similar across all time points and between smoker status, though there are more outliers among former smokers. often the mean and median do not coincide
1.4 In a Single Figure, create a panel (2 rows by 3 columns) of 6 plots (one for each time point) for distribution of FEV1 at each follow up timepoint by group (Smoker Status) overlaid with empirical probability curves. Hint: using the ggplot code in R Lab#1 which produced histograms by group with overlaid empirical probability curves and ADD to the code + facet_wrap(~FU_Time) where FU_Time is the follow up time “0 years”, “3 years”, etc. - Comment on the Normality of the observed FEV1 data at each follow up time point and explain how you might justify using a model for the mean response that relies on the assumption of MVN for Yi of FEV1.
bigplot <- ggplot(data = HW1_data_long, aes(x = FEV1)) + geom_histogram(
binwidth = .1) + facet_wrap(~ Time) +
geom_histogram(aes(color = Smoker == 1)) +
geom_density(aes(color = Smoker == 1),
linetype="dashed") +
labs(title = "Distribution of FEV1 for Smokers and non-Smokers Across Time",
x = "FEV1",
color = "Currently Smoking",
fill = "Baseline Height >= 1.3 m")
print(bigplot)
## Warning: Removed 129 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 129 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 129 rows containing non-finite outside the scale range
## (`stat_density()`).
generally, the FEV1 for both smokers and non-smokers is normally distributed without skew, which would indicate mean-response as an appropriate model to find any differences along with using the MVN since covariance would be balanced
1.5 In a Single Figure, plot the observed mean FEV1 over time (years of follow up) by Group (Smoker Status), and connect the group means across timepoints starting from the baseline (0 years). DO NOT SMOOTH. - Describe the general characteristics of the time trends for the two groups: what is going on with the group means over time? What might you guess the best-fit model for the mean response will be?
#create a mean data frame
mean_data <- HW1_data_long %>%
group_by(Smoker, Time) %>%
summarise(Mean_Value = mean(FEV1, na.rm = TRUE))
## `summarise()` has grouped output by 'Smoker'. You can override using the
## `.groups` argument.
print(mean_data)
## # A tibble: 12 × 3
## # Groups: Smoker [2]
## Smoker Time Mean_Value
## <int> <fct> <dbl>
## 1 0 Y0 3.52
## 2 0 Y3 3.58
## 3 0 Y6 3.26
## 4 0 Y9 3.17
## 5 0 Y12 3.14
## 6 0 Y15 2.87
## 7 1 Y0 3.23
## 8 1 Y3 3.12
## 9 1 Y6 3.09
## 10 1 Y9 2.87
## 11 1 Y12 2.80
## 12 1 Y15 2.68
#factor
HW1_data_long$Smoker <- as.factor(HW1_data_long$Smoker)
levels(HW1_data_long$Smoker) <- c("Current Smoker", "Former Smoker")
#Create average trend lines
lineup <- ggplot(data = mean_data, aes(x = Time, y = Mean_Value, group = Smoker)) +
geom_line(aes(color=Smoker)) + geom_point(aes(color=Smoker)) +
labs(title="Average Trend of Observed Mean FEV1 by Treatment Group",
x="Time", y="FEV1")
print(lineup)
the mean FEV1 for both past and current smokers decreases over time, indicating the best model might be a simple linear regression
Using the notation for Lecture#2 and Lab#2, but with the relevant variables from the current dataset, write out the model for the mean response E(Yij) for:
2.1 The MRP model, and write out the Null hypothesis of interest using your beta coefficients. Explain your Xijp variables.
sum(is.na(HW1_data_long$FEV1))
## [1] 129
sum(is.na(HW1_data_long$Time))
## [1] 0
sum(is.na(HW1_data_long$ID))
## [1] 0
HW1_data_long$Time_enumerated = as.numeric(as.factor(HW1_data_long$Time))
#Check that the code above worked
table(HW1_data_long$Time_enumerated,HW1_data_long$Time)
##
## Y0 Y3 Y6 Y9 Y12 Y15
## 1 133 0 0 0 0 0
## 2 0 133 0 0 0 0
## 3 0 0 133 0 0 0
## 4 0 0 0 133 0 0
## 5 0 0 0 0 133 0
## 6 0 0 0 0 0 133
### Fit a response profile model using the gls function
#The default estimation method is REML
#m1 is Model 1 in the accompanying Word document:
#MRP not assuming equal group means at baseline
HW1_data_long_clean <- HW1_data_long %>%
filter(!is.na(FEV1), !is.na(Time), !is.na(ID))
HW1_data_long_clean <- na.omit(HW1_data_long)
m_1 <- gls(FEV1 ~ Smoker*Time, data = HW1_data_long_clean,
corr=corSymm(form= ~ Time_enumerated | ID),
weights=varIdent(form = ~ 1 | Time), na.action = na.omit)
#varIdent allows for different variances at each time point
#Get estimates of the fixed effects and model fit statistics (AIC, BIC, logLik)
print(summary(m_1)) ###have a look of the coefficients!!!
## Generalized least squares fit by REML
## Model: FEV1 ~ Smoker * Time
## Data: HW1_data_long_clean
## AIC BIC logLik
## 347.1353 495.2289 -140.5677
##
## Correlation Structure: General
## Formula: ~Time_enumerated | ID
## Parameter estimate(s):
## Correlation:
## 1 2 3 4 5
## 2 0.864
## 3 0.849 0.893
## 4 0.837 0.833 0.848
## 5 0.855 0.860 0.889 0.892
## 6 0.841 0.871 0.872 0.874 0.931
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | Time
## Parameter estimates:
## Y0 Y3 Y6 Y9 Y15 Y12
## 1.0000000 0.9696520 0.9435027 0.9528324 0.9665003 0.9387173
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 3.487793 0.10946055 31.86347 0.0000
## SmokerFormer Smoker -0.279122 0.12502607 -2.23251 0.0259
## TimeY3 -0.010901 0.06346478 -0.17177 0.8637
## TimeY6 -0.210907 0.06554868 -3.21756 0.0014
## TimeY9 -0.320675 0.06706319 -4.78168 0.0000
## TimeY12 -0.380604 0.06341193 -6.00209 0.0000
## TimeY15 -0.501443 0.06862330 -7.30718 0.0000
## SmokerFormer Smoker:TimeY3 -0.070556 0.07163060 -0.98499 0.3250
## SmokerFormer Smoker:TimeY6 0.071464 0.07420000 0.96313 0.3358
## SmokerFormer Smoker:TimeY9 -0.051547 0.07644132 -0.67433 0.5003
## SmokerFormer Smoker:TimeY12 -0.029820 0.07216006 -0.41325 0.6796
## SmokerFormer Smoker:TimeY15 -0.067079 0.07798420 -0.86016 0.3900
##
## Correlation:
## (Intr) SmkrFS TimeY3 TimeY6 TimeY9 TimY12 TimY15
## SmokerFormer Smoker -0.876
## TimeY3 -0.378 0.331
## TimeY6 -0.430 0.377 0.678
## TimeY9 -0.427 0.374 0.546 0.616
## TimeY12 -0.441 0.386 0.596 0.700 0.724
## TimeY15 -0.394 0.345 0.608 0.646 0.668 0.782
## SmokerFormer Smoker:TimeY3 0.335 -0.372 -0.886 -0.600 -0.484 -0.528 -0.539
## SmokerFormer Smoker:TimeY6 0.380 -0.422 -0.599 -0.883 -0.544 -0.618 -0.571
## SmokerFormer Smoker:TimeY9 0.375 -0.417 -0.479 -0.540 -0.877 -0.635 -0.586
## SmokerFormer Smoker:TimeY12 0.387 -0.430 -0.524 -0.615 -0.636 -0.879 -0.687
## SmokerFormer Smoker:TimeY15 0.347 -0.385 -0.535 -0.569 -0.587 -0.688 -0.880
## SFS:TY3 SFS:TY6 SFS:TY9 SFS:TY12
## SmokerFormer Smoker
## TimeY3
## TimeY6
## TimeY9
## TimeY12
## TimeY15
## SmokerFormer Smoker:TimeY3
## SmokerFormer Smoker:TimeY6 0.674
## SmokerFormer Smoker:TimeY9 0.536 0.603
## SmokerFormer Smoker:TimeY12 0.588 0.689 0.710
## SmokerFormer Smoker:TimeY15 0.603 0.637 0.653 0.772
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.92624003 -0.65632030 0.05787637 0.65630627 3.20305033
##
## Residual standard error: 0.5962603
## Degrees of freedom: 669 total; 657 residual
m_1.1 <- gls(FEV1 ~ Smoker*Time, data = HW1_data_long_clean,
corr=corSymm(form= ~ Time_enumerated | ID),
weights=varIdent(form = ~ 1 | Time), method = "ML", na.action = na.omit)
print(summary(m_1.1))
## Generalized least squares fit by maximum likelihood
## Model: FEV1 ~ Smoker * Time
## Data: HW1_data_long_clean
## AIC BIC logLik
## 291.8494 440.5402 -112.9247
##
## Correlation Structure: General
## Formula: ~Time_enumerated | ID
## Parameter estimate(s):
## Correlation:
## 1 2 3 4 5
## 2 0.864
## 3 0.850 0.893
## 4 0.837 0.833 0.848
## 5 0.856 0.860 0.889 0.893
## 6 0.841 0.871 0.873 0.874 0.932
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | Time
## Parameter estimates:
## Y0 Y3 Y6 Y9 Y15 Y12
## 1.0000000 0.9698924 0.9436551 0.9529209 0.9665721 0.9388369
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 3.487811 0.10957322 31.83087 0.0000
## SmokerFormer Smoker -0.279157 0.12515650 -2.23047 0.0261
## TimeY3 -0.010918 0.06345065 -0.17207 0.8634
## TimeY6 -0.210914 0.06553080 -3.21855 0.0014
## TimeY9 -0.320710 0.06704313 -4.78364 0.0000
## TimeY12 -0.380608 0.06337998 -6.00518 0.0000
## TimeY15 -0.501412 0.06858235 -7.31110 0.0000
## SmokerFormer Smoker:TimeY3 -0.070504 0.07161507 -0.98448 0.3252
## SmokerFormer Smoker:TimeY6 0.071488 0.07417996 0.96371 0.3355
## SmokerFormer Smoker:TimeY9 -0.051500 0.07641892 -0.67392 0.5006
## SmokerFormer Smoker:TimeY12 -0.029831 0.07212325 -0.41362 0.6793
## SmokerFormer Smoker:TimeY15 -0.067102 0.07793732 -0.86097 0.3896
##
## Correlation:
## (Intr) SmkrFS TimeY3 TimeY6 TimeY9 TimY12 TimY15
## SmokerFormer Smoker -0.875
## TimeY3 -0.377 0.330
## TimeY6 -0.430 0.376 0.677
## TimeY9 -0.427 0.374 0.545 0.616
## TimeY12 -0.440 0.385 0.595 0.700 0.724
## TimeY15 -0.394 0.345 0.608 0.646 0.668 0.783
## SmokerFormer Smoker:TimeY3 0.334 -0.371 -0.886 -0.600 -0.483 -0.527 -0.539
## SmokerFormer Smoker:TimeY6 0.380 -0.422 -0.598 -0.883 -0.544 -0.618 -0.571
## SmokerFormer Smoker:TimeY9 0.374 -0.416 -0.478 -0.540 -0.877 -0.635 -0.586
## SmokerFormer Smoker:TimeY12 0.387 -0.430 -0.523 -0.615 -0.636 -0.879 -0.688
## SmokerFormer Smoker:TimeY15 0.347 -0.384 -0.535 -0.569 -0.588 -0.689 -0.880
## SFS:TY3 SFS:TY6 SFS:TY9 SFS:TY12
## SmokerFormer Smoker
## TimeY3
## TimeY6
## TimeY9
## TimeY12
## TimeY15
## SmokerFormer Smoker:TimeY3
## SmokerFormer Smoker:TimeY6 0.674
## SmokerFormer Smoker:TimeY9 0.535 0.603
## SmokerFormer Smoker:TimeY12 0.587 0.689 0.710
## SmokerFormer Smoker:TimeY15 0.603 0.637 0.653 0.773
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.94932081 -0.66141929 0.05836135 0.66154229 3.22802665
##
## Residual standard error: 0.591567
## Degrees of freedom: 669 total; 657 residual
m_1.2 <- gls(FEV1 ~ Smoker + Time, data = HW1_data_long_clean,
corr=corSymm(form= ~ Time_enumerated | ID),
weights=varIdent(form = ~ 1 | Time), method = "ML", na.action = na.omit)
print(summary(m_1.2))
## Generalized least squares fit by maximum likelihood
## Model: FEV1 ~ Smoker + Time
## Data: HW1_data_long_clean
## AIC BIC logLik
## 289.5773 415.7393 -116.7887
##
## Correlation Structure: General
## Formula: ~Time_enumerated | ID
## Parameter estimate(s):
## Correlation:
## 1 2 3 4 5
## 2 0.863
## 3 0.850 0.888
## 4 0.836 0.835 0.842
## 5 0.856 0.862 0.887 0.893
## 6 0.839 0.872 0.869 0.874 0.931
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | Time
## Parameter estimates:
## Y0 Y3 Y6 Y9 Y15 Y12
## 1.0000000 0.9748603 0.9474614 0.9525876 0.9633347 0.9389191
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 3.498971 0.09834206 35.57960 0.0000
## SmokerFormer Smoker -0.293386 0.10923274 -2.68588 0.0074
## TimeY3 -0.067006 0.02948543 -2.27251 0.0234
## TimeY6 -0.156902 0.03066146 -5.11723 0.0000
## TimeY9 -0.359571 0.03206646 -11.21331 0.0000
## TimeY12 -0.403248 0.03009547 -13.39895 0.0000
## TimeY15 -0.552308 0.03253095 -16.97792 0.0000
##
## Correlation:
## (Intr) SmkrFS TimeY3 TimeY6 TimeY9 TimY12
## SmokerFormer Smoker -0.844
## TimeY3 -0.188 0.001
## TimeY6 -0.215 0.002 0.648
## TimeY9 -0.216 0.007 0.516 0.557
## TimeY12 -0.223 0.006 0.571 0.656 0.678
## TimeY15 -0.200 0.003 0.596 0.605 0.622 0.748
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.9283352 -0.6550817 0.0819973 0.6668006 3.2487553
##
## Residual standard error: 0.5915045
## Degrees of freedom: 669 total; 662 residual
# now everybody from the LRT ...
anova(m_1.1, m_1.2)
the \(N_0\) is that the effects of smoking status, time, and their interaction are not statistically significant and have no effect on FEV1. we can fail to reject the \(N_0\) hypothesis. None of the \(X_{ijp}\) variables had a p-value below .05
2.2 The constant slope of change in mean FEV1, and write out the Null hypothesis of interest using the beta coefficients. Explain your Xijp variables.
the values for TimeY3:TImeY6. These represent the differences in FEV1 from the baseline from Time0. Each coefficient shows how much FEV1 changes from Time 0 at each of these specific time points, and apart from \(Y_3\), are all statistically significant.
2.3 The non-constant slope of change in mean FEV1 (polynomial up to quadratic effect of follow up time), and write out the Null hypothesis of interest using the beta coefficients. Explain your Xijp variables.
we need a polynomial model for this, as below
HW1_data_long_clean$ctime <- HW1_data_long_clean$Time_enumerated -
mean(HW1_data_long_clean$Time_enumerated)
HW1_data_long_clean$ctime2 <- (HW1_data_long_clean$ctime)^2
#looks similar
summary(HW1_data_long$ctime)
## Warning: Unknown or uninitialised column: `ctime`.
## Length Class Mode
## 0 NULL NULL
### Fit a quadratic trend model
m_i6 <- gls(FEV1 ~ Smoker * ctime + Smoker * ctime2,
data = HW1_data_long_clean,
corr=corSymm(form= ~ Time_enumerated | ID),
weights=varIdent(form= ~ 1 | Time))
summary(m_i6)
## Generalized least squares fit by REML
## Model: FEV1 ~ Smoker * ctime + Smoker * ctime2
## Data: HW1_data_long_clean
## AIC BIC logLik
## 337.991 459.4039 -141.9955
##
## Correlation Structure: General
## Formula: ~Time_enumerated | ID
## Parameter estimate(s):
## Correlation:
## 1 2 3 4 5
## 2 0.862
## 3 0.849 0.889
## 4 0.832 0.830 0.833
## 5 0.855 0.863 0.890 0.883
## 6 0.842 0.873 0.873 0.868 0.932
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | Time
## Parameter estimates:
## Y0 Y3 Y6 Y9 Y15 Y12
## 1.0000000 0.9770804 0.9493417 0.9513921 0.9693235 0.9398637
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 3.251352 0.09564734 33.99313 0.0000
## SmokerFormer Smoker -0.280808 0.10984322 -2.55645 0.0108
## ctime -0.106446 0.01215982 -8.75390 0.0000
## ctime2 0.000331 0.00669002 0.04947 0.9606
## SmokerFormer Smoker:ctime -0.007274 0.01382671 -0.52609 0.5990
## SmokerFormer Smoker:ctime2 -0.006571 0.00766256 -0.85751 0.3915
##
## Correlation:
## (Intr) SmkrFS ctime ctime2 SmkFS:
## SmokerFormer Smoker -0.871
## ctime -0.003 0.003
## ctime2 -0.086 0.075 -0.257
## SmokerFormer Smoker:ctime 0.003 -0.001 -0.879 0.226
## SmokerFormer Smoker:ctime2 0.075 -0.090 0.224 -0.873 -0.244
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.90912802 -0.67271976 0.03459019 0.67478636 3.24118537
##
## Residual standard error: 0.5957773
## Degrees of freedom: 669 total; 663 residual
m_i6.1 <- gls(FEV1 ~ Smoker * ctime + Smoker * ctime2,
data = HW1_data_long_clean,
method = "ML",
corr=corSymm(form= ~ Time_enumerated | ID),
weights=varIdent(form= ~ 1 | Time))
summary(m_i6.1)
## Generalized least squares fit by maximum likelihood
## Model: FEV1 ~ Smoker * ctime + Smoker * ctime2
## Data: HW1_data_long_clean
## AIC BIC logLik
## 298.2473 419.9035 -122.1237
##
## Correlation Structure: General
## Formula: ~Time_enumerated | ID
## Parameter estimate(s):
## Correlation:
## 1 2 3 4 5
## 2 0.861
## 3 0.849 0.888
## 4 0.832 0.828 0.831
## 5 0.856 0.862 0.889 0.881
## 6 0.842 0.873 0.873 0.868 0.932
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | Time
## Parameter estimates:
## Y0 Y3 Y6 Y9 Y15 Y12
## 1.0000000 0.9781922 0.9502159 0.9521886 0.9693064 0.9404409
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 3.251337 0.09534491 34.10079 0.0000
## SmokerFormer Smoker -0.280869 0.10949654 -2.56510 0.0105
## ctime -0.106453 0.01211448 -8.78728 0.0000
## ctime2 0.000321 0.00665361 0.04830 0.9615
## SmokerFormer Smoker:ctime -0.007255 0.01377484 -0.52666 0.5986
## SmokerFormer Smoker:ctime2 -0.006563 0.00762113 -0.86110 0.3895
##
## Correlation:
## (Intr) SmkrFS ctime ctime2 SmkFS:
## SmokerFormer Smoker -0.871
## ctime -0.003 0.003
## ctime2 -0.086 0.075 -0.257
## SmokerFormer Smoker:ctime 0.003 -0.001 -0.879 0.226
## SmokerFormer Smoker:ctime2 0.075 -0.089 0.224 -0.873 -0.244
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.93205436 -0.67752242 0.03503597 0.67965261 3.26494799
##
## Residual standard error: 0.5910947
## Degrees of freedom: 669 total; 663 residual
m_i6.2 <- gls(FEV1 ~ Smoker + ctime + ctime2,
data = HW1_data_long_clean,
method = "ML",
corr=corSymm(form= ~ Time_enumerated | ID),
weights=varIdent(form= ~ 1 | Time))
summary(m_i6.2)
## Generalized least squares fit by maximum likelihood
## Model: FEV1 ~ Smoker + ctime + ctime2
## Data: HW1_data_long_clean
## AIC BIC logLik
## 295.5166 408.1612 -122.7583
##
## Correlation Structure: General
## Formula: ~Time_enumerated | ID
## Parameter estimate(s):
## Correlation:
## 1 2 3 4 5
## 2 0.862
## 3 0.848 0.887
## 4 0.833 0.828 0.830
## 5 0.855 0.862 0.888 0.881
## 6 0.841 0.873 0.869 0.868 0.931
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | Time
## Parameter estimates:
## Y0 Y3 Y6 Y9 Y15 Y12
## 1.0000000 0.9771470 0.9512045 0.9527234 0.9656179 0.9407500
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 3.260015 0.09493655 34.33888 0.0000
## SmokerFormer Smoker -0.292784 0.10886049 -2.68954 0.0073
## ctime -0.111878 0.00575943 -19.42526 0.0000
## ctime2 -0.004516 0.00324973 -1.38950 0.1651
##
## Correlation:
## (Intr) SmkrFS ctime
## SmokerFormer Smoker -0.870
## ctime -0.005 0.005
## ctime2 -0.047 -0.006 -0.210
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.88845683 -0.68441123 0.05266598 0.68565849 3.28560179
##
## Residual standard error: 0.5909879
## Degrees of freedom: 669 total; 665 residual
anova(m_i6.1, m_i6.2)
the null hypothesis is if \(\beta_3\) and \(\beta_5\) are $ = 0$, which they both are, though the p-values are above .05, indicating non-statistical significance. regarding the \(X_{ijp}\) variables, Smoker × ctime: The interaction between smoker status and linear time, representing how the rate of change in FEV1 differs for smokers compared to non-smokers. Smoker × ctime2: The interaction between smoker status and quadratic time, representing how the non-linear effect of time on FEV1 differs for smokers compared to non-smokers
table(HW1_data_long$Time_enumerated,HW1_data_long$Time)
##
## Y0 Y3 Y6 Y9 Y12 Y15
## 1 133 0 0 0 0 0
## 2 0 133 0 0 0 0
## 3 0 0 133 0 0 0
## 4 0 0 0 133 0 0
## 5 0 0 0 0 133 0
## 6 0 0 0 0 0 133
2.4 The piece-wise linear change in FEV1 with a knot at the 3-year follow up, and write out the Null hypothesis of interest using the beta coefficients. Explain your Xijp variables.
# threat level 7
HW1_data_long_clean$y3 <- (HW1_data_long_clean$Time_enumerated - 3) *
I(HW1_data_long_clean$Time_enumerated >= 3)
head(HW1_data_long_clean)
### Fit a piece-wise linear model
m7 <- gls(FEV1 ~ Smoker*Time_enumerated + Smoker*y3, data = HW1_data_long_clean,
corr=corSymm(form= ~ Time_enumerated | ID),
weights=varIdent(form= ~ 1 | Time))
summary(m7)
## Generalized least squares fit by REML
## Model: FEV1 ~ Smoker * Time_enumerated + Smoker * y3
## Data: HW1_data_long_clean
## AIC BIC logLik
## 328.877 450.29 -137.4385
##
## Correlation Structure: General
## Formula: ~Time_enumerated | ID
## Parameter estimate(s):
## Correlation:
## 1 2 3 4 5
## 2 0.862
## 3 0.850 0.891
## 4 0.831 0.830 0.837
## 5 0.855 0.862 0.891 0.881
## 6 0.841 0.873 0.873 0.869 0.932
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | Time
## Parameter estimates:
## Y0 Y3 Y6 Y9 Y15 Y12
## 1.0000000 0.9761671 0.9466063 0.9505686 0.9693499 0.9397061
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 3.651597 0.12323654 29.630801 0.0000
## SmokerFormer Smoker -0.364707 0.14047701 -2.596206 0.0096
## Time_enumerated -0.120938 0.02970930 -4.070727 0.0001
## y3 0.021471 0.04015482 0.534704 0.5930
## SmokerFormer Smoker:Time_enumerated 0.036321 0.03368764 1.078167 0.2814
## SmokerFormer Smoker:y3 -0.069127 0.04581959 -1.508674 0.1319
##
## Correlation:
## (Intr) SmkrFS Tm_nmr y3 SFS:T_
## SmokerFormer Smoker -0.877
## Time_enumerated -0.626 0.549
## y3 0.532 -0.466 -0.918
## SmokerFormer Smoker:Time_enumerated 0.552 -0.619 -0.882 0.810
## SmokerFormer Smoker:y3 -0.466 0.524 0.805 -0.876 -0.917
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.92575818 -0.67492471 0.01886003 0.68115799 3.23190067
##
## Residual standard error: 0.596026
## Degrees of freedom: 669 total; 663 residual
m7.1 <- gls(FEV1 ~ Smoker*Time_enumerated + Smoker*y3, data = HW1_data_long_clean,
corr=corSymm(form= ~ Time_enumerated | ID),
weights=varIdent(form= ~ 1 | Time), method = "ML")
summary(m7.1)
## Generalized least squares fit by maximum likelihood
## Model: FEV1 ~ Smoker * Time_enumerated + Smoker * y3
## Data: HW1_data_long_clean
## AIC BIC logLik
## 296.2794 417.9355 -121.1397
##
## Correlation Structure: General
## Formula: ~Time_enumerated | ID
## Parameter estimate(s):
## Correlation:
## 1 2 3 4 5
## 2 0.860
## 3 0.850 0.890
## 4 0.831 0.828 0.835
## 5 0.855 0.861 0.890 0.880
## 6 0.841 0.873 0.873 0.868 0.932
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | Time
## Parameter estimates:
## Y0 Y3 Y6 Y9 Y15 Y12
## 1.0000000 0.9773222 0.9472690 0.9514751 0.9694814 0.9403262
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 3.651249 0.12275745 29.743599 0.0000
## SmokerFormer Smoker -0.364219 0.13993059 -2.602858 0.0095
## Time_enumerated -0.120827 0.02956910 -4.086247 0.0000
## y3 0.021293 0.03997864 0.532607 0.5945
## SmokerFormer Smoker:Time_enumerated 0.036125 0.03352788 1.077463 0.2817
## SmokerFormer Smoker:y3 -0.068828 0.04561868 -1.508761 0.1318
##
## Correlation:
## (Intr) SmkrFS Tm_nmr y3 SFS:T_
## SmokerFormer Smoker -0.877
## Time_enumerated -0.625 0.548
## y3 0.531 -0.466 -0.918
## SmokerFormer Smoker:Time_enumerated 0.551 -0.618 -0.882 0.810
## SmokerFormer Smoker:y3 -0.465 0.523 0.804 -0.876 -0.917
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.94832301 -0.68040038 0.01913402 0.68620816 3.25579117
##
## Residual standard error: 0.59131
## Degrees of freedom: 669 total; 663 residual
m7.2 <- gls(FEV1 ~ Smoker + Time_enumerated + y3, data = HW1_data_long_clean,
corr=corSymm(form= ~ Time_enumerated | ID),
weights=varIdent(form= ~ 1 | Time), method = "ML")
summary(m7.2)
## Generalized least squares fit by maximum likelihood
## Model: FEV1 ~ Smoker + Time_enumerated + y3
## Data: HW1_data_long_clean
## AIC BIC logLik
## 295.018 407.6626 -122.509
##
## Correlation Structure: General
## Formula: ~Time_enumerated | ID
## Parameter estimate(s):
## Correlation:
## 1 2 3 4 5
## 2 0.862
## 3 0.849 0.887
## 4 0.833 0.829 0.832
## 5 0.855 0.862 0.888 0.881
## 6 0.841 0.873 0.869 0.869 0.931
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | Time
## Parameter estimates:
## Y0 Y3 Y6 Y9 Y15 Y12
## 1.0000000 0.9772319 0.9498477 0.9522841 0.9654843 0.9410908
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 3.596282 0.10156759 35.40777 0.0000
## SmokerFormer Smoker -0.292423 0.10885487 -2.68636 0.0074
## Time_enumerated -0.093584 0.01396281 -6.70236 0.0000
## y3 -0.030262 0.01934509 -1.56433 0.1182
##
## Correlation:
## (Intr) SmkrFS Tm_nmr
## SmokerFormer Smoker -0.815
## Time_enumerated -0.352 0.006
## y3 0.294 -0.004 -0.915
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.88539075 -0.67488182 0.05781599 0.67338844 3.29366461
##
## Residual standard error: 0.5910889
## Degrees of freedom: 669 total; 665 residual
The \(H_0\) tests whether the interaction at the 3-year follow-up (y3 x smoker) has any significant effect on the change in FEV1. $H_0 = $ has to be \(\beta_{y3} = 0\) and \(\beta_{Smoker:y3} = 0\), which they are, but not statistically significant. \(\beta_{y3}\) indicates the knot at year 3. \(\beta_{Smoker:y3}\) shows the differential effect of the knot at year 3 on FEV1 for smokers versus non-smokers. It indicates whether the slope change at year 3 is different for smokers compared to non-smokers.
2.5 Using R or SAS statistical software, implement each of the 4 models for the mean response (NO OUTPUT IS NEEDED at this point BUT show your programming code) and pick among them the best-fit model for the mean of FEV1: - First chose among the nested models: Which models are nested and why? Which test should you use to choose among them?
the nested models are the non-integer models (m_1.1, m_1.2, m_i6.1, etc), since they are special cases of the simpler model. nested models are either less constrained or have more parameters
Compare the best-fit model among the nested models to the not-nested model: Which test should you use chose among them?
we can exclude the quadratic model (non-nested) since it has the highest AIC above 300. all the other models have roughly the same AIC and all share insignificant \(\beta_i\) coefficients. so we’ll favor the linear model (m1 series), since it uses the fewest predictors to achieve similar results. even though the p-values for the LRT decrease for the quadratic and piecewise functions, the LRT decreases too. so we still favor the linear model with it’s much higher LRT and still insignificant p-value.
Show your R or SAS output for the summary statistics from your tests (e.g., from ANOVA in R) and use that as evidence for selecting the best-fit model for the mean response.
anova(m7.1, m7.2)
Using the best-fit model for the mean response you chose in Question #2:
3.1 Provide your R or SAS output ONLY for the estimated regression coefficients.
summary(m_1.1)
## Generalized least squares fit by maximum likelihood
## Model: FEV1 ~ Smoker * Time
## Data: HW1_data_long_clean
## AIC BIC logLik
## 291.8494 440.5402 -112.9247
##
## Correlation Structure: General
## Formula: ~Time_enumerated | ID
## Parameter estimate(s):
## Correlation:
## 1 2 3 4 5
## 2 0.864
## 3 0.850 0.893
## 4 0.837 0.833 0.848
## 5 0.856 0.860 0.889 0.893
## 6 0.841 0.871 0.873 0.874 0.932
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | Time
## Parameter estimates:
## Y0 Y3 Y6 Y9 Y15 Y12
## 1.0000000 0.9698924 0.9436551 0.9529209 0.9665721 0.9388369
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 3.487811 0.10957322 31.83087 0.0000
## SmokerFormer Smoker -0.279157 0.12515650 -2.23047 0.0261
## TimeY3 -0.010918 0.06345065 -0.17207 0.8634
## TimeY6 -0.210914 0.06553080 -3.21855 0.0014
## TimeY9 -0.320710 0.06704313 -4.78364 0.0000
## TimeY12 -0.380608 0.06337998 -6.00518 0.0000
## TimeY15 -0.501412 0.06858235 -7.31110 0.0000
## SmokerFormer Smoker:TimeY3 -0.070504 0.07161507 -0.98448 0.3252
## SmokerFormer Smoker:TimeY6 0.071488 0.07417996 0.96371 0.3355
## SmokerFormer Smoker:TimeY9 -0.051500 0.07641892 -0.67392 0.5006
## SmokerFormer Smoker:TimeY12 -0.029831 0.07212325 -0.41362 0.6793
## SmokerFormer Smoker:TimeY15 -0.067102 0.07793732 -0.86097 0.3896
##
## Correlation:
## (Intr) SmkrFS TimeY3 TimeY6 TimeY9 TimY12 TimY15
## SmokerFormer Smoker -0.875
## TimeY3 -0.377 0.330
## TimeY6 -0.430 0.376 0.677
## TimeY9 -0.427 0.374 0.545 0.616
## TimeY12 -0.440 0.385 0.595 0.700 0.724
## TimeY15 -0.394 0.345 0.608 0.646 0.668 0.783
## SmokerFormer Smoker:TimeY3 0.334 -0.371 -0.886 -0.600 -0.483 -0.527 -0.539
## SmokerFormer Smoker:TimeY6 0.380 -0.422 -0.598 -0.883 -0.544 -0.618 -0.571
## SmokerFormer Smoker:TimeY9 0.374 -0.416 -0.478 -0.540 -0.877 -0.635 -0.586
## SmokerFormer Smoker:TimeY12 0.387 -0.430 -0.523 -0.615 -0.636 -0.879 -0.688
## SmokerFormer Smoker:TimeY15 0.347 -0.384 -0.535 -0.569 -0.588 -0.689 -0.880
## SFS:TY3 SFS:TY6 SFS:TY9 SFS:TY12
## SmokerFormer Smoker
## TimeY3
## TimeY6
## TimeY9
## TimeY12
## TimeY15
## SmokerFormer Smoker:TimeY3
## SmokerFormer Smoker:TimeY6 0.674
## SmokerFormer Smoker:TimeY9 0.535 0.603
## SmokerFormer Smoker:TimeY12 0.587 0.689 0.710
## SmokerFormer Smoker:TimeY15 0.603 0.637 0.653 0.773
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.94932081 -0.66141929 0.05836135 0.66154229 3.22802665
##
## Residual standard error: 0.591567
## Degrees of freedom: 669 total; 657 residual
#a summary of the coefficients
print(summary(m_1.1$coefficients))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.5014 -0.2895 -0.0688 0.1364 -0.0251 3.4878
3.2 Conduct an appropriate hypothesis test for the differences in mean FEV1 levels over time between the two groups of smokers (current versus past smoker). Make sure you include the following in your response: - State your Null and Alternative Hypotheses using the regression coefficients.
the \(H_0\) is that the \(\beta_{smoker:time} = 0\) over time, which it is, roughly by one-tenth, but with an insignificant p-value. the alternative hypothesis is that \(\beta_{smoker:time} \neq 0\)
anova(m_1,type = "marginal")
using the f-test we see that the interaction between smoker status and time is limited, and without a significant p-value. it’s nested model, m_1.1, with a maximum likelihood, does not change either of these results
What is your conclusion in words? (Do not simply state whether you failed to reject or rejected the Null BUT also explain in one or two sentences your conclusion.)
my conclusion is that we failed to reject the \(N_0\). Our \(\beta_{interactions}\) failed to yield statistically significant p-values, had low F-statistic values, and the optimal model did not have much of a big AIC difference from unfit models
3.3 Using the estimated regression coefficients from your output, write out and hand-calculate time-specific estimates of FEV1 means at baseline separately in Current and Former Smokers. Provide 95% Confidence Intervals (95%CI) for these two estimated means of FEV1 (you can use software for these).
summary(m_1.1)
## Generalized least squares fit by maximum likelihood
## Model: FEV1 ~ Smoker * Time
## Data: HW1_data_long_clean
## AIC BIC logLik
## 291.8494 440.5402 -112.9247
##
## Correlation Structure: General
## Formula: ~Time_enumerated | ID
## Parameter estimate(s):
## Correlation:
## 1 2 3 4 5
## 2 0.864
## 3 0.850 0.893
## 4 0.837 0.833 0.848
## 5 0.856 0.860 0.889 0.893
## 6 0.841 0.871 0.873 0.874 0.932
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | Time
## Parameter estimates:
## Y0 Y3 Y6 Y9 Y15 Y12
## 1.0000000 0.9698924 0.9436551 0.9529209 0.9665721 0.9388369
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 3.487811 0.10957322 31.83087 0.0000
## SmokerFormer Smoker -0.279157 0.12515650 -2.23047 0.0261
## TimeY3 -0.010918 0.06345065 -0.17207 0.8634
## TimeY6 -0.210914 0.06553080 -3.21855 0.0014
## TimeY9 -0.320710 0.06704313 -4.78364 0.0000
## TimeY12 -0.380608 0.06337998 -6.00518 0.0000
## TimeY15 -0.501412 0.06858235 -7.31110 0.0000
## SmokerFormer Smoker:TimeY3 -0.070504 0.07161507 -0.98448 0.3252
## SmokerFormer Smoker:TimeY6 0.071488 0.07417996 0.96371 0.3355
## SmokerFormer Smoker:TimeY9 -0.051500 0.07641892 -0.67392 0.5006
## SmokerFormer Smoker:TimeY12 -0.029831 0.07212325 -0.41362 0.6793
## SmokerFormer Smoker:TimeY15 -0.067102 0.07793732 -0.86097 0.3896
##
## Correlation:
## (Intr) SmkrFS TimeY3 TimeY6 TimeY9 TimY12 TimY15
## SmokerFormer Smoker -0.875
## TimeY3 -0.377 0.330
## TimeY6 -0.430 0.376 0.677
## TimeY9 -0.427 0.374 0.545 0.616
## TimeY12 -0.440 0.385 0.595 0.700 0.724
## TimeY15 -0.394 0.345 0.608 0.646 0.668 0.783
## SmokerFormer Smoker:TimeY3 0.334 -0.371 -0.886 -0.600 -0.483 -0.527 -0.539
## SmokerFormer Smoker:TimeY6 0.380 -0.422 -0.598 -0.883 -0.544 -0.618 -0.571
## SmokerFormer Smoker:TimeY9 0.374 -0.416 -0.478 -0.540 -0.877 -0.635 -0.586
## SmokerFormer Smoker:TimeY12 0.387 -0.430 -0.523 -0.615 -0.636 -0.879 -0.688
## SmokerFormer Smoker:TimeY15 0.347 -0.384 -0.535 -0.569 -0.588 -0.689 -0.880
## SFS:TY3 SFS:TY6 SFS:TY9 SFS:TY12
## SmokerFormer Smoker
## TimeY3
## TimeY6
## TimeY9
## TimeY12
## TimeY15
## SmokerFormer Smoker:TimeY3
## SmokerFormer Smoker:TimeY6 0.674
## SmokerFormer Smoker:TimeY9 0.535 0.603
## SmokerFormer Smoker:TimeY12 0.587 0.689 0.710
## SmokerFormer Smoker:TimeY15 0.603 0.637 0.653 0.773
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.94932081 -0.66141929 0.05836135 0.66154229 3.22802665
##
## Residual standard error: 0.591567
## Degrees of freedom: 669 total; 657 residual
current smokers’ baseline equation \(Y_i = 3.49 + (-.27 * 1) = 3.22\) former smokers’ baseline equation \(Y_i = 3.49 + (-.27 * 0) = 3.49\)
confit_baseline <- confint(m_1)
print(confit_baseline)
## 2.5 % 97.5 %
## (Intercept) 3.27325428 3.70233175
## SmokerFormer Smoker -0.52416898 -0.03407580
## TimeY3 -0.13529010 0.11348727
## TimeY6 -0.33937981 -0.08243370
## TimeY9 -0.45211611 -0.18923324
## TimeY12 -0.50488907 -0.25631886
## TimeY15 -0.63594181 -0.36694341
## SmokerFormer Smoker:TimeY3 -0.21094914 0.06983766
## SmokerFormer Smoker:TimeY6 -0.07396496 0.21689369
## SmokerFormer Smoker:TimeY9 -0.20136899 0.09827546
## SmokerFormer Smoker:TimeY12 -0.17125125 0.11161097
## SmokerFormer Smoker:TimeY15 -0.21992477 0.08576768
print(paste("the 95% CI for the estimated FEV1 baseline means are",
round(confit_baseline[1,1], 2), "-", round(confit_baseline[1,2], 2),
"for current smokers and", round((confit_baseline[1,1]) +
confit_baseline[2,1]),
"-", round((confit_baseline[1,2]) + confit_baseline[2,2], 2),
"for former smokers"))
## [1] "the 95% CI for the estimated FEV1 baseline means are 3.27 - 3.7 for current smokers and 3 - 3.67 for former smokers"
3.4 On the same Figure, plot the observed and estimated mean FEV1 levels for each group (Current and Former Smokers) across the follow up time from the best-fit model, connecting the means to produce the mean profiles (i.e. connect the means without smoothing). What are your observations of how the model fits the observed data?
lsmeans_mod1 <- unique(data.frame(Smoker = HW1_data_long_clean$Smoker, Time_enumerated = HW1_data_long_clean$Time_enumerated, FEV1 = m_1$fitted))
lsmeans_mod1
write.csv(lsmeans_mod1, "mod.csv", row.names = FALSE)
mod <- read.csv("mod.csv", header = TRUE)
write.csv(mean_data, "mean.csv", row.names = FALSE)
mean_data <- (read.csv("mean.csv", header = TRUE))
trends.meanprofile <- read.csv("all.csv", header = TRUE)
trends.meanprofile
#trend lines
trends <- ggplot(data = mod, aes(
x = Time_enumerated,
y = FEV1,
group = Smoker))
trends + geom_line(aes(color = Smoker)) + geom_point(aes(color = Smoker)) +
labs(title="Average Trend of FEV1 by Smoker Group Fitted by m_1",
x="Time", y="FEV1", colour="Smoker Group")
#Plot on one figure observed means and fitted means from m1
#Create a grouping factor to distinguish predicted values from observed means
mod$group <- "Predicted"
mean_data$group <- "Observed"
#combine the dataframe with mlogcd4 by row #trying to find mean
trends.meanprofile
trends.meanprofile$Group <- as.factor(trends.meanprofile$Group)
mean_data
trends_b <- ggplot(trends.meanprofile, aes(x = Time_enumerated,
y = FEV1,
color=Smoker,
linetype=Group,
group=interaction(Smoker,Group)))
trends_b + geom_line() +
geom_point() +
labs(title="Predicted and Observed Mean Trajectories of FEV1 by Smoker Group",
x="Time Point", y="FEV1", color="Treatment Group", linetype="Predicted/Observed") +
scale_linetype_manual(values=c("dashed","solid"), labels=c("Observed","Predicted"))
the model and the observed values are fitting fairly close together, indicating a close fit and making me surprised that the beta coefficients
print(summary(m_1))
## Generalized least squares fit by REML
## Model: FEV1 ~ Smoker * Time
## Data: HW1_data_long_clean
## AIC BIC logLik
## 347.1353 495.2289 -140.5677
##
## Correlation Structure: General
## Formula: ~Time_enumerated | ID
## Parameter estimate(s):
## Correlation:
## 1 2 3 4 5
## 2 0.864
## 3 0.849 0.893
## 4 0.837 0.833 0.848
## 5 0.855 0.860 0.889 0.892
## 6 0.841 0.871 0.872 0.874 0.931
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | Time
## Parameter estimates:
## Y0 Y3 Y6 Y9 Y15 Y12
## 1.0000000 0.9696520 0.9435027 0.9528324 0.9665003 0.9387173
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 3.487793 0.10946055 31.86347 0.0000
## SmokerFormer Smoker -0.279122 0.12502607 -2.23251 0.0259
## TimeY3 -0.010901 0.06346478 -0.17177 0.8637
## TimeY6 -0.210907 0.06554868 -3.21756 0.0014
## TimeY9 -0.320675 0.06706319 -4.78168 0.0000
## TimeY12 -0.380604 0.06341193 -6.00209 0.0000
## TimeY15 -0.501443 0.06862330 -7.30718 0.0000
## SmokerFormer Smoker:TimeY3 -0.070556 0.07163060 -0.98499 0.3250
## SmokerFormer Smoker:TimeY6 0.071464 0.07420000 0.96313 0.3358
## SmokerFormer Smoker:TimeY9 -0.051547 0.07644132 -0.67433 0.5003
## SmokerFormer Smoker:TimeY12 -0.029820 0.07216006 -0.41325 0.6796
## SmokerFormer Smoker:TimeY15 -0.067079 0.07798420 -0.86016 0.3900
##
## Correlation:
## (Intr) SmkrFS TimeY3 TimeY6 TimeY9 TimY12 TimY15
## SmokerFormer Smoker -0.876
## TimeY3 -0.378 0.331
## TimeY6 -0.430 0.377 0.678
## TimeY9 -0.427 0.374 0.546 0.616
## TimeY12 -0.441 0.386 0.596 0.700 0.724
## TimeY15 -0.394 0.345 0.608 0.646 0.668 0.782
## SmokerFormer Smoker:TimeY3 0.335 -0.372 -0.886 -0.600 -0.484 -0.528 -0.539
## SmokerFormer Smoker:TimeY6 0.380 -0.422 -0.599 -0.883 -0.544 -0.618 -0.571
## SmokerFormer Smoker:TimeY9 0.375 -0.417 -0.479 -0.540 -0.877 -0.635 -0.586
## SmokerFormer Smoker:TimeY12 0.387 -0.430 -0.524 -0.615 -0.636 -0.879 -0.687
## SmokerFormer Smoker:TimeY15 0.347 -0.385 -0.535 -0.569 -0.587 -0.688 -0.880
## SFS:TY3 SFS:TY6 SFS:TY9 SFS:TY12
## SmokerFormer Smoker
## TimeY3
## TimeY6
## TimeY9
## TimeY12
## TimeY15
## SmokerFormer Smoker:TimeY3
## SmokerFormer Smoker:TimeY6 0.674
## SmokerFormer Smoker:TimeY9 0.536 0.603
## SmokerFormer Smoker:TimeY12 0.588 0.689 0.710
## SmokerFormer Smoker:TimeY15 0.603 0.637 0.653 0.772
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.92624003 -0.65632030 0.05787637 0.65630627 3.20305033
##
## Residual standard error: 0.5962603
## Degrees of freedom: 669 total; 657 residual
Using the best-fit model for the mean response and the worst-fit model for the mean response from your solution in Question #2:
4.1 Why do we say that the parameters inside the Covariance \(\sum_!\) are ‘nuisance’ in this example?
because these parameters inside the matrix capture the dependencies in the data that we’re not interested in but necessary for understanding. they describe how the data points within the same subject (ID) are related (correlation over time) and how the variability in measurements changes over time. But we’re interested in the relationship between smoker status and time, which these parameters aren’t measuring.
4.2 Output the estimated Covariance matrix from each model (should have 2 matrices).
#m1 is the best model (linear), m_i6 is the worst model (quadratic)
cov_matrix_m_1 <- getVarCov(m_1)
cov_matrix_m_i6 <- getVarCov(m_i6)
print(cov_matrix_m_1)
## Marginal variance covariance matrix
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.35553 0.29779 0.28484 0.28338 0.28884
## [2,] 0.29779 0.33427 0.29045 0.27354 0.29020
## [3,] 0.28484 0.29045 0.31649 0.27091 0.28284
## [4,] 0.28338 0.27354 0.27091 0.32278 0.28606
## [5,] 0.28884 0.29020 0.28284 0.28606 0.33211
## Standard Deviations: 0.59626 0.57816 0.56257 0.56814 0.57629
print(cov_matrix_m_i6)
## Marginal variance covariance matrix
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.35495 0.29912 0.28607 0.28104 0.28964
## [2,] 0.29912 0.33887 0.29268 0.27389 0.29354
## [3,] 0.28607 0.29268 0.31990 0.26706 0.28508
## [4,] 0.28104 0.27389 0.26706 0.32128 0.28427
## [5,] 0.28964 0.29354 0.28508 0.28427 0.33351
## Standard Deviations: 0.59578 0.58212 0.5656 0.56682 0.5775
4.3 Comment on the variances and covariances: which model has larger/smaller variances and covariances? Why?
the linear model has slightly larger variances and slightly smaller covariances compared to the quadratic model. The difference is small but consistent across the matrices. The smaller variances and slightly larger covariances in the quadratic may indicate that it’s is better at capturing the correlation structure or the relationships between time points, which leads to a tighter fit with lower variability.
4.4 Output the estimated Correlation matrix from each model (should have 2 matrices).
std_devs_m_1 <- sqrt(diag(cov_matrix_m_1))
std_devs_m_i6 <- sqrt(diag(cov_matrix_m_i6))
# 2. Normalize the covariance matrices by the standard deviations
cor_matrix_m_1 <- cov_matrix_m_1 / (std_devs_m_1 %*% t(std_devs_m_1))
cor_matrix_m_i6 <- cov_matrix_m_i6 / (std_devs_m_i6 %*% t(std_devs_m_i6))
print(cor_matrix_m_1)
## Marginal variance covariance matrix
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1.00000 0.86382 0.84915 0.83654 0.84058
## [2,] 0.86382 1.00000 0.89296 0.83276 0.87098
## [3,] 0.84915 0.89296 1.00000 0.84760 0.87243
## [4,] 0.83654 0.83276 0.84760 1.00000 0.87371
## [5,] 0.84058 0.87098 0.87243 0.87371 1.00000
## Standard Deviations: 1 1 1 1 1
print(cor_matrix_m_i6)
## Marginal variance covariance matrix
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1.00000 0.86248 0.84896 0.83222 0.84183
## [2,] 0.86248 1.00000 0.88894 0.83007 0.87317
## [3,] 0.84896 0.88894 1.00000 0.83302 0.87280
## [4,] 0.83222 0.83007 0.83302 1.00000 0.86844
## [5,] 0.84183 0.87317 0.87280 0.86844 1.00000
## Standard Deviations: 1 1 1 1 1
4.5 Comment on the correlations: which model has larger/smaller correlations? Why? Basically, I am asking you about the effect of how we specify the mean response \(E(Y_{ij}) =𝜇_{ij}\) on the estimated Covariance \((\sum_i)\) and Correlation \((R_i)\) matrices. Why do we care about the model for the mean response?
the linear model has slightly larger correlations than the quadratic model. This is because the linear model does not capture the non-linear relationships in the data as effectively as the quadratic model, resulting in larger residuals and stronger correlations. The specification of the mean response is important because it determines how well the model fits the data. A more accurate specification (like the quadratic model) leads to smaller residuals and smaller correlations, while a less accurate specification leads to larger residuals and stronger correlations.