regression with correlations over time and in clusters

set significant decimal points

don’t print silly stars

options(digits=4, show.signif.stars=FALSE)

handle package loading automatically

#install.package(pacman)

load packages

pacman::p_load(tidyverse, nlme, broom, magrittr, MuMIn)

load data set

#data(Oxboys, package="nlme")
girl_hight<-read.csv("C:/Users/Ching-Fang Wu/Documents/data/girl_height.csv",header = T)

exam data structure

str(girl_hight)
## 'data.frame':    100 obs. of  4 variables:
##  $ Height: num  111 116 122 126 130 ...
##  $ Girl  : chr  "G1" "G1" "G1" "G1" ...
##  $ Age   : int  6 7 8 9 10 6 7 8 9 10 ...
##  $ Mother: chr  "S" "S" "S" "S" ...

this data structure goes with lattice plot

plot(girl_hight)

view first 6 lines

head(girl_hight)
##   Height Girl Age Mother
## 1  111.0   G1   6      S
## 2  116.4   G1   7      S
## 3  121.7   G1   8      S
## 4  126.3   G1   9      S
## 5  130.5   G1  10      S
## 6  110.0   G2   6      S

compute correlations of heights among Age

cor(girl_hight$Height,girl_hight$Age)
## [1] 0.8551

format data from long to wide format first

dta <- as.data.frame(girl_hight) %>%
 select(Girl, Age , Height) %>%
 spread(Age, Height) %>%
 select(-Girl)

get correlation matrix

dta %>% cor
##         6      7      8      9     10
## 6  1.0000 0.9809 0.9816 0.9740 0.9540
## 7  0.9809 1.0000 0.9858 0.9803 0.9584
## 8  0.9816 0.9858 1.0000 0.9881 0.9751
## 9  0.9740 0.9803 0.9881 1.0000 0.9920
## 10 0.9540 0.9584 0.9751 0.9920 1.0000

get variance at each Age

dta %>% var %>% diag
##     6     7     8     9    10 
## 15.81 19.73 27.16 31.22 32.01

set plot theme to black-n-white

ot <- theme_set(theme_bw())

one regression line fits all

ggplot(girl_hight, aes(x = Age, y = Height, color = Mother)) +
 geom_point() +
 stat_smooth(aes(group = 1), method = "lm") +
 labs(x = "Age (scaled)", y = "Height (cm)") +
 theme(legend.position = "NONE")
## `geom_smooth()` using formula 'y ~ x'

ordinary regression

m0 <- gls(Height ~ Age, data = girl_hight)

regression with autocorrelation

m1 <- update(m0, corr = corAR1(form = ~ 1 | Girl), method = "ML")

regression with unequal variances for occasions

m2 <- update(m0, weights = varIdent(form = ~ 1 | Mother))

regression with autocorrelation and unequal variances

m3 <- update(m1, weights = varIdent(form = ~ 1 | Mother))

model selection with AICc

model.sel(m0, m1, m2, m3)
## Model selection table 
##    (Intrc)   Age             family     correlation             weights  REML
## m3   82.47 5.558 gaussian(identity) corAR1(~1|Girl) varIdent(~1|Mother) FALSE
## m1   82.47 5.684 gaussian(identity) corAR1(~1|Girl)                     FALSE
## m2   82.46 5.549 gaussian(identity)                 varIdent(~1|Mother)      
## m0   82.52 5.716 gaussian(identity)                                          
##    df logLik  AICc  delta weight
## m3  6 -162.8 338.5   0.00      1
## m1  4 -172.7 353.8  15.26      0
## m2  5 -292.0 594.6 256.14      0
## m0  3 -300.8 607.8 269.27      0
## Models ranked by AICc(x)

優先考慮的模型應是AIC值最小的那一個:m3

summary model output

lapply(list(m0, m1, m2, m3), summary)
## [[1]]
## Generalized least squares fit by REML
##   Model: Height ~ Age 
##   Data: girl_hight 
##     AIC   BIC logLik
##   607.5 615.3 -300.8
## 
## Coefficients:
##             Value Std.Error t-value p-value
## (Intercept) 82.52    2.8439   29.02       0
## Age          5.72    0.3501   16.33       0
## 
##  Correlation: 
##     (Intr)
## Age -0.985
## 
## Standardized residuals:
##     Min      Q1     Med      Q3     Max 
## -1.8561 -0.7402 -0.1696  0.7974  2.6348 
## 
## Residual standard error: 4.951 
## Degrees of freedom: 100 total; 98 residual
## 
## [[2]]
## Generalized least squares fit by maximum likelihood
##   Model: Height ~ Age 
##   Data: girl_hight 
##     AIC   BIC logLik
##   353.3 363.8 -172.7
## 
## Correlation Structure: AR(1)
##  Formula: ~1 | Girl 
##  Parameter estimate(s):
##    Phi 
## 0.9782 
## 
## Coefficients:
##             Value Std.Error t-value p-value
## (Intercept) 82.47     1.380   59.75       0
## Age          5.68     0.111   51.21       0
## 
##  Correlation: 
##     (Intr)
## Age -0.643
## 
## Standardized residuals:
##     Min      Q1     Med      Q3     Max 
## -1.8432 -0.7005 -0.1174  0.8831  2.7934 
## 
## Residual standard error: 4.781 
## Degrees of freedom: 100 total; 98 residual
## 
## [[3]]
## Generalized least squares fit by REML
##   Model: Height ~ Age 
##   Data: girl_hight 
##   AIC   BIC logLik
##   594 606.9   -292
## 
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Mother 
##  Parameter estimates:
##      S      M      T 
## 1.0000 0.8014 1.8171 
## 
## Coefficients:
##             Value Std.Error t-value p-value
## (Intercept) 82.46    2.3331   35.34       0
## Age          5.55    0.2872   19.32       0
## 
##  Correlation: 
##     (Intr)
## Age -0.985
## 
## Standardized residuals:
##      Min       Q1      Med       Q3      Max 
## -1.88260 -0.64836  0.07568  0.75427  2.40785 
## 
## Residual standard error: 3.961 
## Degrees of freedom: 100 total; 98 residual
## 
## [[4]]
## Generalized least squares fit by maximum likelihood
##   Model: Height ~ Age 
##   Data: girl_hight 
##     AIC   BIC logLik
##   337.6 353.2 -162.8
## 
## Correlation Structure: AR(1)
##  Formula: ~1 | Girl 
##  Parameter estimate(s):
##    Phi 
## 0.9787 
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Mother 
##  Parameter estimates:
##      S      M      T 
## 1.0000 0.6923 1.5494 
## 
## Coefficients:
##             Value Std.Error t-value p-value
## (Intercept) 82.47    1.1305   72.95       0
## Age          5.56    0.0903   61.55       0
## 
##  Correlation: 
##     (Intr)
## Age -0.639
## 
## Standardized residuals:
##      Min       Q1      Med       Q3      Max 
## -1.77189 -0.71727  0.06484  0.80091  2.56095 
## 
## Residual standard error: 4.265 
## Degrees of freedom: 100 total; 98 residual

autocorrelations

acf(resid(m0, type = "normalized"), main = "m0")

acf(resid(m1, type = "normalized"), main = "m1")

acf(resid(m2, type = "normalized"), main = "m2")

acf(resid(m3, type = "normalized"), main = "m3")

The end