1 Data information

The data consist of weight gains of 68 Asian children in a British community. Measurements of weight were recorded for children on up to five occasions visiting a clinic. The ages at which the measurements were taken are roughly to target examination dates of 6 weeks, then 8, 12, 27 months.

pacman::p_load(tidyverse, afex, segmented, foreign, haven, lme4, lmerTest)

# data location and input data
fL <- "http://www.stata-press.com/data/mlmus3/asian.dta"
dta <- read.dta(fL)

#show data structure
str(dta)

## 'data.frame':    198 obs. of  6 variables:
##  $ id    : int  45 45 45 45 45 258 258 258 258 287 ...
##  $ occ   : int  1 2 3 4 5 1 2 3 4 1 ...
##  $ age   : num  0.137 0.657 1.218 1.429 2.272 ...
##  $ weight: num  5.17 10.86 13.15 13.2 15.88 ...
##  $ brthwt: int  4140 4140 4140 4140 4140 3155 3155 3155 3155 3850 ...
##  $ gender: int  1 1 1 1 1 2 2 2 2 1 ...
##  - attr(*, "datalabel")= chr ""
##  - attr(*, "time.stamp")= chr "29 Jan 2005 17:33"
##  - attr(*, "formats")= chr [1:6] "%8.0g" "%9.0g" "%8.0g" "%8.0g" ...
##  - attr(*, "types")= int [1:6] 252 251 254 254 252 252
##  - attr(*, "val.labels")= chr [1:6] "" "" "" "" ...
##  - attr(*, "var.labels")= chr [1:6] "" "" "" "" ...
##  - attr(*, "expansion.fields")=List of 5
##   ..$ : chr [1:3] "_dta" "ReS_i" "id"
##   ..$ : chr [1:3] "_dta" "ReS_ver" "v.2"
##   ..$ : chr [1:3] "_dta" "ReS_j" "occ"
##   ..$ : chr [1:3] "_dta" "ReS_str" "0"
##   ..$ : chr [1:3] "_dta" "ReS_Xij" "weight age"
##  - attr(*, "version")= int 8

不知道為什麼$ gender下會出現那一串訊息?

2 data management

dta$id <- factor(dta$id) #將ID資料屬性轉變為factor
dta$gender <- factor(dta$gender, 1:2, labels = c("M", "F")) #將男性1改為M，女性2改為F
dta$brthwt <- dta$brthwt/1000 #將出生率的單位改為千分比
dta$occ <- factor(dta$occ) #將第幾次就診occ資料結構轉為factor

glimpse(dta) #瀏覽資料

## Rows: 198
## Columns: 6
## $ id     <fct> 45, 45, 45, 45, 45, 258, 258, 258, 258, 287, 287, 287, 287, ...
## $ occ    <fct> 1, 2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 2, ...
## $ age    <dbl> 0.1368925, 0.6570842, 1.2183436, 1.4291581, 2.2724161, 0.191...
## $ weight <dbl> 5.171, 10.860, 13.150, 13.200, 15.880, 5.300, 9.740, 9.980, ...
## $ brthwt <dbl> 4.140, 4.140, 4.140, 4.140, 4.140, 3.155, 3.155, 3.155, 3.15...
## $ gender <fct> M, M, M, M, M, F, F, F, F, M, M, M, M, F, F, F, F, F, M, M, ...

glimpse()的語法類似str()，可總覽資料框的欄位及欄位類別。

執行結果回報有6個變數(variables)，每個變數有198個觀察值。

經過上個步驟，資料型態也已經轉變。

head(dta,10) # show first 10 lines

##     id occ       age weight brthwt gender
## 1   45   1 0.1368925  5.171  4.140      M
## 2   45   2 0.6570842 10.860  4.140      M
## 3   45   3 1.2183436 13.150  4.140      M
## 4   45   4 1.4291581 13.200  4.140      M
## 5   45   5 2.2724161 15.880  4.140      M
## 6  258   1 0.1916496  5.300  3.155      F
## 7  258   2 0.6872005  9.740  3.155      F
## 8  258   3 1.1279945  9.980  3.155      F
## 9  258   4 2.3052704 11.340  3.155      F
## 10 287   1 0.1341547  4.820  3.850      M

3 Data visualization

3.1 plot data

#看一下raw data的樣子
ggplot(dta, aes(age, weight, group = id, color=gender))+ #x軸為age，Y軸為weight，按照id分群，按照gender不同給不同顏色
 geom_point()+ 
 geom_line() + 
 facet_wrap( ~ gender)+ #依照gender來分割圖
 labs(x = "Age (year)", y = "Weight (kg)") + 
 theme(legend.position = c(.9, .2))

不論男女，隨著年紀增加，體重也會增加，但不是直線上升，大約0.75至1歲前會快速增加，1歲左右後增加幅度稍微趨緩。此外，男生的斜率較女生的斜率陡，表示男性體重增加的幅度大於女性。

#用平滑曲線來配適資料
ggplot(dta, aes(age, weight, color = gender))+ #x軸為age，Y軸為weight，按照gender不同給不同顏色
 geom_point()+
 stat_smooth(method = "lm", formula = y ~ poly(x, 2)) + #poly(x, 2)中的2是指二次多項式
 labs(x = "Age (year)", y = "Weight (kg)")+
 theme(legend.position = c(.9, .2))

尾端還有很多點沒有在線上，不確定用二次多項式formula = y ~ poly(x, 2)是否為最佳。

4 Modeling

#在原始data set中產生一個新變數age2，現在有7個variables
dta$age2 <- dta$age^2

4.1 m0

# t-tests use Satterthwaite's method
summary(m0 <- lmer(weight ~ brthwt + gender + age + age2 + (age |id),data = dta))#brthwt、gender、age、age2為固定效果；(age |id)為隨機效果，表示每個id的age都是隨機截距項

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: weight ~ brthwt + gender + age + age2 + (age | id)
##    Data: dta
## 
## REML criterion at convergence: 496.6
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.81353 -0.43471 -0.03652  0.45573  2.79813 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr
##  id       (Intercept) 0.1920   0.4381       
##           age         0.2702   0.5198   0.07
##  Residual             0.3346   0.5784       
## Number of obs: 198, groups:  id, 68
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)   1.02225    0.54924  57.65949   1.861  0.06782 .  
## brthwt        0.87755    0.16616  53.16001   5.281 2.43e-06 ***
## genderF      -0.51922    0.16698  52.23221  -3.109  0.00303 ** 
## age           7.73933    0.23674 135.03625  32.691  < 2e-16 ***
## age2         -1.67125    0.08762 114.78740 -19.074  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr) brthwt gendrF age   
## brthwt  -0.963                     
## genderF -0.240  0.095              
## age     -0.221  0.058 -0.003       
## age2     0.199 -0.051  0.002 -0.927

解釋：出生時的體重brthwt、女性genderF、年齡age、age2對每個人目前的體重有顯著影響，其中體重與年齡的關係是二次多項式的關係。

5

# Type 3 tests, KR-method - This is better
afex::mixed(weight ~ brthwt + gender + age + age2 + (age | id), data = dta)

## Contrasts set to contr.sum for the following variables: gender, id

## Numerical variables NOT centered on 0: brthwt, age, age2
## If in interactions, interpretation of lower order (e.g., main) effects difficult.

## Fitting one lmer() model. [DONE]
## Calculating p-values. [DONE]

## Mixed Model Anova Table (Type 3 tests, KR-method)
## 
## Model: weight ~ brthwt + gender + age + age2 + (age | id)
## Data: dta
##   Effect        df           F p.value
## 1 brthwt  1, 62.42   26.79 ***   <.001
## 2 gender  1, 61.58     9.31 **    .003
## 3    age 1, 141.54 1046.79 ***   <.001
## 4   age2 1, 122.93  355.47 ***   <.001
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '+' 0.1 ' ' 1

# construct piecewise lines defined by knots
dta$d1 <- dta$age - 0.4; dta$d1[dta$d1 < 0] <- 0
dta$d2 <- dta$age - 0.9; dta$d2[dta$d2 < 0] <- 0
dta$d3 <- dta$age - 1.5; dta$d3[dta$d3 < 0] <- 0

除了用二次多項式曲線配適所有的資料，還可以設定不同斷點，在這裡每個id以不同年齡作為切點(age=0.4、0.9、1.5)，來分段配適資料，因此新增三個變數d1、d2、d3。

5.1 m1

summary(m1 <- lmer(weight ~ brthwt + gender + age + d1 + d2 + d3 + (age | id), data = dta))

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: weight ~ brthwt + gender + age + d1 + d2 + d3 + (age | id)
##    Data: dta
## 
## REML criterion at convergence: 465
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.81475 -0.41746 -0.01623  0.42323  2.51409 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  id       (Intercept) 0.2476   0.4976        
##           age         0.3273   0.5721   -0.12
##  Residual             0.2523   0.5023        
## Number of obs: 198, groups:  id, 68
## 
## Fixed effects:
##             Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)   0.4941     0.5600 65.7487   0.882 0.380826    
## brthwt        0.8949     0.1617 51.4586   5.533 1.07e-06 ***
## genderF      -0.5459     0.1627 50.8735  -3.356 0.001504 ** 
## age          10.3595     1.0792 83.2345   9.599 4.03e-15 ***
## d1           -6.1891     1.8004 80.6638  -3.438 0.000931 ***
## d2           -1.6641     1.3071 82.8518  -1.273 0.206547    
## d3           -0.3684     0.9920 80.3757  -0.371 0.711367    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr) brthwt gendrF age    d1     d2    
## brthwt  -0.927                                   
## genderF -0.231  0.099                            
## age     -0.339  0.045 -0.002                     
## d1       0.313 -0.039 -0.001 -0.985              
## d2      -0.239  0.032  0.012  0.816 -0.884       
## d3       0.167 -0.031 -0.019 -0.533  0.606 -0.886

解釋： 1.對體重顯著影響的變數有：brthwt、genderF、age、d1

2.從Correlation of Fixed Effects可知： brthwt和d1、d2、d3的相關係數為分別為-0.985,0.816,-0.533，表示出生體重越重，0.9歲至1.5歲之間成長越快。

dtafm1 <- data.frame(dta, yhat=fitted(m1)) #yhat=fitted(m1)的意思是提取符合m1模型的資料，將它命名為yhat

6

ggplot(dtafm1, aes(age, weight, group=id, color = gender))+
 geom_point()+
 geom_line(aes(age, yhat))+
 facet_wrap(~ gender)+
 labs(x = "Age (year)", y = "Weight (kg)")

quantile(dta$age) #將年齡切分四分位數

##        0%       25%       50%       75%      100% 
## 0.1149897 0.6461328 0.9965777 1.4709103 2.5462012

7

summary(m0 <- lm(weight ~ age, data=dta))

## 
## Call:
## lm(formula = weight ~ age, data = dta)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2994 -1.0341 -0.2524  1.0947  4.2614 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.2021     0.1779   29.24   <2e-16 ***
## age           3.3640     0.1332   25.26   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.471 on 196 degrees of freedom
## Multiple R-squared:  0.765,  Adjusted R-squared:  0.7638 
## F-statistic: 637.9 on 1 and 196 DF,  p-value: < 2.2e-16

8

#segmented提供斷點估計
summary(m0z <- segmented::segmented(m0, seg.Z= ~ age, 
                            psi=list(age=c(0.6, 1.4)),
                            control=seg.control(display=FALSE)))

## 
##  ***Regression Model with Segmented Relationship(s)***
## 
## Call: 
## segmented.lm(obj = m0, seg.Z = ~age, psi = list(age = c(0.6, 
##     1.4)), control = seg.control(display = FALSE))
## 
## Estimated Break-Point(s):
##            Est. St.Err
## psi1.age 0.317  0.138
## psi2.age 2.300  0.047
## 
## Meaningful coefficients of the linear terms:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    2.262      1.391   1.626    0.106
## age           16.425     10.311   1.593    0.113
## U1.age       -13.786     10.312  -1.337       NA
## U2.age        -8.539      3.457  -2.470       NA
## 
## Residual standard error: 1.176 on 192 degrees of freedom
## Multiple R-Squared: 0.8528,  Adjusted R-squared: 0.849 
## 
## Convergence attained in 5 iter. (rel. change 2.0429e-06)

結果：得到Estimated Break-Point(s)兩個：0.317及2.3

9

plot(m0z)
with(dta, points(age, weight))
abline(v = m0z$psi[,2], lty = 3)
grid()

依據Break-Point劃出分段圖形。

#The END

Week6 In-class exercises 1

Ching-Fang Wu

Mon Oct 19 11:48:10 2020