• toc: true
  • toc_float: true
  • theme: cerulean
  • highlight: kate

1. Import library

suppressMessages(library(ggplot2))
suppressMessages(library(gridExtra))
suppressMessages(library(lme4))
suppressMessages(library(lmerTest))

2. Import data set

pop = read.csv("~/Desktop/R-dir/R studying/dataset/popular.csv")
head(pop)
##   pupil class extrov sex texp popular popteach    Zextrav       Zsex    Ztexp
## 1     1     1      5   F   24     6.3        6 -0.1703149  0.9888125 1.486153
## 2     2     1      7   M   24     4.9        5  1.4140098 -1.0108084 1.486153
## 3     3     1      4   F   24     5.3        6 -0.9624772  0.9888125 1.486153
## 4     4     1      3   F   24     4.7        5 -1.7546396  0.9888125 1.486153
## 5     5     1      5   F   24     6.0        6 -0.1703149  0.9888125 1.486153
## 6     6     1      4   M   24     4.7        5 -0.9624772 -1.0108084 1.486153
##     Zpopular   Zpopteach Cextrav Ctexp Csex
## 1  0.8850133  0.66905609  -0.215 9.737  0.5
## 2 -0.1276291 -0.04308451   1.785 9.737 -0.5
## 3  0.1616973  0.66905609  -1.215 9.737  0.5
## 4 -0.2722923 -0.04308451  -2.215 9.737  0.5
## 5  0.6680185  0.66905609  -0.215 9.737  0.5
## 6 -0.2722923 -0.04308451  -1.215 9.737 -0.5

3. Spagetti plot

ggplot(data = pop, aes(x = factor(class), col = class)) + geom_bar()

ggplot(data = pop, aes(x = factor(class), fill = class)) + 
  geom_bar() + 
  coord_flip() + 
  theme(legend.position="none")

4. Describe several plot in same graph by applying library(gridExtra)

# Phân bố biến popular
p1 = ggplot(data = pop, aes(popular)) + geom_histogram(fill = "blue", col = "white")

# Ảnh hưởng của sex
p2 = ggplot(data = pop, aes(x = sex, y = popular, col = sex)) + geom_boxplot() + 
  geom_jitter(alpha = 0.1) + 
  theme(legend.position = "none")

# Ảnh hưởng của extroversion
p3 = ggplot(data = pop, aes(x = factor(extrov), y = popular, col = factor(extrov))) + 
  geom_boxplot() + 
  geom_jitter(alpha = 0.1) + 
  theme(legend.position = "none")

# Liên quan đến Texp
p4 = ggplot(data = pop, aes(x = texp, y = popular, col = sex)) + 
  geom_point() + 
  geom_smooth()

grid.arrange(p1, p2, p3, p4, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

5. Mô hình đa tầng (multilevel models)

  • Dữ liệu đa tầng (multilevel data) cần đến mô hình đa tầng
  • Mô hình đa tầng rất linh hoạt trong việc phát biểu giả thuyết
  • Phát biểu giả thuyết theo ‘tầng’ giúp chúng ta suy nghĩ rõ ràng hơn

5.1 Model 1

# Xem class là biến factor
pop$class = as.factor(pop$class) 

m1 = lmer(popular ~ 1 + (1 | class), data = pop)
summary(m1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: popular ~ 1 + (1 | class)
##    Data: pop
## 
## REML criterion at convergence: 6330.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.5655 -0.6975  0.0020  0.6758  3.3175 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  class    (Intercept) 0.7021   0.8379  
##  Residual             1.2218   1.1053  
## Number of obs: 2000, groups:  class, 100
## 
## Fixed effects:
##             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)  5.07786    0.08739 98.90973    58.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Tham số • Ảnh hưởng cố định (fixed effect): Y = 5.07 (SE 0.087) • Ảnh hưởng ngẫu nhiên (random effects): v2 = 0.702, s2 = 1.222 • Hệ số tương quan (intra-class correlation) = 0.702 / (0.702 + 1.222) = 0.365 • Khoảng 1/3 phương sai của popular là do khác biệt giữa điểm các lớp học (class)

5.2 Model 2

m2 = lmer(popular ~ 1 + Cextrav + (1 | class), data = pop) 
summary(m2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: popular ~ 1 + Cextrav + (1 | class)
##    Data: pop
## 
## REML criterion at convergence: 5832.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0644 -0.7267  0.0165  0.7088  3.3587 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  class    (Intercept) 0.8406   0.9168  
##  Residual             0.9304   0.9646  
## Number of obs: 2000, groups:  class, 100
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept) 5.078e+00  9.421e-02 9.830e+01   53.90   <2e-16 ***
## Cextrav     4.863e-01  2.015e-02 1.965e+03   24.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## Cextrav 0.000
  • Tham số:
    • Fixed effect: Y = 5.07 + 0.486 x C_extrav
    • Random effects: v2 = 0.841, s2 = 0.930
    • Intra-class correlation = 0.841 / (0.841 + 0.930) = 0.475
    • 47.5% phương sai của popular là do class và extroversion

5.3 Model 3

m3 = lmer(popular ~ 1 + Cextrav + (1 + Cextrav | class), data = pop)
summary(m3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: popular ~ 1 + Cextrav + (1 + Cextrav | class)
##    Data: pop
## 
## REML criterion at convergence: 5779.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1961 -0.7291  0.0146  0.6816  3.2217 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  class    (Intercept) 0.89178  0.9443        
##           Cextrav     0.02599  0.1612   -0.88
##  Residual             0.89492  0.9460        
## Number of obs: 2000, groups:  class, 100
## 
## Fixed effects:
##             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)  5.03127    0.09702 97.07723   51.86   <2e-16 ***
## Cextrav      0.49286    0.02546 89.69832   19.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## Cextrav -0.552
  • Tham số:
    • Fixed effect: Y = 5.03 + 0.493 x C_extrav
    • Random effects: v2 = 0.892, u2 = 0.026, s2 = 0.895
    • Intra-class correlation = (0.892 + 0.026) / (0.892 + 0.026 + 0.895) = 0.506
    • 51% of phương sai của popular là do ảnh hưởng của class và extroversion

5.4 Model 4

m4 = lmer(popular ~ 1 + Cextrav + Csex + (1 + Cextrav + Csex | class), data = pop)
## boundary (singular) fit: see help('isSingular')
summary(m4)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: popular ~ 1 + Cextrav + Csex + (1 + Cextrav + Csex | class)
##    Data: pop
## 
## REML criterion at convergence: 4870.5
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -3.01904 -0.64955 -0.01055  0.67101  3.11763 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr       
##  class    (Intercept) 0.673895 0.82091             
##           Cextrav     0.029851 0.17277  -0.73      
##           Csex        0.005371 0.07329  -0.65 -0.04
##  Residual             0.552881 0.74356             
## Number of obs: 2000, groups:  class, 100
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)   5.02051    0.08410  95.87616   59.69   <2e-16 ***
## Cextrav       0.44300    0.02343  91.03883   18.91   <2e-16 ***
## Csex          1.24483    0.03728 503.72929   33.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr) Cextrv
## Cextrav -0.533       
## Csex    -0.127 -0.065
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
  • Tham số:
    • Fixed effect: Y = 5.02 + 0.443xCextrav + 1.244xCsex
    • Random effects: v2 = 0.674, u2 = 0.030, w2 = 0.005, s2 = 0.553
    • Intra-class correlation = (0.674 + 0.03 + 0.005) / (0.674 + 0.03 + 0.005 + 0.553) = 0.562
    • 56.2% phương sai của popular là do ảnh hưởng của class, extroversion và sex

5.5 Model 5

m5 = lmer(popular ~ 1 + Cextrav + Csex + Ctexp + (1 + Cextrav + Csex | class), data = pop)
## boundary (singular) fit: see help('isSingular')
summary(m5)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: popular ~ 1 + Cextrav + Csex + Ctexp + (1 + Cextrav + Csex |  
##     class)
##    Data: pop
## 
## REML criterion at convergence: 4833.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1642 -0.6554 -0.0246  0.6711  2.9571 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr       
##  class    (Intercept) 0.284587 0.53347             
##           Cextrav     0.034740 0.18639  -0.09      
##           Csex        0.002404 0.04903  -0.98 -0.09
##  Residual             0.551435 0.74259             
## Number of obs: 2000, groups:  class, 100
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept) 5.022e+00  5.645e-02 9.507e+01   88.96   <2e-16 ***
## Cextrav     4.529e-01  2.465e-02 9.620e+01   18.38   <2e-16 ***
## Csex        1.251e+00  3.694e-02 9.860e+02   33.86   <2e-16 ***
## Ctexp       8.951e-02  8.618e-03 1.013e+02   10.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr) Cextrv Csex  
## Cextrav -0.060              
## Csex    -0.126 -0.066       
## Ctexp   -0.024  0.089 -0.039
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
  • Tham số:
    • Fixed effect: Y = 5.02 + 0.453xCextrav + 1.251xCsex + 0.089xCtexp
    • Random effects: v2 = 0.284, u2 = 0.035, w2 = 0.0024, 𝜎2 = 0.551
    • Intra-class correlation = (0.285 + 0.035 + 0.002) / (0.285 + 0.035 + 0.002 + 0.551) = 0.368
    • 37% phương sai của popular là do ảnh hưởng của class, extroversion và sex

5.6 Model 6

m6 = lmer(popular ~ 1 + Cextrav + Csex + Ctexp + Cextrav*Ctexp + Csex*Ctexp + 
            (1 + Cextrav + Csex | class), data = pop) 
summary(m6)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: popular ~ 1 + Cextrav + Csex + Ctexp + Cextrav * Ctexp + Csex *  
##     Ctexp + (1 + Cextrav + Csex | class)
##    Data: pop
## 
## REML criterion at convergence: 4786.7
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -3.12085 -0.64852 -0.01955  0.68699  3.05065 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr       
##  class    (Intercept) 0.286806 0.53554             
##           Cextrav     0.005583 0.07472  -0.09      
##           Csex        0.004142 0.06436  -0.85 -0.45
##  Residual             0.552081 0.74302             
## Number of obs: 2000, groups:  class, 100
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)     4.991462   0.056717  99.885225  88.007  < 2e-16 ***
## Cextrav         0.450533   0.017465  82.661686  25.797  < 2e-16 ***
## Csex            1.240359   0.036858 571.466897  33.652  < 2e-16 ***
## Ctexp           0.097171   0.008704 103.180129  11.164  < 2e-16 ***
## Cextrav:Ctexp  -0.024704   0.002574  71.872436  -9.597 1.66e-14 ***
## Csex:Ctexp     -0.001772   0.005924 616.292865  -0.299    0.765    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Cextrv Csex   Ctexp  Cxtr:C
## Cextrav     -0.037                            
## Csex        -0.140 -0.113                     
## Ctexp       -0.021  0.120 -0.038              
## Cextrv:Ctxp  0.120  0.015  0.032 -0.119       
## Csex:Ctexp  -0.040  0.030 -0.015 -0.145 -0.142
  • Tham số
    • Fixed effect: Y = 5.00 + 0.450xCextrav + 1.24xCsex + 0.097xCtexp – 0.- 25CextravCtexp – 0.0018CexCtexp
    • Random effects: v2 = 0.287, u2 = 0.0056, w2 = 0.004, s2 = 0.552