toc: true
toc_float: true
theme: cerulean
highlight: kate

1. Import library

suppressMessages(library(ggplot2))
suppressMessages(library(gridExtra))
suppressMessages(library(lme4))
suppressMessages(library(lmerTest))

2. Import data set

pop = read.csv("~/Desktop/R-dir/R studying/dataset/popular.csv")
head(pop)

##   pupil class extrov sex texp popular popteach    Zextrav       Zsex    Ztexp
## 1     1     1      5   F   24     6.3        6 -0.1703149  0.9888125 1.486153
## 2     2     1      7   M   24     4.9        5  1.4140098 -1.0108084 1.486153
## 3     3     1      4   F   24     5.3        6 -0.9624772  0.9888125 1.486153
## 4     4     1      3   F   24     4.7        5 -1.7546396  0.9888125 1.486153
## 5     5     1      5   F   24     6.0        6 -0.1703149  0.9888125 1.486153
## 6     6     1      4   M   24     4.7        5 -0.9624772 -1.0108084 1.486153
##     Zpopular   Zpopteach Cextrav Ctexp Csex
## 1  0.8850133  0.66905609  -0.215 9.737  0.5
## 2 -0.1276291 -0.04308451   1.785 9.737 -0.5
## 3  0.1616973  0.66905609  -1.215 9.737  0.5
## 4 -0.2722923 -0.04308451  -2.215 9.737  0.5
## 5  0.6680185  0.66905609  -0.215 9.737  0.5
## 6 -0.2722923 -0.04308451  -1.215 9.737 -0.5

3. Spagetti plot

ggplot(data = pop, aes(x = factor(class), col = class)) + geom_bar()

ggplot(data = pop, aes(x = factor(class), fill = class)) + 
  geom_bar() + 
  coord_flip() + 
  theme(legend.position="none")

4. Describe several plot in same graph by applying library(gridExtra)

# Phân bố biến popular
p1 = ggplot(data = pop, aes(popular)) + geom_histogram(fill = "blue", col = "white")

# Ảnh hưởng của sex
p2 = ggplot(data = pop, aes(x = sex, y = popular, col = sex)) + geom_boxplot() + 
  geom_jitter(alpha = 0.1) + 
  theme(legend.position = "none")

# Ảnh hưởng của extroversion
p3 = ggplot(data = pop, aes(x = factor(extrov), y = popular, col = factor(extrov))) + 
  geom_boxplot() + 
  geom_jitter(alpha = 0.1) + 
  theme(legend.position = "none")

# Liên quan đến Texp
p4 = ggplot(data = pop, aes(x = texp, y = popular, col = sex)) + 
  geom_point() + 
  geom_smooth()

grid.arrange(p1, p2, p3, p4, ncol=2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

5. Mô hình đa tầng (multilevel models)

Dữ liệu đa tầng (multilevel data) cần đến mô hình đa tầng
Mô hình đa tầng rất linh hoạt trong việc phát biểu giả thuyết
Phát biểu giả thuyết theo ‘tầng’ giúp chúng ta suy nghĩ rõ ràng hơn

5.1 Model 1

# Xem class là biến factor
pop$class = as.factor(pop$class) 

m1 = lmer(popular ~ 1 + (1 | class), data = pop)
summary(m1)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: popular ~ 1 + (1 | class)
##    Data: pop
## 
## REML criterion at convergence: 6330.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.5655 -0.6975  0.0020  0.6758  3.3175 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  class    (Intercept) 0.7021   0.8379  
##  Residual             1.2218   1.1053  
## Number of obs: 2000, groups:  class, 100
## 
## Fixed effects:
##             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)  5.07786    0.08739 98.90973    58.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Tham số • Ảnh hưởng cố định (fixed effect): Y = 5.07 (SE 0.087) • Ảnh hưởng ngẫu nhiên (random effects): v2 = 0.702, s2 = 1.222 • Hệ số tương quan (intra-class correlation) = 0.702 / (0.702 + 1.222) = 0.365 • Khoảng 1/3 phương sai của popular là do khác biệt giữa điểm các lớp học (class)

5.2 Model 2

m2 = lmer(popular ~ 1 + Cextrav + (1 | class), data = pop) 
summary(m2)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: popular ~ 1 + Cextrav + (1 | class)
##    Data: pop
## 
## REML criterion at convergence: 5832.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0644 -0.7267  0.0165  0.7088  3.3587 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  class    (Intercept) 0.8406   0.9168  
##  Residual             0.9304   0.9646  
## Number of obs: 2000, groups:  class, 100
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept) 5.078e+00  9.421e-02 9.830e+01   53.90   <2e-16 ***
## Cextrav     4.863e-01  2.015e-02 1.965e+03   24.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## Cextrav 0.000

Tham số:
- Fixed effect: Y = 5.07 + 0.486 x C_extrav
- Random effects: v2 = 0.841, s2 = 0.930
- Intra-class correlation = 0.841 / (0.841 + 0.930) = 0.475
- 47.5% phương sai của popular là do class và extroversion

5.3 Model 3

m3 = lmer(popular ~ 1 + Cextrav + (1 + Cextrav | class), data = pop)
summary(m3)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: popular ~ 1 + Cextrav + (1 + Cextrav | class)
##    Data: pop
## 
## REML criterion at convergence: 5779.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1961 -0.7291  0.0146  0.6816  3.2217 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  class    (Intercept) 0.89178  0.9443        
##           Cextrav     0.02599  0.1612   -0.88
##  Residual             0.89492  0.9460        
## Number of obs: 2000, groups:  class, 100
## 
## Fixed effects:
##             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)  5.03127    0.09702 97.07723   51.86   <2e-16 ***
## Cextrav      0.49286    0.02546 89.69832   19.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## Cextrav -0.552

Tham số:
- Fixed effect: Y = 5.03 + 0.493 x C_extrav
- Random effects: v2 = 0.892, u2 = 0.026, s2 = 0.895
- Intra-class correlation = (0.892 + 0.026) / (0.892 + 0.026 + 0.895) = 0.506
- 51% of phương sai của popular là do ảnh hưởng của class và extroversion

5.4 Model 4

m4 = lmer(popular ~ 1 + Cextrav + Csex + (1 + Cextrav + Csex | class), data = pop)

## boundary (singular) fit: see help('isSingular')

summary(m4)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: popular ~ 1 + Cextrav + Csex + (1 + Cextrav + Csex | class)
##    Data: pop
## 
## REML criterion at convergence: 4870.5
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -3.01904 -0.64955 -0.01055  0.67101  3.11763 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr       
##  class    (Intercept) 0.673895 0.82091             
##           Cextrav     0.029851 0.17277  -0.73      
##           Csex        0.005371 0.07329  -0.65 -0.04
##  Residual             0.552881 0.74356             
## Number of obs: 2000, groups:  class, 100
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)   5.02051    0.08410  95.87616   59.69   <2e-16 ***
## Cextrav       0.44300    0.02343  91.03883   18.91   <2e-16 ***
## Csex          1.24483    0.03728 503.72929   33.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr) Cextrv
## Cextrav -0.533       
## Csex    -0.127 -0.065
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

Tham số:
- Fixed effect: Y = 5.02 + 0.443xCextrav + 1.244xCsex
- Random effects: v2 = 0.674, u2 = 0.030, w2 = 0.005, s2 = 0.553
- Intra-class correlation = (0.674 + 0.03 + 0.005) / (0.674 + 0.03 + 0.005 + 0.553) = 0.562
- 56.2% phương sai của popular là do ảnh hưởng của class, extroversion và sex

5.5 Model 5

m5 = lmer(popular ~ 1 + Cextrav + Csex + Ctexp + (1 + Cextrav + Csex | class), data = pop)

## boundary (singular) fit: see help('isSingular')

summary(m5)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: popular ~ 1 + Cextrav + Csex + Ctexp + (1 + Cextrav + Csex |  
##     class)
##    Data: pop
## 
## REML criterion at convergence: 4833.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1642 -0.6554 -0.0246  0.6711  2.9571 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr       
##  class    (Intercept) 0.284587 0.53347             
##           Cextrav     0.034740 0.18639  -0.09      
##           Csex        0.002404 0.04903  -0.98 -0.09
##  Residual             0.551435 0.74259             
## Number of obs: 2000, groups:  class, 100
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept) 5.022e+00  5.645e-02 9.507e+01   88.96   <2e-16 ***
## Cextrav     4.529e-01  2.465e-02 9.620e+01   18.38   <2e-16 ***
## Csex        1.251e+00  3.694e-02 9.860e+02   33.86   <2e-16 ***
## Ctexp       8.951e-02  8.618e-03 1.013e+02   10.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr) Cextrv Csex  
## Cextrav -0.060              
## Csex    -0.126 -0.066       
## Ctexp   -0.024  0.089 -0.039
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

Tham số:
- Fixed effect: Y = 5.02 + 0.453xCextrav + 1.251xCsex + 0.089xCtexp
- Random effects: v2 = 0.284, u2 = 0.035, w2 = 0.0024, 𝜎2 = 0.551
- Intra-class correlation = (0.285 + 0.035 + 0.002) / (0.285 + 0.035 + 0.002 + 0.551) = 0.368
- 37% phương sai của popular là do ảnh hưởng của class, extroversion và sex

5.6 Model 6

m6 = lmer(popular ~ 1 + Cextrav + Csex + Ctexp + Cextrav*Ctexp + Csex*Ctexp + 
            (1 + Cextrav + Csex | class), data = pop) 
summary(m6)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: popular ~ 1 + Cextrav + Csex + Ctexp + Cextrav * Ctexp + Csex *  
##     Ctexp + (1 + Cextrav + Csex | class)
##    Data: pop
## 
## REML criterion at convergence: 4786.7
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -3.12085 -0.64852 -0.01955  0.68699  3.05065 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr       
##  class    (Intercept) 0.286806 0.53554             
##           Cextrav     0.005583 0.07472  -0.09      
##           Csex        0.004142 0.06436  -0.85 -0.45
##  Residual             0.552081 0.74302             
## Number of obs: 2000, groups:  class, 100
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)     4.991462   0.056717  99.885225  88.007  < 2e-16 ***
## Cextrav         0.450533   0.017465  82.661686  25.797  < 2e-16 ***
## Csex            1.240359   0.036858 571.466897  33.652  < 2e-16 ***
## Ctexp           0.097171   0.008704 103.180129  11.164  < 2e-16 ***
## Cextrav:Ctexp  -0.024704   0.002574  71.872436  -9.597 1.66e-14 ***
## Csex:Ctexp     -0.001772   0.005924 616.292865  -0.299    0.765    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Cextrv Csex   Ctexp  Cxtr:C
## Cextrav     -0.037                            
## Csex        -0.140 -0.113                     
## Ctexp       -0.021  0.120 -0.038              
## Cextrv:Ctxp  0.120  0.015  0.032 -0.119       
## Csex:Ctexp  -0.040  0.030 -0.015 -0.145 -0.142

Tham số
- Fixed effect: Y = 5.00 + 0.450xCextrav + 1.24xCsex + 0.097xCtexp – 0.- 25CextravCtexp – 0.0018CexCtexp
- Random effects: v2 = 0.287, u2 = 0.0056, w2 = 0.004, s2 = 0.552

Panel analysis

Vu Thien

2023-04-11