1. Import library

suppressMessages(library(ggplot2))

2. Import data set

wil = read.csv("~/Desktop/R-dir/R studying/dataset/willett.csv")
colnames(wil)
## [1] "id"    "time"  "cons"  "covar" "y"
dim(wil)
## [1] 140   5
head(wil)
##   id time cons covar   y
## 1  1    0    1   137 205
## 2  1    1    1   137 217
## 3  1    2    1   137 268
## 4  1    3    1   137 302
## 5  2    0    1   123 219
## 6  2    1    1   123 243
tail(wil)
##     id time cons covar   y
## 135 34    2    1   110 150
## 136 34    3    1   110 214
## 137 35    0    1   110 166
## 138 35    1    1   110 197
## 139 35    2    1   110 203
## 140 35    3    1   110 233
str(wil)
## 'data.frame':    140 obs. of  5 variables:
##  $ id   : int  1 1 1 1 2 2 2 2 3 3 ...
##  $ time : int  0 1 2 3 0 1 2 3 0 1 ...
##  $ cons : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ covar: int  137 137 137 137 123 123 123 123 129 129 ...
##  $ y    : int  205 217 268 302 219 243 279 302 142 212 ...

3. Visualization

One graph for all id

ggplot(data = wil, aes(x = time, y = y, group = id, col = id)) + geom_line() 

ggplot(data = wil, aes(x = time, y = y, group = id, col = id)) + geom_line() + 
  theme(legend.position = "none")

ggplot(data = wil, aes(x = time, y = y, col = factor(id))) + geom_line() 

ggplot(data = wil, aes(x = time, y = y, col = factor(id))) + geom_line() + 
  theme(legend.position = "none")

Each graph for each id

ggplot(data = wil, aes(x = time, y = y, col = factor(id))) + geom_point() +
  facet_wrap(~id)

ggplot(data = wil, aes(x = time, y = y, col = factor(id))) + geom_line() + 
  facet_wrap(~id)

4. Linear regression model

4.1 Linear model for whole students

mod_linear = lm(y ~ time, data = wil)
summary(mod_linear)
## 
## Call:
## lm(formula = y ~ time, data = wil)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -88.374 -25.584   1.186  28.926  64.746 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  164.374      5.035   32.65   <2e-16 ***
## time          26.960      2.691   10.02   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.6 on 138 degrees of freedom
## Multiple R-squared:  0.421,  Adjusted R-squared:  0.4168 
## F-statistic: 100.3 on 1 and 138 DF,  p-value: < 2.2e-16

4.2 Linear model for each student

suppressMessages(library(tidyverse))
wil %>% group_by(id) %>% mutate(beta = lm(y ~ time)$coefficients[2]) %>% mutate(alpha = lm(y ~ time)$coefficients[1]) %>% summarize(mbeta=mean(beta), malpha=mean(alpha))
## # A tibble: 35 × 3
##       id mbeta malpha
##    <int> <dbl>  <dbl>
##  1     1  34.2  197. 
##  2     2  28.5  218  
##  3     3  47.9  151. 
##  4     4  21.9  206. 
##  5     5   9.9  200. 
##  6     6  29.6  166. 
##  7     7  32.6  160. 
##  8     8  37.9   92.9
##  9     9  22.7  139. 
## 10    10  26.8  220. 
## # … with 25 more rows

4.3 Mix-effect model

suppressMessages(library(lme4))
mod_mix_effect = lmer(y ~ 1 + (1 | id), data = wil)
summary(mod_mix_effect)
## Linear mixed model fit by REML ['lmerMod']
## Formula: y ~ 1 + (1 | id)
##    Data: wil
## 
## REML criterion at convergence: 1455
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.32587 -0.63434 -0.03562  0.60466  1.92746 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept)  602.8   24.55   
##  Residual             1583.7   39.80   
## Number of obs: 140, groups:  id, 35
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  204.814      5.342   38.34

5. How does the previous school year’s grades affect a student’s progress?

5.1 Linear regression model

wil$ccovar = wil$covar - mean(wil$covar)
mod_linear2 = lm(y ~ time + ccovar, data = wil)
summary(mod_linear2)
## 
## Call:
## lm(formula = y ~ time + ccovar, data = wil)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -86.522 -24.591   3.165  28.033  70.018 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 164.3743     4.9646  33.109   <2e-16 ***
## time         26.9600     2.6537  10.159   <2e-16 ***
## ccovar        0.5357     0.2410   2.223   0.0278 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.11 on 137 degrees of freedom
## Multiple R-squared:  0.4412, Adjusted R-squared:  0.433 
## F-statistic: 54.08 on 2 and 137 DF,  p-value: < 2.2e-16

5.2 Linear model with interaction term

mod_linear_interact = lm(y ~ time + ccovar + time:ccovar, data = wil)
summary(mod_linear_interact)
## 
## Call:
## lm(formula = y ~ time + ccovar + time:ccovar, data = wil)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -88.767 -26.040   3.676  26.224  73.433 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 164.3743     4.9089  33.485   <2e-16 ***
## time         26.9600     2.6239  10.275   <2e-16 ***
## ccovar       -0.1136     0.3987  -0.285   0.7762    
## time:ccovar   0.4329     0.2131   2.031   0.0442 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.71 on 136 degrees of freedom
## Multiple R-squared:  0.4576, Adjusted R-squared:  0.4457 
## F-statistic: 38.25 on 3 and 136 DF,  p-value: < 2.2e-16

5.3 Mix effect

mod_mix_effect2 = lmer(y ~ time + ccovar + time:ccovar + ( 1 + time | id), data = wil)
summary(mod_mix_effect2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: y ~ time + ccovar + time:ccovar + (1 + time | id)
##    Data: wil
## 
## REML criterion at convergence: 1260.3
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.24817 -0.61873  0.00428  0.61472  1.55688 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  id       (Intercept) 1236.4   35.16         
##           time         107.2   10.36    -0.49
##  Residual              159.5   12.63         
## Number of obs: 140, groups:  id, 35
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 164.3743     6.2061  26.486
## time         26.9600     1.9939  13.521
## ccovar       -0.1136     0.5040  -0.225
## time:ccovar   0.4329     0.1619   2.673
## 
## Correlation of Fixed Effects:
##             (Intr) time   ccovar
## time        -0.522              
## ccovar       0.000  0.000       
## time:ccovar  0.000  0.000 -0.522