1. Import library
suppressMessages(library(ggplot2))
2. Import data set
wil = read.csv("~/Desktop/R-dir/R studying/dataset/willett.csv")
colnames(wil)
## [1] "id" "time" "cons" "covar" "y"
dim(wil)
## [1] 140 5
head(wil)
## id time cons covar y
## 1 1 0 1 137 205
## 2 1 1 1 137 217
## 3 1 2 1 137 268
## 4 1 3 1 137 302
## 5 2 0 1 123 219
## 6 2 1 1 123 243
tail(wil)
## id time cons covar y
## 135 34 2 1 110 150
## 136 34 3 1 110 214
## 137 35 0 1 110 166
## 138 35 1 1 110 197
## 139 35 2 1 110 203
## 140 35 3 1 110 233
str(wil)
## 'data.frame': 140 obs. of 5 variables:
## $ id : int 1 1 1 1 2 2 2 2 3 3 ...
## $ time : int 0 1 2 3 0 1 2 3 0 1 ...
## $ cons : int 1 1 1 1 1 1 1 1 1 1 ...
## $ covar: int 137 137 137 137 123 123 123 123 129 129 ...
## $ y : int 205 217 268 302 219 243 279 302 142 212 ...
- This is the data of 35 students (id variable) who are
tracked for progress (by score y) over time with
factors affecting score such as cons and covar.
3. Visualization
One graph for all id
ggplot(data = wil, aes(x = time, y = y, group = id, col = id)) + geom_line()

ggplot(data = wil, aes(x = time, y = y, group = id, col = id)) + geom_line() +
theme(legend.position = "none")

ggplot(data = wil, aes(x = time, y = y, col = factor(id))) + geom_line()

ggplot(data = wil, aes(x = time, y = y, col = factor(id))) + geom_line() +
theme(legend.position = "none")

Each graph for each id
ggplot(data = wil, aes(x = time, y = y, col = factor(id))) + geom_point() +
facet_wrap(~id)

ggplot(data = wil, aes(x = time, y = y, col = factor(id))) + geom_line() +
facet_wrap(~id)

4. Linear regression model
4.1 Linear model for whole students
mod_linear = lm(y ~ time, data = wil)
summary(mod_linear)
##
## Call:
## lm(formula = y ~ time, data = wil)
##
## Residuals:
## Min 1Q Median 3Q Max
## -88.374 -25.584 1.186 28.926 64.746
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 164.374 5.035 32.65 <2e-16 ***
## time 26.960 2.691 10.02 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.6 on 138 degrees of freedom
## Multiple R-squared: 0.421, Adjusted R-squared: 0.4168
## F-statistic: 100.3 on 1 and 138 DF, p-value: < 2.2e-16
4.2 Linear model for each student
suppressMessages(library(tidyverse))
wil %>% group_by(id) %>% mutate(beta = lm(y ~ time)$coefficients[2]) %>% mutate(alpha = lm(y ~ time)$coefficients[1]) %>% summarize(mbeta=mean(beta), malpha=mean(alpha))
## # A tibble: 35 × 3
## id mbeta malpha
## <int> <dbl> <dbl>
## 1 1 34.2 197.
## 2 2 28.5 218
## 3 3 47.9 151.
## 4 4 21.9 206.
## 5 5 9.9 200.
## 6 6 29.6 166.
## 7 7 32.6 160.
## 8 8 37.9 92.9
## 9 9 22.7 139.
## 10 10 26.8 220.
## # … with 25 more rows
4.3 Mix-effect model
suppressMessages(library(lme4))
mod_mix_effect = lmer(y ~ 1 + (1 | id), data = wil)
summary(mod_mix_effect)
## Linear mixed model fit by REML ['lmerMod']
## Formula: y ~ 1 + (1 | id)
## Data: wil
##
## REML criterion at convergence: 1455
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.32587 -0.63434 -0.03562 0.60466 1.92746
##
## Random effects:
## Groups Name Variance Std.Dev.
## id (Intercept) 602.8 24.55
## Residual 1583.7 39.80
## Number of obs: 140, groups: id, 35
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 204.814 5.342 38.34
5. How does the previous school year’s grades affect a student’s
progress?
5.1 Linear regression model
wil$ccovar = wil$covar - mean(wil$covar)
mod_linear2 = lm(y ~ time + ccovar, data = wil)
summary(mod_linear2)
##
## Call:
## lm(formula = y ~ time + ccovar, data = wil)
##
## Residuals:
## Min 1Q Median 3Q Max
## -86.522 -24.591 3.165 28.033 70.018
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 164.3743 4.9646 33.109 <2e-16 ***
## time 26.9600 2.6537 10.159 <2e-16 ***
## ccovar 0.5357 0.2410 2.223 0.0278 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.11 on 137 degrees of freedom
## Multiple R-squared: 0.4412, Adjusted R-squared: 0.433
## F-statistic: 54.08 on 2 and 137 DF, p-value: < 2.2e-16
5.2 Linear model with interaction term
mod_linear_interact = lm(y ~ time + ccovar + time:ccovar, data = wil)
summary(mod_linear_interact)
##
## Call:
## lm(formula = y ~ time + ccovar + time:ccovar, data = wil)
##
## Residuals:
## Min 1Q Median 3Q Max
## -88.767 -26.040 3.676 26.224 73.433
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 164.3743 4.9089 33.485 <2e-16 ***
## time 26.9600 2.6239 10.275 <2e-16 ***
## ccovar -0.1136 0.3987 -0.285 0.7762
## time:ccovar 0.4329 0.2131 2.031 0.0442 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.71 on 136 degrees of freedom
## Multiple R-squared: 0.4576, Adjusted R-squared: 0.4457
## F-statistic: 38.25 on 3 and 136 DF, p-value: < 2.2e-16
5.3 Mix effect
mod_mix_effect2 = lmer(y ~ time + ccovar + time:ccovar + ( 1 + time | id), data = wil)
summary(mod_mix_effect2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: y ~ time + ccovar + time:ccovar + (1 + time | id)
## Data: wil
##
## REML criterion at convergence: 1260.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.24817 -0.61873 0.00428 0.61472 1.55688
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## id (Intercept) 1236.4 35.16
## time 107.2 10.36 -0.49
## Residual 159.5 12.63
## Number of obs: 140, groups: id, 35
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 164.3743 6.2061 26.486
## time 26.9600 1.9939 13.521
## ccovar -0.1136 0.5040 -0.225
## time:ccovar 0.4329 0.1619 2.673
##
## Correlation of Fixed Effects:
## (Intr) time ccovar
## time -0.522
## ccovar 0.000 0.000
## time:ccovar 0.000 0.000 -0.522