Refer to the questions here.
library(alr4)
# simple linear regression info
m0 <- lm(dheight ~ 1, data = Heights) ; m0
Call:
lm(formula = dheight ~ 1, data = Heights)
Coefficients:
(Intercept)
63.75
m1 <- update(m0, . ~ . + mheight)
summary(m1)
Call:
lm(formula = dheight ~ mheight, data = Heights)
Residuals:
Min 1Q Median 3Q Max
-7.397 -1.529 0.036 1.492 9.053
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 29.91744 1.62247 18.44 <2e-16 ***
mheight 0.54175 0.02596 20.87 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.266 on 1373 degrees of freedom
Multiple R-squared: 0.2408, Adjusted R-squared: 0.2402
F-statistic: 435.5 on 1 and 1373 DF, p-value: < 2.2e-16
anova(m0, m1)
Analysis of Variance Table
Model 1: dheight ~ 1
Model 2: dheight ~ mheight
Res.Df RSS Df Sum of Sq F Pr(>F)
1 1374 9288.6
2 1373 7052.0 1 2236.7 435.47 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# diagnostics
#dev.off()
ggplot(m1, aes(sample = scale(resid(m1, "pearson")))) +
stat_qq() +
geom_abline(intercept = 0, slope = 1)+
labs(x = "Normal quantiles",
y = "Quantiles of standardized residuals")
# centering
coef(lm(dheight ~ mheight, data = Heights))
(Intercept) mheight
29.917437 0.541747
Heights %>% mutate(mheight_c = mheight - mean(mheight)) %>%
lm(dheight ~ mheight_c, data = .) %>% coef(.)
(Intercept) mheight_c
63.751055 0.541747
# plot
with(Heights, plot(mheight, dheight))
abline(lm(dheight ~ mheight, data = Heights), col = "blue")
# scaling
coef(lm(scale(dheight) ~ scale(mheight), data = Heights))
(Intercept) scale(mheight)
2.199083e-15 4.907094e-01
with(Heights, (sd(dheight)/sd(mheight)* cor(dheight, mheight)))
[1] 0.541747
What the Rscript does :
1.Pick three regions : africa, asia, europe , and randomly pick 72 countries. Draw regression line of these three regions.
2.Draw another weighted regression line(brown) to show the correlation of gdp and fertility rate.
setwd("E:/201709/multilevel_analysis/1002")
girl_dat <- read.csv("girl_height.csv")
dat <- read.csv("girl_height.csv")
# plot
xyplot( Height ~ Age | Girl, data = girl_dat,
xlab = "Age", ylab = "Height(cm)")
# compute correlations of heights among occasions
dat %<>% mutate(groupt=substr(dat$Girl, 1,1), ID=as.numeric(substr(dat$Girl,2,3)))
dat <- as.data.frame(dat) %>%
select(Height, Age, ID) %>%
spread(Age, Height) %>%
select(-ID)
# get correlation matrix
dat %>% cor
6 7 8 9 10
6 1.0000000 0.9809247 0.9815512 0.9740140 0.9540426
7 0.9809247 1.0000000 0.9858051 0.9803015 0.9584083
8 0.9815512 0.9858051 1.0000000 0.9880770 0.9751260
9 0.9740140 0.9803015 0.9880770 1.0000000 0.9920014
10 0.9540426 0.9584083 0.9751260 0.9920014 1.0000000
# get variance at each occasion (diag : extract or replace the diagonal of a matrix)
dat %>% var %>% diag
6 7 8 9 10
15.81095 19.73063 27.15589 31.22408 32.00997
# set plot theme to black-n-white
ot <- theme_set(theme_bw())
# one regression line fits all
ggplot(girl_dat, aes(x = Age, y = Height, color = Girl)) +
geom_point() +
stat_smooth(aes(group = 1), method = "lm") +
labs(x = "Age (scaled)", y = "Height (cm)") +
theme(legend.position = "NONE")
# ordinary regression
m0 <- gls(Height ~ Age, data = girl_dat)
# regression with autocorrelation
m1 <- update(m0, corr = corAR1(form = ~ 1 | Girl), method = "ML");m1
Generalized least squares fit by maximum likelihood
Model: Height ~ Age
Data: girl_dat
Log-likelihood: -172.6705
Coefficients:
(Intercept) Age
82.474556 5.683788
Correlation Structure: AR(1)
Formula: ~1 | Girl
Parameter estimate(s):
Phi
0.9781674
Degrees of freedom: 100 total; 98 residual
Residual standard error: 4.780948
# regression with unequal variances for occasions
m2 <- update(m0, weights = varIdent(form = ~ 1 | Age));m2
Generalized least squares fit by REML
Model: Height ~ Age
Data: girl_dat
Log-restricted-likelihood: -299.0226
Coefficients:
(Intercept) Age
82.229398 5.751755
Variance function:
Structure: Different standard deviations per stratum
Formula: ~1 | Age
Parameter estimates:
6 7 8 9 10
1.000000 1.104837 1.297424 1.391427 1.421762
Degrees of freedom: 100 total; 98 residual
Residual standard error: 3.947016
# regression with autocorrelation and unequal variances
m3 <- update(m1, weights = varIdent(form = ~ 1 | Age));m3
Generalized least squares fit by maximum likelihood
Model: Height ~ Age
Data: girl_dat
Log-likelihood: -159.3523
Coefficients:
(Intercept) Age
81.431037 5.533068
Correlation Structure: AR(1)
Formula: ~1 | Girl
Parameter estimate(s):
Phi
0.9886847
Variance function:
Structure: Different standard deviations per stratum
Formula: ~1 | Age
Parameter estimates:
6 7 8 9 10
1.000000 1.135107 1.343258 1.410275 1.374144
Degrees of freedom: 100 total; 98 residual
Residual standard error: 4.37343
# model selection with AICc
model.sel(m0, m1, m2, m3)
Model selection table
(Intrc) Age correlation weights REML df logLik AICc delta weight
m3 81.43 5.533 crAR1(Girl) vrI(Age) F 8 -159.352 336.3 0.00 1
m1 82.47 5.684 crAR1(Girl) F 4 -172.671 353.8 17.48 0
m0 82.52 5.716 3 -300.761 607.8 271.48 0
m2 82.23 5.752 vrI(Age) 7 -299.023 613.3 276.98 0
Abbreviations:
correlation: crAR1(Girl) = 'corAR1(~1|Girl)'
weights: vrI(Age) = 'varIdent(~1|Age)'
REML: F = 'FALSE'
Models ranked by AICc(x)
data(Stevens)
dta <- Stevens %>% rename(Length = y, Loudness = loudness, Subject = subject)
# one model fits all
m0 <- lm(Length ~ Loudness, data = dta)
# plot
ot <- theme_set(theme_bw())
ggplot(dta, aes(Loudness, Length, group = Subject, alpha = Subject))+
geom_point() +
geom_line() +
stat_smooth(aes(group = 1), method = "lm", size = rel(.8)) +
coord_cartesian(ylim = c(-1, 5)) +
labs(x = "Loudness (db)", y = "Average log(Length)") +
theme(legend.position = "NONE")
# tidy summary model output
broom::tidy(summary(m0))
term estimate std.error statistic p.value
1 (Intercept) -3.203770 0.344285528 -9.305561 2.531325e-12
2 Loudness 0.079013 0.004820962 16.389469 2.693170e-21
# weighted model
m1 <- Stevens %>%
group_by(loudness) %>%
mutate(y_m = mean(y), y_n = n()) %>%
lm(y_m ~ loudness, weight = y_n, data = .) %>%
coef(.)
#plot
ggplot(Stevens, aes(loudness, y, group = subject, label = subject)) +
geom_point() +
geom_abline(intercept = m1[1], slope = m1[2], col = "firebrick") +
stat_smooth(aes(group = 1), method = "lm", size = rel(.8)) +
labs(x = "Loudness (db)", y = "Average log(Length)") +
theme(legend.position = "NONE")
# tidy summary model output
broom::tidy(summary(m1))
minimum q1 median mean q3 maximum
1 -3.20377 -2.383074 -1.562379 -1.562379 -0.7416828 0.079013