# file location
fL<-"https://math.montana.edu/shancock/data/Framingham.txt"
#
dta <- read.table(fL, header=T)
# recode gender variable
dta$sex <- factor(dta$sex,
levels=c(1, 2),
labels=c("M", "F"))
# lattice plot
library(lattice)
xyplot(sbp ~ dbp | sex,
data=dta, cex=.5,
type=c("p","g","r"),
xlab="Diastolic pressure (mmHg)",
ylab="Systolic pressure (mmHg)")
m0 <- lm(sbp ~ dbp, data=dta)
m1 <- lm(sbp ~ dbp + sex, data=dta)
m2 <- lm(sbp ~ dbp + sex + sex:dbp, data=dta)
anova(m0, m1, m2)
Analysis of Variance Table
Model 1: sbp ~ dbp
Model 2: sbp ~ dbp + sex
Model 3: sbp ~ dbp + sex + sex:dbp
Res.Df RSS Df Sum of Sq F Pr(>F)
1 4697 941778
2 4696 927853 1 13925 71.8 <2e-16
3 4695 910543 1 17310 89.3 <2e-16
plot(rstandard(m2) ~ fitted(m2),
cex=.5,
xlab="Fitted values",
ylab="Standardized residuals")
grid()
options(digits=5, show.signif.stars=FALSE)
pacman::p_load(alr4, tidyverse, magrittr)
# make sure data set is active
data(Stevens, package="alr4")
# save a copy
dta <- Stevens %>%
rename(Length=y, Loudness=loudness, Subject=subject)
# 1 regression fits all
ggplot(dta, aes(Loudness, Length, group = Subject, alpha = Subject))+
geom_point() +
geom_line() +
stat_smooth(aes(group = 1), method = "lm", size = rel(.8)) +
coord_cartesian(ylim = c(-1, 5)) +
labs(x = "Loudness (db)", y = "Average log(Length)") +
theme(legend.position = "NONE")
# one model fits all
m0 <- lm(Length ~ Loudness, data = dta)
# tidy summary model output
broom::tidy(summary(m0))
# A tibble: 2 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) -3.20 0.344 -9.31 2.53e-12
2 Loudness 0.0790 0.00482 16.4 2.69e-21
# 1 regression model per individual
ggplot(dta, aes(Loudness, Length, group = Subject, color = Subject))+
geom_point() +
stat_smooth(method = "lm", se = F, size = rel(.5)) +
scale_color_grey()+
coord_cartesian(ylim = c(-1, 5)) +
labs(x = "Loudness (db)", y = "Average log(Length)") +
theme(legend.position = "NONE")
# fancy way of conducting 1 regression model per subject
# grouped by subject before doing regression
dta %>%
group_by(Subject) %>%
do(broom::tidy(lm(Length ~ Loudness, data = .)))
# A tibble: 20 x 6
# Groups: Subject [10]
Subject term estimate std.error statistic p.value
<fct> <chr> <dbl> <dbl> <dbl> <dbl>
1 A (Intercept) -4.49 0.748 -6.01 0.00923
2 A Loudness 0.103 0.0105 9.80 0.00225
3 B (Intercept) -4.59 0.462 -9.92 0.00218
4 B Loudness 0.0935 0.00647 14.5 0.000718
5 C (Intercept) -2.29 0.253 -9.05 0.00285
6 C Loudness 0.0695 0.00355 19.6 0.000291
7 D (Intercept) -2.92 0.338 -8.63 0.00327
8 D Loudness 0.0751 0.00474 15.8 0.000547
9 E (Intercept) -5.39 0.743 -7.24 0.00542
10 E Loudness 0.104 0.0104 9.98 0.00214
11 F (Intercept) -2.42 0.523 -4.63 0.0190
12 F Loudness 0.0774 0.00732 10.6 0.00181
13 G (Intercept) -2.81 0.312 -9.01 0.00288
14 G Loudness 0.0650 0.00437 14.9 0.000659
15 H (Intercept) -3.42 0.401 -8.53 0.00338
16 H Loudness 0.0822 0.00561 14.6 0.000691
17 I (Intercept) -1.95 0.512 -3.80 0.0319
18 I Loudness 0.0656 0.00718 9.14 0.00277
19 J (Intercept) -1.76 0.265 -6.63 0.00699
20 J Loudness 0.0552 0.00372 14.9 0.000660
# fitting a model treating subject as fixed effects
summary(m1 <- lm(Length ~ Subject/Loudness - 1, data = dta))
Call:
lm(formula = Length ~ Subject/Loudness - 1, data = dta)
Residuals:
Min 1Q Median 3Q Max
-0.3318 -0.1073 0.0023 0.1485 0.3242
Coefficients:
Estimate Std. Error t value Pr(>|t|)
SubjectA -4.49370 0.48664 -9.23 2.8e-10
SubjectB -4.58530 0.48664 -9.42 1.8e-10
SubjectC -2.29360 0.48664 -4.71 5.2e-05
SubjectD -2.91940 0.48664 -6.00 1.4e-06
SubjectE -5.38510 0.48664 -11.07 4.1e-12
SubjectF -2.42120 0.48664 -4.98 2.5e-05
SubjectG -2.81090 0.48664 -5.78 2.6e-06
SubjectH -3.42070 0.48664 -7.03 8.2e-08
SubjectI -1.94870 0.48664 -4.00 0.00038
SubjectJ -1.75910 0.48664 -3.61 0.00109
SubjectA:Loudness 0.10265 0.00681 15.06 1.6e-15
SubjectB:Loudness 0.09355 0.00681 13.73 1.8e-14
SubjectC:Loudness 0.06950 0.00681 10.20 2.9e-11
SubjectD:Loudness 0.07506 0.00681 11.02 4.6e-12
SubjectE:Loudness 0.10391 0.00681 15.25 1.1e-15
SubjectF:Loudness 0.07740 0.00681 11.36 2.2e-12
SubjectG:Loudness 0.06497 0.00681 9.53 1.4e-10
SubjectH:Loudness 0.08223 0.00681 12.07 4.9e-13
SubjectI:Loudness 0.06561 0.00681 9.63 1.1e-10
SubjectJ:Loudness 0.05525 0.00681 8.11 4.7e-09
Residual standard error: 0.215 on 30 degrees of freedom
Multiple R-squared: 0.996, Adjusted R-squared: 0.993
F-statistic: 369 on 20 and 30 DF, p-value: <2e-16
# augument data with model fit
dta <- dta %>% mutate(fit1 = fitted(m1))
# 1 regression model for all with individual parameters
ggplot(dta, aes(Loudness, fit1, group = Subject, alpha = Subject))+
geom_path() +
geom_point(aes(Loudness, Length, group = Subject, alpha = Subject)) +
coord_cartesian(ylim = c(-1, 5)) +
labs(x = "Loudness (dB)", y = "Average log(Length)") +
theme(legend.position = "NONE")
library(Brq)
data(USgirl, package="Brq")
plot(USgirl, pch='.', bty="n",
xlim=c(0, 25),
ylim=c(0, 150),
xlab="Age (in years)",
ylab="Body weight (in kg)")
ggplot(USgirl, aes(Age, Weight)) +
geom_point(alpha=.5, size=rel(.3)) +
geom_quantile(quantiles=1:9/10,
formula=y ~ x)+
labs(x="Age (in years)",
y="Body weight (in kg)") +
theme_minimal()
# Data
library(MASS)
data(cats, package="MASS")
# structure of data
str(cats)
'data.frame': 144 obs. of 3 variables:
$ Sex: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
$ Bwt: num 2 2 2 2.1 2.1 2.1 2.1 2.1 2.1 2.1 ...
$ Hwt: num 7 7.4 9.5 7.2 7.3 7.6 8.1 8.2 8.3 8.5 ...
head(cats)
Sex Bwt Hwt
1 F 2.0 7.0
2 F 2.0 7.4
3 F 2.0 9.5
4 F 2.1 7.2
5 F 2.1 7.3
6 F 2.1 7.6
library(lattice)
# conditional by gender - panels
xyplot(Hwt ~ Bwt | Sex, data=cats, type=c("p","g","r"),
xlab="Body weight (kg)",
ylab="Heart weight (g)")
# grouped by sex - in one panel
xyplot(Hwt ~ Bwt, groups=Sex, data=cats, type=c("p","g","r"),
xlab="Body weight (Kilograms)",
ylab="Heart weight (Grams)",
auto.key=TRUE)
summary(cats_1 <- lm(Hwt ~ Bwt, data=cats))
Call:
lm(formula = Hwt ~ Bwt, data = cats)
Residuals:
Min 1Q Median 3Q Max
-3.569 -0.963 -0.092 1.043 5.124
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.357 0.692 -0.52 0.61
Bwt 4.034 0.250 16.12 <2e-16
Residual standard error: 1.45 on 142 degrees of freedom
Multiple R-squared: 0.647, Adjusted R-squared: 0.644
F-statistic: 260 on 1 and 142 DF, p-value: <2e-16
The fitted model is
Hwt = -0.36 + 4.03 Bwt
Next we include cats gender into the regression model. The term ‘Sex’ is a factor variable having two levels. The default coding in R follows the alphanumeric rule by treating ‘F’ in (‘F’ and ‘M’ for Sex) as the reference level.
summary(cats_2 <- lm(Hwt ~ Bwt + Sex, data=cats))
Call:
lm(formula = Hwt ~ Bwt + Sex, data = cats)
Residuals:
Min 1Q Median 3Q Max
-3.583 -0.970 -0.095 1.043 5.102
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.4150 0.7273 -0.57 0.57
Bwt 4.0758 0.2948 13.83 <2e-16
SexM -0.0821 0.3040 -0.27 0.79
Residual standard error: 1.46 on 141 degrees of freedom
Multiple R-squared: 0.647, Adjusted R-squared: 0.642
F-statistic: 129 on 2 and 141 DF, p-value: <2e-16
The fitted model is
Hwt = -0.42 + 4.08 Bwt for female cats,
Hwt = (-0.42 - 0.08) + 4.08 Bwt for male cats
The above model assumes different intercepts for female and male cats but the same slope for both. To assume also different slopes between cats gender, we add the ‘interaction’ term ‘Sex:Bwt’ to the model.
# For convenience: lm(Hwt ~ Bwt*Sex, data=cats)
summary(cats_3 <- lm(Hwt ~ Bwt + Sex + Bwt:Sex, data=cats))
Call:
lm(formula = Hwt ~ Bwt + Sex + Bwt:Sex, data = cats)
Residuals:
Min 1Q Median 3Q Max
-3.773 -1.012 -0.120 0.927 4.865
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.981 1.843 1.62 0.10796
Bwt 2.636 0.776 3.40 0.00088
SexM -4.165 2.062 -2.02 0.04526
Bwt:SexM 1.676 0.837 2.00 0.04722
Residual standard error: 1.44 on 140 degrees of freedom
Multiple R-squared: 0.657, Adjusted R-squared: 0.649
F-statistic: 89.2 on 3 and 140 DF, p-value: <2e-16
The fitted model is
Hwt = 2.98 + 2.64 Bwt for female cats,
Hwt = (2.98 - 4.17) + (2.64+1.68) Bwt for male cats
The ‘interaction’ term ‘Sex:Bwt’ is statistically significant translates to that rate of change in heart weight in relation body weight for male cats is larger than that for female cats.
The above three models are nested models, i.e., the latter models can be simplified to former models by removing terms in them. We can compare these models by the F-test for the full-reduced model framework via the ‘anova’ command in R.
anova(cats_1, cats_2, cats_3)
Analysis of Variance Table
Model 1: Hwt ~ Bwt
Model 2: Hwt ~ Bwt + Sex
Model 3: Hwt ~ Bwt + Sex + Bwt:Sex
Res.Df RSS Df Sum of Sq F Pr(>F)
1 142 300
2 141 299 1 0.15 0.07 0.785
3 140 291 1 8.33 4.01 0.047
We see that model 2 is not different from model 1 but model 3 is different from model 2. The difference is in the gender by body weight term. The F-test gives the same p-value as in the output following the t-value of the model 3 summary. This is not a coincidence.
Conclusions
For the cats data example, the linear relationship between heart weight and body weight is positive and stronger for male cats than for female cats. For each additional increase in body weight by one kilogram, the increase in heart weight is about \(4.32 \pm 0.31\) grams for male cats and about \(2.64 \pm 0.78\) grams for female cats.
Note that the standard error for the slope coefficient estimate for male cats is smaller than that of the female cats. It can be obtained as follows:
show(vm <- vcov(cats_3)[c(2,4), c(2,4)])
Bwt Bwt:SexM
Bwt 0.60202 -0.60202
Bwt:SexM -0.60202 0.70111
sqrt(vm[1,1]+vm[2,2]+2*vm[1,2])
[1] 0.31479
We can also get a rough estimate of this standard error by considering the data for male cats alone.
summary(lm(Hwt ~ Bwt, data=cats, subset=Sex=='M'))
Call:
lm(formula = Hwt ~ Bwt, data = cats, subset = Sex == "M")
Residuals:
Min 1Q Median 3Q Max
-3.773 -1.048 -0.298 0.984 4.865
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.184 0.998 -1.19 0.24
Bwt 4.313 0.340 12.69 <2e-16
Residual standard error: 1.56 on 95 degrees of freedom
Multiple R-squared: 0.629, Adjusted R-squared: 0.625
F-statistic: 161 on 1 and 95 DF, p-value: <2e-16
Further questions
What is the difference between the slope estimate for male cats in Model 3 we have used earlier and that in the analysis of the male cats alone?
Why is that the standard error for the slope estimate of female cats is much larger than that for male cats?
# set some options
options(digits=3, show.signif.stars=FALSE)
# packagage management
# install.packages(pacman)
# load packages
pacman::p_load(alr4, tidyverse)
# load data
data(UN11, package="alr4")
# seed the random number generator to get the same sample
set.seed(6102)
# pick 81 countries from three regions
# arrange the rows by alphabetical order
dta <- UN11 %>%
filter(region %in% c("Africa", "Asia", "Europe")) %>%
sample_n(81) %>%
arrange(region)
# first 6 lines of data frame
head(dta)
region group fertility ppgdp lifeExpF pctUrban
1 Africa africa 3.99 1333 65.8 52
2 Africa africa 2.34 11451 78.0 56
3 Africa africa 3.19 12469 64.3 86
4 Africa africa 2.41 11321 77.9 78
5 Africa africa 5.08 741 58.7 42
6 Africa africa 5.75 520 57.0 27
# data dimensions - rows and columns
dim(dta)
[1] 81 6
# how many countries in each of the three regions
R3 <- table(dta$region)
# percentage of countries from each of the three regions selected
w <- R3/table(UN11$region)
# add the sampling weights variable to data
# skip over countries in regions not selected
dta$wt <- rep(1/w[w != 0], R3[R3 != 0])
# simple regression
summary(m0 <- lm(fertility ~ log(ppgdp), data=dta))
Call:
lm(formula = fertility ~ log(ppgdp), data = dta)
Residuals:
Min 1Q Median 3Q Max
-2.2686 -0.7716 0.0497 0.6811 2.6292
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.313 0.575 14.46 < 2e-16
log(ppgdp) -0.652 0.068 -9.58 7.2e-15
Residual standard error: 1.07 on 79 degrees of freedom
Multiple R-squared: 0.537, Adjusted R-squared: 0.532
F-statistic: 91.8 on 1 and 79 DF, p-value: 7.15e-15
# weighted regression
summary(m1 <- update(m0, weights=wt))
Call:
lm(formula = fertility ~ log(ppgdp), data = dta, weights = wt)
Weighted Residuals:
Min 1Q Median 3Q Max
-3.031 -1.001 0.063 0.921 3.425
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.210 0.577 14.22 < 2e-16
log(ppgdp) -0.642 0.068 -9.44 1.3e-14
Residual standard error: 1.42 on 79 degrees of freedom
Multiple R-squared: 0.53, Adjusted R-squared: 0.524
F-statistic: 89.1 on 1 and 79 DF, p-value: 1.35e-14
# plot
ggplot(dta,
aes(log(ppgdp), fertility, label=region)) +
stat_smooth(method="lm", formula=y ~ x, se=F, col="peru", lwd=rel(.5)) +
stat_smooth(aes(weight=wt), method="lm", formula=y ~ x, se=F, lwd=rel(.5), col="gray")+
geom_text(check_overlap=TRUE, size=rel(2.3), aes(color=region))+
labs(x="GDP (US$ in log unit)",
y="Number of children per woman") +
theme_minimal() +
theme(legend.position="NONE")
# The end
# load packages
pacman::p_load(languageR, lattice, tidyverse, nlme, lmtest)
# load data
data(heid, package="languageR")
# see first 6 lines of data
head(heid)
Subject Word RT BaseFrequency
1 pp1 basaalheid 6.69 3.56
2 pp1 markantheid 6.81 5.16
3 pp1 ontroerdheid 6.51 5.55
4 pp1 contentheid 6.58 4.50
5 pp1 riantheid 6.86 4.53
6 pp1 tembaarheid 6.35 0.00
# check data structure
str(heid)
'data.frame': 832 obs. of 4 variables:
$ Subject : Factor w/ 26 levels "pp1","pp10","pp11",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Word : Factor w/ 40 levels "aftandsheid",..: 4 24 28 10 33 38 25 5 16 30 ...
$ RT : num 6.69 6.81 6.51 6.58 6.86 6.35 6.69 7.17 6.58 6.98 ...
$ BaseFrequency: num 3.56 5.16 5.55 4.5 4.53 0 1.61 3.61 2.56 5.86 ...
heid$Trial <- as.numeric(unlist(apply(as.matrix(table(heid$Subject)), 1, seq)))
# select a subset of variables for current example
dta_ex2 <- heid[,-2]
# word frequency effect
ggplot(data = dta_ex2, aes(x = BaseFrequency, y = RT)) +
geom_point() +
stat_smooth(method="lm") +
labs(x = "Word Frequency", y = "Reaction Time (transformed)") +
theme_bw()
# trial effect by subject
ggplot(data = dta_ex2, aes(x = Trial, y = RT)) +
geom_point(pch =1, size = rel(.5)) +
geom_line() +
facet_wrap(~ Subject) +
labs(x = "Trial ID", y = "Reaction Time (transformed)")
# autocorrelation by individual
acf.fnc(dta_ex2, group = "Subject", time = "Trial", x = "RT", plot = TRUE)
# simple linear model ignoring trial dependence - least-squares
summary(m0 <- lm(RT ~ BaseFrequency, data = dta_ex2))
Call:
lm(formula = RT ~ BaseFrequency, data = dta_ex2)
Residuals:
Min 1Q Median 3Q Max
-0.5166 -0.2158 -0.0525 0.1599 0.8903
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.6417 0.0207 320.39 <2e-16
BaseFrequency -0.0155 0.0047 -3.29 0.001
Residual standard error: 0.282 on 830 degrees of freedom
Multiple R-squared: 0.0129, Adjusted R-squared: 0.0117
F-statistic: 10.8 on 1 and 830 DF, p-value: 0.00104
# MLE of the model above
summary(m0a <- gls(RT ~ BaseFrequency, data = dta_ex2, method = "ML"))
Generalized least squares fit by maximum likelihood
Model: RT ~ BaseFrequency
Data: dta_ex2
AIC BIC logLik
258 273 -126
Coefficients:
Value Std.Error t-value p-value
(Intercept) 6.64 0.0207 320 0.000
BaseFrequency -0.02 0.0047 -3 0.001
Correlation:
(Intr)
BaseFrequency -0.882
Standardized residuals:
Min Q1 Med Q3 Max
-1.835 -0.766 -0.186 0.568 3.161
Residual standard error: 0.282
Degrees of freedom: 832 total; 830 residual
# simple linear model with autocorrelation error using gls
summary(m1 <- update(m0a, correlation = corAR1(form = ~ Trial | Subject)))
Generalized least squares fit by maximum likelihood
Model: RT ~ BaseFrequency
Data: dta_ex2
AIC BIC logLik
64.3 83.2 -28.1
Correlation Structure: AR(1)
Formula: ~Trial | Subject
Parameter estimate(s):
Phi
0.462
Coefficients:
Value Std.Error t-value p-value
(Intercept) 6.64 0.02193 302.5 0e+00
BaseFrequency -0.01 0.00387 -3.6 4e-04
Correlation:
(Intr)
BaseFrequency -0.699
Standardized residuals:
Min Q1 Med Q3 Max
-1.835 -0.765 -0.184 0.586 3.159
Residual standard error: 0.281
Degrees of freedom: 832 total; 830 residual
# compare models
anova(m0a, m1)
Model df AIC BIC logLik Test L.Ratio p-value
m0a 1 3 258.5 272.6 -126.2
m1 2 4 64.3 83.2 -28.1 1 vs 2 196 <.0001
# collect residuals from the two models above
dta_ex2 <- dta_ex2 %>% mutate(res0 = rstandard(m0),
res1 = resid(m1, type = "normalized"))
# Durbin-Watson test of serial correlation
dwtest(res0 ~ BaseFrequency, data = dta_ex2)
Durbin-Watson test
data: res0 ~ BaseFrequency
DW = 1, p-value <2e-16
alternative hypothesis: true autocorrelation is greater than 0
#
dwtest(res1 ~ BaseFrequency, data = dta_ex2)
Durbin-Watson test
data: res1 ~ BaseFrequency
DW = 2, p-value = 1
alternative hypothesis: true autocorrelation is greater than 0
# check acf for residuals
acf.fnc(dta_ex2, group = "Subject", time = "Trial", x = "res1", plot = TRUE)
# regression with correlations over time and in clusters
# set significant decimal points
# don't print silly stars
options(digits=4, show.signif.stars=FALSE)
pacman::p_load(tidyverse, nlme, broom, magrittr, MuMIn)
dta_ex3<- read.table('girl_height.csv', header=T, sep=",")
str(dta_ex3)
'data.frame': 100 obs. of 4 variables:
$ Height: num 111 116 122 126 130 ...
$ Girl : chr "G1" "G1" "G1" "G1" ...
$ Age : int 6 7 8 9 10 6 7 8 9 10 ...
$ Mother: chr "S" "S" "S" "S" ...
class(dta_ex3)
[1] "data.frame"
head(dta_ex3)
Height Girl Age Mother
1 111.0 G1 6 S
2 116.4 G1 7 S
3 121.7 G1 8 S
4 126.3 G1 9 S
5 130.5 G1 10 S
6 110.0 G2 6 S
# this data structure goes with lattice plot
plot(dta_ex3)
# compute correlations
cor(dta_ex3$Age, dta_ex3$Height)
[1] 0.8551
# get variance
library(dplyr)
dta_ex3 %>% dplyr::select(Height, Age)%>% var %>% diag
Height Age
90.28 2.02
# one regression line fits all
ggplot(dta_ex3, aes(x = Age, y = Height, color = Mother)) +
geom_point() +
stat_smooth(aes(group = 1), method = "lm") +
labs(x = "Age (scaled)", y = "Height (cm)") +
theme(legend.position = "NONE")
# ordinary regression
m0 <- gls(Height ~ Age, data = dta_ex3)
# regression with autocorrelation
m1 <- update(m0, corr = corAR1(form = ~ 1 | Girl), method = "ML")
# regression with unequal variances for occasions
m2 <- update(m0, weights = varIdent(form = ~ 1 | Mother))
# regression with autocorrelation and unequal variances
m3 <- update(m1, weights = varIdent(form = ~ 1 | Mother))
# model selection with AICc
model.sel(m0, m1, m2, m3)
Model selection table
(Intrc) Age family correlation weights REML
m3 82.47 5.558 gaussian(identity) corAR1(~1|Girl) varIdent(~1|Mother) FALSE
m1 82.47 5.684 gaussian(identity) corAR1(~1|Girl) FALSE
m2 82.46 5.549 gaussian(identity) varIdent(~1|Mother)
m0 82.52 5.716 gaussian(identity)
df logLik AICc delta weight
m3 6 -162.8 338.5 0.00 1
m1 4 -172.7 353.8 15.26 0
m2 5 -292.0 594.6 256.14 0
m0 3 -300.8 607.8 269.27 0
Models ranked by AICc(x)
# summary model output
lapply(list(m0, m1, m2, m3), summary)
[[1]]
Generalized least squares fit by REML
Model: Height ~ Age
Data: dta_ex3
AIC BIC logLik
607.5 615.3 -300.8
Coefficients:
Value Std.Error t-value p-value
(Intercept) 82.52 2.8439 29.02 0
Age 5.72 0.3501 16.33 0
Correlation:
(Intr)
Age -0.985
Standardized residuals:
Min Q1 Med Q3 Max
-1.8561 -0.7402 -0.1696 0.7974 2.6348
Residual standard error: 4.951
Degrees of freedom: 100 total; 98 residual
[[2]]
Generalized least squares fit by maximum likelihood
Model: Height ~ Age
Data: dta_ex3
AIC BIC logLik
353.3 363.8 -172.7
Correlation Structure: AR(1)
Formula: ~1 | Girl
Parameter estimate(s):
Phi
0.9782
Coefficients:
Value Std.Error t-value p-value
(Intercept) 82.47 1.380 59.75 0
Age 5.68 0.111 51.21 0
Correlation:
(Intr)
Age -0.643
Standardized residuals:
Min Q1 Med Q3 Max
-1.8432 -0.7005 -0.1174 0.8831 2.7934
Residual standard error: 4.781
Degrees of freedom: 100 total; 98 residual
[[3]]
Generalized least squares fit by REML
Model: Height ~ Age
Data: dta_ex3
AIC BIC logLik
594 606.9 -292
Variance function:
Structure: Different standard deviations per stratum
Formula: ~1 | Mother
Parameter estimates:
S M T
1.0000 0.8014 1.8171
Coefficients:
Value Std.Error t-value p-value
(Intercept) 82.46 2.3331 35.34 0
Age 5.55 0.2872 19.32 0
Correlation:
(Intr)
Age -0.985
Standardized residuals:
Min Q1 Med Q3 Max
-1.88260 -0.64836 0.07568 0.75427 2.40785
Residual standard error: 3.961
Degrees of freedom: 100 total; 98 residual
[[4]]
Generalized least squares fit by maximum likelihood
Model: Height ~ Age
Data: dta_ex3
AIC BIC logLik
337.6 353.2 -162.8
Correlation Structure: AR(1)
Formula: ~1 | Girl
Parameter estimate(s):
Phi
0.9787
Variance function:
Structure: Different standard deviations per stratum
Formula: ~1 | Mother
Parameter estimates:
S M T
1.0000 0.6923 1.5494
Coefficients:
Value Std.Error t-value p-value
(Intercept) 82.47 1.1305 72.95 0
Age 5.56 0.0903 61.55 0
Correlation:
(Intr)
Age -0.639
Standardized residuals:
Min Q1 Med Q3 Max
-1.77189 -0.71727 0.06484 0.80091 2.56095
Residual standard error: 4.265
Degrees of freedom: 100 total; 98 residual
# autocorrelations
acf(resid(m0, type = "normalized"), main = "m0")
acf(resid(m1, type = "normalized"), main = "m1")
acf(resid(m2, type = "normalized"), main = "m2")
acf(resid(m3, type = "normalized"), main = "m3")
# The end
#Data
dta_ex4<- read.table("course_eval.txt", header=T)
dta_ex4$Course <- as.factor(dta_ex4$Course)
head(dta_ex4)
Score Beauty Gender Age Minority Tenure Course
1 4.3 0.2016 1 36 1 0 3
2 4.5 -0.8261 0 59 0 1 0
3 3.7 -0.6603 0 51 0 1 4
4 4.3 -0.7663 1 40 0 1 2
5 4.4 1.4214 1 31 0 0 0
6 4.2 0.5002 0 62 0 1 0
library(lattice)
xyplot(Score ~ Beauty | Course, data=dta_ex4,
ylab="Average course evaluation score",
xlab="Beauty judgment score",
type=c("p", "g", "r"))
dtag <- # create a new copy of the groupedData object
groupedData(Score ~ Beauty | Course,
data=as.data.frame( dta_ex4 ),
FUN=mean,
labels=list(x="Beauty score",
y="Couse evaluation score" ))
plot(dtag, asp=1)
#
m0 <- lm(Score ~ Beauty, data=dta_ex4)
summary(m0)
Call:
lm(formula = Score ~ Beauty, data = dta_ex4)
Residuals:
Min 1Q Median 3Q Max
-1.8002 -0.3630 0.0725 0.4021 1.1037
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.0100 0.0255 157.21 < 2e-16
Beauty 0.1330 0.0322 4.13 4.2e-05
Residual standard error: 0.545 on 461 degrees of freedom
Multiple R-squared: 0.0357, Adjusted R-squared: 0.0336
F-statistic: 17.1 on 1 and 461 DF, p-value: 4.25e-05
#
with(dta_ex4, plot(Beauty, Score, bty="n",
xlab="Beauty score",
ylab="Average course evaluation"))
grid()
abline(m0)
#
t.test(coef(lmList(Score ~ Beauty | Course, data = dta_ex4))["Beauty"])
One Sample t-test
data: coef(lmList(Score ~ Beauty | Course, data = dta_ex4))["Beauty"]
t = 0.54, df = 30, p-value = 0.6
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
-2.188 3.769
sample estimates:
mean of x
0.7905
#
m1 <- lm(Score ~ Course/Beauty - 1, data=dta_ex4)
#
summary(m1)
Call:
lm(formula = Score ~ Course/Beauty - 1, data = dta_ex4)
Residuals:
Min 1Q Median 3Q Max
-1.8108 -0.2739 0.0135 0.3314 1.0935
Coefficients:
Estimate Std. Error t value Pr(>|t|)
Course0 4.0316 0.0304 132.83 < 2e-16
Course1 3.7049 1.2199 3.04 0.00254
Course2 4.4852 0.4880 9.19 < 2e-16
Course3 3.8806 0.2535 15.31 < 2e-16
Course4 3.8359 0.1207 31.79 < 2e-16
Course5 4.0863 0.3073 13.30 < 2e-16
Course6 4.2195 0.3775 11.18 < 2e-16
Course7 3.4491 0.4324 7.98 1.6e-14
Course8 5.0785 1.0201 4.98 9.5e-07
Course9 3.9284 0.2150 18.27 < 2e-16
Course10 4.8393 1.8830 2.57 0.01053
Course11 4.1322 0.4060 10.18 < 2e-16
Course12 3.3081 0.6667 4.96 1.0e-06
Course13 4.1574 0.6147 6.76 4.8e-11
Course14 3.6171 0.4442 8.14 4.9e-15
Course15 2.4200 0.4187 5.78 1.5e-08
Course16 -16.9734 22.0939 -0.77 0.44280
Course17 4.4669 0.2982 14.98 < 2e-16
Course18 4.2042 0.3147 13.36 < 2e-16
Course19 3.5362 0.2520 14.03 < 2e-16
Course20 4.4540 0.4957 8.99 < 2e-16
Course21 3.6620 0.1425 25.70 < 2e-16
Course22 3.7286 0.2064 18.06 < 2e-16
Course23 3.5310 0.5096 6.93 1.7e-11
Course24 3.7810 0.3552 10.65 < 2e-16
Course25 20.7529 13.6132 1.52 0.12818
Course26 4.2462 0.3138 13.53 < 2e-16
Course27 4.5522 0.6720 6.77 4.5e-11
Course28 3.6051 1.3974 2.58 0.01024
Course29 3.8039 0.3909 9.73 < 2e-16
Course30 4.3076 0.3272 13.17 < 2e-16
Course0:Beauty 0.1462 0.0378 3.87 0.00013
Course1:Beauty 0.9109 1.3378 0.68 0.49632
Course2:Beauty 0.2416 0.8978 0.27 0.78795
Course3:Beauty -0.0458 1.4114 -0.03 0.97412
Course4:Beauty 0.5401 0.2909 1.86 0.06405
Course5:Beauty 0.2275 0.4206 0.54 0.58888
Course6:Beauty 0.7948 0.6393 1.24 0.21453
Course7:Beauty -0.1954 0.4447 -0.44 0.66051
Course8:Beauty 1.8980 1.4103 1.35 0.17913
Course9:Beauty -1.9625 0.7006 -2.80 0.00534
Course10:Beauty 0.5348 1.9240 0.28 0.78117
Course11:Beauty -0.4049 0.5014 -0.81 0.41987
Course12:Beauty -1.9471 1.6706 -1.17 0.24452
Course13:Beauty 0.5055 0.9294 0.54 0.58682
Course14:Beauty -0.1386 0.8917 -0.16 0.87659
Course15:Beauty -0.3753 0.5578 -0.67 0.50140
Course16:Beauty -18.2149 19.1411 -0.95 0.34187
Course17:Beauty 0.3354 0.4471 0.75 0.45349
Course18:Beauty -0.1303 0.4932 -0.26 0.79172
Course19:Beauty -0.3202 0.3246 -0.99 0.32451
Course20:Beauty 0.0692 0.8893 0.08 0.93801
Course21:Beauty -0.0380 0.1891 -0.20 0.84096
Course22:Beauty 0.1641 0.2424 0.68 0.49881
Course23:Beauty -1.3347 0.7649 -1.74 0.08178
Course24:Beauty 0.0756 0.9732 0.08 0.93811
Course25:Beauty 40.5972 32.6559 1.24 0.21453
Course26:Beauty 0.2253 0.3363 0.67 0.50332
Course27:Beauty 0.9815 1.2156 0.81 0.41987
Course28:Beauty 0.7384 0.9699 0.76 0.44693
Course29:Beauty 0.4722 0.2924 1.61 0.10711
Course30:Beauty 0.1544 0.2311 0.67 0.50451
Residual standard error: 0.525 on 401 degrees of freedom
Multiple R-squared: 0.985, Adjusted R-squared: 0.983
F-statistic: 434 on 62 and 401 DF, p-value: <2e-16
#
summary(m1)$coef[which(summary(m1)$coef[-c(1:31),4] <0.05),]
Estimate Std. Error t value Pr(>|t|)
Course0 4.032 0.03035 132.83 0.000e+00
Course9 3.928 0.21504 18.27 1.102e-54
# The end