Homework

In class exercise 1

# file location
fL<-"https://math.montana.edu/shancock/data/Framingham.txt"
# 
dta <- read.table(fL, header=T)

# recode gender variable
dta$sex <- factor(dta$sex, 
                  levels=c(1, 2), 
                  labels=c("M", "F"))

# lattice plot
library(lattice)
xyplot(sbp ~ dbp | sex, 
       data=dta, cex=.5,
       type=c("p","g","r"),
       xlab="Diastolic pressure (mmHg)",
       ylab="Systolic pressure (mmHg)")

m0 <- lm(sbp ~ dbp, data=dta)
m1 <- lm(sbp ~ dbp + sex, data=dta)
m2 <- lm(sbp ~ dbp + sex + sex:dbp, data=dta)
anova(m0, m1, m2)

Analysis of Variance Table

Model 1: sbp ~ dbp
Model 2: sbp ~ dbp + sex
Model 3: sbp ~ dbp + sex + sex:dbp
  Res.Df    RSS Df Sum of Sq    F Pr(>F)
1   4697 941778                         
2   4696 927853  1     13925 71.8 <2e-16
3   4695 910543  1     17310 89.3 <2e-16

plot(rstandard(m2) ~ fitted(m2), 
     cex=.5, 
     xlab="Fitted values", 
     ylab="Standardized residuals")
grid()

In class exercise 2

options(digits=5, show.signif.stars=FALSE)
pacman::p_load(alr4, tidyverse, magrittr)
# make sure data set is active
data(Stevens, package="alr4")
# save a copy
dta <- Stevens %>% 
 rename(Length=y, Loudness=loudness, Subject=subject)

# 1 regression fits all
ggplot(dta, aes(Loudness, Length, group = Subject, alpha = Subject))+
 geom_point() +
 geom_line() +
 stat_smooth(aes(group = 1), method = "lm", size = rel(.8)) +
 coord_cartesian(ylim = c(-1, 5)) +
 labs(x = "Loudness (db)", y = "Average log(Length)") +
 theme(legend.position = "NONE")

# one model fits all
m0 <- lm(Length ~ Loudness, data = dta)
# tidy summary model output
broom::tidy(summary(m0))

# A tibble: 2 x 5
  term        estimate std.error statistic  p.value
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)  -3.20     0.344       -9.31 2.53e-12
2 Loudness      0.0790   0.00482     16.4  2.69e-21

# 1 regression model per individual 
ggplot(dta, aes(Loudness, Length, group = Subject, color = Subject))+
 geom_point() +
 stat_smooth(method = "lm", se = F, size = rel(.5)) +
 scale_color_grey()+
 coord_cartesian(ylim = c(-1, 5)) +
 labs(x = "Loudness (db)", y = "Average log(Length)") +
 theme(legend.position = "NONE")

# fancy way of conducting 1 regression model per subject
# grouped by subject before doing regression
dta %>% 
 group_by(Subject) %>% 
 do(broom::tidy(lm(Length ~ Loudness, data = .)))

# A tibble: 20 x 6
# Groups:   Subject [10]
   Subject term        estimate std.error statistic  p.value
   <fct>   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
 1 A       (Intercept)  -4.49     0.748       -6.01 0.00923 
 2 A       Loudness      0.103    0.0105       9.80 0.00225 
 3 B       (Intercept)  -4.59     0.462       -9.92 0.00218 
 4 B       Loudness      0.0935   0.00647     14.5  0.000718
 5 C       (Intercept)  -2.29     0.253       -9.05 0.00285 
 6 C       Loudness      0.0695   0.00355     19.6  0.000291
 7 D       (Intercept)  -2.92     0.338       -8.63 0.00327 
 8 D       Loudness      0.0751   0.00474     15.8  0.000547
 9 E       (Intercept)  -5.39     0.743       -7.24 0.00542 
10 E       Loudness      0.104    0.0104       9.98 0.00214 
11 F       (Intercept)  -2.42     0.523       -4.63 0.0190  
12 F       Loudness      0.0774   0.00732     10.6  0.00181 
13 G       (Intercept)  -2.81     0.312       -9.01 0.00288 
14 G       Loudness      0.0650   0.00437     14.9  0.000659
15 H       (Intercept)  -3.42     0.401       -8.53 0.00338 
16 H       Loudness      0.0822   0.00561     14.6  0.000691
17 I       (Intercept)  -1.95     0.512       -3.80 0.0319  
18 I       Loudness      0.0656   0.00718      9.14 0.00277 
19 J       (Intercept)  -1.76     0.265       -6.63 0.00699 
20 J       Loudness      0.0552   0.00372     14.9  0.000660

# fitting a model treating subject as fixed effects
summary(m1 <- lm(Length ~ Subject/Loudness - 1, data = dta))


Call:
lm(formula = Length ~ Subject/Loudness - 1, data = dta)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.3318 -0.1073  0.0023  0.1485  0.3242 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)
SubjectA          -4.49370    0.48664   -9.23  2.8e-10
SubjectB          -4.58530    0.48664   -9.42  1.8e-10
SubjectC          -2.29360    0.48664   -4.71  5.2e-05
SubjectD          -2.91940    0.48664   -6.00  1.4e-06
SubjectE          -5.38510    0.48664  -11.07  4.1e-12
SubjectF          -2.42120    0.48664   -4.98  2.5e-05
SubjectG          -2.81090    0.48664   -5.78  2.6e-06
SubjectH          -3.42070    0.48664   -7.03  8.2e-08
SubjectI          -1.94870    0.48664   -4.00  0.00038
SubjectJ          -1.75910    0.48664   -3.61  0.00109
SubjectA:Loudness  0.10265    0.00681   15.06  1.6e-15
SubjectB:Loudness  0.09355    0.00681   13.73  1.8e-14
SubjectC:Loudness  0.06950    0.00681   10.20  2.9e-11
SubjectD:Loudness  0.07506    0.00681   11.02  4.6e-12
SubjectE:Loudness  0.10391    0.00681   15.25  1.1e-15
SubjectF:Loudness  0.07740    0.00681   11.36  2.2e-12
SubjectG:Loudness  0.06497    0.00681    9.53  1.4e-10
SubjectH:Loudness  0.08223    0.00681   12.07  4.9e-13
SubjectI:Loudness  0.06561    0.00681    9.63  1.1e-10
SubjectJ:Loudness  0.05525    0.00681    8.11  4.7e-09

Residual standard error: 0.215 on 30 degrees of freedom
Multiple R-squared:  0.996, Adjusted R-squared:  0.993 
F-statistic:  369 on 20 and 30 DF,  p-value: <2e-16

# augument data with model fit
dta <- dta %>% mutate(fit1 = fitted(m1))

# 1 regression model for all with individual parameters 
ggplot(dta, aes(Loudness, fit1, group = Subject, alpha = Subject))+
 geom_path() +
 geom_point(aes(Loudness, Length, group = Subject, alpha = Subject)) +
 coord_cartesian(ylim = c(-1, 5)) +
 labs(x = "Loudness (dB)", y = "Average log(Length)") +
 theme(legend.position = "NONE")

In class exercise 3

library(Brq)
data(USgirl, package="Brq")
plot(USgirl, pch='.', bty="n",
     xlim=c(0, 25),
     ylim=c(0, 150),
     xlab="Age (in years)",
     ylab="Body weight (in kg)")

ggplot(USgirl, aes(Age, Weight)) + 
 geom_point(alpha=.5, size=rel(.3)) + 
 geom_quantile(quantiles=1:9/10, 
               formula=y ~ x)+
 labs(x="Age (in years)", 
      y="Body weight (in kg)") +
 theme_minimal()

In class exercise 4

# Data
library(MASS)
data(cats, package="MASS")
# structure of data 
str(cats)

'data.frame':   144 obs. of  3 variables:
 $ Sex: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
 $ Bwt: num  2 2 2 2.1 2.1 2.1 2.1 2.1 2.1 2.1 ...
 $ Hwt: num  7 7.4 9.5 7.2 7.3 7.6 8.1 8.2 8.3 8.5 ...

head(cats)

  Sex Bwt Hwt
1   F 2.0 7.0
2   F 2.0 7.4
3   F 2.0 9.5
4   F 2.1 7.2
5   F 2.1 7.3
6   F 2.1 7.6

Visualization

library(lattice)
# conditional by gender - panels
xyplot(Hwt ~ Bwt | Sex, data=cats, type=c("p","g","r"),
       xlab="Body weight (kg)",
       ylab="Heart weight (g)")

# grouped by sex - in one panel
xyplot(Hwt ~ Bwt, groups=Sex, data=cats, type=c("p","g","r"),
       xlab="Body weight (Kilograms)",
       ylab="Heart weight (Grams)",
       auto.key=TRUE)

# Linear regression models

summary(cats_1 <- lm(Hwt ~ Bwt, data=cats))


Call:
lm(formula = Hwt ~ Bwt, data = cats)

Residuals:
   Min     1Q Median     3Q    Max 
-3.569 -0.963 -0.092  1.043  5.124 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)   -0.357      0.692   -0.52     0.61
Bwt            4.034      0.250   16.12   <2e-16

Residual standard error: 1.45 on 142 degrees of freedom
Multiple R-squared:  0.647, Adjusted R-squared:  0.644 
F-statistic:  260 on 1 and 142 DF,  p-value: <2e-16

The fitted model is

Hwt = -0.36 + 4.03 Bwt

Next we include cats gender into the regression model. The term ‘Sex’ is a factor variable having two levels. The default coding in R follows the alphanumeric rule by treating ‘F’ in (‘F’ and ‘M’ for Sex) as the reference level.

summary(cats_2 <- lm(Hwt ~ Bwt + Sex, data=cats))


Call:
lm(formula = Hwt ~ Bwt + Sex, data = cats)

Residuals:
   Min     1Q Median     3Q    Max 
-3.583 -0.970 -0.095  1.043  5.102 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)  -0.4150     0.7273   -0.57     0.57
Bwt           4.0758     0.2948   13.83   <2e-16
SexM         -0.0821     0.3040   -0.27     0.79

Residual standard error: 1.46 on 141 degrees of freedom
Multiple R-squared:  0.647, Adjusted R-squared:  0.642 
F-statistic:  129 on 2 and 141 DF,  p-value: <2e-16

The fitted model is

Hwt = -0.42 + 4.08 Bwt for female cats,

Hwt = (-0.42 - 0.08) + 4.08 Bwt for male cats

The above model assumes different intercepts for female and male cats but the same slope for both. To assume also different slopes between cats gender, we add the ‘interaction’ term ‘Sex:Bwt’ to the model.

# For convenience: lm(Hwt ~ Bwt*Sex, data=cats)
summary(cats_3 <- lm(Hwt ~ Bwt + Sex + Bwt:Sex, data=cats))


Call:
lm(formula = Hwt ~ Bwt + Sex + Bwt:Sex, data = cats)

Residuals:
   Min     1Q Median     3Q    Max 
-3.773 -1.012 -0.120  0.927  4.865 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)    2.981      1.843    1.62  0.10796
Bwt            2.636      0.776    3.40  0.00088
SexM          -4.165      2.062   -2.02  0.04526
Bwt:SexM       1.676      0.837    2.00  0.04722

Residual standard error: 1.44 on 140 degrees of freedom
Multiple R-squared:  0.657, Adjusted R-squared:  0.649 
F-statistic: 89.2 on 3 and 140 DF,  p-value: <2e-16

The fitted model is

Hwt = 2.98 + 2.64 Bwt for female cats,

Hwt = (2.98 - 4.17) + (2.64+1.68) Bwt for male cats

The ‘interaction’ term ‘Sex:Bwt’ is statistically significant translates to that rate of change in heart weight in relation body weight for male cats is larger than that for female cats.

The above three models are nested models, i.e., the latter models can be simplified to former models by removing terms in them. We can compare these models by the F-test for the full-reduced model framework via the ‘anova’ command in R.

anova(cats_1, cats_2, cats_3)

Analysis of Variance Table

Model 1: Hwt ~ Bwt
Model 2: Hwt ~ Bwt + Sex
Model 3: Hwt ~ Bwt + Sex + Bwt:Sex
  Res.Df RSS Df Sum of Sq    F Pr(>F)
1    142 300                         
2    141 299  1      0.15 0.07  0.785
3    140 291  1      8.33 4.01  0.047

We see that model 2 is not different from model 1 but model 3 is different from model 2. The difference is in the gender by body weight term. The F-test gives the same p-value as in the output following the t-value of the model 3 summary. This is not a coincidence.

Conclusions

For the cats data example, the linear relationship between heart weight and body weight is positive and stronger for male cats than for female cats. For each additional increase in body weight by one kilogram, the increase in heart weight is about \(4.32 \pm 0.31\) grams for male cats and about \(2.64 \pm 0.78\) grams for female cats.

Note that the standard error for the slope coefficient estimate for male cats is smaller than that of the female cats. It can be obtained as follows:

show(vm <- vcov(cats_3)[c(2,4), c(2,4)])

              Bwt Bwt:SexM
Bwt       0.60202 -0.60202
Bwt:SexM -0.60202  0.70111

sqrt(vm[1,1]+vm[2,2]+2*vm[1,2])

[1] 0.31479

We can also get a rough estimate of this standard error by considering the data for male cats alone.

summary(lm(Hwt ~ Bwt, data=cats, subset=Sex=='M'))


Call:
lm(formula = Hwt ~ Bwt, data = cats, subset = Sex == "M")

Residuals:
   Min     1Q Median     3Q    Max 
-3.773 -1.048 -0.298  0.984  4.865 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)   -1.184      0.998   -1.19     0.24
Bwt            4.313      0.340   12.69   <2e-16

Residual standard error: 1.56 on 95 degrees of freedom
Multiple R-squared:  0.629, Adjusted R-squared:  0.625 
F-statistic:  161 on 1 and 95 DF,  p-value: <2e-16

Further questions

What is the difference between the slope estimate for male cats in Model 3 we have used earlier and that in the analysis of the male cats alone?
Why is that the standard error for the slope estimate of female cats is much larger than that for male cats?

Exercise 1

# set some options
options(digits=3, show.signif.stars=FALSE)


# packagage management
# install.packages(pacman)

# load packages
pacman::p_load(alr4, tidyverse)

# load data
data(UN11, package="alr4")

# seed the random number generator to get the same sample
set.seed(6102)

# pick 81 countries from three regions 
# arrange the rows by alphabetical order
dta <- UN11 %>%
       filter(region %in% c("Africa", "Asia", "Europe")) %>%
       sample_n(81) %>%
       arrange(region)

# first 6 lines of data frame
head(dta)

  region  group fertility ppgdp lifeExpF pctUrban
1 Africa africa      3.99  1333     65.8       52
2 Africa africa      2.34 11451     78.0       56
3 Africa africa      3.19 12469     64.3       86
4 Africa africa      2.41 11321     77.9       78
5 Africa africa      5.08   741     58.7       42
6 Africa africa      5.75   520     57.0       27

# data dimensions - rows and columns
dim(dta)

[1] 81  6

# how many countries in each of the three regions
R3 <- table(dta$region)

# percentage of countries from each of the three regions selected
w <- R3/table(UN11$region)

# add the sampling weights variable to data
# skip over countries in regions not selected
dta$wt <- rep(1/w[w != 0], R3[R3 != 0])

# simple regression
summary(m0 <- lm(fertility ~ log(ppgdp), data=dta))


Call:
lm(formula = fertility ~ log(ppgdp), data = dta)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.2686 -0.7716  0.0497  0.6811  2.6292 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)    8.313      0.575   14.46  < 2e-16
log(ppgdp)    -0.652      0.068   -9.58  7.2e-15

Residual standard error: 1.07 on 79 degrees of freedom
Multiple R-squared:  0.537, Adjusted R-squared:  0.532 
F-statistic: 91.8 on 1 and 79 DF,  p-value: 7.15e-15

# weighted regression
summary(m1 <- update(m0, weights=wt))


Call:
lm(formula = fertility ~ log(ppgdp), data = dta, weights = wt)

Weighted Residuals:
   Min     1Q Median     3Q    Max 
-3.031 -1.001  0.063  0.921  3.425 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)    8.210      0.577   14.22  < 2e-16
log(ppgdp)    -0.642      0.068   -9.44  1.3e-14

Residual standard error: 1.42 on 79 degrees of freedom
Multiple R-squared:  0.53,  Adjusted R-squared:  0.524 
F-statistic: 89.1 on 1 and 79 DF,  p-value: 1.35e-14

# plot
ggplot(dta, 
       aes(log(ppgdp), fertility, label=region)) +
 stat_smooth(method="lm", formula=y ~ x, se=F, col="peru", lwd=rel(.5)) +
 stat_smooth(aes(weight=wt), method="lm", formula=y ~ x, se=F, lwd=rel(.5), col="gray")+
 geom_text(check_overlap=TRUE, size=rel(2.3), aes(color=region))+
 labs(x="GDP (US$ in log unit)", 
      y="Number of children per woman") +
 theme_minimal() +
 theme(legend.position="NONE")

# The end

Exercise 2

# load packages
pacman::p_load(languageR, lattice, tidyverse, nlme, lmtest)
# load data
data(heid, package="languageR")

# see first 6 lines of data
head(heid)

  Subject         Word   RT BaseFrequency
1     pp1   basaalheid 6.69          3.56
2     pp1  markantheid 6.81          5.16
3     pp1 ontroerdheid 6.51          5.55
4     pp1  contentheid 6.58          4.50
5     pp1    riantheid 6.86          4.53
6     pp1  tembaarheid 6.35          0.00

# check data structure
str(heid)

'data.frame':   832 obs. of  4 variables:
 $ Subject      : Factor w/ 26 levels "pp1","pp10","pp11",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Word         : Factor w/ 40 levels "aftandsheid",..: 4 24 28 10 33 38 25 5 16 30 ...
 $ RT           : num  6.69 6.81 6.51 6.58 6.86 6.35 6.69 7.17 6.58 6.98 ...
 $ BaseFrequency: num  3.56 5.16 5.55 4.5 4.53 0 1.61 3.61 2.56 5.86 ...

heid$Trial <- as.numeric(unlist(apply(as.matrix(table(heid$Subject)), 1, seq)))

# select a subset of variables for current example
dta_ex2 <- heid[,-2]
# word frequency effect
ggplot(data = dta_ex2, aes(x = BaseFrequency, y = RT)) +
 geom_point() +
 stat_smooth(method="lm") +
 labs(x = "Word Frequency", y = "Reaction Time (transformed)") +
 theme_bw()

# trial effect by subject
ggplot(data = dta_ex2, aes(x = Trial, y = RT)) +
 geom_point(pch =1, size = rel(.5)) +
 geom_line() +
 facet_wrap(~ Subject) +
 labs(x = "Trial ID", y = "Reaction Time (transformed)")

# autocorrelation by individual
acf.fnc(dta_ex2, group = "Subject", time = "Trial", x = "RT", plot = TRUE)

# simple linear model ignoring trial dependence - least-squares
summary(m0 <- lm(RT ~ BaseFrequency, data = dta_ex2))


Call:
lm(formula = RT ~ BaseFrequency, data = dta_ex2)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.5166 -0.2158 -0.0525  0.1599  0.8903 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)
(Intercept)     6.6417     0.0207  320.39   <2e-16
BaseFrequency  -0.0155     0.0047   -3.29    0.001

Residual standard error: 0.282 on 830 degrees of freedom
Multiple R-squared:  0.0129,    Adjusted R-squared:  0.0117 
F-statistic: 10.8 on 1 and 830 DF,  p-value: 0.00104

# MLE of the model above
summary(m0a <- gls(RT ~ BaseFrequency, data = dta_ex2, method = "ML"))

Generalized least squares fit by maximum likelihood
  Model: RT ~ BaseFrequency 
  Data: dta_ex2 
  AIC BIC logLik
  258 273   -126

Coefficients:
              Value Std.Error t-value p-value
(Intercept)    6.64    0.0207     320   0.000
BaseFrequency -0.02    0.0047      -3   0.001

 Correlation: 
              (Intr)
BaseFrequency -0.882

Standardized residuals:
   Min     Q1    Med     Q3    Max 
-1.835 -0.766 -0.186  0.568  3.161 

Residual standard error: 0.282 
Degrees of freedom: 832 total; 830 residual

# simple linear model with autocorrelation error using gls
summary(m1 <- update(m0a, correlation = corAR1(form = ~ Trial | Subject)))

Generalized least squares fit by maximum likelihood
  Model: RT ~ BaseFrequency 
  Data: dta_ex2 
   AIC  BIC logLik
  64.3 83.2  -28.1

Correlation Structure: AR(1)
 Formula: ~Trial | Subject 
 Parameter estimate(s):
  Phi 
0.462 

Coefficients:
              Value Std.Error t-value p-value
(Intercept)    6.64   0.02193   302.5   0e+00
BaseFrequency -0.01   0.00387    -3.6   4e-04

 Correlation: 
              (Intr)
BaseFrequency -0.699

Standardized residuals:
   Min     Q1    Med     Q3    Max 
-1.835 -0.765 -0.184  0.586  3.159 

Residual standard error: 0.281 
Degrees of freedom: 832 total; 830 residual

# compare models
anova(m0a, m1)

    Model df   AIC   BIC logLik   Test L.Ratio p-value
m0a     1  3 258.5 272.6 -126.2                       
m1      2  4  64.3  83.2  -28.1 1 vs 2     196  <.0001

# collect residuals from the two models above
dta_ex2 <- dta_ex2 %>% mutate(res0 = rstandard(m0),
                      res1 = resid(m1, type = "normalized"))

# Durbin-Watson test of serial correlation
dwtest(res0 ~ BaseFrequency, data = dta_ex2)


    Durbin-Watson test

data:  res0 ~ BaseFrequency
DW = 1, p-value <2e-16
alternative hypothesis: true autocorrelation is greater than 0

#
dwtest(res1 ~ BaseFrequency, data = dta_ex2)


    Durbin-Watson test

data:  res1 ~ BaseFrequency
DW = 2, p-value = 1
alternative hypothesis: true autocorrelation is greater than 0

# check acf for residuals
acf.fnc(dta_ex2, group = "Subject", time = "Trial", x = "res1", plot = TRUE)

Exercise 3

# regression with correlations over time and in clusters
# set significant decimal points
# don't print silly stars
options(digits=4, show.signif.stars=FALSE)
pacman::p_load(tidyverse, nlme, broom, magrittr, MuMIn)
dta_ex3<- read.table('girl_height.csv', header=T, sep=",")
str(dta_ex3)

'data.frame':   100 obs. of  4 variables:
 $ Height: num  111 116 122 126 130 ...
 $ Girl  : chr  "G1" "G1" "G1" "G1" ...
 $ Age   : int  6 7 8 9 10 6 7 8 9 10 ...
 $ Mother: chr  "S" "S" "S" "S" ...

class(dta_ex3)

[1] "data.frame"

head(dta_ex3)

  Height Girl Age Mother
1  111.0   G1   6      S
2  116.4   G1   7      S
3  121.7   G1   8      S
4  126.3   G1   9      S
5  130.5   G1  10      S
6  110.0   G2   6      S

# this data structure goes with lattice plot
plot(dta_ex3)

# compute correlations 

cor(dta_ex3$Age, dta_ex3$Height)

[1] 0.8551

# get variance
library(dplyr)
dta_ex3 %>% dplyr::select(Height, Age)%>% var %>% diag

Height    Age 
 90.28   2.02

# one regression line fits all
ggplot(dta_ex3, aes(x = Age, y = Height, color = Mother)) +
 geom_point() +
 stat_smooth(aes(group = 1), method = "lm") +
 labs(x = "Age (scaled)", y = "Height (cm)") +
 theme(legend.position = "NONE")

# ordinary regression
m0 <- gls(Height ~ Age, data = dta_ex3)

# regression with autocorrelation
m1 <- update(m0, corr = corAR1(form = ~ 1 | Girl), method = "ML")

# regression with unequal variances for occasions
m2 <- update(m0, weights = varIdent(form = ~ 1 | Mother))
                          
# regression with autocorrelation and unequal variances
m3 <- update(m1, weights = varIdent(form = ~ 1 | Mother))

# model selection with AICc
model.sel(m0, m1, m2, m3)

Model selection table 
   (Intrc)   Age             family     correlation             weights  REML
m3   82.47 5.558 gaussian(identity) corAR1(~1|Girl) varIdent(~1|Mother) FALSE
m1   82.47 5.684 gaussian(identity) corAR1(~1|Girl)                     FALSE
m2   82.46 5.549 gaussian(identity)                 varIdent(~1|Mother)      
m0   82.52 5.716 gaussian(identity)                                          
   df logLik  AICc  delta weight
m3  6 -162.8 338.5   0.00      1
m1  4 -172.7 353.8  15.26      0
m2  5 -292.0 594.6 256.14      0
m0  3 -300.8 607.8 269.27      0
Models ranked by AICc(x)

# summary model output
lapply(list(m0, m1, m2, m3), summary)

[[1]]
Generalized least squares fit by REML
  Model: Height ~ Age 
  Data: dta_ex3 
    AIC   BIC logLik
  607.5 615.3 -300.8

Coefficients:
            Value Std.Error t-value p-value
(Intercept) 82.52    2.8439   29.02       0
Age          5.72    0.3501   16.33       0

 Correlation: 
    (Intr)
Age -0.985

Standardized residuals:
    Min      Q1     Med      Q3     Max 
-1.8561 -0.7402 -0.1696  0.7974  2.6348 

Residual standard error: 4.951 
Degrees of freedom: 100 total; 98 residual

[[2]]
Generalized least squares fit by maximum likelihood
  Model: Height ~ Age 
  Data: dta_ex3 
    AIC   BIC logLik
  353.3 363.8 -172.7

Correlation Structure: AR(1)
 Formula: ~1 | Girl 
 Parameter estimate(s):
   Phi 
0.9782 

Coefficients:
            Value Std.Error t-value p-value
(Intercept) 82.47     1.380   59.75       0
Age          5.68     0.111   51.21       0

 Correlation: 
    (Intr)
Age -0.643

Standardized residuals:
    Min      Q1     Med      Q3     Max 
-1.8432 -0.7005 -0.1174  0.8831  2.7934 

Residual standard error: 4.781 
Degrees of freedom: 100 total; 98 residual

[[3]]
Generalized least squares fit by REML
  Model: Height ~ Age 
  Data: dta_ex3 
  AIC   BIC logLik
  594 606.9   -292

Variance function:
 Structure: Different standard deviations per stratum
 Formula: ~1 | Mother 
 Parameter estimates:
     S      M      T 
1.0000 0.8014 1.8171 

Coefficients:
            Value Std.Error t-value p-value
(Intercept) 82.46    2.3331   35.34       0
Age          5.55    0.2872   19.32       0

 Correlation: 
    (Intr)
Age -0.985

Standardized residuals:
     Min       Q1      Med       Q3      Max 
-1.88260 -0.64836  0.07568  0.75427  2.40785 

Residual standard error: 3.961 
Degrees of freedom: 100 total; 98 residual

[[4]]
Generalized least squares fit by maximum likelihood
  Model: Height ~ Age 
  Data: dta_ex3 
    AIC   BIC logLik
  337.6 353.2 -162.8

Correlation Structure: AR(1)
 Formula: ~1 | Girl 
 Parameter estimate(s):
   Phi 
0.9787 
Variance function:
 Structure: Different standard deviations per stratum
 Formula: ~1 | Mother 
 Parameter estimates:
     S      M      T 
1.0000 0.6923 1.5494 

Coefficients:
            Value Std.Error t-value p-value
(Intercept) 82.47    1.1305   72.95       0
Age          5.56    0.0903   61.55       0

 Correlation: 
    (Intr)
Age -0.639

Standardized residuals:
     Min       Q1      Med       Q3      Max 
-1.77189 -0.71727  0.06484  0.80091  2.56095 

Residual standard error: 4.265 
Degrees of freedom: 100 total; 98 residual

# autocorrelations
acf(resid(m0, type = "normalized"), main = "m0")

acf(resid(m1, type = "normalized"), main = "m1")

acf(resid(m2, type = "normalized"), main = "m2")

acf(resid(m3, type = "normalized"), main = "m3")

# The end

Exercise 4

#Data
dta_ex4<- read.table("course_eval.txt", header=T)
dta_ex4$Course <- as.factor(dta_ex4$Course)
head(dta_ex4)

  Score  Beauty Gender Age Minority Tenure Course
1   4.3  0.2016      1  36        1      0      3
2   4.5 -0.8261      0  59        0      1      0
3   3.7 -0.6603      0  51        0      1      4
4   4.3 -0.7663      1  40        0      1      2
5   4.4  1.4214      1  31        0      0      0
6   4.2  0.5002      0  62        0      1      0

library(lattice)
xyplot(Score ~ Beauty | Course, data=dta_ex4,
       ylab="Average course evaluation score", 
       xlab="Beauty judgment score",
       type=c("p", "g", "r"))

dtag <-  # create a new copy of the groupedData object
  groupedData(Score ~ Beauty | Course,
              data=as.data.frame( dta_ex4 ),
              FUN=mean,
              labels=list(x="Beauty score",
                y="Couse evaluation score" ))

plot(dtag, asp=1)

#
m0 <- lm(Score ~ Beauty, data=dta_ex4)
summary(m0)


Call:
lm(formula = Score ~ Beauty, data = dta_ex4)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8002 -0.3630  0.0725  0.4021  1.1037 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)   4.0100     0.0255  157.21  < 2e-16
Beauty        0.1330     0.0322    4.13  4.2e-05

Residual standard error: 0.545 on 461 degrees of freedom
Multiple R-squared:  0.0357,    Adjusted R-squared:  0.0336 
F-statistic: 17.1 on 1 and 461 DF,  p-value: 4.25e-05

#
with(dta_ex4, plot(Beauty, Score, bty="n",
     xlab="Beauty score", 
     ylab="Average course evaluation"))
grid()
abline(m0)

#
t.test(coef(lmList(Score ~ Beauty | Course, data = dta_ex4))["Beauty"])


    One Sample t-test

data:  coef(lmList(Score ~ Beauty | Course, data = dta_ex4))["Beauty"]
t = 0.54, df = 30, p-value = 0.6
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
 -2.188  3.769
sample estimates:
mean of x 
   0.7905

#
m1 <- lm(Score ~ Course/Beauty - 1, data=dta_ex4)

#
summary(m1)


Call:
lm(formula = Score ~ Course/Beauty - 1, data = dta_ex4)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8108 -0.2739  0.0135  0.3314  1.0935 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)
Course0           4.0316     0.0304  132.83  < 2e-16
Course1           3.7049     1.2199    3.04  0.00254
Course2           4.4852     0.4880    9.19  < 2e-16
Course3           3.8806     0.2535   15.31  < 2e-16
Course4           3.8359     0.1207   31.79  < 2e-16
Course5           4.0863     0.3073   13.30  < 2e-16
Course6           4.2195     0.3775   11.18  < 2e-16
Course7           3.4491     0.4324    7.98  1.6e-14
Course8           5.0785     1.0201    4.98  9.5e-07
Course9           3.9284     0.2150   18.27  < 2e-16
Course10          4.8393     1.8830    2.57  0.01053
Course11          4.1322     0.4060   10.18  < 2e-16
Course12          3.3081     0.6667    4.96  1.0e-06
Course13          4.1574     0.6147    6.76  4.8e-11
Course14          3.6171     0.4442    8.14  4.9e-15
Course15          2.4200     0.4187    5.78  1.5e-08
Course16        -16.9734    22.0939   -0.77  0.44280
Course17          4.4669     0.2982   14.98  < 2e-16
Course18          4.2042     0.3147   13.36  < 2e-16
Course19          3.5362     0.2520   14.03  < 2e-16
Course20          4.4540     0.4957    8.99  < 2e-16
Course21          3.6620     0.1425   25.70  < 2e-16
Course22          3.7286     0.2064   18.06  < 2e-16
Course23          3.5310     0.5096    6.93  1.7e-11
Course24          3.7810     0.3552   10.65  < 2e-16
Course25         20.7529    13.6132    1.52  0.12818
Course26          4.2462     0.3138   13.53  < 2e-16
Course27          4.5522     0.6720    6.77  4.5e-11
Course28          3.6051     1.3974    2.58  0.01024
Course29          3.8039     0.3909    9.73  < 2e-16
Course30          4.3076     0.3272   13.17  < 2e-16
Course0:Beauty    0.1462     0.0378    3.87  0.00013
Course1:Beauty    0.9109     1.3378    0.68  0.49632
Course2:Beauty    0.2416     0.8978    0.27  0.78795
Course3:Beauty   -0.0458     1.4114   -0.03  0.97412
Course4:Beauty    0.5401     0.2909    1.86  0.06405
Course5:Beauty    0.2275     0.4206    0.54  0.58888
Course6:Beauty    0.7948     0.6393    1.24  0.21453
Course7:Beauty   -0.1954     0.4447   -0.44  0.66051
Course8:Beauty    1.8980     1.4103    1.35  0.17913
Course9:Beauty   -1.9625     0.7006   -2.80  0.00534
Course10:Beauty   0.5348     1.9240    0.28  0.78117
Course11:Beauty  -0.4049     0.5014   -0.81  0.41987
Course12:Beauty  -1.9471     1.6706   -1.17  0.24452
Course13:Beauty   0.5055     0.9294    0.54  0.58682
Course14:Beauty  -0.1386     0.8917   -0.16  0.87659
Course15:Beauty  -0.3753     0.5578   -0.67  0.50140
Course16:Beauty -18.2149    19.1411   -0.95  0.34187
Course17:Beauty   0.3354     0.4471    0.75  0.45349
Course18:Beauty  -0.1303     0.4932   -0.26  0.79172
Course19:Beauty  -0.3202     0.3246   -0.99  0.32451
Course20:Beauty   0.0692     0.8893    0.08  0.93801
Course21:Beauty  -0.0380     0.1891   -0.20  0.84096
Course22:Beauty   0.1641     0.2424    0.68  0.49881
Course23:Beauty  -1.3347     0.7649   -1.74  0.08178
Course24:Beauty   0.0756     0.9732    0.08  0.93811
Course25:Beauty  40.5972    32.6559    1.24  0.21453
Course26:Beauty   0.2253     0.3363    0.67  0.50332
Course27:Beauty   0.9815     1.2156    0.81  0.41987
Course28:Beauty   0.7384     0.9699    0.76  0.44693
Course29:Beauty   0.4722     0.2924    1.61  0.10711
Course30:Beauty   0.1544     0.2311    0.67  0.50451

Residual standard error: 0.525 on 401 degrees of freedom
Multiple R-squared:  0.985, Adjusted R-squared:  0.983 
F-statistic:  434 on 62 and 401 DF,  p-value: <2e-16

#
summary(m1)$coef[which(summary(m1)$coef[-c(1:31),4] <0.05),]

        Estimate Std. Error t value  Pr(>|t|)
Course0    4.032    0.03035  132.83 0.000e+00
Course9    3.928    0.21504   18.27 1.102e-54

# The end

Homework_0921

yuwenchen

2020-09-27

In class exercise 1

In class exercise 2

In class exercise 3

In class exercise 4

Visualization

# Linear regression models

Exercise 1

Exercise 2

Exercise 3

Exercise 4