1002_HW

Refer to the questions here.

Q1

library(alr4)
# simple linear regression info
m0 <- lm(dheight ~ 1, data = Heights) ; m0


Call:
lm(formula = dheight ~ 1, data = Heights)

Coefficients:
(Intercept)  
      63.75

m1 <- update(m0, . ~ . + mheight)
summary(m1)


Call:
lm(formula = dheight ~ mheight, data = Heights)

Residuals:
   Min     1Q Median     3Q    Max 
-7.397 -1.529  0.036  1.492  9.053 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 29.91744    1.62247   18.44   <2e-16 ***
mheight      0.54175    0.02596   20.87   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.266 on 1373 degrees of freedom
Multiple R-squared:  0.2408,    Adjusted R-squared:  0.2402 
F-statistic: 435.5 on 1 and 1373 DF,  p-value: < 2.2e-16

anova(m0, m1)

Analysis of Variance Table

Model 1: dheight ~ 1
Model 2: dheight ~ mheight
  Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
1   1374 9288.6                                  
2   1373 7052.0  1    2236.7 435.47 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# diagnostics
#dev.off()
ggplot(m1, aes(sample = scale(resid(m1, "pearson")))) +
  stat_qq() +
  geom_abline(intercept = 0, slope = 1)+
  labs(x = "Normal quantiles",
       y = "Quantiles of standardized residuals")

# centering
coef(lm(dheight ~ mheight, data = Heights))

(Intercept)     mheight 
  29.917437    0.541747

Heights %>% mutate(mheight_c = mheight - mean(mheight)) %>%
  lm(dheight ~ mheight_c, data = .) %>% coef(.)

(Intercept)   mheight_c 
  63.751055    0.541747

# plot
with(Heights, plot(mheight, dheight))
abline(lm(dheight ~ mheight, data = Heights), col = "blue")

# scaling
coef(lm(scale(dheight) ~ scale(mheight), data = Heights))

   (Intercept) scale(mheight) 
  2.199083e-15   4.907094e-01

with(Heights, (sd(dheight)/sd(mheight)* cor(dheight, mheight)))

[1] 0.541747

Q2

What the Rscript does :

1.Pick three regions : africa, asia, europe , and randomly pick 72 countries. Draw regression line of these three regions.
2.Draw another weighted regression line(brown) to show the correlation of gdp and fertility rate.

Q3

setwd("E:/201709/multilevel_analysis/1002")
girl_dat <- read.csv("girl_height.csv")
dat <- read.csv("girl_height.csv")

# plot 
xyplot( Height ~ Age | Girl, data = girl_dat,
        xlab = "Age", ylab = "Height(cm)")

# compute correlations of heights among occasions
dat %<>% mutate(groupt=substr(dat$Girl, 1,1), ID=as.numeric(substr(dat$Girl,2,3)))
dat <- as.data.frame(dat) %>%
  select(Height, Age, ID) %>%
  spread(Age, Height) %>%
  select(-ID)

# get correlation matrix
dat %>% cor

           6         7         8         9        10
6  1.0000000 0.9809247 0.9815512 0.9740140 0.9540426
7  0.9809247 1.0000000 0.9858051 0.9803015 0.9584083
8  0.9815512 0.9858051 1.0000000 0.9880770 0.9751260
9  0.9740140 0.9803015 0.9880770 1.0000000 0.9920014
10 0.9540426 0.9584083 0.9751260 0.9920014 1.0000000

# get variance at each occasion (diag : extract or replace the diagonal of a matrix)
dat %>% var %>% diag

       6        7        8        9       10 
15.81095 19.73063 27.15589 31.22408 32.00997

# set plot theme to black-n-white
ot <- theme_set(theme_bw())

# one regression line fits all
ggplot(girl_dat, aes(x = Age, y = Height, color = Girl)) +
  geom_point() +
  stat_smooth(aes(group = 1), method = "lm") +
  labs(x = "Age (scaled)", y = "Height (cm)") +
  theme(legend.position = "NONE")

# ordinary regression
m0 <- gls(Height ~ Age, data = girl_dat)

# regression with autocorrelation
m1 <- update(m0, corr = corAR1(form = ~ 1 | Girl), method = "ML");m1

Generalized least squares fit by maximum likelihood
  Model: Height ~ Age 
  Data: girl_dat 
  Log-likelihood: -172.6705

Coefficients:
(Intercept)         Age 
  82.474556    5.683788 

Correlation Structure: AR(1)
 Formula: ~1 | Girl 
 Parameter estimate(s):
      Phi 
0.9781674 
Degrees of freedom: 100 total; 98 residual
Residual standard error: 4.780948

# regression with unequal variances for occasions
m2 <- update(m0, weights = varIdent(form = ~ 1 | Age));m2

Generalized least squares fit by REML
  Model: Height ~ Age 
  Data: girl_dat 
  Log-restricted-likelihood: -299.0226

Coefficients:
(Intercept)         Age 
  82.229398    5.751755 

Variance function:
 Structure: Different standard deviations per stratum
 Formula: ~1 | Age 
 Parameter estimates:
       6        7        8        9       10 
1.000000 1.104837 1.297424 1.391427 1.421762 
Degrees of freedom: 100 total; 98 residual
Residual standard error: 3.947016

# regression with autocorrelation and unequal variances
m3 <- update(m1, weights = varIdent(form = ~ 1 | Age));m3

Generalized least squares fit by maximum likelihood
  Model: Height ~ Age 
  Data: girl_dat 
  Log-likelihood: -159.3523

Coefficients:
(Intercept)         Age 
  81.431037    5.533068 

Correlation Structure: AR(1)
 Formula: ~1 | Girl 
 Parameter estimate(s):
      Phi 
0.9886847 
Variance function:
 Structure: Different standard deviations per stratum
 Formula: ~1 | Age 
 Parameter estimates:
       6        7        8        9       10 
1.000000 1.135107 1.343258 1.410275 1.374144 
Degrees of freedom: 100 total; 98 residual
Residual standard error: 4.37343

# model selection with AICc
model.sel(m0, m1, m2, m3)

Model selection table 
   (Intrc)   Age correlation  weights REML df   logLik  AICc  delta weight
m3   81.43 5.533 crAR1(Girl) vrI(Age)    F  8 -159.352 336.3   0.00      1
m1   82.47 5.684 crAR1(Girl)             F  4 -172.671 353.8  17.48      0
m0   82.52 5.716                            3 -300.761 607.8 271.48      0
m2   82.23 5.752             vrI(Age)       7 -299.023 613.3 276.98      0
Abbreviations:
correlation: crAR1(Girl) = 'corAR1(~1|Girl)'
weights: vrI(Age) = 'varIdent(~1|Age)'
REML: F = 'FALSE'
Models ranked by AICc(x)

Q5

data(Stevens)
dta <- Stevens %>% rename(Length = y, Loudness = loudness, Subject = subject)

# one model fits all
m0 <- lm(Length ~ Loudness, data = dta)

# plot
ot <- theme_set(theme_bw())
ggplot(dta, aes(Loudness, Length, group = Subject, alpha = Subject))+
  geom_point() +
  geom_line() +
  stat_smooth(aes(group = 1), method = "lm", size = rel(.8)) +
  coord_cartesian(ylim = c(-1, 5)) +
  labs(x = "Loudness (db)", y = "Average log(Length)") +
  theme(legend.position = "NONE")

# tidy summary model output
broom::tidy(summary(m0))

         term  estimate   std.error statistic      p.value
1 (Intercept) -3.203770 0.344285528 -9.305561 2.531325e-12
2    Loudness  0.079013 0.004820962 16.389469 2.693170e-21

# weighted model
m1 <- Stevens %>% 
  group_by(loudness) %>%
  mutate(y_m = mean(y), y_n = n()) %>%
  lm(y_m ~ loudness, weight = y_n, data = .) %>%
  coef(.)

#plot
ggplot(Stevens, aes(loudness, y, group = subject, label = subject)) +
  geom_point() +
  geom_abline(intercept = m1[1], slope = m1[2], col = "firebrick") +
  stat_smooth(aes(group = 1), method = "lm", size = rel(.8)) +
  labs(x = "Loudness (db)", y = "Average log(Length)") +
  theme(legend.position = "NONE")

# tidy summary model output
broom::tidy(summary(m1))

   minimum        q1    median      mean         q3  maximum
1 -3.20377 -2.383074 -1.562379 -1.562379 -0.7416828 0.079013