Q3
#
# regression with correlations over time and in clusters
#
# set path to working directory
setwd("C:\\Users\\USER\\Desktop\\Multilevel Statistical\\w3 exercise")
#regression with correlations over time and in clusters
# set significant decimal points
options(digits = 2, show.signif.stars = FALSE)
# handle package loading automatically
library(pacman)
# load packages
pacman::p_load(tidyverse, nlme, broom, magrittr, MuMIn)
setwd("C:\\Users\\USER\\Desktop\\Multilevel Statistical\\w3 exercise")
# load data set
dta <- read.csv("girl_height.csv", header = TRUE)
# compute correlations of heights among occasions
# format data from long to wide format first
dta0 <- dta %>% select(-Mother)
dta0 <- as.data.frame(dta) %>%
select(Height, Girl, Age) %>%
spread(Age, Height) %>%
#spread以指定的項目(該項目會變成變成欄)分離想要的數值
select(-Girl)
#選定想要的變項(整欄)
# get correlation matrix
dta0 %>% cor
## 6 7 8 9 10
## 6 1.00 0.98 0.98 0.97 0.95
## 7 0.98 1.00 0.99 0.98 0.96
## 8 0.98 0.99 1.00 0.99 0.98
## 9 0.97 0.98 0.99 1.00 0.99
## 10 0.95 0.96 0.98 0.99 1.00
# get variance at each occasion
dta0 %>% var %>% diag
## 6 7 8 9 10
## 16 20 27 31 32
# set plot theme to black-n-white
ot <- theme_set(theme_bw())
# one regression line fits all
ggplot(dta, aes(x = Age, y = Height, color = Girl)) +
geom_point() +
stat_smooth(aes(group = 1), method = "lm") +
labs(x = "Age", y = "Height") +
theme(legend.position = "NONE")

# ordinary regression
m0 <- gls(Height ~ Age, data = dta)
# regression with autocorrelation
m1 <- update(m0, corr = corAR1(form = ~ 1 | Girl), method = "ML")
# regression with unequal variances for occasions
m2 <- update(m0, weights = varIdent(form = ~ 1 | Girl))
# regression with autocorrelation and unequal variances
m3 <- update(m1, weights = varIdent(form = ~ 1 | Girl))
# model selection with AICc
model.sel(m0, m1, m2, m3)
## Model selection table
## (Intrc) Age correlation weights REML df logLik AICc delta weight
## m1 82 5.7 crAR1(Girl) F 4 -173 354 0 0.994
## m3 81 5.6 crAR1(Girl) vrI(Grl) F 23 -152 364 10 0.006
## m2 81 5.5 vrI(Grl) 22 -242 541 188 0.000
## m0 83 5.7 3 -301 608 254 0.000
## Abbreviations:
## correlation: crAR1(Girl) = 'corAR1(~1|Girl)'
## weights: vrI(Grl) = 'varIdent(~1|Girl)'
## REML: F = 'FALSE'
## Models ranked by AICc(x)
# summary model output
lapply(list(m0, m1, m2, m3), summary)
## [[1]]
## Generalized least squares fit by REML
## Model: Height ~ Age
## Data: dta
## AIC BIC logLik
## 608 615 -301
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 83 2.84 29 0
## Age 6 0.35 16 0
##
## Correlation:
## (Intr)
## Age -0.98
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.86 -0.74 -0.17 0.80 2.63
##
## Residual standard error: 5
## Degrees of freedom: 100 total; 98 residual
##
## [[2]]
## Generalized least squares fit by maximum likelihood
## Model: Height ~ Age
## Data: dta
## AIC BIC logLik
## 353 364 -173
##
## Correlation Structure: AR(1)
## Formula: ~1 | Girl
## Parameter estimate(s):
## Phi
## 0.98
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 82 1.38 60 0
## Age 6 0.11 51 0
##
## Correlation:
## (Intr)
## Age -0.64
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.84 -0.70 -0.12 0.88 2.79
##
## Residual standard error: 4.8
## Degrees of freedom: 100 total; 98 residual
##
## [[3]]
## Generalized least squares fit by REML
## Model: Height ~ Age
## Data: dta
## AIC BIC logLik
## 528 585 -242
##
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | Girl
## Parameter estimates:
## G1 G2 G3 G4 G5 G6 G7 G8 G9 G10 G11 G12
## 1.000 1.004 0.076 0.289 0.379 0.373 0.530 0.977 2.250 0.183 1.030 0.077
## G13 G14 G15 G16 G17 G18 G19 G20
## 0.076 1.560 2.408 1.833 2.254 2.229 0.901 3.784
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 81 0.41 195 0
## Age 6 0.05 108 0
##
## Correlation:
## (Intr)
## Age -0.98
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.78 -0.57 0.84 1.07 1.76
##
## Residual standard error: 3.9
## Degrees of freedom: 100 total; 98 residual
##
## [[4]]
## Generalized least squares fit by maximum likelihood
## Model: Height ~ Age
## Data: dta
## AIC BIC logLik
## 349 409 -152
##
## Correlation Structure: AR(1)
## Formula: ~1 | Girl
## Parameter estimate(s):
## Phi
## 0.98
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | Girl
## Parameter estimates:
## G1 G10 G11 G12 G13 G14 G15 G16 G17 G18 G19 G2 G20 G3 G4
## 1.00 0.68 0.57 0.45 0.23 1.31 1.64 1.22 1.32 1.43 1.13 0.71 2.44 0.46 0.73
## G5 G6 G7 G8 G9
## 1.45 0.89 0.63 0.71 1.20
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 81 0.79 102 0
## Age 6 0.07 85 0
##
## Correlation:
## (Intr)
## Age -0.66
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.82 -0.45 0.43 1.21 1.75
##
## Residual standard error: 4.2
## Degrees of freedom: 100 total; 98 residual
# autocorrelations
acf(resid(m0, type = "normalized"), main = "m0")

acf(resid(m1, type = "normalized"), main = "m1")

acf(resid(m2, type = "normalized"), main = "m2")

acf(resid(m3, type = "normalized"), main = "m3")

Q5
#The standard deviation of length estimates of each subject is available in the loudness data example.
#Perform a weighted regression and compare the results against an ordinary regression of length onto loudness.
# handle package loading automatically
library(pacman)
# load the packages
pacman::p_load(alr4, tidyverse, magrittr,RColorBrewer)
# make sure data set is active
data(Stevens)
# save a copy
dta <- Stevens %>% rename(Length = y, Loudness = loudness, Subject = subject)
# ordinal regression
or <- lm(Length ~ Loudness, dta)
summary(or)
##
## Call:
## lm(formula = Length ~ Loudness, data = dta)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2669 -0.3179 -0.0093 0.3707 0.9219
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.20377 0.34429 -9.31 2.5e-12
## Loudness 0.07901 0.00482 16.39 < 2e-16
##
## Residual standard error: 0.48 on 48 degrees of freedom
## Multiple R-squared: 0.848, Adjusted R-squared: 0.845
## F-statistic: 269 on 1 and 48 DF, p-value: <2e-16
#weighted lm
wr= lm(Length ~Loudness, data = dta, weights = 1/sd^2)
summary(wr)
##
## Call:
## lm(formula = Length ~ Loudness, data = dta, weights = 1/sd^2)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -7.428 0.024 1.286 2.189 11.169
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.85942 0.38359 -7.45 1.5e-09
## Loudness 0.06939 0.00489 14.18 < 2e-16
##
## Residual standard error: 3.8 on 48 degrees of freedom
## Multiple R-squared: 0.807, Adjusted R-squared: 0.803
## F-statistic: 201 on 1 and 48 DF, p-value: <2e-16
# compare
ggplot(dta, aes(x = Loudness, y = Length, group = Subject, color = Subject))+
geom_point()+
stat_smooth(aes(group = 1),method = "lm", col = "#d95f0e", se = F)+ #lm
#stat_smooth just one line
geom_path(aes(x = Loudness, fitted(wr)), size = rel(1.0), col = "#c51b8a")+ #weighted
#geom_path connects the observations in the order in which they appear in the data
labs(x = "Loudness (db)", y = "Average log(Length)")

Q6
#Extract data on Taiwan from the data set gapminder{gapminder} to examine the linear relationship
#between life expectancy (lifeExp) and gross domestic product per capita (gdpPercap) in logarithmic unit.
#Should the analysis take into account that the observations are time series?
# load the packages
pacman::p_load(gapminder,car)
dat = gapminder
#select the data of Taiwan
dat = dat[dat$country=="Taiwan",]
#add new variable "log.gdp"
dat$log.gdp = log(dat$gdpPercap)
summary(dat)
## country continent year lifeExp
## Taiwan :12 Africa : 0 Min. :1952 Min. :58
## Afghanistan: 0 Americas: 0 1st Qu.:1966 1st Qu.:67
## Albania : 0 Asia :12 Median :1980 Median :71
## Algeria : 0 Europe : 0 Mean :1980 Mean :70
## Angola : 0 Oceania : 0 3rd Qu.:1993 3rd Qu.:75
## Argentina : 0 Max. :2007 Max. :78
## (Other) : 0
## pop gdpPercap log.gdp
## Min. : 8550362 Min. : 1207 Min. : 7.1
## 1st Qu.:13216254 1st Qu.: 2439 1st Qu.: 7.8
## Median :17643293 Median : 6511 Median : 8.8
## Mean :16874724 Mean :10225 Mean : 8.7
## 3rd Qu.:20922340 3rd Qu.:16463 3rd Qu.: 9.7
## Max. :23174294 Max. :28718 Max. :10.3
##
#plot(year,lifeExp)
require(stats)
plot(dat$year,dat$lifeExp)
lines(lowess(dat$year,dat$lifeExp))

#plot(year,log.gdp)
plot(dat$year,dat$log.gdp)
lines(lowess(dat$year,dat$log.gdp))

#plot(lifeExp,log.gdp)
plot(dat$lifeExp,dat$log.gdp)
lines(lowess(dat$lifeExp,dat$log.gdp))

#simple lm
summary(m0 <- lm(log.gdp~lifeExp, data = dat))
##
## Call:
## lm(formula = log.gdp ~ lifeExp, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.343 -0.178 0.019 0.134 0.498
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.9684 0.9373 -4.23 0.0017
## lifeExp 0.1806 0.0133 13.60 8.9e-08
##
## Residual standard error: 0.27 on 10 degrees of freedom
## Multiple R-squared: 0.949, Adjusted R-squared: 0.944
## F-statistic: 185 on 1 and 10 DF, p-value: 8.93e-08
#r square=0.9487 have high correlation
# plot m0 residual
plot(dat$lifeExp,m0$residual)

#have autocorrelation
#DW
require(car)
durbinWatsonTest(m0)
## lag Autocorrelation D-W Statistic p-value
## 1 0.5 0.64 0.002
## Alternative hypothesis: rho != 0
#P<.05 reject H0 ,有自我相關存在
#correct
# model with autocorrelation error using gls
require(nlme)
# ordinary regression
#correct
# regression with autocorrelation
summary(m1 <- gls(log.gdp~lifeExp, correlation = corAR1(form = ~ year),
data = dat, method = "ML") )
## Generalized least squares fit by maximum likelihood
## Model: log.gdp ~ lifeExp
## Data: dat
## AIC BIC logLik
## 8.1 10 -0.025
##
## Correlation Structure: ARMA(1,0)
## Formula: ~year
## Parameter estimate(s):
## Phi1
## 0
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) -4.0 0.94 -4.2 0.0017
## lifeExp 0.2 0.01 13.6 0.0000
##
## Correlation:
## (Intr)
## lifeExp -1
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.416 -0.734 0.078 0.551 2.054
##
## Residual standard error: 0.24
## Degrees of freedom: 12 total; 10 residual