regression Q3Q5Q6

Q3

#
# regression with correlations over time and in clusters
#

# set path to working directory
setwd("C:\\Users\\USER\\Desktop\\Multilevel Statistical\\w3 exercise")

#regression with correlations over time and in clusters
# set significant decimal points
options(digits = 2, show.signif.stars = FALSE)

# handle package loading automatically
library(pacman)

# load packages
pacman::p_load(tidyverse, nlme, broom, magrittr, MuMIn)

setwd("C:\\Users\\USER\\Desktop\\Multilevel Statistical\\w3 exercise")

# load data set
dta <- read.csv("girl_height.csv", header = TRUE)

# compute correlations of heights among occasions
# format data from long to wide format first
dta0 <- dta %>% select(-Mother)
dta0 <- as.data.frame(dta) %>%
  select(Height, Girl, Age) %>%
  spread(Age, Height) %>% 
  #spread以指定的項目(該項目會變成變成欄)分離想要的數值
  select(-Girl)
  #選定想要的變項(整欄)

# get correlation matrix
dta0 %>% cor

##       6    7    8    9   10
## 6  1.00 0.98 0.98 0.97 0.95
## 7  0.98 1.00 0.99 0.98 0.96
## 8  0.98 0.99 1.00 0.99 0.98
## 9  0.97 0.98 0.99 1.00 0.99
## 10 0.95 0.96 0.98 0.99 1.00

# get variance at each occasion
dta0 %>% var %>% diag

##  6  7  8  9 10 
## 16 20 27 31 32

# set plot theme to black-n-white
ot <- theme_set(theme_bw())

# one regression line fits all
ggplot(dta, aes(x = Age, y = Height, color = Girl)) +
  geom_point() +
  stat_smooth(aes(group = 1), method = "lm") +
  labs(x = "Age", y = "Height") +
  theme(legend.position = "NONE")

# ordinary regression
m0 <- gls(Height ~ Age, data = dta)

# regression with autocorrelation
m1 <- update(m0, corr = corAR1(form = ~ 1 | Girl), method = "ML")

# regression with unequal variances for occasions
m2 <- update(m0, weights = varIdent(form = ~ 1 | Girl))

# regression with autocorrelation and unequal variances
m3 <- update(m1, weights = varIdent(form = ~ 1 | Girl))

# model selection with AICc
model.sel(m0, m1, m2, m3)

## Model selection table 
##    (Intrc) Age correlation  weights REML df logLik AICc delta weight
## m1      82 5.7 crAR1(Girl)             F  4   -173  354     0  0.994
## m3      81 5.6 crAR1(Girl) vrI(Grl)    F 23   -152  364    10  0.006
## m2      81 5.5             vrI(Grl)      22   -242  541   188  0.000
## m0      83 5.7                            3   -301  608   254  0.000
## Abbreviations:
## correlation: crAR1(Girl) = 'corAR1(~1|Girl)'
## weights: vrI(Grl) = 'varIdent(~1|Girl)'
## REML: F = 'FALSE'
## Models ranked by AICc(x)

# summary model output
lapply(list(m0, m1, m2, m3), summary)

## [[1]]
## Generalized least squares fit by REML
##   Model: Height ~ Age 
##   Data: dta 
##   AIC BIC logLik
##   608 615   -301
## 
## Coefficients:
##             Value Std.Error t-value p-value
## (Intercept)    83      2.84      29       0
## Age             6      0.35      16       0
## 
##  Correlation: 
##     (Intr)
## Age -0.98 
## 
## Standardized residuals:
##   Min    Q1   Med    Q3   Max 
## -1.86 -0.74 -0.17  0.80  2.63 
## 
## Residual standard error: 5 
## Degrees of freedom: 100 total; 98 residual
## 
## [[2]]
## Generalized least squares fit by maximum likelihood
##   Model: Height ~ Age 
##   Data: dta 
##   AIC BIC logLik
##   353 364   -173
## 
## Correlation Structure: AR(1)
##  Formula: ~1 | Girl 
##  Parameter estimate(s):
##  Phi 
## 0.98 
## 
## Coefficients:
##             Value Std.Error t-value p-value
## (Intercept)    82      1.38      60       0
## Age             6      0.11      51       0
## 
##  Correlation: 
##     (Intr)
## Age -0.64 
## 
## Standardized residuals:
##   Min    Q1   Med    Q3   Max 
## -1.84 -0.70 -0.12  0.88  2.79 
## 
## Residual standard error: 4.8 
## Degrees of freedom: 100 total; 98 residual
## 
## [[3]]
## Generalized least squares fit by REML
##   Model: Height ~ Age 
##   Data: dta 
##   AIC BIC logLik
##   528 585   -242
## 
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Girl 
##  Parameter estimates:
##    G1    G2    G3    G4    G5    G6    G7    G8    G9   G10   G11   G12 
## 1.000 1.004 0.076 0.289 0.379 0.373 0.530 0.977 2.250 0.183 1.030 0.077 
##   G13   G14   G15   G16   G17   G18   G19   G20 
## 0.076 1.560 2.408 1.833 2.254 2.229 0.901 3.784 
## 
## Coefficients:
##             Value Std.Error t-value p-value
## (Intercept)    81      0.41     195       0
## Age             6      0.05     108       0
## 
##  Correlation: 
##     (Intr)
## Age -0.98 
## 
## Standardized residuals:
##   Min    Q1   Med    Q3   Max 
## -1.78 -0.57  0.84  1.07  1.76 
## 
## Residual standard error: 3.9 
## Degrees of freedom: 100 total; 98 residual
## 
## [[4]]
## Generalized least squares fit by maximum likelihood
##   Model: Height ~ Age 
##   Data: dta 
##   AIC BIC logLik
##   349 409   -152
## 
## Correlation Structure: AR(1)
##  Formula: ~1 | Girl 
##  Parameter estimate(s):
##  Phi 
## 0.98 
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Girl 
##  Parameter estimates:
##   G1  G10  G11  G12  G13  G14  G15  G16  G17  G18  G19   G2  G20   G3   G4 
## 1.00 0.68 0.57 0.45 0.23 1.31 1.64 1.22 1.32 1.43 1.13 0.71 2.44 0.46 0.73 
##   G5   G6   G7   G8   G9 
## 1.45 0.89 0.63 0.71 1.20 
## 
## Coefficients:
##             Value Std.Error t-value p-value
## (Intercept)    81      0.79     102       0
## Age             6      0.07      85       0
## 
##  Correlation: 
##     (Intr)
## Age -0.66 
## 
## Standardized residuals:
##   Min    Q1   Med    Q3   Max 
## -1.82 -0.45  0.43  1.21  1.75 
## 
## Residual standard error: 4.2 
## Degrees of freedom: 100 total; 98 residual

# autocorrelations
acf(resid(m0, type = "normalized"), main = "m0")

acf(resid(m1, type = "normalized"), main = "m1")

acf(resid(m2, type = "normalized"), main = "m2")

acf(resid(m3, type = "normalized"), main = "m3")

Q5

#The standard deviation of length estimates of each subject is available in the loudness data example. 
#Perform a weighted regression and compare the results against an ordinary regression of length onto loudness.

# handle package loading automatically
library(pacman)

# load the packages
pacman::p_load(alr4, tidyverse, magrittr,RColorBrewer)

# make sure data set is active
data(Stevens)

# save a copy
dta <- Stevens %>% rename(Length = y, Loudness = loudness, Subject = subject)

# ordinal regression
or <- lm(Length ~ Loudness, dta)
summary(or)

## 
## Call:
## lm(formula = Length ~ Loudness, data = dta)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2669 -0.3179 -0.0093  0.3707  0.9219 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.20377    0.34429   -9.31  2.5e-12
## Loudness     0.07901    0.00482   16.39  < 2e-16
## 
## Residual standard error: 0.48 on 48 degrees of freedom
## Multiple R-squared:  0.848,  Adjusted R-squared:  0.845 
## F-statistic:  269 on 1 and 48 DF,  p-value: <2e-16

#weighted lm
wr= lm(Length ~Loudness, data = dta, weights = 1/sd^2)
summary(wr)

## 
## Call:
## lm(formula = Length ~ Loudness, data = dta, weights = 1/sd^2)
## 
## Weighted Residuals:
##    Min     1Q Median     3Q    Max 
## -7.428  0.024  1.286  2.189 11.169 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.85942    0.38359   -7.45  1.5e-09
## Loudness     0.06939    0.00489   14.18  < 2e-16
## 
## Residual standard error: 3.8 on 48 degrees of freedom
## Multiple R-squared:  0.807,  Adjusted R-squared:  0.803 
## F-statistic:  201 on 1 and 48 DF,  p-value: <2e-16

# compare
ggplot(dta, aes(x = Loudness, y = Length, group = Subject, color = Subject))+
  geom_point()+
  stat_smooth(aes(group = 1),method = "lm", col = "#d95f0e", se = F)+             #lm
  #stat_smooth just one line           
  geom_path(aes(x = Loudness, fitted(wr)), size = rel(1.0), col = "#c51b8a")+     #weighted 
  #geom_path   connects the observations in the order in which they appear in the data
  labs(x = "Loudness (db)", y = "Average log(Length)")

Q6

#Extract data on Taiwan from the data set gapminder{gapminder} to examine the linear relationship 
    #between life expectancy (lifeExp) and gross domestic product per capita (gdpPercap) in logarithmic unit. 
    #Should the analysis take into account that the observations are time series?

# load the packages
pacman::p_load(gapminder,car)

dat = gapminder

#select the data of Taiwan
dat = dat[dat$country=="Taiwan",]

#add new variable "log.gdp"
dat$log.gdp = log(dat$gdpPercap)
summary(dat)

##         country      continent       year         lifeExp  
##  Taiwan     :12   Africa  : 0   Min.   :1952   Min.   :58  
##  Afghanistan: 0   Americas: 0   1st Qu.:1966   1st Qu.:67  
##  Albania    : 0   Asia    :12   Median :1980   Median :71  
##  Algeria    : 0   Europe  : 0   Mean   :1980   Mean   :70  
##  Angola     : 0   Oceania : 0   3rd Qu.:1993   3rd Qu.:75  
##  Argentina  : 0                 Max.   :2007   Max.   :78  
##  (Other)    : 0                                            
##       pop             gdpPercap        log.gdp    
##  Min.   : 8550362   Min.   : 1207   Min.   : 7.1  
##  1st Qu.:13216254   1st Qu.: 2439   1st Qu.: 7.8  
##  Median :17643293   Median : 6511   Median : 8.8  
##  Mean   :16874724   Mean   :10225   Mean   : 8.7  
##  3rd Qu.:20922340   3rd Qu.:16463   3rd Qu.: 9.7  
##  Max.   :23174294   Max.   :28718   Max.   :10.3  
##

#plot(year,lifeExp)
require(stats)
plot(dat$year,dat$lifeExp)
lines(lowess(dat$year,dat$lifeExp))

#plot(year,log.gdp)
plot(dat$year,dat$log.gdp)
lines(lowess(dat$year,dat$log.gdp))

#plot(lifeExp,log.gdp)
plot(dat$lifeExp,dat$log.gdp)
lines(lowess(dat$lifeExp,dat$log.gdp))

#simple lm
summary(m0 <- lm(log.gdp~lifeExp, data = dat))

## 
## Call:
## lm(formula = log.gdp ~ lifeExp, data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.343 -0.178  0.019  0.134  0.498 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -3.9684     0.9373   -4.23   0.0017
## lifeExp       0.1806     0.0133   13.60  8.9e-08
## 
## Residual standard error: 0.27 on 10 degrees of freedom
## Multiple R-squared:  0.949,  Adjusted R-squared:  0.944 
## F-statistic:  185 on 1 and 10 DF,  p-value: 8.93e-08

#r square=0.9487 have high correlation

# plot m0 residual
plot(dat$lifeExp,m0$residual)

#have autocorrelation

#DW
require(car)
durbinWatsonTest(m0)

##  lag Autocorrelation D-W Statistic p-value
##    1             0.5          0.64   0.002
##  Alternative hypothesis: rho != 0

#P<.05 reject H0 ，有自我相關存在


#correct
# model with autocorrelation error using gls
require(nlme)
# ordinary regression

#correct
# regression with autocorrelation
summary(m1 <- gls(log.gdp~lifeExp, correlation = corAR1(form = ~ year),
                  data = dat, method = "ML") )

## Generalized least squares fit by maximum likelihood
##   Model: log.gdp ~ lifeExp 
##   Data: dat 
##   AIC BIC logLik
##   8.1  10 -0.025
## 
## Correlation Structure: ARMA(1,0)
##  Formula: ~year 
##  Parameter estimate(s):
## Phi1 
##    0 
## 
## Coefficients:
##             Value Std.Error t-value p-value
## (Intercept)  -4.0      0.94    -4.2  0.0017
## lifeExp       0.2      0.01    13.6  0.0000
## 
##  Correlation: 
##         (Intr)
## lifeExp -1    
## 
## Standardized residuals:
##    Min     Q1    Med     Q3    Max 
## -1.416 -0.734  0.078  0.551  2.054 
## 
## Residual standard error: 0.24 
## Degrees of freedom: 12 total; 10 residual

regression Q3Q5Q6

Li

2017年10月22日

Q3

Q5

Q6