p = c("nlme", "dplyr", "magrittr", "tidyr", "ggplot2", "lmtest", "gridExtra","MuMIn")
lapply(p, library, character.only = TRUE)
dat = read.table("D:/Multilevel_publish/Multilevel/week3/P211.txt", h = TRUE)
str(dat)
## 'data.frame': 20 obs. of 4 variables:
## $ Year : int 1952 1952 1952 1952 1953 1953 1953 1953 1954 1954 ...
## $ Quarter : int 1 2 3 4 1 2 3 4 1 2 ...
## $ Expenditure: num 215 218 220 227 231 ...
## $ Stock : num 159 161 163 165 166 ...
head(dat)
## Year Quarter Expenditure Stock
## 1 1952 1 214.6 159.3
## 2 1952 2 217.7 161.2
## 3 1952 3 219.6 162.8
## 4 1952 4 227.2 164.6
## 5 1953 1 230.9 165.9
## 6 1953 2 233.3 167.9
summary(dat)
## Year Quarter Expenditure Stock
## Min. :1952 Min. :1.00 Min. :214.6 Min. :159.3
## 1st Qu.:1953 1st Qu.:1.75 1st Qu.:231.9 1st Qu.:167.4
## Median :1954 Median :2.50 Median :237.6 Median :172.8
## Mean :1954 Mean :2.50 Mean :243.5 Mean :173.1
## 3rd Qu.:1955 3rd Qu.:3.25 3rd Qu.:261.5 3rd Qu.:180.4
## Max. :1956 Max. :4.00 Max. :275.6 Max. :184.3
dat %<>% mutate(year_52 = Year - 1952)
dat$year_52 = as.integer(dat$year_52)
##simple lm
summary(rst.e <- lm(Expenditure~year_52, data = dat))
##
## Call:
## lm(formula = Expenditure ~ year_52, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.7750 -3.0925 -0.1588 2.4925 8.6600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 218.5400 1.9658 111.17 < 2e-16 ***
## year_52 12.4675 0.8025 15.54 7.16e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.076 on 18 degrees of freedom
## Multiple R-squared: 0.9306, Adjusted R-squared: 0.9267
## F-statistic: 241.3 on 1 and 18 DF, p-value: 7.161e-12
##gls
rst.e1 = gls(Expenditure ~ year_52, correlation = corAR1(form = ~year_52|Quarter), method = "ML",data = dat)
summary(rst.e1)
## Generalized least squares fit by maximum likelihood
## Model: Expenditure ~ year_52
## Data: dat
## AIC BIC logLik
## 126.14 130.1229 -59.07
##
## Correlation Structure: AR(1)
## Formula: ~year_52 | Quarter
## Parameter estimate(s):
## Phi
## 0.2888505
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 218.67959 2.2338875 97.89195 0
## year_52 12.49466 0.8647971 14.44809 0
##
## Correlation:
## (Intr)
## year_52 -0.774
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.0752090 -0.6897819 -0.0762420 0.4841451 1.7736752
##
## Residual standard error: 4.803815
## Degrees of freedom: 20 total; 18 residual
#Collect Residuals
dat %<>% mutate(res.e = rstandard(rst.e),
res.e1 = resid(rst.e1, type = "normalized"))
#Test two models
dwtest(res.e~year_52, data = dat)
##
## Durbin-Watson test
##
## data: res.e ~ year_52
## DW = 1.2322, p-value = 0.01823
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(res.e1~year_52, data = dat)
##
## Durbin-Watson test
##
## data: res.e1 ~ year_52
## DW = 1.6463, p-value = 0.1427
## alternative hypothesis: true autocorrelation is greater than 0
#Plot
##regression plots
ggplot(dat, aes(x = year_52, y = Expenditure))+
geom_point(col = "gray")+
stat_smooth(method = "lm", se = F, linetype = "dashed")+
geom_path(aes(year_52, fitted(rst.e1)), size = rel(1.0), col = "gray")+
labs(x = "Years since 1952", y = "Expenditure")+
theme_bw()
##residual plots
ggplot(dat, aes(x = year_52, res.e))+
geom_point(col = "skyblue")+
geom_point(aes(x = year_52, res.e1), col = "gray")+
geom_hline(yintercept = 0)+
labs(x = "Years since 1952", y = "Standardized Residuals")+
theme_bw()
#ACF Plot
##bound
ci = 2/sqrt(dim(dat)[1])
##acf for lm residuals
dat %$% acf(res.e, plot = FALSE) %$%
data.frame(lag,acf) %>%
ggplot()+
aes(x = lag, y = acf)+
geom_segment(mapping = aes(xend = lag, yend = 0))+
geom_hline(aes(yintercept = 0))+
geom_hline(aes(yintercept = ci), col = "blue", linetype = "dashed")+
geom_hline(aes(yintercept = -ci), col = "blue", linetype = "dashed")+
ylim(-1,1)+
ggtitle("acf of lm")+
theme_bw()
##acf for gls model residuals
dat %$% acf(res.e1, plot = FALSE) %$%
data.frame(lag,acf) %>%
ggplot()+
aes(x = lag, y = acf)+
geom_segment(mapping = aes(xend = lag, yend = 0))+
geom_hline(aes(yintercept = 0))+
geom_hline(aes(yintercept = ci), col = "blue", linetype = "dashed")+
geom_hline(aes(yintercept = -ci), col = "blue", linetype = "dashed")+
ylim(-1,1)+
ggtitle("acf of gls")+
theme_bw()
##simple lm
summary(rst.s <- lm(Stock~year_52, data = dat))
##
## Call:
## lm(formula = Stock ~ year_52, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0850 -1.3613 0.2863 1.0356 3.0000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 162.3850 0.7102 228.64 < 2e-16 ***
## year_52 5.3575 0.2899 18.48 3.76e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.834 on 18 degrees of freedom
## Multiple R-squared: 0.9499, Adjusted R-squared: 0.9471
## F-statistic: 341.4 on 1 and 18 DF, p-value: 3.76e-13
##gls
rst.s1 = gls(Stock ~ year_52, correlation = corAR1(form = ~year_52|Quarter), method = "ML",data = dat)
summary(rst.s1)
## Generalized least squares fit by maximum likelihood
## Model: Stock ~ year_52
## Data: dat
## AIC BIC logLik
## 78.14275 82.12568 -35.07138
##
## Correlation Structure: AR(1)
## Formula: ~year_52 | Quarter
## Parameter estimate(s):
## Phi
## 0.6434015
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 162.29881 0.8879946 182.77005 0
## year_52 5.26436 0.2929325 17.97125 0
##
## Correlation:
## (Intr)
## year_52 -0.66
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.7329651 -0.6535811 0.2421401 0.7693828 1.8911093
##
## Residual standard error: 1.730449
## Degrees of freedom: 20 total; 18 residual
#Residual
dat %<>% mutate(res.s = rstandard(rst.s),
res.s1 = resid(rst.s1, type = "normalized"))
#Test two models
dwtest(res.s~year_52, data = dat)
##
## Durbin-Watson test
##
## data: res.s ~ year_52
## DW = 1.7441, p-value = 0.2008
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(res.s1~year_52, data = dat)
##
## Durbin-Watson test
##
## data: res.s1 ~ year_52
## DW = 2.4588, p-value = 0.7927
## alternative hypothesis: true autocorrelation is greater than 0
#Plot
##regression plots
ggplot(dat, aes(x = year_52, y = Stock))+
geom_point(col = "gray")+
stat_smooth(method = "lm", se = F, linetype = "dashed")+
geom_path(aes(year_52, fitted(rst.s1)), size = rel(1.0), col = "gray")+
labs(x = "Years since 1952", y = "Stock")+
theme_bw()
##residual plots
ggplot(dat, aes(x = year_52, res.s))+
geom_point(col = "skyblue")+
geom_point(aes(x = year_52, res.s1), col = "gray")+
labs(x = "Years since 1952", y = "Standardized Residuals")+
geom_hline(yintercept = 0)+
theme_bw()
#ACF Plot
##bound
ci = 2/sqrt(dim(dat)[1])
##acf for lm residuals
dat %$% acf(res.s, plot = FALSE) %$%
data.frame(lag,acf) %>%
ggplot()+
aes(x = lag, y = acf)+
geom_segment(mapping = aes(xend = lag, yend = 0))+
geom_hline(aes(yintercept = 0))+
geom_hline(aes(yintercept = ci), col = "blue", linetype = "dashed")+
geom_hline(aes(yintercept = -ci), col = "blue", linetype = "dashed")+
ylim(-1,1)+
theme_bw()
##acf for gls model residuals
dat %$% acf(res.s1, plot = FALSE) %$%
data.frame(lag,acf) %>%
ggplot()+
aes(x = lag, y = acf)+
geom_segment(mapping = aes(xend = lag, yend = 0))+
geom_hline(aes(yintercept = 0))+
geom_hline(aes(yintercept = ci), col = "blue", linetype = "dashed")+
geom_hline(aes(yintercept = -ci), col = "blue", linetype = "dashed")+
ylim(-1,1)+
theme_bw()
dat = read.csv("D:/Multilevel_publish/Multilevel/week3/reading_time.csv", h = TRUE)
str(dat)
## 'data.frame': 70 obs. of 6 variables:
## $ Snt : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Sp : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Wrds : int 13 13 13 13 13 13 13 13 13 13 ...
## $ New : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Subject: Factor w/ 10 levels "S01","S02","S03",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Time : num 3.43 2.79 4.16 3.07 3.62 ...
head(dat)
## Snt Sp Wrds New Subject Time
## 1 1 1 13 1 S01 3.429
## 2 1 1 13 1 S02 2.795
## 3 1 1 13 1 S03 4.161
## 4 1 1 13 1 S04 3.071
## 5 1 1 13 1 S05 3.625
## 6 1 1 13 1 S06 3.161
# 1 model fit all
m0 = lm(Time~Sp+Wrds+New, data = dat)
summary(m0)
##
## Call:
## lm(formula = Time ~ Sp + Wrds + New, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9261 -0.9403 -0.3808 0.5332 4.9136
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.58695 0.87010 -2.973 0.00411 **
## Sp 0.33337 0.12080 2.760 0.00748 **
## Wrds 0.45859 0.08308 5.520 6.14e-07 ***
## New 0.15163 0.31254 0.485 0.62917
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.557 on 66 degrees of freedom
## Multiple R-squared: 0.6054, Adjusted R-squared: 0.5874
## F-statistic: 33.75 on 3 and 66 DF, p-value: 2.433e-13
# 1 regression model per individual
m1 = lm(Time~Subject/(Sp+Wrds+New)-1, data = dat)
summary(m1)
##
## Call:
## lm(formula = Time ~ Subject/(Sp + Wrds + New) - 1, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.10020 -0.46047 0.06511 0.40878 2.88204
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## SubjectS01 -1.8261 2.4659 -0.741 0.46471
## SubjectS02 -3.1898 2.4659 -1.294 0.20569
## SubjectS03 -1.1433 2.4659 -0.464 0.64626
## SubjectS04 -3.3292 2.4659 -1.350 0.18708
## SubjectS05 0.4200 2.4659 0.170 0.86591
## SubjectS06 -7.8901 2.4659 -3.200 0.00324 **
## SubjectS07 -4.2632 2.4659 -1.729 0.09411 .
## SubjectS08 4.2679 2.4659 1.731 0.09376 .
## SubjectS09 -6.0982 2.4659 -2.473 0.01928 *
## SubjectS10 -2.8176 2.4659 -1.143 0.26223
## SubjectS01:Sp 0.2312 0.3424 0.675 0.50457
## SubjectS02:Sp 0.3053 0.3424 0.892 0.37957
## SubjectS03:Sp 0.2064 0.3424 0.603 0.55117
## SubjectS04:Sp 0.4830 0.3424 1.411 0.16859
## SubjectS05:Sp -0.0621 0.3424 -0.181 0.85728
## SubjectS06:Sp 1.1098 0.3424 3.242 0.00291 **
## SubjectS07:Sp 0.2545 0.3424 0.743 0.46308
## SubjectS08:Sp -0.3315 0.3424 -0.968 0.34068
## SubjectS09:Sp 0.6679 0.3424 1.951 0.06048 .
## SubjectS10:Sp 0.4692 0.3424 1.371 0.18069
## SubjectS01:Wrds 0.3910 0.2355 1.661 0.10718
## SubjectS02:Wrds 0.4341 0.2355 1.844 0.07510 .
## SubjectS03:Wrds 0.4036 0.2355 1.714 0.09682 .
## SubjectS04:Wrds 0.5020 0.2355 2.132 0.04130 *
## SubjectS05:Wrds 0.2878 0.2355 1.222 0.23114
## SubjectS06:Wrds 0.8085 0.2355 3.434 0.00176 **
## SubjectS07:Wrds 0.5750 0.2355 2.442 0.02072 *
## SubjectS08:Wrds 0.1134 0.2355 0.482 0.63356
## SubjectS09:Wrds 0.5008 0.2355 2.127 0.04177 *
## SubjectS10:Wrds 0.5696 0.2355 2.419 0.02182 *
## SubjectS01:New 0.2216 0.8857 0.250 0.80414
## SubjectS02:New 0.3464 0.8857 0.391 0.69852
## SubjectS03:New -0.2529 0.8857 -0.286 0.77717
## SubjectS04:New -0.2768 0.8857 -0.313 0.75679
## SubjectS05:New 0.9268 0.8857 1.046 0.30375
## SubjectS06:New -0.2334 0.8857 -0.263 0.79399
## SubjectS07:New 0.7964 0.8857 0.899 0.37572
## SubjectS08:New 0.3312 0.8857 0.374 0.71105
## SubjectS09:New 0.1632 0.8857 0.184 0.85506
## SubjectS10:New -0.5062 0.8857 -0.572 0.57190
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.395 on 30 degrees of freedom
## Multiple R-squared: 0.9668, Adjusted R-squared: 0.9226
## F-statistic: 21.86 on 40 and 30 DF, p-value: 6.582e-14
#Plot
##SP
##1 fit all
SP.0 = ggplot(dat, aes(x = Sp, y = Time, group = Subject, alpha = Subject))+
geom_point()+
geom_line()+
stat_smooth(aes(group = 1), method = "lm", size = .7)+
labs(x = "SP (all)")+
theme_bw()+
theme(legend.position = "NONE")
##1 regression per individual
SP.1 = ggplot(dat, aes(x = Sp, y = Time, group = Subject, alpha = Subject))+
geom_point()+
stat_smooth(method = "lm", se = F, size = .7)+
scale_color_grey()+
labs(x = "SP (per individual)")+
theme_bw()+
theme(legend.position = "NONE")
##Wrds
##1 fit all
Wrd.0 = ggplot(dat, aes(x = Wrds, y = Time, group = Subject, alpha = Subject))+
geom_point()+
geom_line()+
stat_smooth(aes(group = 1), method = "lm", size = .7)+
labs(x = "Wrd (all)")+
theme_bw()+
theme(legend.position = "NONE")
##1 regression per individual
Wrd.1 = ggplot(dat, aes(x = Wrds, y = Time, group = Subject, alpha = Subject))+
geom_point()+
stat_smooth(method = "lm", se = F, size = .7)+
scale_color_grey()+
labs(x = "Wrd (per individual")+
theme_bw()+
theme(legend.position = "NONE")
##New
##1 fit all
New.0 = ggplot(dat, aes(x = New, y = Time, group = Subject, alpha = Subject))+
geom_point()+
geom_line()+
stat_smooth(aes(group = 1), method = "lm", size = .7)+
labs(x = "New (all)")+
theme_bw()+
theme(legend.position = "NONE")
##1 regression per individual
New.1 = ggplot(dat, aes(x = New, y = Time, group = Subject, alpha = Subject))+
geom_point()+
stat_smooth(method = "lm", se = F, size = .7)+
scale_color_grey()+
labs(x = "New (per individual)")+
theme_bw()+
theme(legend.position = "NONE")
#arrange the plots
grid.arrange(SP.0, SP.1, Wrd.0, Wrd.1, New.0, New.1, nrow = 3, ncol = 2)
data("Stevens", package = "alr4")
dat = Stevens
str(dat)
## 'data.frame': 50 obs. of 5 variables:
## $ subject : Factor w/ 10 levels "A","B","C","D",..: 1 1 1 1 1 2 2 2 2 2 ...
## $ loudness: num 50 60 70 80 90 50 60 70 80 90 ...
## $ y : num 0.379 1.727 3.016 3.924 4.413 ...
## $ sd : num 0.507 0.904 0.553 0.363 0.092 0.077 0.287 0.396 0.124 0.068 ...
## $ n : num 3 3 3 3 3 3 3 3 3 3 ...
#ordinal lm
rst.0 = lm(y~loudness, data = dat)
summary(rst.0)
##
## Call:
## lm(formula = y ~ loudness, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26688 -0.31789 -0.00934 0.37067 0.92186
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.203770 0.344286 -9.306 2.53e-12 ***
## loudness 0.079013 0.004821 16.389 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4821 on 48 degrees of freedom
## Multiple R-squared: 0.8484, Adjusted R-squared: 0.8452
## F-statistic: 268.6 on 1 and 48 DF, p-value: < 2.2e-16
#weighted lm
rst.1= lm(y~loudness, data = dat, weights = 1/sd^2)
summary(rst.1)
##
## Call:
## lm(formula = y ~ loudness, data = dat, weights = 1/sd^2)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -7.4279 0.0238 1.2860 2.1889 11.1692
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.859423 0.383589 -7.454 1.48e-09 ***
## loudness 0.069387 0.004892 14.183 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.785 on 48 degrees of freedom
## Multiple R-squared: 0.8073, Adjusted R-squared: 0.8033
## F-statistic: 201.1 on 1 and 48 DF, p-value: < 2.2e-16
#residual of two models
dat %<>% mutate(res.0 = rstandard(rst.0),
res.1 = rstandard(rst.1))
#Plot
ggplot(dat, aes(x = loudness, y = y, group = subject, alpha = subject))+
geom_point(alpha = .5)+
stat_smooth(aes(group = 1),method = "lm", col = "gray", se = F)+ #lm
geom_path(aes(x = loudness, fitted(rst.1)), size = rel(1.0), col = "skyblue")+ #weighted
labs(x = "Loudness (db)", y = "Average log(Length)") +
theme_bw()+
theme(legend.position = "NONE")
#Residuals plot
ggplot(dat, aes(x = loudness, y = res.0, group = subject))+
geom_point(col = "gray", alpha = .5)+ #lm residual
geom_point(aes(x = loudness, res.1), col = "skyblue", alpha = .5)+ #weighted lm residual
geom_hline(yintercept = 0)+
labs(x = "loudness", y = "Standardized residuals")+
theme_bw()
#test residuals of two model
t.test(dat$res.0,dat$res.1)
##
## Welch Two Sample t-test
##
## data: dat$res.0 and dat$res.1
## t = -1.5326, df = 96.786, p-value = 0.1286
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.75396225 0.09692547
## sample estimates:
## mean of x mean of y
## -0.001012561 0.327505827
The two models didn’t perform differently.
data("gapminder", package = "gapminder")
dat = gapminder
dat = dat[dat$country=="Taiwan",]
dat$log.gdp = log(dat$gdpPercap)
summary(dat)
## country continent year lifeExp
## Taiwan :12 Africa : 0 Min. :1952 Min. :58.50
## Afghanistan: 0 Americas: 0 1st Qu.:1966 1st Qu.:66.92
## Albania : 0 Asia :12 Median :1980 Median :71.38
## Algeria : 0 Europe : 0 Mean :1980 Mean :70.34
## Angola : 0 Oceania : 0 3rd Qu.:1993 3rd Qu.:74.51
## Argentina : 0 Max. :2007 Max. :78.40
## (Other) : 0
## pop gdpPercap log.gdp
## Min. : 8550362 Min. : 1207 Min. : 7.096
## 1st Qu.:13216254 1st Qu.: 2439 1st Qu.: 7.787
## Median :17643293 Median : 6511 Median : 8.771
## Mean :16874724 Mean :10225 Mean : 8.736
## 3rd Qu.:20922340 3rd Qu.:16463 3rd Qu.: 9.701
## Max. :23174294 Max. :28718 Max. :10.265
##
dat %<>% mutate(year_52 = year-1952)
##simple lm
summary(m0 <- lm(lifeExp~year_52, data = dat))
##
## Call:
## lm(formula = lifeExp ~ year_52, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8374 -0.7356 0.2114 1.0218 1.5077
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.33744 0.71159 86.20 1.08e-15 ***
## year_52 0.32724 0.02192 14.93 3.65e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.31 on 10 degrees of freedom
## Multiple R-squared: 0.9571, Adjusted R-squared: 0.9528
## F-statistic: 222.9 on 1 and 10 DF, p-value: 3.654e-08
##GLS
summary(m1 <-lm(log.gdp ~ year_52,data = dat))
##
## Call:
## lm(formula = log.gdp ~ year_52, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16949 -0.07515 0.02998 0.06870 0.12209
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.036535 0.056316 124.95 < 2e-16 ***
## year_52 0.061786 0.001735 35.62 7.22e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1037 on 10 degrees of freedom
## Multiple R-squared: 0.9922, Adjusted R-squared: 0.9914
## F-statistic: 1269 on 1 and 10 DF, p-value: 7.216e-12
##Collesct residuals
dat %<>% mutate(res.0 = rstandard(m0),
res.1 = rstandard(m1))
dwtest(res.0~year_52, data = dat)
##
## Durbin-Watson test
##
## data: res.0 ~ year_52
## DW = 0.51333, p-value = 4.727e-05
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(res.1~year_52, data = dat)
##
## Durbin-Watson test
##
## data: res.1 ~ year_52
## DW = 0.81032, p-value = 0.002029
## alternative hypothesis: true autocorrelation is greater than 0
According to the test results, both log.gdp and life expectacy are influenced by time.
The data might be time series.
#bound
ci = 2/sqrt(dim(dat)[1])
##acf for lm residuals on life expectacy
dat %$% acf(res.0, plot = FALSE) %$%
data.frame(lag,acf) %>%
ggplot()+
aes(x = lag, y = acf)+
geom_segment(mapping = aes(xend = lag, yend = 0))+
geom_hline(aes(yintercept = 0))+
geom_hline(aes(yintercept = ci), col = "blue", linetype = "dashed")+
geom_hline(aes(yintercept = -ci), col = "blue", linetype = "dashed")+
ylim(-1,1)+
ggtitle("acf of lm on life expactacy")+
theme_bw()
##acf for lm residauls on log.gdp
dat %$% acf(res.1, plot = FALSE) %$%
data.frame(lag,acf) %>%
ggplot()+
aes(x = lag, y = acf)+
geom_segment(mapping = aes(xend = lag, yend = 0))+
geom_hline(aes(yintercept = 0))+
geom_hline(aes(yintercept = ci), col = "blue", linetype = "dashed")+
geom_hline(aes(yintercept = -ci), col = "blue", linetype = "dashed")+
ylim(-1,1)+
ggtitle("acf of lm on log.gdp")+
theme_bw()
dat = read.table("D:/Multilevel_publish/Multilevel/week3/girlHeights.txt", h = T)
str(dat)
## 'data.frame': 100 obs. of 4 variables:
## $ height: num 111 116 122 126 130 ...
## $ girl : int 1 1 1 1 1 2 2 2 2 2 ...
## $ age : int 6 7 8 9 10 6 7 8 9 10 ...
## $ mom : Factor w/ 3 levels "M","S","T": 2 2 2 2 2 2 2 2 2 2 ...
head(dat)
## height girl age mom
## 1 111.0 1 6 S
## 2 116.4 1 7 S
## 3 121.7 1 8 S
## 4 126.3 1 9 S
## 5 130.5 1 10 S
## 6 110.0 2 6 S
#Correlation matrix
dat[,-4] %>% cor
## height girl age
## height 1.0000000 0.3986156 0.8551367
## girl 0.3986156 1.0000000 0.0000000
## age 0.8551367 0.0000000 1.0000000
# 1 model fit all
summary(m0 <- lm(height~age, data = dat))
##
## Call:
## lm(formula = height ~ age, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.1890 -3.6643 -0.8395 3.9479 13.0440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.5240 2.8439 29.02 <2e-16 ***
## age 5.7165 0.3501 16.33 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.951 on 98 degrees of freedom
## Multiple R-squared: 0.7313, Adjusted R-squared: 0.7285
## F-statistic: 266.7 on 1 and 98 DF, p-value: < 2.2e-16
ggplot(dat, aes(x = age, y = height, alpha = girl))+
geom_point(col = "navy")+
stat_smooth(method = "lm", se = F)+
labs(x = "Age", y = "Height")+
theme_bw()+
theme(legend.position = "NONE")
#autocorrelation
summary(m1 <- gls(height~age, data = dat, corr = corAR1(form = ~1|girl), method = "ML"))
## Generalized least squares fit by maximum likelihood
## Model: height ~ age
## Data: dat
## AIC BIC logLik
## 353.3411 363.7618 -172.6705
##
## Correlation Structure: AR(1)
## Formula: ~1 | girl
## Parameter estimate(s):
## Phi
## 0.9781674
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 82.47456 1.3803440 59.74927 0
## age 5.68379 0.1109941 51.20803 0
##
## Correlation:
## (Intr)
## age -0.643
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.8432405 -0.7004707 -0.1173561 0.8830952 2.7934077
##
## Residual standard error: 4.780948
## Degrees of freedom: 100 total; 98 residual
#unequal variance
summary(m2 <- gls(height ~ age, weights = varIdent(form = ~ 1 | mom), method = "ML", data = dat))
## Generalized least squares fit by maximum likelihood
## Model: height ~ age
## Data: dat
## AIC BIC logLik
## 593.349 606.3749 -291.6745
##
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | mom
## Parameter estimates:
## S M T
## 1.0000000 0.8011027 1.8430778
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 82.46082 2.324905 35.46847 0
## age 5.54718 0.286176 19.38381 0
##
## Correlation:
## (Intr)
## age -0.985
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.90495116 -0.65254923 0.08002491 0.76101422 2.45121455
##
## Residual standard error: 3.901737
## Degrees of freedom: 100 total; 98 residual
#both
summary(m3 <- gls(height ~ age, corr = corAR1(form = ~ 1 | girl),
weights = varIdent(form = ~ 1 | mom), method = "ML",
data = dat))
## Generalized least squares fit by maximum likelihood
## Model: height ~ age
## Data: dat
## AIC BIC logLik
## 337.597 353.228 -162.7985
##
## Correlation Structure: AR(1)
## Formula: ~1 | girl
## Parameter estimate(s):
## Phi
## 0.9786546
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | mom
## Parameter estimates:
## S M T
## 1.0000000 0.6922724 1.5493547
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 82.47292 1.1305458 72.94965 0
## age 5.55833 0.0902996 61.55430 0
##
## Correlation:
## (Intr)
## age -0.639
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.77188602 -0.71727072 0.06483833 0.80091331 2.56095242
##
## Residual standard error: 4.264505
## Degrees of freedom: 100 total; 98 residual
#model selection
model.sel(m0,m1,m2,m3)
## Model selection table
## (Intrc) age class correlation weights REML df logLik AICc delta
## m3 82.47 5.558 gls crAR1(girl) vrI(mom) F 6 -162.798 338.5 0.00
## m1 82.47 5.684 gls crAR1(girl) F 4 -172.671 353.8 15.26
## m2 82.46 5.547 gls vrI(mom) F 5 -291.675 594.0 255.49
## m0 82.52 5.716 lm 3 -300.836 607.9 269.42
## weight
## m3 1
## m1 0
## m2 0
## m0 0
## Abbreviations:
## correlation: crAR1(girl) = 'corAR1(~1|girl)'
## weights: vrI(mom) = 'varIdent(~1|mom)'
## REML: F = 'FALSE'
## Models ranked by AICc(x)
According to the results of AIC,
model with autocorrelation and unequal variance correction is the most appropriate
# seed the random number generator to get the same sample
set.seed(6102)
# pick 75 countries from three regions & arrange the rows by alphabetical order
data("UN11", package = "alr4")
dta <- UN11 %>%
filter(region %in% c("Africa", "Asia", "Europe")) %>%
sample_n(72) %>%
arrange(region)
# how many countries in each of the three regions
n3 <- table(dta$region)
n3
##
## Africa Asia Caribbean Europe Latin Amer
## 25 30 0 17 0
## North America NorthAtlantic Oceania
## 0 0 0
# percentage of countries from each region
w <- n3/table(UN11$region)
w
##
## Africa Asia Caribbean Europe Latin Amer
## 0.4716981 0.6000000 0.0000000 0.4358974 0.0000000
## North America NorthAtlantic Oceania
## 0.0000000 0.0000000 0.0000000
# add the weights variable to data
dta$wt <- rep(1/w[w != 0], n3[n3 != 0])
# plot
ggplot(dta, aes(log(ppgdp), fertility, group = region, label = region)) +
geom_text(check_overlap = TRUE, size = rel(2), col = "gray30")+
stat_smooth(aes(group = 1), method = "lm", se = F, col = "gray30") +##The overall regression line
stat_smooth(aes(colour = region), method = "lm", se = F)+ #I slightly changed the code
theme(legend.background = element_rect(linetype=1, colour = "black"))+ #To make the pattern more clear
theme(legend.position=c(.88, .82))+
geom_abline(intercept = coef(m1)[1], slope = coef(m1)[2], col = "firebrick") +
labs(x = "GDP (US$ in log units)", y = "Number of children per woman") +
theme_bw()
According to the different regression line:
In Africa and Asia, richer countries have lower number of children per woman.
In Europe, richer countries have higher number of children per woman.
The effects of economic condition on the birth rates of undeveloped countries and on the birth rates of well-developed contries are different.
data("boys7482", package = "AGD")
dat = boys7482
str(dat)
## 'data.frame': 7482 obs. of 9 variables:
## $ age: num 0.032 0.035 0.035 0.038 0.043 0.046 0.046 0.046 0.049 0.049 ...
## $ hgt: num 53.3 50.8 50.1 53.5 52.1 50.5 NA 52.5 55 52.9 ...
## $ wgt: num 3.76 3.21 3.65 3.37 4.16 ...
## $ bmi: num 13.2 12.4 14.5 11.8 15.3 ...
## $ hc : num 33.6 33.6 33.7 35 36.1 36.6 36.2 38 37.4 35.1 ...
## $ gen: Ord.factor w/ 5 levels "G1"<"G2"<"G3"<..: NA NA NA NA NA NA NA NA NA NA ...
## $ phb: Ord.factor w/ 6 levels "P1"<"P2"<"P3"<..: NA NA NA NA NA NA NA NA NA NA ...
## $ tv : int NA NA NA NA NA NA NA NA NA NA ...
## $ reg: Factor w/ 5 levels "north","east",..: 4 4 4 4 4 3 3 3 3 4 ...
head(dat)
## age hgt wgt bmi hc gen phb tv reg
## 1 0.032 53.3 3.760 13.23 33.6 <NA> <NA> NA south
## 2 0.035 50.8 3.205 12.41 33.6 <NA> <NA> NA south
## 3 0.035 50.1 3.650 14.54 33.7 <NA> <NA> NA south
## 4 0.038 53.5 3.370 11.77 35.0 <NA> <NA> NA south
## 5 0.043 52.1 4.160 15.32 36.1 <NA> <NA> NA south
## 6 0.046 50.5 3.370 13.21 36.6 <NA> <NA> NA west
dat$index = ifelse(dat$age<5, 0 , ifelse(dat$age>13,0,1))
dat = dat[dat$index==1,]
dat$log.bmi = log(dat$bmi)
#quantile
q = 1:9/10
#Plot
ggplot(dat, aes(x = age, y = log.bmi))+
geom_point(alpha = 0.5)+
scale_color_grey()+
geom_quantile(quantiles = q)+
labs(x = "Age", y = "BMI")+
theme_bw()