pkgs <- c("nlme", "dplyr", "magrittr", "tidyr", "ggplot2", "lmtest","gridExtra","MuMIn","AGD","gapminder","alr4","broom")
lapply(pkgs, library, character.only = TRUE)
dta1 <- read.table("http://www1.aucegypt.edu/faculty/hadi/RABE5/Data5/P211.txt",header = T)
str(dta1)
## 'data.frame': 20 obs. of 4 variables:
## $ Year : int 1952 1952 1952 1952 1953 1953 1953 1953 1954 1954 ...
## $ Quarter : int 1 2 3 4 1 2 3 4 1 2 ...
## $ Expenditure: num 215 218 220 227 231 ...
## $ Stock : num 159 161 163 165 166 ...
dta1 %<>% mutate(year_order = (Year-1952)*4+Quarter)
#不考慮時序因素
summary(test1 <- lm(Expenditure ~ Stock, data = dta1))
##
## Call:
## lm(formula = Expenditure ~ Stock, data = dta1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.176 -3.396 1.396 2.928 6.361
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -154.7192 19.8500 -7.794 3.54e-07 ***
## Stock 2.3004 0.1146 20.080 8.99e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.983 on 18 degrees of freedom
## Multiple R-squared: 0.9573, Adjusted R-squared: 0.9549
## F-statistic: 403.2 on 1 and 18 DF, p-value: 8.988e-14
#考慮時序因素
summary(test2 <- gls(Expenditure ~ Stock, data = dta1,
corr = corAR1(form = ~ year_order), method = "ML"))
## Generalized least squares fit by maximum likelihood
## Model: Expenditure ~ Stock
## Data: dta1
## AIC BIC logLik
## 96.18197 100.1649 -44.09099
##
## Correlation Structure: AR(1)
## Formula: ~year_order
## Parameter estimate(s):
## Phi
## 0.8453629
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) -156.53702 38.23433 -4.094148 7e-04
## Stock 2.32035 0.22095 10.501554 0e+00
##
## Correlation:
## (Intr)
## Stock -0.998
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.23034538 -1.25090869 -0.06133501 0.33448216 1.12981148
##
## Residual standard error: 3.979913
## Degrees of freedom: 20 total; 18 residual
dta1 %<>%
mutate( testr1 = rstandard(test1)
,testr2 = resid(test2, type = "normalized"))
#不考慮時序因素情形下的DWTEST
dwtest(testr1 ~ Year, data = dta1)
##
## Durbin-Watson test
##
## data: testr1 ~ Year
## DW = 0.32677, p-value = 2.198e-08
## alternative hypothesis: true autocorrelation is greater than 0
#考慮時序因素情形下的DWTEST
dwtest(testr2 ~ Year, data = dta1)
##
## Durbin-Watson test
##
## data: testr2 ~ Year
## DW = 1.5603, p-value = 0.1016
## alternative hypothesis: true autocorrelation is greater than 0
dta2 <- read.csv("reading_time.csv",header = T,sep = ",")
## One model fit all
summary(m0 <- lm(Time ~ Sp+Wrds+New, data = dta2))
##
## Call:
## lm(formula = Time ~ Sp + Wrds + New, data = dta2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9261 -0.9403 -0.3808 0.5332 4.9136
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.58695 0.87010 -2.973 0.00411 **
## Sp 0.33337 0.12080 2.760 0.00748 **
## Wrds 0.45859 0.08308 5.520 6.14e-07 ***
## New 0.15163 0.31254 0.485 0.62917
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.557 on 66 degrees of freedom
## Multiple R-squared: 0.6054, Adjusted R-squared: 0.5874
## F-statistic: 33.75 on 3 and 66 DF, p-value: 2.433e-13
## One regression model per subject
dta2 %>% group_by(Subject) %>% do(tidy(lm(Time ~ Sp+Wrds+New, data = .)))
## Source: local data frame [40 x 6]
## Groups: Subject [10]
##
## Subject term estimate std.error statistic p.value
## <fctr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 S01 (Intercept) -1.8261379 1.7403933 -1.0492674 0.37112790
## 2 S01 Sp 0.2312387 0.2416325 0.9569850 0.40917399
## 3 S01 Wrds 0.3910316 0.1661836 2.3530090 0.10003217
## 4 S01 New 0.2216075 0.6251411 0.3544920 0.74642432
## 5 S02 (Intercept) -3.1897537 1.4384071 -2.2175597 0.11330038
## 6 S02 Sp 0.3053308 0.1997054 1.5289063 0.22375883
## 7 S02 Wrds 0.4341488 0.1373481 3.1609374 0.05083437
## 8 S02 New 0.3463725 0.5166691 0.6703952 0.55059813
## 9 S03 (Intercept) -1.1432555 0.7655852 -1.4933093 0.23219603
## 10 S03 Sp 0.2063712 0.1062922 1.9415459 0.14748837
## # ... with 30 more rows
dta3 <- Stevens %>% mutate(length = y) %>% select(-y)
## ordinary regression
summary(m0 <- lm(length ~ loudness, data = dta3))
##
## Call:
## lm(formula = length ~ loudness, data = dta3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26688 -0.31789 -0.00934 0.37067 0.92186
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.203770 0.344286 -9.306 2.53e-12 ***
## loudness 0.079013 0.004821 16.389 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4821 on 48 degrees of freedom
## Multiple R-squared: 0.8484, Adjusted R-squared: 0.8452
## F-statistic: 268.6 on 1 and 48 DF, p-value: < 2.2e-16
## weighted regression
summary(m1 <- lm(length ~ loudness, data = dta3, weights = 1/sd^2))
##
## Call:
## lm(formula = length ~ loudness, data = dta3, weights = 1/sd^2)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -7.4279 0.0238 1.2860 2.1889 11.1692
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.859423 0.383589 -7.454 1.48e-09 ***
## loudness 0.069387 0.004892 14.183 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.785 on 48 degrees of freedom
## Multiple R-squared: 0.8073, Adjusted R-squared: 0.8033
## F-statistic: 201.1 on 1 and 48 DF, p-value: < 2.2e-16
dta4 <- gapminder%>%filter(country == "Taiwan")
dta4 %<>% mutate(year_num = seq(length(year)))
#不考慮時序因素
summary(test1 <- lm(lifeExp ~ log(gdpPercap), data = dta4))
##
## Call:
## lm(formula = lifeExp ~ log(gdpPercap), data = dta4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2235 -0.5632 0.0362 0.9926 1.6577
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.4523 3.3992 7.194 2.95e-05 ***
## log(gdpPercap) 5.2525 0.3862 13.600 8.93e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.432 on 10 degrees of freedom
## Multiple R-squared: 0.9487, Adjusted R-squared: 0.9436
## F-statistic: 184.9 on 1 and 10 DF, p-value: 8.933e-08
#考慮時序因素
summary(test2 <- gls(lifeExp ~ log(gdpPercap), data = dta4,
corr = corAR1(form = ~ year_num), method = "ML"))
## Generalized least squares fit by maximum likelihood
## Model: lifeExp ~ log(gdpPercap)
## Data: dta4
## AIC BIC logLik
## 43.63132 45.57095 -17.81566
##
## Correlation Structure: AR(1)
## Formula: ~year_num
## Parameter estimate(s):
## Phi
## 0.7023494
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 20.23711 5.830042 3.471177 0.006
## log(gdpPercap) 5.68422 0.659933 8.613334 0.000
##
## Correlation:
## (Intr)
## log(gdpPercap) -0.987
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.4204567 -0.3249835 0.2751865 0.9969390 1.6945944
##
## Residual standard error: 1.458322
## Degrees of freedom: 12 total; 10 residual
dta4 %<>%
mutate( testr1 = rstandard(test1)
,testr2 = resid(test2, type = "normalized"))
#不考慮時序因素情形下的DWTEST
dwtest(testr1 ~ year, data = dta4)
##
## Durbin-Watson test
##
## data: testr1 ~ year
## DW = 0.69441, p-value = 0.0006321
## alternative hypothesis: true autocorrelation is greater than 0
#考慮時序因素情形下的DWTEST
dwtest(testr2 ~ year, data = dta4)
##
## Durbin-Watson test
##
## data: testr2 ~ year
## DW = 1.5337, p-value = 0.1079
## alternative hypothesis: true autocorrelation is greater than 0
dta5 <- read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/lmm/Data/girlHeights.txt",header = T)
dta5_cor <- dta5%>%select(-mom,-girl)
# get correlation matrix
dta5_cor %>% cor
## height age
## height 1.0000000 0.8551367
## age 0.8551367 1.0000000
# get variance at each occasion
dta5_cor %>% var %>% diag
## height age
## 90.278448 2.020202
#ggplot
ggplot(dta5, aes(age, height, group = girl, alpha = girl)) +
geom_point(col = "navy") +
stat_smooth(method = "lm", se = F, size = rel(.5)) +
labs(x = "Age (scaled)", y = "Height (cm)") +
theme(legend.position = "NONE")
# ordinary regression
summary(m0 <- lm(height ~ age, data = dta5))
##
## Call:
## lm(formula = height ~ age, data = dta5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.1890 -3.6643 -0.8395 3.9479 13.0440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.5240 2.8439 29.02 <2e-16 ***
## age 5.7165 0.3501 16.33 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.951 on 98 degrees of freedom
## Multiple R-squared: 0.7313, Adjusted R-squared: 0.7285
## F-statistic: 266.7 on 1 and 98 DF, p-value: < 2.2e-16
# regression with autocorrelation
summary(m1 <- gls(height ~ age, data = dta5,
corr = corAR1(form = ~ 1 | girl), method = "ML"))
## Generalized least squares fit by maximum likelihood
## Model: height ~ age
## Data: dta5
## AIC BIC logLik
## 353.3411 363.7618 -172.6705
##
## Correlation Structure: AR(1)
## Formula: ~1 | girl
## Parameter estimate(s):
## Phi
## 0.9781674
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 82.47456 1.3803440 59.74927 0
## age 5.68379 0.1109941 51.20803 0
##
## Correlation:
## (Intr)
## age -0.643
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.8432405 -0.7004707 -0.1173561 0.8830952 2.7934077
##
## Residual standard error: 4.780948
## Degrees of freedom: 100 total; 98 residual
# regression with unequal variances for occasions
summary(m2 <- gls(height ~ age, data = dta5,
weights = varIdent(form = ~ 1 | age), method = "ML"))
## Generalized least squares fit by maximum likelihood
## Model: height ~ age
## Data: dta5
## AIC BIC logLik
## 612.0471 630.2833 -299.0236
##
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | age
## Parameter estimates:
## 6 7 8 9 10
## 1.000000 1.115971 1.313829 1.405088 1.426068
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 82.22936 2.608213 31.52708 0
## age 5.75145 0.336893 17.07205 0
##
## Correlation:
## (Intr)
## age -0.983
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.7367518 -0.7608716 -0.1776154 0.9034705 2.5619617
##
## Residual standard error: 3.879704
## Degrees of freedom: 100 total; 98 residual
# regression with autocorrelation and unequal variances
summary(m3 <- gls(height ~ age, data = dta5,
weights = varIdent(form = ~ 1 | age),
corr = corAR1(form = ~ 1 | girl), method = "ML"))
## Generalized least squares fit by maximum likelihood
## Model: height ~ age
## Data: dta5
## AIC BIC logLik
## 334.7045 355.5459 -159.3523
##
## Correlation Structure: AR(1)
## Formula: ~1 | girl
## Parameter estimate(s):
## Phi
## 0.9886847
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | age
## Parameter estimates:
## 6 7 8 9 10
## 1.000000 1.135107 1.343258 1.410275 1.374144
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 81.43104 0.7226702 112.68077 0
## age 5.53307 0.1141829 48.45793 0
##
## Correlation:
## (Intr)
## age -0.429
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.0585383 -0.2125203 0.3019501 1.2743492 2.6562326
##
## Residual standard error: 4.37343
## Degrees of freedom: 100 total; 98 residual
# model selection with AICc
model.sel(m0, m1, m2, m3)
## Model selection table
## (Intrc) age class correlation weights REML df logLik AICc delta
## m3 81.43 5.533 gls crAR1(girl) vrI(age) F 8 -159.352 336.3 0.00
## m1 82.47 5.684 gls crAR1(girl) F 4 -172.671 353.8 17.48
## m0 82.52 5.716 lm 3 -300.836 607.9 271.63
## m2 82.23 5.751 gls vrI(age) F 7 -299.024 613.3 276.98
## weight
## m3 1
## m1 0
## m0 0
## m2 0
## Abbreviations:
## correlation: crAR1(girl) = 'corAR1(~1|girl)'
## weights: vrI(age) = 'varIdent(~1|age)'
## REML: F = 'FALSE'
## Models ranked by AICc(x)
# seed the random number generator to get the same sample
set.seed(6102)
# pick 75 countries from three regions & arrange the rows by alphabetical order
dta6 <- UN11 %>%
filter(region %in% c("Africa", "Asia", "Europe")) %>%
sample_n(72) %>%
arrange(region)
# how many countries in each of the three regions
n3 <- table(dta6$region)
# percentage of countries from each region
w <- n3/table(UN11$region)
# add the weights variable to data
dta6$wt <- rep(1/w[w != 0], n3[n3 != 0])
# simple regression
summary(m0 <- lm(fertility ~ log(ppgdp), data = dta6))
##
## Call:
## lm(formula = fertility ~ log(ppgdp), data = dta6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8790 -0.7033 -0.1431 0.5331 3.0766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.34849 0.65721 11.181 < 2e-16 ***
## log(ppgdp) -0.55949 0.07855 -7.123 7.45e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.059 on 70 degrees of freedom
## Multiple R-squared: 0.4202, Adjusted R-squared: 0.4119
## F-statistic: 50.74 on 1 and 70 DF, p-value: 7.451e-10
# weighted regression
summary(m1 <- lm(fertility ~ log(ppgdp), data = dta6, weights = wt))
##
## Call:
## lm(formula = fertility ~ log(ppgdp), data = dta6, weights = wt)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -2.7165 -1.0269 -0.1952 0.7517 4.4589
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.43570 0.64741 11.485 < 2e-16 ***
## log(ppgdp) -0.56699 0.07695 -7.368 2.65e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.481 on 70 degrees of freedom
## Multiple R-squared: 0.4368, Adjusted R-squared: 0.4288
## F-statistic: 54.29 on 1 and 70 DF, p-value: 2.65e-10
# plot
ggplot(dta6, aes(log(ppgdp), fertility, group = region, label = region)) +
geom_text(check_overlap = TRUE, size = rel(2), col = "gray30")+
stat_smooth(aes(group = 1), method = "lm", se = F, col = "gray30") +
stat_smooth(method = "lm", se = F)+
geom_abline(intercept = coef(m1)[1], slope = coef(m1)[2], col = "firebrick") +
labs(x = "GDP (US$ in log units)", y = "Number of children per woman") +
theme_bw()
In general, the higher the value of GDP, the lower the number of children parents have. But in Europe, richer country intends to have more children in a family. The effect of GDP on fertility will change by region.
# select age between 5~13
dta7 <- filter(boys7482,boys7482$age >=5 & boys7482$age<=13)
head(dta7)
## age hgt wgt bmi hc gen phb tv reg
## 1 5.015 117.9 21.1 15.17 53.1 <NA> <NA> NA east
## 2 5.081 111.0 17.9 14.52 51.0 <NA> <NA> NA south
## 3 5.130 113.3 19.0 14.80 49.5 <NA> <NA> NA west
## 4 5.180 112.5 20.4 16.11 50.0 <NA> <NA> NA west
## 5 5.196 111.1 20.9 16.93 54.0 <NA> <NA> NA east
## 6 5.204 122.9 22.2 14.69 54.5 <NA> <NA> NA west
qs <- 1:9/10
ggplot(dta7, aes(age, log(bmi))) +
geom_point(alpha=.5) +
geom_quantile(quantiles = qs)+
labs(x = "Age", y = "BMI") +
theme_bw()
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
## Smoothing formula not specified. Using: y ~ x