Library packages

pkgs <- c("nlme", "dplyr", "magrittr", "tidyr", "ggplot2", "lmtest","gridExtra","MuMIn","AGD","gapminder","alr4","broom")
lapply(pkgs, library, character.only = TRUE)

Question 1

dta1 <- read.table("http://www1.aucegypt.edu/faculty/hadi/RABE5/Data5/P211.txt",header = T)
str(dta1)
## 'data.frame':    20 obs. of  4 variables:
##  $ Year       : int  1952 1952 1952 1952 1953 1953 1953 1953 1954 1954 ...
##  $ Quarter    : int  1 2 3 4 1 2 3 4 1 2 ...
##  $ Expenditure: num  215 218 220 227 231 ...
##  $ Stock      : num  159 161 163 165 166 ...
dta1 %<>% mutate(year_order = (Year-1952)*4+Quarter)
#不考慮時序因素
summary(test1 <- lm(Expenditure ~ Stock, data = dta1))
## 
## Call:
## lm(formula = Expenditure ~ Stock, data = dta1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.176 -3.396  1.396  2.928  6.361 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -154.7192    19.8500  -7.794 3.54e-07 ***
## Stock          2.3004     0.1146  20.080 8.99e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.983 on 18 degrees of freedom
## Multiple R-squared:  0.9573, Adjusted R-squared:  0.9549 
## F-statistic: 403.2 on 1 and 18 DF,  p-value: 8.988e-14
#考慮時序因素
summary(test2 <- gls(Expenditure ~ Stock, data = dta1,
                     corr = corAR1(form = ~ year_order), method = "ML"))
## Generalized least squares fit by maximum likelihood
##   Model: Expenditure ~ Stock 
##   Data: dta1 
##        AIC      BIC    logLik
##   96.18197 100.1649 -44.09099
## 
## Correlation Structure: AR(1)
##  Formula: ~year_order 
##  Parameter estimate(s):
##       Phi 
## 0.8453629 
## 
## Coefficients:
##                  Value Std.Error   t-value p-value
## (Intercept) -156.53702  38.23433 -4.094148   7e-04
## Stock          2.32035   0.22095 10.501554   0e+00
## 
##  Correlation: 
##       (Intr)
## Stock -0.998
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.23034538 -1.25090869 -0.06133501  0.33448216  1.12981148 
## 
## Residual standard error: 3.979913 
## Degrees of freedom: 20 total; 18 residual
dta1 %<>%
  mutate( testr1 = rstandard(test1)
          ,testr2 = resid(test2, type = "normalized"))
#不考慮時序因素情形下的DWTEST
dwtest(testr1 ~ Year, data = dta1)
## 
##  Durbin-Watson test
## 
## data:  testr1 ~ Year
## DW = 0.32677, p-value = 2.198e-08
## alternative hypothesis: true autocorrelation is greater than 0
#考慮時序因素情形下的DWTEST
dwtest(testr2 ~ Year, data = dta1)
## 
##  Durbin-Watson test
## 
## data:  testr2 ~ Year
## DW = 1.5603, p-value = 0.1016
## alternative hypothesis: true autocorrelation is greater than 0

Question 2

dta2 <- read.csv("reading_time.csv",header = T,sep = ",")
## One model fit all
summary(m0 <- lm(Time ~ Sp+Wrds+New, data = dta2))
## 
## Call:
## lm(formula = Time ~ Sp + Wrds + New, data = dta2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9261 -0.9403 -0.3808  0.5332  4.9136 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.58695    0.87010  -2.973  0.00411 ** 
## Sp           0.33337    0.12080   2.760  0.00748 ** 
## Wrds         0.45859    0.08308   5.520 6.14e-07 ***
## New          0.15163    0.31254   0.485  0.62917    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.557 on 66 degrees of freedom
## Multiple R-squared:  0.6054, Adjusted R-squared:  0.5874 
## F-statistic: 33.75 on 3 and 66 DF,  p-value: 2.433e-13
## One regression model per subject
dta2 %>% group_by(Subject) %>% do(tidy(lm(Time ~ Sp+Wrds+New, data = .)))
## Source: local data frame [40 x 6]
## Groups: Subject [10]
## 
##    Subject        term   estimate std.error  statistic    p.value
##     <fctr>       <chr>      <dbl>     <dbl>      <dbl>      <dbl>
## 1      S01 (Intercept) -1.8261379 1.7403933 -1.0492674 0.37112790
## 2      S01          Sp  0.2312387 0.2416325  0.9569850 0.40917399
## 3      S01        Wrds  0.3910316 0.1661836  2.3530090 0.10003217
## 4      S01         New  0.2216075 0.6251411  0.3544920 0.74642432
## 5      S02 (Intercept) -3.1897537 1.4384071 -2.2175597 0.11330038
## 6      S02          Sp  0.3053308 0.1997054  1.5289063 0.22375883
## 7      S02        Wrds  0.4341488 0.1373481  3.1609374 0.05083437
## 8      S02         New  0.3463725 0.5166691  0.6703952 0.55059813
## 9      S03 (Intercept) -1.1432555 0.7655852 -1.4933093 0.23219603
## 10     S03          Sp  0.2063712 0.1062922  1.9415459 0.14748837
## # ... with 30 more rows

Question 3

dta3 <- Stevens %>% mutate(length = y) %>% select(-y)
## ordinary regression
summary(m0 <- lm(length ~ loudness, data = dta3))
## 
## Call:
## lm(formula = length ~ loudness, data = dta3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26688 -0.31789 -0.00934  0.37067  0.92186 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.203770   0.344286  -9.306 2.53e-12 ***
## loudness     0.079013   0.004821  16.389  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4821 on 48 degrees of freedom
## Multiple R-squared:  0.8484, Adjusted R-squared:  0.8452 
## F-statistic: 268.6 on 1 and 48 DF,  p-value: < 2.2e-16
## weighted regression
summary(m1 <- lm(length ~ loudness, data = dta3, weights = 1/sd^2))
## 
## Call:
## lm(formula = length ~ loudness, data = dta3, weights = 1/sd^2)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4279  0.0238  1.2860  2.1889 11.1692 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.859423   0.383589  -7.454 1.48e-09 ***
## loudness     0.069387   0.004892  14.183  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.785 on 48 degrees of freedom
## Multiple R-squared:  0.8073, Adjusted R-squared:  0.8033 
## F-statistic: 201.1 on 1 and 48 DF,  p-value: < 2.2e-16

Question 4

dta4 <- gapminder%>%filter(country == "Taiwan")
dta4 %<>% mutate(year_num = seq(length(year)))
#不考慮時序因素
summary(test1 <- lm(lifeExp ~ log(gdpPercap), data = dta4))
## 
## Call:
## lm(formula = lifeExp ~ log(gdpPercap), data = dta4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2235 -0.5632  0.0362  0.9926  1.6577 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     24.4523     3.3992   7.194 2.95e-05 ***
## log(gdpPercap)   5.2525     0.3862  13.600 8.93e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.432 on 10 degrees of freedom
## Multiple R-squared:  0.9487, Adjusted R-squared:  0.9436 
## F-statistic: 184.9 on 1 and 10 DF,  p-value: 8.933e-08
#考慮時序因素
summary(test2 <- gls(lifeExp ~ log(gdpPercap), data = dta4,
                  corr = corAR1(form = ~ year_num), method = "ML"))
## Generalized least squares fit by maximum likelihood
##   Model: lifeExp ~ log(gdpPercap) 
##   Data: dta4 
##        AIC      BIC    logLik
##   43.63132 45.57095 -17.81566
## 
## Correlation Structure: AR(1)
##  Formula: ~year_num 
##  Parameter estimate(s):
##       Phi 
## 0.7023494 
## 
## Coefficients:
##                   Value Std.Error  t-value p-value
## (Intercept)    20.23711  5.830042 3.471177   0.006
## log(gdpPercap)  5.68422  0.659933 8.613334   0.000
## 
##  Correlation: 
##                (Intr)
## log(gdpPercap) -0.987
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -1.4204567 -0.3249835  0.2751865  0.9969390  1.6945944 
## 
## Residual standard error: 1.458322 
## Degrees of freedom: 12 total; 10 residual
dta4 %<>%
  mutate( testr1 = rstandard(test1)
          ,testr2 = resid(test2, type = "normalized"))
#不考慮時序因素情形下的DWTEST
dwtest(testr1 ~ year, data = dta4)
## 
##  Durbin-Watson test
## 
## data:  testr1 ~ year
## DW = 0.69441, p-value = 0.0006321
## alternative hypothesis: true autocorrelation is greater than 0
#考慮時序因素情形下的DWTEST
dwtest(testr2 ~ year, data = dta4)
## 
##  Durbin-Watson test
## 
## data:  testr2 ~ year
## DW = 1.5337, p-value = 0.1079
## alternative hypothesis: true autocorrelation is greater than 0

Question 5

dta5 <- read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/lmm/Data/girlHeights.txt",header = T)
dta5_cor <- dta5%>%select(-mom,-girl)
# get correlation matrix
dta5_cor %>% cor
##           height       age
## height 1.0000000 0.8551367
## age    0.8551367 1.0000000
# get variance at each occasion
dta5_cor %>% var %>% diag
##    height       age 
## 90.278448  2.020202
#ggplot
ggplot(dta5, aes(age, height, group = girl, alpha = girl)) +
  geom_point(col = "navy") +
  stat_smooth(method = "lm", se = F, size = rel(.5)) +
  labs(x = "Age (scaled)", y = "Height (cm)") +
  theme(legend.position = "NONE")

# ordinary regression
summary(m0 <- lm(height ~ age, data = dta5))
## 
## Call:
## lm(formula = height ~ age, data = dta5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.1890 -3.6643 -0.8395  3.9479 13.0440 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  82.5240     2.8439   29.02   <2e-16 ***
## age           5.7165     0.3501   16.33   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.951 on 98 degrees of freedom
## Multiple R-squared:  0.7313, Adjusted R-squared:  0.7285 
## F-statistic: 266.7 on 1 and 98 DF,  p-value: < 2.2e-16
# regression with autocorrelation
summary(m1 <- gls(height ~ age, data = dta5,
                  corr = corAR1(form = ~ 1 | girl), method = "ML"))
## Generalized least squares fit by maximum likelihood
##   Model: height ~ age 
##   Data: dta5 
##        AIC      BIC    logLik
##   353.3411 363.7618 -172.6705
## 
## Correlation Structure: AR(1)
##  Formula: ~1 | girl 
##  Parameter estimate(s):
##       Phi 
## 0.9781674 
## 
## Coefficients:
##                Value Std.Error  t-value p-value
## (Intercept) 82.47456 1.3803440 59.74927       0
## age          5.68379 0.1109941 51.20803       0
## 
##  Correlation: 
##     (Intr)
## age -0.643
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -1.8432405 -0.7004707 -0.1173561  0.8830952  2.7934077 
## 
## Residual standard error: 4.780948 
## Degrees of freedom: 100 total; 98 residual
# regression with unequal variances for occasions
summary(m2 <- gls(height ~ age, data = dta5,
                  weights = varIdent(form = ~ 1 | age), method = "ML"))
## Generalized least squares fit by maximum likelihood
##   Model: height ~ age 
##   Data: dta5 
##        AIC      BIC    logLik
##   612.0471 630.2833 -299.0236
## 
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | age 
##  Parameter estimates:
##        6        7        8        9       10 
## 1.000000 1.115971 1.313829 1.405088 1.426068 
## 
## Coefficients:
##                Value Std.Error  t-value p-value
## (Intercept) 82.22936  2.608213 31.52708       0
## age          5.75145  0.336893 17.07205       0
## 
##  Correlation: 
##     (Intr)
## age -0.983
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -1.7367518 -0.7608716 -0.1776154  0.9034705  2.5619617 
## 
## Residual standard error: 3.879704 
## Degrees of freedom: 100 total; 98 residual
# regression with autocorrelation and unequal variances
summary(m3 <- gls(height ~ age, data = dta5,
                  weights = varIdent(form = ~ 1 | age),
                  corr = corAR1(form = ~ 1 | girl), method = "ML"))
## Generalized least squares fit by maximum likelihood
##   Model: height ~ age 
##   Data: dta5 
##        AIC      BIC    logLik
##   334.7045 355.5459 -159.3523
## 
## Correlation Structure: AR(1)
##  Formula: ~1 | girl 
##  Parameter estimate(s):
##       Phi 
## 0.9886847 
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | age 
##  Parameter estimates:
##        6        7        8        9       10 
## 1.000000 1.135107 1.343258 1.410275 1.374144 
## 
## Coefficients:
##                Value Std.Error   t-value p-value
## (Intercept) 81.43104 0.7226702 112.68077       0
## age          5.53307 0.1141829  48.45793       0
## 
##  Correlation: 
##     (Intr)
## age -0.429
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -1.0585383 -0.2125203  0.3019501  1.2743492  2.6562326 
## 
## Residual standard error: 4.37343 
## Degrees of freedom: 100 total; 98 residual
# model selection with AICc
model.sel(m0, m1, m2, m3)
## Model selection table 
##    (Intrc)   age class correlation  weights REML df   logLik  AICc  delta
## m3   81.43 5.533   gls crAR1(girl) vrI(age)    F  8 -159.352 336.3   0.00
## m1   82.47 5.684   gls crAR1(girl)             F  4 -172.671 353.8  17.48
## m0   82.52 5.716    lm                            3 -300.836 607.9 271.63
## m2   82.23 5.751   gls             vrI(age)    F  7 -299.024 613.3 276.98
##    weight
## m3      1
## m1      0
## m0      0
## m2      0
## Abbreviations:
## correlation: crAR1(girl) = 'corAR1(~1|girl)'
## weights: vrI(age) = 'varIdent(~1|age)'
## REML: F = 'FALSE'
## Models ranked by AICc(x)

Question 6

# seed the random number generator to get the same sample
set.seed(6102)

# pick 75 countries from three regions & arrange the rows by alphabetical order
dta6 <- UN11 %>%
  filter(region %in% c("Africa", "Asia", "Europe")) %>%
  sample_n(72) %>%
  arrange(region)

# how many countries in each of the three regions
n3 <- table(dta6$region)

# percentage of countries from each region
w <- n3/table(UN11$region)

# add the weights variable to data
dta6$wt <- rep(1/w[w != 0], n3[n3 != 0])

# simple regression
summary(m0 <- lm(fertility ~ log(ppgdp), data = dta6))
## 
## Call:
## lm(formula = fertility ~ log(ppgdp), data = dta6)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8790 -0.7033 -0.1431  0.5331  3.0766 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.34849    0.65721  11.181  < 2e-16 ***
## log(ppgdp)  -0.55949    0.07855  -7.123 7.45e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.059 on 70 degrees of freedom
## Multiple R-squared:  0.4202, Adjusted R-squared:  0.4119 
## F-statistic: 50.74 on 1 and 70 DF,  p-value: 7.451e-10
# weighted regression
summary(m1 <- lm(fertility ~ log(ppgdp), data = dta6, weights = wt))
## 
## Call:
## lm(formula = fertility ~ log(ppgdp), data = dta6, weights = wt)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7165 -1.0269 -0.1952  0.7517  4.4589 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.43570    0.64741  11.485  < 2e-16 ***
## log(ppgdp)  -0.56699    0.07695  -7.368 2.65e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.481 on 70 degrees of freedom
## Multiple R-squared:  0.4368, Adjusted R-squared:  0.4288 
## F-statistic: 54.29 on 1 and 70 DF,  p-value: 2.65e-10
# plot
ggplot(dta6, aes(log(ppgdp), fertility, group = region, label = region)) +
  geom_text(check_overlap = TRUE, size = rel(2), col = "gray30")+
  stat_smooth(aes(group = 1), method = "lm", se = F, col = "gray30") +
  stat_smooth(method = "lm", se = F)+
  geom_abline(intercept = coef(m1)[1], slope = coef(m1)[2], col = "firebrick") +
  labs(x = "GDP (US$ in log units)", y = "Number of children per woman") +
  theme_bw()

In general, the higher the value of GDP, the lower the number of children parents have. But in Europe, richer country intends to have more children in a family. The effect of GDP on fertility will change by region.

Question 7

# select age between 5~13
dta7 <- filter(boys7482,boys7482$age >=5 & boys7482$age<=13)

head(dta7)
##     age   hgt  wgt   bmi   hc  gen  phb tv   reg
## 1 5.015 117.9 21.1 15.17 53.1 <NA> <NA> NA  east
## 2 5.081 111.0 17.9 14.52 51.0 <NA> <NA> NA south
## 3 5.130 113.3 19.0 14.80 49.5 <NA> <NA> NA  west
## 4 5.180 112.5 20.4 16.11 50.0 <NA> <NA> NA  west
## 5 5.196 111.1 20.9 16.93 54.0 <NA> <NA> NA  east
## 6 5.204 122.9 22.2 14.69 54.5 <NA> <NA> NA  west
qs <- 1:9/10
ggplot(dta7, aes(age, log(bmi))) + 
  geom_point(alpha=.5) + 
  geom_quantile(quantiles = qs)+
  labs(x = "Age", y = "BMI") +
  theme_bw()
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
## Smoothing formula not specified. Using: y ~ x