HW_Regression

library packages

p = c("nlme", "dplyr", "magrittr", "tidyr", "ggplot2", "lmtest", "gridExtra","MuMIn")
lapply(p, library, character.only = TRUE)

Q1

Q1. Expenditure

dat = read.table("D:/Multilevel_publish/Multilevel/week3/P211.txt", h = TRUE)
str(dat)

## 'data.frame':    20 obs. of  4 variables:
##  $ Year       : int  1952 1952 1952 1952 1953 1953 1953 1953 1954 1954 ...
##  $ Quarter    : int  1 2 3 4 1 2 3 4 1 2 ...
##  $ Expenditure: num  215 218 220 227 231 ...
##  $ Stock      : num  159 161 163 165 166 ...

head(dat)

##   Year Quarter Expenditure Stock
## 1 1952       1       214.6 159.3
## 2 1952       2       217.7 161.2
## 3 1952       3       219.6 162.8
## 4 1952       4       227.2 164.6
## 5 1953       1       230.9 165.9
## 6 1953       2       233.3 167.9

summary(dat)

##       Year         Quarter      Expenditure        Stock      
##  Min.   :1952   Min.   :1.00   Min.   :214.6   Min.   :159.3  
##  1st Qu.:1953   1st Qu.:1.75   1st Qu.:231.9   1st Qu.:167.4  
##  Median :1954   Median :2.50   Median :237.6   Median :172.8  
##  Mean   :1954   Mean   :2.50   Mean   :243.5   Mean   :173.1  
##  3rd Qu.:1955   3rd Qu.:3.25   3rd Qu.:261.5   3rd Qu.:180.4  
##  Max.   :1956   Max.   :4.00   Max.   :275.6   Max.   :184.3

dat %<>% mutate(year_52 = Year - 1952)
dat$year_52 = as.integer(dat$year_52)


##simple lm
summary(rst.e <- lm(Expenditure~year_52, data = dat))

## 
## Call:
## lm(formula = Expenditure ~ year_52, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.7750 -3.0925 -0.1588  2.4925  8.6600 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 218.5400     1.9658  111.17  < 2e-16 ***
## year_52      12.4675     0.8025   15.54 7.16e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.076 on 18 degrees of freedom
## Multiple R-squared:  0.9306, Adjusted R-squared:  0.9267 
## F-statistic: 241.3 on 1 and 18 DF,  p-value: 7.161e-12

##gls
rst.e1 = gls(Expenditure ~ year_52, correlation = corAR1(form = ~year_52|Quarter), method = "ML",data = dat)
summary(rst.e1)

## Generalized least squares fit by maximum likelihood
##   Model: Expenditure ~ year_52 
##   Data: dat 
##      AIC      BIC logLik
##   126.14 130.1229 -59.07
## 
## Correlation Structure: AR(1)
##  Formula: ~year_52 | Quarter 
##  Parameter estimate(s):
##       Phi 
## 0.2888505 
## 
## Coefficients:
##                 Value Std.Error  t-value p-value
## (Intercept) 218.67959 2.2338875 97.89195       0
## year_52      12.49466 0.8647971 14.44809       0
## 
##  Correlation: 
##         (Intr)
## year_52 -0.774
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -2.0752090 -0.6897819 -0.0762420  0.4841451  1.7736752 
## 
## Residual standard error: 4.803815 
## Degrees of freedom: 20 total; 18 residual

#Collect Residuals
dat %<>% mutate(res.e = rstandard(rst.e),
                res.e1 = resid(rst.e1, type = "normalized"))
#Test two models 
dwtest(res.e~year_52, data = dat)

## 
##  Durbin-Watson test
## 
## data:  res.e ~ year_52
## DW = 1.2322, p-value = 0.01823
## alternative hypothesis: true autocorrelation is greater than 0

dwtest(res.e1~year_52, data = dat)

## 
##  Durbin-Watson test
## 
## data:  res.e1 ~ year_52
## DW = 1.6463, p-value = 0.1427
## alternative hypothesis: true autocorrelation is greater than 0

#Plot
##regression plots
ggplot(dat, aes(x = year_52, y = Expenditure))+
  geom_point(col = "gray")+
  stat_smooth(method = "lm", se = F, linetype = "dashed")+
  geom_path(aes(year_52, fitted(rst.e1)), size = rel(1.0), col = "gray")+
  labs(x = "Years since 1952", y = "Expenditure")+
  theme_bw()

##residual plots
ggplot(dat, aes(x = year_52, res.e))+
  geom_point(col = "skyblue")+
  geom_point(aes(x = year_52, res.e1), col = "gray")+
  geom_hline(yintercept = 0)+
  labs(x = "Years since 1952", y = "Standardized Residuals")+
  theme_bw()

#ACF Plot
##bound
ci = 2/sqrt(dim(dat)[1])

##acf for lm residuals
dat %$% acf(res.e, plot = FALSE) %$%
  data.frame(lag,acf) %>%
  ggplot()+
  aes(x = lag, y = acf)+
  geom_segment(mapping = aes(xend = lag, yend = 0))+
  geom_hline(aes(yintercept = 0))+
  geom_hline(aes(yintercept = ci), col = "blue", linetype = "dashed")+
  geom_hline(aes(yintercept = -ci), col = "blue", linetype = "dashed")+
  ylim(-1,1)+
  ggtitle("acf of lm")+
  theme_bw()

##acf for gls model residuals
dat %$% acf(res.e1, plot = FALSE) %$%
  data.frame(lag,acf) %>%
  ggplot()+
  aes(x = lag, y = acf)+
  geom_segment(mapping = aes(xend = lag, yend = 0))+
  geom_hline(aes(yintercept = 0))+
  geom_hline(aes(yintercept = ci), col = "blue", linetype = "dashed")+
  geom_hline(aes(yintercept = -ci), col = "blue", linetype = "dashed")+
  ylim(-1,1)+
  ggtitle("acf of gls")+
  theme_bw()

Q1. Stock

##simple lm
summary(rst.s <- lm(Stock~year_52, data = dat))

## 
## Call:
## lm(formula = Stock ~ year_52, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0850 -1.3613  0.2863  1.0356  3.0000 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 162.3850     0.7102  228.64  < 2e-16 ***
## year_52       5.3575     0.2899   18.48 3.76e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.834 on 18 degrees of freedom
## Multiple R-squared:  0.9499, Adjusted R-squared:  0.9471 
## F-statistic: 341.4 on 1 and 18 DF,  p-value: 3.76e-13

##gls
rst.s1 = gls(Stock ~ year_52, correlation = corAR1(form = ~year_52|Quarter), method = "ML",data = dat)
summary(rst.s1)

## Generalized least squares fit by maximum likelihood
##   Model: Stock ~ year_52 
##   Data: dat 
##        AIC      BIC    logLik
##   78.14275 82.12568 -35.07138
## 
## Correlation Structure: AR(1)
##  Formula: ~year_52 | Quarter 
##  Parameter estimate(s):
##       Phi 
## 0.6434015 
## 
## Coefficients:
##                 Value Std.Error   t-value p-value
## (Intercept) 162.29881 0.8879946 182.77005       0
## year_52       5.26436 0.2929325  17.97125       0
## 
##  Correlation: 
##         (Intr)
## year_52 -0.66 
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -1.7329651 -0.6535811  0.2421401  0.7693828  1.8911093 
## 
## Residual standard error: 1.730449 
## Degrees of freedom: 20 total; 18 residual

#Residual
dat %<>% mutate(res.s = rstandard(rst.s),
                res.s1 = resid(rst.s1, type = "normalized"))
#Test two models
dwtest(res.s~year_52, data = dat)

## 
##  Durbin-Watson test
## 
## data:  res.s ~ year_52
## DW = 1.7441, p-value = 0.2008
## alternative hypothesis: true autocorrelation is greater than 0

dwtest(res.s1~year_52, data = dat)

## 
##  Durbin-Watson test
## 
## data:  res.s1 ~ year_52
## DW = 2.4588, p-value = 0.7927
## alternative hypothesis: true autocorrelation is greater than 0

#Plot
##regression plots
ggplot(dat, aes(x = year_52, y = Stock))+
  geom_point(col = "gray")+
  stat_smooth(method = "lm", se = F, linetype = "dashed")+
  geom_path(aes(year_52, fitted(rst.s1)), size = rel(1.0), col = "gray")+
  labs(x = "Years since 1952", y = "Stock")+
  theme_bw()

##residual plots
ggplot(dat, aes(x = year_52, res.s))+
  geom_point(col = "skyblue")+
  geom_point(aes(x = year_52, res.s1), col = "gray")+
  labs(x = "Years since 1952", y = "Standardized Residuals")+
  geom_hline(yintercept = 0)+
  theme_bw()

#ACF Plot
##bound
ci = 2/sqrt(dim(dat)[1])

##acf for lm residuals
dat %$% acf(res.s, plot = FALSE) %$%
  data.frame(lag,acf) %>%
  ggplot()+
  aes(x = lag, y = acf)+
  geom_segment(mapping = aes(xend = lag, yend = 0))+
  geom_hline(aes(yintercept = 0))+
  geom_hline(aes(yintercept = ci), col = "blue", linetype = "dashed")+
  geom_hline(aes(yintercept = -ci), col = "blue", linetype = "dashed")+
  ylim(-1,1)+
  theme_bw()

##acf for gls model residuals
dat %$% acf(res.s1, plot = FALSE) %$%
  data.frame(lag,acf) %>%
  ggplot()+
  aes(x = lag, y = acf)+
  geom_segment(mapping = aes(xend = lag, yend = 0))+
  geom_hline(aes(yintercept = 0))+
  geom_hline(aes(yintercept = ci), col = "blue", linetype = "dashed")+
  geom_hline(aes(yintercept = -ci), col = "blue", linetype = "dashed")+
  ylim(-1,1)+
  theme_bw()

Q2

dat = read.csv("D:/Multilevel_publish/Multilevel/week3/reading_time.csv", h = TRUE)
str(dat)

## 'data.frame':    70 obs. of  6 variables:
##  $ Snt    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Sp     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Wrds   : int  13 13 13 13 13 13 13 13 13 13 ...
##  $ New    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Subject: Factor w/ 10 levels "S01","S02","S03",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Time   : num  3.43 2.79 4.16 3.07 3.62 ...

head(dat)

##   Snt Sp Wrds New Subject  Time
## 1   1  1   13   1     S01 3.429
## 2   1  1   13   1     S02 2.795
## 3   1  1   13   1     S03 4.161
## 4   1  1   13   1     S04 3.071
## 5   1  1   13   1     S05 3.625
## 6   1  1   13   1     S06 3.161

# 1 model fit all
m0 = lm(Time~Sp+Wrds+New, data = dat)
summary(m0)

## 
## Call:
## lm(formula = Time ~ Sp + Wrds + New, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9261 -0.9403 -0.3808  0.5332  4.9136 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.58695    0.87010  -2.973  0.00411 ** 
## Sp           0.33337    0.12080   2.760  0.00748 ** 
## Wrds         0.45859    0.08308   5.520 6.14e-07 ***
## New          0.15163    0.31254   0.485  0.62917    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.557 on 66 degrees of freedom
## Multiple R-squared:  0.6054, Adjusted R-squared:  0.5874 
## F-statistic: 33.75 on 3 and 66 DF,  p-value: 2.433e-13

# 1 regression model per individual
m1 = lm(Time~Subject/(Sp+Wrds+New)-1, data = dat)
summary(m1)

## 
## Call:
## lm(formula = Time ~ Subject/(Sp + Wrds + New) - 1, data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.10020 -0.46047  0.06511  0.40878  2.88204 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## SubjectS01       -1.8261     2.4659  -0.741  0.46471   
## SubjectS02       -3.1898     2.4659  -1.294  0.20569   
## SubjectS03       -1.1433     2.4659  -0.464  0.64626   
## SubjectS04       -3.3292     2.4659  -1.350  0.18708   
## SubjectS05        0.4200     2.4659   0.170  0.86591   
## SubjectS06       -7.8901     2.4659  -3.200  0.00324 **
## SubjectS07       -4.2632     2.4659  -1.729  0.09411 . 
## SubjectS08        4.2679     2.4659   1.731  0.09376 . 
## SubjectS09       -6.0982     2.4659  -2.473  0.01928 * 
## SubjectS10       -2.8176     2.4659  -1.143  0.26223   
## SubjectS01:Sp     0.2312     0.3424   0.675  0.50457   
## SubjectS02:Sp     0.3053     0.3424   0.892  0.37957   
## SubjectS03:Sp     0.2064     0.3424   0.603  0.55117   
## SubjectS04:Sp     0.4830     0.3424   1.411  0.16859   
## SubjectS05:Sp    -0.0621     0.3424  -0.181  0.85728   
## SubjectS06:Sp     1.1098     0.3424   3.242  0.00291 **
## SubjectS07:Sp     0.2545     0.3424   0.743  0.46308   
## SubjectS08:Sp    -0.3315     0.3424  -0.968  0.34068   
## SubjectS09:Sp     0.6679     0.3424   1.951  0.06048 . 
## SubjectS10:Sp     0.4692     0.3424   1.371  0.18069   
## SubjectS01:Wrds   0.3910     0.2355   1.661  0.10718   
## SubjectS02:Wrds   0.4341     0.2355   1.844  0.07510 . 
## SubjectS03:Wrds   0.4036     0.2355   1.714  0.09682 . 
## SubjectS04:Wrds   0.5020     0.2355   2.132  0.04130 * 
## SubjectS05:Wrds   0.2878     0.2355   1.222  0.23114   
## SubjectS06:Wrds   0.8085     0.2355   3.434  0.00176 **
## SubjectS07:Wrds   0.5750     0.2355   2.442  0.02072 * 
## SubjectS08:Wrds   0.1134     0.2355   0.482  0.63356   
## SubjectS09:Wrds   0.5008     0.2355   2.127  0.04177 * 
## SubjectS10:Wrds   0.5696     0.2355   2.419  0.02182 * 
## SubjectS01:New    0.2216     0.8857   0.250  0.80414   
## SubjectS02:New    0.3464     0.8857   0.391  0.69852   
## SubjectS03:New   -0.2529     0.8857  -0.286  0.77717   
## SubjectS04:New   -0.2768     0.8857  -0.313  0.75679   
## SubjectS05:New    0.9268     0.8857   1.046  0.30375   
## SubjectS06:New   -0.2334     0.8857  -0.263  0.79399   
## SubjectS07:New    0.7964     0.8857   0.899  0.37572   
## SubjectS08:New    0.3312     0.8857   0.374  0.71105   
## SubjectS09:New    0.1632     0.8857   0.184  0.85506   
## SubjectS10:New   -0.5062     0.8857  -0.572  0.57190   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.395 on 30 degrees of freedom
## Multiple R-squared:  0.9668, Adjusted R-squared:  0.9226 
## F-statistic: 21.86 on 40 and 30 DF,  p-value: 6.582e-14

#Plot
##SP
##1 fit all
SP.0 = ggplot(dat, aes(x = Sp, y = Time, group = Subject, alpha = Subject))+
  geom_point()+
  geom_line()+
  stat_smooth(aes(group = 1), method = "lm", size = .7)+
  labs(x = "SP (all)")+
  theme_bw()+
  theme(legend.position = "NONE")

##1 regression per individual
SP.1 = ggplot(dat, aes(x = Sp, y = Time, group = Subject, alpha = Subject))+
  geom_point()+
  stat_smooth(method = "lm", se = F, size = .7)+
  scale_color_grey()+
  labs(x = "SP (per individual)")+
  theme_bw()+
  theme(legend.position = "NONE")

##Wrds
##1 fit all
Wrd.0 = ggplot(dat, aes(x = Wrds, y = Time, group = Subject, alpha = Subject))+
  geom_point()+
  geom_line()+
  stat_smooth(aes(group = 1), method = "lm", size = .7)+
  labs(x = "Wrd (all)")+
  theme_bw()+
  theme(legend.position = "NONE")

##1 regression per individual
Wrd.1 = ggplot(dat, aes(x = Wrds, y = Time, group = Subject, alpha = Subject))+
  geom_point()+
  stat_smooth(method = "lm", se = F, size = .7)+
  scale_color_grey()+
  labs(x = "Wrd (per individual")+
  theme_bw()+
  theme(legend.position = "NONE")

##New
##1 fit all
New.0 = ggplot(dat, aes(x = New, y = Time, group = Subject, alpha = Subject))+
  geom_point()+
  geom_line()+
  stat_smooth(aes(group = 1), method = "lm", size = .7)+
  labs(x = "New (all)")+
  theme_bw()+
  theme(legend.position = "NONE")

##1 regression per individual
New.1 = ggplot(dat, aes(x = New, y = Time, group = Subject, alpha = Subject))+
  geom_point()+
  stat_smooth(method = "lm", se = F, size = .7)+
  scale_color_grey()+
  labs(x = "New (per individual)")+
  theme_bw()+
  theme(legend.position = "NONE")

#arrange the plots
grid.arrange(SP.0, SP.1, Wrd.0, Wrd.1, New.0, New.1, nrow = 3, ncol = 2)

Q3

data("Stevens", package = "alr4")
dat = Stevens
str(dat)

## 'data.frame':    50 obs. of  5 variables:
##  $ subject : Factor w/ 10 levels "A","B","C","D",..: 1 1 1 1 1 2 2 2 2 2 ...
##  $ loudness: num  50 60 70 80 90 50 60 70 80 90 ...
##  $ y       : num  0.379 1.727 3.016 3.924 4.413 ...
##  $ sd      : num  0.507 0.904 0.553 0.363 0.092 0.077 0.287 0.396 0.124 0.068 ...
##  $ n       : num  3 3 3 3 3 3 3 3 3 3 ...

#ordinal lm
rst.0 = lm(y~loudness, data = dat)
summary(rst.0)

## 
## Call:
## lm(formula = y ~ loudness, data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26688 -0.31789 -0.00934  0.37067  0.92186 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.203770   0.344286  -9.306 2.53e-12 ***
## loudness     0.079013   0.004821  16.389  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4821 on 48 degrees of freedom
## Multiple R-squared:  0.8484, Adjusted R-squared:  0.8452 
## F-statistic: 268.6 on 1 and 48 DF,  p-value: < 2.2e-16

#weighted lm
rst.1= lm(y~loudness, data = dat, weights = 1/sd^2)
summary(rst.1)

## 
## Call:
## lm(formula = y ~ loudness, data = dat, weights = 1/sd^2)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4279  0.0238  1.2860  2.1889 11.1692 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.859423   0.383589  -7.454 1.48e-09 ***
## loudness     0.069387   0.004892  14.183  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.785 on 48 degrees of freedom
## Multiple R-squared:  0.8073, Adjusted R-squared:  0.8033 
## F-statistic: 201.1 on 1 and 48 DF,  p-value: < 2.2e-16

#residual of two models
dat %<>% mutate(res.0 = rstandard(rst.0),
                res.1 = rstandard(rst.1))

#Plot
ggplot(dat, aes(x = loudness, y = y, group = subject, alpha = subject))+
  geom_point(alpha = .5)+
  stat_smooth(aes(group = 1),method = "lm", col = "gray", se = F)+ #lm
  geom_path(aes(x = loudness, fitted(rst.1)), size = rel(1.0), col = "skyblue")+ #weighted 
  labs(x = "Loudness (db)", y = "Average log(Length)") +
  theme_bw()+
  theme(legend.position = "NONE")

#Residuals plot
ggplot(dat, aes(x = loudness, y = res.0, group = subject))+
  geom_point(col = "gray", alpha = .5)+ #lm residual
  geom_point(aes(x = loudness, res.1), col = "skyblue", alpha = .5)+ #weighted lm residual
  geom_hline(yintercept = 0)+
  labs(x = "loudness", y = "Standardized residuals")+
  theme_bw()

#test residuals of two model
t.test(dat$res.0,dat$res.1)

## 
##  Welch Two Sample t-test
## 
## data:  dat$res.0 and dat$res.1
## t = -1.5326, df = 96.786, p-value = 0.1286
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.75396225  0.09692547
## sample estimates:
##    mean of x    mean of y 
## -0.001012561  0.327505827

The two models didn’t perform differently.

Q4

Conduct a regression analysis to explore data

data("gapminder", package = "gapminder")
dat = gapminder
dat = dat[dat$country=="Taiwan",]
dat$log.gdp = log(dat$gdpPercap)
summary(dat)

##         country      continent       year         lifeExp     
##  Taiwan     :12   Africa  : 0   Min.   :1952   Min.   :58.50  
##  Afghanistan: 0   Americas: 0   1st Qu.:1966   1st Qu.:66.92  
##  Albania    : 0   Asia    :12   Median :1980   Median :71.38  
##  Algeria    : 0   Europe  : 0   Mean   :1980   Mean   :70.34  
##  Angola     : 0   Oceania : 0   3rd Qu.:1993   3rd Qu.:74.51  
##  Argentina  : 0                 Max.   :2007   Max.   :78.40  
##  (Other)    : 0                                               
##       pop             gdpPercap        log.gdp      
##  Min.   : 8550362   Min.   : 1207   Min.   : 7.096  
##  1st Qu.:13216254   1st Qu.: 2439   1st Qu.: 7.787  
##  Median :17643293   Median : 6511   Median : 8.771  
##  Mean   :16874724   Mean   :10225   Mean   : 8.736  
##  3rd Qu.:20922340   3rd Qu.:16463   3rd Qu.: 9.701  
##  Max.   :23174294   Max.   :28718   Max.   :10.265  
##

dat %<>% mutate(year_52 = year-1952)

##simple lm
summary(m0 <- lm(lifeExp~year_52, data = dat))

## 
## Call:
## lm(formula = lifeExp ~ year_52, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8374 -0.7356  0.2114  1.0218  1.5077 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 61.33744    0.71159   86.20 1.08e-15 ***
## year_52      0.32724    0.02192   14.93 3.65e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.31 on 10 degrees of freedom
## Multiple R-squared:  0.9571, Adjusted R-squared:  0.9528 
## F-statistic: 222.9 on 1 and 10 DF,  p-value: 3.654e-08

##GLS
summary(m1 <-lm(log.gdp ~ year_52,data = dat))

## 
## Call:
## lm(formula = log.gdp ~ year_52, data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.16949 -0.07515  0.02998  0.06870  0.12209 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.036535   0.056316  124.95  < 2e-16 ***
## year_52     0.061786   0.001735   35.62 7.22e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1037 on 10 degrees of freedom
## Multiple R-squared:  0.9922, Adjusted R-squared:  0.9914 
## F-statistic:  1269 on 1 and 10 DF,  p-value: 7.216e-12

##Collesct residuals
dat %<>% mutate(res.0 = rstandard(m0),
                res.1 = rstandard(m1))

dwtest(res.0~year_52, data = dat)

## 
##  Durbin-Watson test
## 
## data:  res.0 ~ year_52
## DW = 0.51333, p-value = 4.727e-05
## alternative hypothesis: true autocorrelation is greater than 0

dwtest(res.1~year_52, data = dat)

## 
##  Durbin-Watson test
## 
## data:  res.1 ~ year_52
## DW = 0.81032, p-value = 0.002029
## alternative hypothesis: true autocorrelation is greater than 0

According to the test results, both log.gdp and life expectacy are influenced by time.
The data might be time series.

#bound
ci = 2/sqrt(dim(dat)[1])

##acf for lm residuals on life expectacy
dat %$% acf(res.0, plot = FALSE) %$%
  data.frame(lag,acf) %>%
  ggplot()+
  aes(x = lag, y = acf)+
  geom_segment(mapping = aes(xend = lag, yend = 0))+
  geom_hline(aes(yintercept = 0))+
  geom_hline(aes(yintercept = ci), col = "blue", linetype = "dashed")+
  geom_hline(aes(yintercept = -ci), col = "blue", linetype = "dashed")+
  ylim(-1,1)+
  ggtitle("acf of lm on life expactacy")+
  theme_bw()

##acf for lm residauls on log.gdp
dat %$% acf(res.1, plot = FALSE) %$%
  data.frame(lag,acf) %>%
  ggplot()+
  aes(x = lag, y = acf)+
  geom_segment(mapping = aes(xend = lag, yend = 0))+
  geom_hline(aes(yintercept = 0))+
  geom_hline(aes(yintercept = ci), col = "blue", linetype = "dashed")+
  geom_hline(aes(yintercept = -ci), col = "blue", linetype = "dashed")+
  ylim(-1,1)+
  ggtitle("acf of lm on log.gdp")+
  theme_bw()

Q5

dat = read.table("D:/Multilevel_publish/Multilevel/week3/girlHeights.txt", h = T)
str(dat)

## 'data.frame':    100 obs. of  4 variables:
##  $ height: num  111 116 122 126 130 ...
##  $ girl  : int  1 1 1 1 1 2 2 2 2 2 ...
##  $ age   : int  6 7 8 9 10 6 7 8 9 10 ...
##  $ mom   : Factor w/ 3 levels "M","S","T": 2 2 2 2 2 2 2 2 2 2 ...

head(dat)

##   height girl age mom
## 1  111.0    1   6   S
## 2  116.4    1   7   S
## 3  121.7    1   8   S
## 4  126.3    1   9   S
## 5  130.5    1  10   S
## 6  110.0    2   6   S

#Correlation matrix
dat[,-4] %>% cor

##           height      girl       age
## height 1.0000000 0.3986156 0.8551367
## girl   0.3986156 1.0000000 0.0000000
## age    0.8551367 0.0000000 1.0000000

# 1 model fit all
summary(m0 <- lm(height~age, data = dat))

## 
## Call:
## lm(formula = height ~ age, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.1890 -3.6643 -0.8395  3.9479 13.0440 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  82.5240     2.8439   29.02   <2e-16 ***
## age           5.7165     0.3501   16.33   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.951 on 98 degrees of freedom
## Multiple R-squared:  0.7313, Adjusted R-squared:  0.7285 
## F-statistic: 266.7 on 1 and 98 DF,  p-value: < 2.2e-16

ggplot(dat, aes(x = age, y = height, alpha = girl))+
  geom_point(col = "navy")+
  stat_smooth(method = "lm", se = F)+
  labs(x = "Age", y = "Height")+
  theme_bw()+
  theme(legend.position = "NONE")

#autocorrelation
summary(m1 <- gls(height~age, data = dat, corr = corAR1(form = ~1|girl), method = "ML"))

## Generalized least squares fit by maximum likelihood
##   Model: height ~ age 
##   Data: dat 
##        AIC      BIC    logLik
##   353.3411 363.7618 -172.6705
## 
## Correlation Structure: AR(1)
##  Formula: ~1 | girl 
##  Parameter estimate(s):
##       Phi 
## 0.9781674 
## 
## Coefficients:
##                Value Std.Error  t-value p-value
## (Intercept) 82.47456 1.3803440 59.74927       0
## age          5.68379 0.1109941 51.20803       0
## 
##  Correlation: 
##     (Intr)
## age -0.643
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -1.8432405 -0.7004707 -0.1173561  0.8830952  2.7934077 
## 
## Residual standard error: 4.780948 
## Degrees of freedom: 100 total; 98 residual

#unequal variance
summary(m2 <- gls(height ~ age, weights = varIdent(form = ~ 1 | mom), method = "ML", data = dat))

## Generalized least squares fit by maximum likelihood
##   Model: height ~ age 
##   Data: dat 
##       AIC      BIC    logLik
##   593.349 606.3749 -291.6745
## 
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | mom 
##  Parameter estimates:
##         S         M         T 
## 1.0000000 0.8011027 1.8430778 
## 
## Coefficients:
##                Value Std.Error  t-value p-value
## (Intercept) 82.46082  2.324905 35.46847       0
## age          5.54718  0.286176 19.38381       0
## 
##  Correlation: 
##     (Intr)
## age -0.985
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -1.90495116 -0.65254923  0.08002491  0.76101422  2.45121455 
## 
## Residual standard error: 3.901737 
## Degrees of freedom: 100 total; 98 residual

#both
summary(m3 <- gls(height ~ age, corr = corAR1(form = ~ 1 | girl), 
                  weights = varIdent(form = ~ 1 | mom), method = "ML", 
                  data = dat))

## Generalized least squares fit by maximum likelihood
##   Model: height ~ age 
##   Data: dat 
##       AIC     BIC    logLik
##   337.597 353.228 -162.7985
## 
## Correlation Structure: AR(1)
##  Formula: ~1 | girl 
##  Parameter estimate(s):
##       Phi 
## 0.9786546 
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | mom 
##  Parameter estimates:
##         S         M         T 
## 1.0000000 0.6922724 1.5493547 
## 
## Coefficients:
##                Value Std.Error  t-value p-value
## (Intercept) 82.47292 1.1305458 72.94965       0
## age          5.55833 0.0902996 61.55430       0
## 
##  Correlation: 
##     (Intr)
## age -0.639
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -1.77188602 -0.71727072  0.06483833  0.80091331  2.56095242 
## 
## Residual standard error: 4.264505 
## Degrees of freedom: 100 total; 98 residual

#model selection
model.sel(m0,m1,m2,m3)

## Model selection table 
##    (Intrc)   age class correlation  weights REML df   logLik  AICc  delta
## m3   82.47 5.558   gls crAR1(girl) vrI(mom)    F  6 -162.798 338.5   0.00
## m1   82.47 5.684   gls crAR1(girl)             F  4 -172.671 353.8  15.26
## m2   82.46 5.547   gls             vrI(mom)    F  5 -291.675 594.0 255.49
## m0   82.52 5.716    lm                            3 -300.836 607.9 269.42
##    weight
## m3      1
## m1      0
## m2      0
## m0      0
## Abbreviations:
## correlation: crAR1(girl) = 'corAR1(~1|girl)'
## weights: vrI(mom) = 'varIdent(~1|mom)'
## REML: F = 'FALSE'
## Models ranked by AICc(x)

According to the results of AIC,
model with autocorrelation and unequal variance correction is the most appropriate

Q6

# seed the random number generator to get the same sample
set.seed(6102)

# pick 75 countries from three regions & arrange the rows by alphabetical order
data("UN11", package = "alr4")
dta <- UN11 %>%
  filter(region %in% c("Africa", "Asia", "Europe")) %>%
  sample_n(72) %>%
  arrange(region)

# how many countries in each of the three regions
n3 <- table(dta$region)
n3

## 
##        Africa          Asia     Caribbean        Europe    Latin Amer 
##            25            30             0            17             0 
## North America NorthAtlantic       Oceania 
##             0             0             0

# percentage of countries from each region
w <- n3/table(UN11$region)
w

## 
##        Africa          Asia     Caribbean        Europe    Latin Amer 
##     0.4716981     0.6000000     0.0000000     0.4358974     0.0000000 
## North America NorthAtlantic       Oceania 
##     0.0000000     0.0000000     0.0000000

# add the weights variable to data
dta$wt <- rep(1/w[w != 0], n3[n3 != 0])

# plot
ggplot(dta, aes(log(ppgdp), fertility, group = region, label = region)) +
  geom_text(check_overlap = TRUE, size = rel(2), col = "gray30")+
  stat_smooth(aes(group = 1), method = "lm", se = F, col = "gray30") +##The overall regression line
  stat_smooth(aes(colour = region), method = "lm", se = F)+               #I slightly changed the code 
  theme(legend.background = element_rect(linetype=1, colour = "black"))+  #To make the pattern more clear
  theme(legend.position=c(.88, .82))+
  geom_abline(intercept = coef(m1)[1], slope = coef(m1)[2], col = "firebrick") +
  labs(x = "GDP (US$ in log units)", y = "Number of children per woman") +
  theme_bw()

According to the different regression line:
In Africa and Asia, richer countries have lower number of children per woman.
In Europe, richer countries have higher number of children per woman.
The effects of economic condition on the birth rates of undeveloped countries and on the birth rates of well-developed contries are different.

Q7

data("boys7482", package = "AGD")
dat = boys7482
str(dat)

## 'data.frame':    7482 obs. of  9 variables:
##  $ age: num  0.032 0.035 0.035 0.038 0.043 0.046 0.046 0.046 0.049 0.049 ...
##  $ hgt: num  53.3 50.8 50.1 53.5 52.1 50.5 NA 52.5 55 52.9 ...
##  $ wgt: num  3.76 3.21 3.65 3.37 4.16 ...
##  $ bmi: num  13.2 12.4 14.5 11.8 15.3 ...
##  $ hc : num  33.6 33.6 33.7 35 36.1 36.6 36.2 38 37.4 35.1 ...
##  $ gen: Ord.factor w/ 5 levels "G1"<"G2"<"G3"<..: NA NA NA NA NA NA NA NA NA NA ...
##  $ phb: Ord.factor w/ 6 levels "P1"<"P2"<"P3"<..: NA NA NA NA NA NA NA NA NA NA ...
##  $ tv : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ reg: Factor w/ 5 levels "north","east",..: 4 4 4 4 4 3 3 3 3 4 ...

head(dat)

##     age  hgt   wgt   bmi   hc  gen  phb tv   reg
## 1 0.032 53.3 3.760 13.23 33.6 <NA> <NA> NA south
## 2 0.035 50.8 3.205 12.41 33.6 <NA> <NA> NA south
## 3 0.035 50.1 3.650 14.54 33.7 <NA> <NA> NA south
## 4 0.038 53.5 3.370 11.77 35.0 <NA> <NA> NA south
## 5 0.043 52.1 4.160 15.32 36.1 <NA> <NA> NA south
## 6 0.046 50.5 3.370 13.21 36.6 <NA> <NA> NA  west

dat$index = ifelse(dat$age<5, 0 , ifelse(dat$age>13,0,1))
dat = dat[dat$index==1,]
dat$log.bmi = log(dat$bmi)

#quantile
q = 1:9/10

#Plot
ggplot(dat, aes(x = age, y = log.bmi))+
  geom_point(alpha = 0.5)+
  scale_color_grey()+
  geom_quantile(quantiles = q)+
  labs(x = "Age", y = "BMI")+
  theme_bw()

HW_Regression

Yu-Han, Luo

2016年10月3日

library packages

Q1

Q1. Expenditure

Q1. Stock

Q2

Q3

Q4

Conduct a regression analysis to explore data

Q5

Q6

Q7