1 Exercise 1

Complete the regression analysis of the blood pressure data from the Framingham Heart Study in the R script . In there a gender difference in regression slopes?

1.1 file location

fL<-"https://math.montana.edu/shancock/data/Framingham.txt"
dta1 <- read.table(fL, header=T)

1.2 recode gender variable

dta1$sex <- factor(dta1$sex, 
                  levels=c(1, 2), 
                  labels=c("M", "F")) 

1.3 lattice plot

library(lattice)
xyplot(sbp ~ dbp | sex, 
       data=dta1, cex=.5,
       type=c("p","g","r"),
       xlab="Diastolic pressure (mmHg)",
       ylab="Systolic pressure (mmHg)")

1.4 regression analysis

m0 <- lm(sbp ~ dbp, data=dta1)
m1 <- lm(sbp ~ dbp + sex, data=dta1)
m2 <- lm(sbp ~ dbp + sex + sex:dbp, data=dta1)
anova(m0, m1, m2)
Analysis of Variance Table

Model 1: sbp ~ dbp
Model 2: sbp ~ dbp + sex
Model 3: sbp ~ dbp + sex + sex:dbp
  Res.Df    RSS Df Sum of Sq    F Pr(>F)
1   4697 941778                         
2   4696 927853  1     13925 71.8 <2e-16
3   4695 910543  1     17310 89.3 <2e-16

根據上述迴歸分析結果,性別因子對血壓模型有顯著影響,且性別與舒張壓間有交互作用。

(補充)

血壓模型 (m0) 加入性別因子比較 (m1)

anova(m0, m1)
Analysis of Variance Table

Model 1: sbp ~ dbp
Model 2: sbp ~ dbp + sex
  Res.Df    RSS Df Sum of Sq    F Pr(>F)
1   4697 941778                         
2   4696 927853  1     13925 70.5 <2e-16

加入與舒張壓有交互作用的性別因子模型 (m0 vs m2)

anova(m0,m2)
Analysis of Variance Table

Model 1: sbp ~ dbp
Model 2: sbp ~ dbp + sex + sex:dbp
  Res.Df    RSS Df Sum of Sq    F Pr(>F)
1   4697 941778                         
2   4695 910543  2     31235 80.5 <2e-16

一次比m0, m1, m2三個models,是一層一層比(m1比m0, m2比m1)

plot(rstandard(m2) ~ fitted(m2), 
     cex=.5, 
     xlab="Fitted values", 
     ylab="Standardized residuals")
grid()

m2 model的殘差 pattern,應為 random,若有 pattern 或有聚類,表示 model 有問題。

1.5 The end

2 Exercise 2

The standard deviation of length estimates of each subject is available in the loudness data example. Perform a weighted regression and compare the results against an ordinary regression of length onto loudness.

2.1 regression with clusters

options(digits=5, show.signif.stars=FALSE)

2.1.1 load the packages

# install.packages("pacman")
pacman::p_load(alr4, tidyverse, magrittr)

2.1.2 make sure data set is active

data(Stevens, package="alr4")

2.1.3 save a copy

dta2 <- Stevens %>% 
 rename(Length=y, Loudness=loudness, Subject=subject)
ot <- theme_set(theme_bw())

2.1.4 1 regression fits all

ggplot(dta2, aes(Loudness, Length, group = Subject, alpha = Subject))+
 geom_point() +
 geom_line() +
 stat_smooth(aes(group = 1), method = "lm", size = rel(.8)) +
 coord_cartesian(ylim = c(-1, 5)) +
 labs(x = "Loudness (db)", y = "Average log(Length)") +
 theme(legend.position = "NONE")

2.1.5 one model fits all

m0 <- lm(Length ~ Loudness, data = dta2)

2.1.6 tidy summary model output

broom::tidy(summary(m0))
# A tibble: 2 x 5
  term        estimate std.error statistic  p.value
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)  -3.20     0.344       -9.31 2.53e-12
2 Loudness      0.0790   0.00482     16.4  2.69e-21

2.1.7 1 regression model per individual

ggplot(dta2, aes(Loudness, Length, group = Subject, color = Subject))+
 geom_point() +
 stat_smooth(method = "lm", se = F, size = rel(.5)) +
 scale_color_grey()+
 coord_cartesian(ylim = c(-1, 5)) +
 labs(x = "Loudness (db)", y = "Average log(Length)") +
 theme(legend.position = "NONE")

2.2 fancy way of conducting 1 regression model per subject

2.2.1 grouped by subject before doing regression

dta2 %>% 
 group_by(Subject) %>% 
 do(broom::tidy(lm(Length ~ Loudness, data = .)))
# A tibble: 20 x 6
# Groups:   Subject [10]
   Subject term        estimate std.error statistic  p.value
   <fct>   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
 1 A       (Intercept)  -4.49     0.748       -6.01 0.00923 
 2 A       Loudness      0.103    0.0105       9.80 0.00225 
 3 B       (Intercept)  -4.59     0.462       -9.92 0.00218 
 4 B       Loudness      0.0935   0.00647     14.5  0.000718
 5 C       (Intercept)  -2.29     0.253       -9.05 0.00285 
 6 C       Loudness      0.0695   0.00355     19.6  0.000291
 7 D       (Intercept)  -2.92     0.338       -8.63 0.00327 
 8 D       Loudness      0.0751   0.00474     15.8  0.000547
 9 E       (Intercept)  -5.39     0.743       -7.24 0.00542 
10 E       Loudness      0.104    0.0104       9.98 0.00214 
11 F       (Intercept)  -2.42     0.523       -4.63 0.0190  
12 F       Loudness      0.0774   0.00732     10.6  0.00181 
13 G       (Intercept)  -2.81     0.312       -9.01 0.00288 
14 G       Loudness      0.0650   0.00437     14.9  0.000659
15 H       (Intercept)  -3.42     0.401       -8.53 0.00338 
16 H       Loudness      0.0822   0.00561     14.6  0.000691
17 I       (Intercept)  -1.95     0.512       -3.80 0.0319  
18 I       Loudness      0.0656   0.00718      9.14 0.00277 
19 J       (Intercept)  -1.76     0.265       -6.63 0.00699 
20 J       Loudness      0.0552   0.00372     14.9  0.000660

2.2.2 fitting a model treating subject as fixed effects

summary(m1 <- lm(Length ~ Subject/Loudness - 1, data = dta2)) 

Call:
lm(formula = Length ~ Subject/Loudness - 1, data = dta2)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.3318 -0.1073  0.0023  0.1485  0.3242 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)
SubjectA          -4.49370    0.48664   -9.23  2.8e-10
SubjectB          -4.58530    0.48664   -9.42  1.8e-10
SubjectC          -2.29360    0.48664   -4.71  5.2e-05
SubjectD          -2.91940    0.48664   -6.00  1.4e-06
SubjectE          -5.38510    0.48664  -11.07  4.1e-12
SubjectF          -2.42120    0.48664   -4.98  2.5e-05
SubjectG          -2.81090    0.48664   -5.78  2.6e-06
SubjectH          -3.42070    0.48664   -7.03  8.2e-08
SubjectI          -1.94870    0.48664   -4.00  0.00038
SubjectJ          -1.75910    0.48664   -3.61  0.00109
SubjectA:Loudness  0.10265    0.00681   15.06  1.6e-15
SubjectB:Loudness  0.09355    0.00681   13.73  1.8e-14
SubjectC:Loudness  0.06950    0.00681   10.20  2.9e-11
SubjectD:Loudness  0.07506    0.00681   11.02  4.6e-12
SubjectE:Loudness  0.10391    0.00681   15.25  1.1e-15
SubjectF:Loudness  0.07740    0.00681   11.36  2.2e-12
SubjectG:Loudness  0.06497    0.00681    9.53  1.4e-10
SubjectH:Loudness  0.08223    0.00681   12.07  4.9e-13
SubjectI:Loudness  0.06561    0.00681    9.63  1.1e-10
SubjectJ:Loudness  0.05525    0.00681    8.11  4.7e-09

Residual standard error: 0.215 on 30 degrees of freedom
Multiple R-squared:  0.996, Adjusted R-squared:  0.993 
F-statistic:  369 on 20 and 30 DF,  p-value: <2e-16

2.2.3 augument data with model fit

fit1 <- fitted(m1)
dta2 %>% mutate(fit1)
   Subject Loudness Length    sd n    fit1
1        A       50  0.379 0.507 3  0.6388
2        A       60  1.727 0.904 3  1.6653
3        A       70  3.016 0.553 3  2.6918
4        A       80  3.924 0.363 3  3.7183
5        A       90  4.413 0.092 3  4.7448
6        B       50  0.260 0.077 3  0.0922
7        B       60  0.833 0.287 3  1.0277
8        B       70  2.009 0.396 3  1.9632
9        B       80  2.720 0.124 3  2.8987
10       B       90  3.994 0.068 3  3.8342
11       C       50  1.072 0.106 3  1.1814
12       C       60  1.942 0.324 3  1.8764
13       C       70  2.700 0.319 3  2.5714
14       C       80  3.250 0.137 3  3.2664
15       C       90  3.893 0.356 3  3.9614
16       D       50  0.697 0.288 3  0.8336
17       D       60  1.787 0.121 3  1.5842
18       D       70  2.283 0.139 3  2.3348
19       D       80  3.127 0.331 3  3.0854
20       D       90  3.780 0.314 3  3.8360
21       E       50 -0.520 0.168 3 -0.1896
22       E       60  1.124 0.550 3  0.8495
23       E       70  2.045 0.315 3  1.8886
24       E       80  3.113 0.247 3  2.9277
25       E       90  3.681 0.217 3  3.9668
26       F       50  1.348 0.842 3  1.4488
27       F       60  2.134 0.130 3  2.2228
28       F       70  3.249 0.626 3  2.9968
29       F       80  3.936 0.181 3  3.7708
30       F       90  4.317 0.107 3  4.5448
31       G       50  0.485 0.272 3  0.4376
32       G       60  1.158 0.413 3  1.0873
33       G       70  1.585 0.240 3  1.7370
34       G       80  2.289 0.102 3  2.3867
35       G       90  3.168 0.118 3  3.0364
36       H       50  0.682 0.189 3  0.6908
37       H       60  1.684 0.231 3  1.5131
38       H       70  2.090 0.540 3  2.3354
39       H       80  3.171 0.449 3  3.1577
40       H       90  4.050 0.309 3  3.9800
41       I       50  1.201 0.089 3  1.3318
42       I       60  2.142 0.514 3  1.9879
43       I       70  2.543 0.821 3  2.6440
44       I       80  3.563 0.283 3  3.3001
45       I       90  3.771 0.378 3  3.9562
46       J       50  1.111 0.174 3  1.0034
47       J       60  1.500 0.112 3  1.5559
48       J       70  2.010 0.555 3  2.1084
49       J       80  2.595 0.013 3  2.6609
50       J       90  3.326 0.289 3  3.2134

(註記)

找殘差用 residuals()

fitted value 配適值( 符合模型的y值 ) 與殘差

residuals(m1)
      1       2       3       4       5       6       7       8       9 
-0.2598  0.0617  0.3242  0.2057 -0.3318  0.1678 -0.1947  0.0458 -0.1787 
     10      11      12      13      14      15      16      17      18 
 0.1598 -0.1094  0.0656  0.1286 -0.0164 -0.0684 -0.1366  0.2028 -0.0518 
     19      20      21      22      23      24      25      26      27 
 0.0416 -0.0560 -0.3304  0.2745  0.1564  0.1853 -0.2858 -0.1008 -0.0888 
     28      29      30      31      32      33      34      35      36 
 0.2522  0.1652 -0.2278  0.0474  0.0707 -0.1520 -0.0977  0.1316 -0.0088 
     37      38      39      40      41      42      43      44      45 
 0.1709 -0.2454  0.0133  0.0700 -0.1308  0.1541 -0.1010  0.2629 -0.1852 
     46      47      48      49      50 
 0.1076 -0.0559 -0.0984 -0.0659  0.1126 

2.2.4 1 regression model for all with individual parameters

ggplot(dta2, aes(Loudness, fit1, group = Subject, alpha = Subject))+
 geom_path() +
 geom_point(aes(Loudness, Length, group = Subject, alpha = Subject)) +
 coord_cartesian(ylim = c(-1, 5)) +
 labs(x = "Loudness (dB)", y = "Average log(Length)") +
 theme(legend.position = "NONE")

3 Exercise 3

The plot below describes the relationship between body weight (in kg) and age (in years) for a sample of 4,011 US girls from the data set USgirl{Brq}. Edit the R script to perform a quantile regression of weight onto age.

3.1 data input

# install.packages("Brq")
library(Brq)
data(USgirl, package="Brq")

3.2 plot

plot(USgirl, pch='.', bty="n",
     xlim=c(0, 25),
     ylim=c(0, 150),
     xlab="Age (in years)",
     ylab="Body weight (in kg)")

ggplot(USgirl, aes(Age, Weight)) + 
 geom_point(alpha=.5, size=rel(.3)) + 
 geom_quantile(quantiles=1:9/10, # 加上 quantile regression
               formula=y ~ x)+
 labs(x="Age (in years)", 
      y="Body weight (in kg)") +
 theme_minimal()

4 Exercise 4

Create a markdown file to replicate the analysis reported in the box office data example.

4.1 data input & management

dta4 <- read.table("https://gattonweb.uky.edu/sheather/book/docs/datasets/boxoffice.txt", header = T)
str(dta4)
'data.frame':   32 obs. of  2 variables:
 $ GrossBoxOffice: num  95.3 86.4 119.4 124.4 154.2 ...
 $ year          : int  1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 ...
dta4 <- dta4 %>% 
  mutate(Year_75 = 1:32 ) %>%
  rename(Box_Office = GrossBoxOffice)
str(dta4)
'data.frame':   32 obs. of  3 variables:
 $ Box_Office: num  95.3 86.4 119.4 124.4 154.2 ...
 $ year      : int  1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 ...
 $ Year_75   : int  1 2 3 4 5 6 7 8 9 10 ...

4.2 Model 1

m1 <- lm(formula = Box_Office ~ Year_75, data = dta4)
summary(m1)

Call:
lm(formula = Box_Office ~ Year_75, data = dta4)

Residuals:
    Min      1Q  Median      3Q     Max 
-116.38  -79.20    6.08   62.26  121.70 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)   -55.93      28.03    -2.0    0.055
Year_75        29.53       1.48    19.9   <2e-16

Residual standard error: 77.4 on 30 degrees of freedom
Multiple R-squared:  0.93,  Adjusted R-squared:  0.927 
F-statistic:  397 on 1 and 30 DF,  p-value: <2e-16
knitr::kable(summary(m1)$coefficients)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -55.931 28.0344 -1.9951 0.05519
Year_75 29.534 1.4827 19.9194 0.00000
library(nlme)
knitr::kable(as.data.frame(summary(m1g <- gls(Box_Office ~ Year_75, data = dta4))$tTable))
Value Std.Error t-value p-value
(Intercept) -55.931 28.0344 -1.9951 0.05519
Year_75 29.534 1.4827 19.9194 0.00000
dta4m1 <- dta4 %>%
  mutate(res0 = resid(m1g, type = "normalized"))
# install.packages("lmtest")
lmtest::dwtest(res0 ~ year, data = dta4m1)

    Durbin-Watson test

data:  res0 ~ year
DW = 0.248, p-value = 2.3e-13
alternative hypothesis: true autocorrelation is greater than 0
# install.packages("languageR")
library(languageR)
acf(resid(m1g, type = "normalized"), main = "LM", ylim = c(-1, 1))

4.3 Model 2

m2 <- gls(Box_Office ~ Year_75, data = dta4, method = "ML",correlation=corAR1(form = ~ Year_75))
summary(m2)
Generalized least squares fit by maximum likelihood
  Model: Box_Office ~ Year_75 
  Data: dta4 
     AIC    BIC  logLik
  330.39 336.25 -161.19

Correlation Structure: AR(1)
 Formula: ~Year_75 
 Parameter estimate(s):
    Phi 
0.87821 

Coefficients:
              Value Std.Error t-value p-value
(Intercept)  4.5141    72.744  0.0621  0.9509
Year_75     27.0754     3.448  7.8533  0.0000

 Correlation: 
        (Intr)
Year_75 -0.782

Standardized residuals:
      Min        Q1       Med        Q3       Max 
-1.934208 -1.385920  0.018223  0.332026  1.542698 

Residual standard error: 76.165 
Degrees of freedom: 32 total; 30 residual
ggplot(dta4, aes(Year_75, Box_Office)) +
  geom_point(aes(Year_75, Box_Office), color = "grey70") +
  stat_smooth(method = "lm", se = F, size = rel(1), lty = 2, color = "blue") +
  labs(x = "Year since 1975", y = "Gross Box Office (in million $)") +
  theme(legend.position = "NONE")+
  theme_bw()

不知道灰色的線是什麼QQ

2020/9/25 老師給的提示為 The box office example is about auto-correlated errors in time series data.

做了以下調整:

# 加入model的配適值
dta4m2 <- dta4 %>%
  mutate(fitm2 = fitted(m2)) 
ggplot(dta4m2, aes(Year_75, Box_Office)) +
  geom_point(aes(Year_75, Box_Office), color = "grey70") +
  geom_path(aes(Year_75, fitm2), color = "grey70", size = rel(1))+ 
  stat_smooth(method = "lm", se = F, size = rel(1), lty = 2, color = "blue") +
  labs(x = "Year since 1975", y = "Gross Box Office (in million $)") +
  theme(legend.position = "NONE")+
  theme_bw()

適配値連成的線和自動計算的迴歸線有差,而自動計算的迴歸分析相當於 model 1。

dta4m2 <- dta4m2 %>%
  mutate(res0 = resid(m1g, type = "normalized")) %>%
  mutate(res1 = resid(m2, type = "normalized"))
ggplot(dta4m2, aes(Year_75, res1)) +
  geom_point(aes(Year_75, res1), color = "grey70") +
  geom_point(aes(Year_75, res0), color = "skyblue")+
  geom_hline(yintercept = 0, size = rel(1), color = "grey60")+
  labs(x = "Year since 1975", y = "Standarized Residuals") +
  theme(legend.position = "NONE")+
  theme_bw()

lmtest::dwtest(res1 ~ year, data = dta4m2)

    Durbin-Watson test

data:  res1 ~ year
DW = 2.2, p-value = 0.65
alternative hypothesis: true autocorrelation is greater than 0
acf(resid(m2, type = "normalized"), main = "GLS", ylim = c(-1, 1))