1 in class exercise 1

1.1 file location

fL<-"https://math.montana.edu/shancock/data/Framingham.txt"

dta <- read.table(fL, header=T)

1.2 recode gender variable

dta$sex <- factor(dta$sex, 
                  levels=c(1, 2), 
                  labels=c("M", "F"))

1.3 lattice plot

library(lattice)
xyplot(sbp ~ dbp | sex, 
       data=dta, cex=.5,
       type=c("p","g","r"),
       xlab="Diastolic pressure (mmHg)",
       ylab="Systolic pressure (mmHg)")

由lattice plot 可知，Female的slope略高於Male，在相同的dbp下，女生的sbp略高於男生

m0 <- lm(sbp ~ dbp, data=dta)

m1 <- lm(sbp ~ dbp + sex, data=dta)

m2 <- lm(sbp ~ dbp + sex + sex:dbp, data=dta)

anova(m0, m1, m2)

## Analysis of Variance Table
## 
## Model 1: sbp ~ dbp
## Model 2: sbp ~ dbp + sex
## Model 3: sbp ~ dbp + sex + sex:dbp
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   4697 941778                                  
## 2   4696 927853  1     13925 71.800 < 2.2e-16 ***
## 3   4695 910543  1     17310 89.256 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

我們可以看到m1及m2的model是顯著的，即代表性別對血壓有影響，且DBP與性別有交互作用。

plot(rstandard(m2) ~ fitted(m2), 
     cex=.5, 
     xlab="Fitted values", 
     ylab="Standardized residuals")
grid()

## The end

2 in class exercise 2

2.1 regression with clusters

#
options(digits=5, show.signif.stars=FALSE)

2.2 load the packages

# install.packages("pacman")
pacman::p_load(alr4, tidyverse, magrittr)

2.3 make sure data set is active

data(Stevens, package="alr4")

2.4 save a copy

dta <- Stevens %>% 
 rename(Length=y, Loudness=loudness, Subject=subject)

#
ot <- theme_set(theme_bw())

2.5 1 regression fits all

ggplot(dta, aes(Loudness, Length, group = Subject, alpha = Subject))+
 geom_point() +
 geom_line() +
 stat_smooth(aes(group = 1), method = "lm", size = rel(.8)) +
 coord_cartesian(ylim = c(-1, 5)) +
 labs(x = "Loudness (db)", y = "Average log(Length)") +
 theme(legend.position = "NONE")

## Warning: Using alpha for a discrete variable is not advised.

## `geom_smooth()` using formula 'y ~ x'

## one model fits all

m0 <- lm(Length ~ Loudness, data = dta)

2.6 tidy summary model output

broom::tidy(summary(m0))

## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)  -3.20     0.344       -9.31 2.53e-12
## 2 Loudness      0.0790   0.00482     16.4  2.69e-21

2.7 1 regression model per individual

ggplot(dta, aes(Loudness, Length, group = Subject, color = Subject))+
 geom_point() +
 stat_smooth(method = "lm", se = F, size = rel(.5)) +
 scale_color_grey()+
 coord_cartesian(ylim = c(-1, 5)) +
 labs(x = "Loudness (db)", y = "Average log(Length)") +
 theme(legend.position = "NONE")

## `geom_smooth()` using formula 'y ~ x'

2.8 fancy way of conducting 1 regression model per subject

##grouped by subject before doing regression
dta %>% 
 group_by(Subject) %>% 
 do(broom::tidy(lm(Length ~ Loudness, data = .)))

## # A tibble: 20 x 6
## # Groups:   Subject [10]
##    Subject term        estimate std.error statistic  p.value
##    <fct>   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 A       (Intercept)  -4.49     0.748       -6.01 0.00923 
##  2 A       Loudness      0.103    0.0105       9.80 0.00225 
##  3 B       (Intercept)  -4.59     0.462       -9.92 0.00218 
##  4 B       Loudness      0.0935   0.00647     14.5  0.000718
##  5 C       (Intercept)  -2.29     0.253       -9.05 0.00285 
##  6 C       Loudness      0.0695   0.00355     19.6  0.000291
##  7 D       (Intercept)  -2.92     0.338       -8.63 0.00327 
##  8 D       Loudness      0.0751   0.00474     15.8  0.000547
##  9 E       (Intercept)  -5.39     0.743       -7.24 0.00542 
## 10 E       Loudness      0.104    0.0104       9.98 0.00214 
## 11 F       (Intercept)  -2.42     0.523       -4.63 0.0190  
## 12 F       Loudness      0.0774   0.00732     10.6  0.00181 
## 13 G       (Intercept)  -2.81     0.312       -9.01 0.00288 
## 14 G       Loudness      0.0650   0.00437     14.9  0.000659
## 15 H       (Intercept)  -3.42     0.401       -8.53 0.00338 
## 16 H       Loudness      0.0822   0.00561     14.6  0.000691
## 17 I       (Intercept)  -1.95     0.512       -3.80 0.0319  
## 18 I       Loudness      0.0656   0.00718      9.14 0.00277 
## 19 J       (Intercept)  -1.76     0.265       -6.63 0.00699 
## 20 J       Loudness      0.0552   0.00372     14.9  0.000660

2.9 fitting a model treating subject as fixed effects

summary(m1 <- lm(Length ~ Subject/Loudness - 1, data = dta))

## 
## Call:
## lm(formula = Length ~ Subject/Loudness - 1, data = dta)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3318 -0.1073  0.0023  0.1485  0.3242 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)
## SubjectA          -4.49370    0.48664   -9.23  2.8e-10
## SubjectB          -4.58530    0.48664   -9.42  1.8e-10
## SubjectC          -2.29360    0.48664   -4.71  5.2e-05
## SubjectD          -2.91940    0.48664   -6.00  1.4e-06
## SubjectE          -5.38510    0.48664  -11.07  4.1e-12
## SubjectF          -2.42120    0.48664   -4.98  2.5e-05
## SubjectG          -2.81090    0.48664   -5.78  2.6e-06
## SubjectH          -3.42070    0.48664   -7.03  8.2e-08
## SubjectI          -1.94870    0.48664   -4.00  0.00038
## SubjectJ          -1.75910    0.48664   -3.61  0.00109
## SubjectA:Loudness  0.10265    0.00681   15.06  1.6e-15
## SubjectB:Loudness  0.09355    0.00681   13.73  1.8e-14
## SubjectC:Loudness  0.06950    0.00681   10.20  2.9e-11
## SubjectD:Loudness  0.07506    0.00681   11.02  4.6e-12
## SubjectE:Loudness  0.10391    0.00681   15.25  1.1e-15
## SubjectF:Loudness  0.07740    0.00681   11.36  2.2e-12
## SubjectG:Loudness  0.06497    0.00681    9.53  1.4e-10
## SubjectH:Loudness  0.08223    0.00681   12.07  4.9e-13
## SubjectI:Loudness  0.06561    0.00681    9.63  1.1e-10
## SubjectJ:Loudness  0.05525    0.00681    8.11  4.7e-09
## 
## Residual standard error: 0.215 on 30 degrees of freedom
## Multiple R-squared:  0.996,  Adjusted R-squared:  0.993 
## F-statistic:  369 on 20 and 30 DF,  p-value: <2e-16

2.10 augument data with model fit

fit1 <- fitted(m1)
dta %>% mutate(fit1)

##    Subject Loudness Length    sd n    fit1
## 1        A       50  0.379 0.507 3  0.6388
## 2        A       60  1.727 0.904 3  1.6653
## 3        A       70  3.016 0.553 3  2.6918
## 4        A       80  3.924 0.363 3  3.7183
## 5        A       90  4.413 0.092 3  4.7448
## 6        B       50  0.260 0.077 3  0.0922
## 7        B       60  0.833 0.287 3  1.0277
## 8        B       70  2.009 0.396 3  1.9632
## 9        B       80  2.720 0.124 3  2.8987
## 10       B       90  3.994 0.068 3  3.8342
## 11       C       50  1.072 0.106 3  1.1814
## 12       C       60  1.942 0.324 3  1.8764
## 13       C       70  2.700 0.319 3  2.5714
## 14       C       80  3.250 0.137 3  3.2664
## 15       C       90  3.893 0.356 3  3.9614
## 16       D       50  0.697 0.288 3  0.8336
## 17       D       60  1.787 0.121 3  1.5842
## 18       D       70  2.283 0.139 3  2.3348
## 19       D       80  3.127 0.331 3  3.0854
## 20       D       90  3.780 0.314 3  3.8360
## 21       E       50 -0.520 0.168 3 -0.1896
## 22       E       60  1.124 0.550 3  0.8495
## 23       E       70  2.045 0.315 3  1.8886
## 24       E       80  3.113 0.247 3  2.9277
## 25       E       90  3.681 0.217 3  3.9668
## 26       F       50  1.348 0.842 3  1.4488
## 27       F       60  2.134 0.130 3  2.2228
## 28       F       70  3.249 0.626 3  2.9968
## 29       F       80  3.936 0.181 3  3.7708
## 30       F       90  4.317 0.107 3  4.5448
## 31       G       50  0.485 0.272 3  0.4376
## 32       G       60  1.158 0.413 3  1.0873
## 33       G       70  1.585 0.240 3  1.7370
## 34       G       80  2.289 0.102 3  2.3867
## 35       G       90  3.168 0.118 3  3.0364
## 36       H       50  0.682 0.189 3  0.6908
## 37       H       60  1.684 0.231 3  1.5131
## 38       H       70  2.090 0.540 3  2.3354
## 39       H       80  3.171 0.449 3  3.1577
## 40       H       90  4.050 0.309 3  3.9800
## 41       I       50  1.201 0.089 3  1.3318
## 42       I       60  2.142 0.514 3  1.9879
## 43       I       70  2.543 0.821 3  2.6440
## 44       I       80  3.563 0.283 3  3.3001
## 45       I       90  3.771 0.378 3  3.9562
## 46       J       50  1.111 0.174 3  1.0034
## 47       J       60  1.500 0.112 3  1.5559
## 48       J       70  2.010 0.555 3  2.1084
## 49       J       80  2.595 0.013 3  2.6609
## 50       J       90  3.326 0.289 3  3.2134

2.11 1 regression model for all with individual parameters

ggplot(dta, aes(Loudness, fit1, group = Subject, alpha = Subject))+
 geom_path() +
 geom_point(aes(Loudness, Length, group = Subject, alpha = Subject)) +
 coord_cartesian(ylim = c(-1, 5)) +
 labs(x = "Loudness (dB)", y = "Average log(Length)") +
 theme(legend.position = "NONE")

## Warning: Using alpha for a discrete variable is not advised.

## The end

3 in class exercise 3

3.1 load data

library(Brq)

#see it
data(USgirl, package="Brq")
str(USgirl)

## 'data.frame':    4011 obs. of  2 variables:
##  $ Age   : num  14.74 1.76 15.38 12.76 15.73 ...
##  $ Weight: num  85 12.2 59.4 38.6 47.9 ...

3.2 plot

plot(USgirl, pch='.', bty="n",
     xlim=c(0, 25),
     ylim=c(0, 150),
     xlab="Age (in years)",
     ylab="Body weight (in kg)")

3.3 quantile regression

ggplot(USgirl, aes(Age, Weight)) + 
 geom_point(alpha=.5, size=rel(.3)) + #point alpha & size
 geom_quantile(quantiles=1:9/10, #the nine regreesion line as below
               formula=y ~ x)+
 labs(x="Age (in years)", 
      y="Body weight (in kg)") +
 theme_minimal()

## The end

4 In class exercise 4

4.1 load data

indta <- read.table("https://gattonweb.uky.edu/sheather/book/docs/datasets/boxoffice.txt", header = T)

str(indta)

## 'data.frame':    32 obs. of  2 variables:
##  $ GrossBoxOffice: num  95.3 86.4 119.4 124.4 154.2 ...
##  $ year          : int  1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 ...

4.2 add variable and rename

indta4<-mutate(indta,Year_75=1:32)  #1976~2007共32

inclass4<-rename(indta4,Box_Office = GrossBoxOffice)
head(inclass4)

##   Box_Office year Year_75
## 1       95.3 1976       1
## 2       86.4 1977       2
## 3      119.4 1978       3
## 4      124.4 1979       4
## 5      154.2 1980       5
## 6      174.3 1981       6

4.3 The Model A

Box_office_t = β0 + β1 × Year_75t + εt, t = 1, …, 32, where εt ~ N(0, σ).

summary(m1<-lm(formula = Box_Office ~ Year_75, data = inclass4))

## 
## Call:
## lm(formula = Box_Office ~ Year_75, data = inclass4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -116.38  -79.20    6.08   62.26  121.70 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -55.93      28.03    -2.0    0.055
## Year_75        29.53       1.48    19.9   <2e-16
## 
## Residual standard error: 77.4 on 30 degrees of freedom
## Multiple R-squared:  0.93,   Adjusted R-squared:  0.927 
## F-statistic:  397 on 1 and 30 DF,  p-value: <2e-16

(summary(lm(formula = Box_Office ~ Year_75, data = inclass4))$coefficients)

##             Estimate Std. Error t value   Pr(>|t|)
## (Intercept)  -55.931    28.0344 -1.9951 5.5186e-02
## Year_75       29.534     1.4827 19.9194 7.5559e-19

Generalized least squares #lecture note p.15

library(nlme)

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:dplyr':
## 
##     collapse

knitr::kable(as.data.frame(summary(m1g <- gls(Box_Office ~ Year_75, data=inclass4, method="ML"))$tTable))

	Value	Std.Error	t-value	p-value
(Intercept)	-55.931	28.0344	-1.9951	0.05519
Year_75	29.534	1.4827	19.9194	0.00000

Residual correlations

library(languageR)
library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

inclass4 <- inclass4 %>% 
  mutate(res0=resid(m1g, 
                    type="normalized"))
acf(resid(m1g, type = "normalized"), main = "LM", ylim = c(-1, 1))

lmtest::dwtest(res0 ~ year, 
               data=inclass4)

## 
##  Durbin-Watson test
## 
## data:  res0 ~ year
## DW = 0.248, p-value = 2.3e-13
## alternative hypothesis: true autocorrelation is greater than 0

4.4 The Model B

Model: Box_office_t = β0 + β1 × Year_75t + εt, t = 1, …, 32, where εt = εt-1 + νt, and νt ~ N(0, σ).

m2 <- gls(Box_Office ~ Year_75, data = inclass4, method = "ML",correlation=corAR1(form = ~ Year_75))

summary(m2)

## Generalized least squares fit by maximum likelihood
##   Model: Box_Office ~ Year_75 
##   Data: inclass4 
##      AIC    BIC  logLik
##   330.39 336.25 -161.19
## 
## Correlation Structure: AR(1)
##  Formula: ~Year_75 
##  Parameter estimate(s):
##     Phi 
## 0.87821 
## 
## Coefficients:
##               Value Std.Error t-value p-value
## (Intercept)  4.5141    72.744  0.0621  0.9509
## Year_75     27.0754     3.448  7.8533  0.0000
## 
##  Correlation: 
##         (Intr)
## Year_75 -0.782
## 
## Standardized residuals:
##       Min        Q1       Med        Q3       Max 
## -1.934208 -1.385920  0.018223  0.332026  1.542698 
## 
## Residual standard error: 76.165 
## Degrees of freedom: 32 total; 30 residual

inclass4 <- inclass4 %>%
       mutate(fit1 = fitted(m2))

ggplot(inclass4, aes(Year_75, Box_Office)) +
  geom_point(aes(Year_75, Box_Office), color = "grey") +
  geom_path(aes(Year_75, fit1), col = "grey", size =rel(1))+
  stat_smooth(method = "lm", se = F, size = rel(1), lty = 2, color = "blue")+
  labs(x = "Year since 1975", y = "Gross Box Office (in million $)") +
  theme(legend.position = "NONE")+
  theme_bw()

## `geom_smooth()` using formula 'y ~ x'

inclass4m2 <- inclass4 %>%
  mutate(res1 = resid(m2, type = "normalized"))
ggplot(inclass4m2, aes(Year_75, res1)) +
  geom_point(aes(Year_75, res0), color = "skyblue") +
  geom_point(aes(Year_75, res1), color = "grey70")+
  geom_hline(yintercept = 0, size = rel(1), color = "grey60")+
 
  labs(x = "Year since 1975", y = "Standarized Residuals") +
  theme(legend.position = "NONE")+
  theme_bw()

lmtest::dwtest(res1 ~ year, data = inclass4m2)

## 
##  Durbin-Watson test
## 
## data:  res1 ~ year
## DW = 2.2, p-value = 0.65
## alternative hypothesis: true autocorrelation is greater than 0

acf(resid(m2, type = "normalized"), main = "GLS", ylim = c(-1, 1))

##從白天嘗試到黑夜~

In Class Exercise

tjlee

2020-09-27

1 in class exercise 1

1.1 file location

1.2 recode gender variable

1.3 lattice plot

2 in class exercise 2

2.1 regression with clusters

2.2 load the packages

2.3 make sure data set is active

2.4 save a copy

2.5 1 regression fits all

2.6 tidy summary model output

2.7 1 regression model per individual

2.8 fancy way of conducting 1 regression model per subject

2.9 fitting a model treating subject as fixed effects

2.10 augument data with model fit

2.11 1 regression model for all with individual parameters

3 in class exercise 3

3.1 load data

3.2 plot

3.3 quantile regression

4 In class exercise 4

4.1 load data

4.2 add variable and rename

4.3 The Model A

4.4 The Model B