1 Inclass exercise 1: “Framingham example”

1.1 File location

fL<-"https://math.montana.edu/shancock/data/Framingham.txt"
dta <- read.table(fL, header=T)
knitr::kable(head(dta))

sex	sbp	dbp	scl	chdfate	followup	age	bmi	month	id
1	120	80	267	1	18	55	25.0	8	2642
1	130	78	192	1	35	53	28.4	12	4627
1	144	90	207	1	109	61	25.1	8	2568
1	92	66	231	1	147	48	26.2	11	4192
1	162	98	271	1	169	39	28.4	11	3977
2	212	118	182	1	199	61	33.3	2	659

sex sbp dbp scl chdfate followup age bmi month id 1 120 80 267 1 18 55 25.0 8 2642 1 130 78 192 1 35 53 28.4 12 4627 1 144 90 207 1 109 61 25.1 8 2568 1 92 66 231 1 147 48 26.2 11 4192 1 162 98 271 1 169 39 28.4 11 3977 2 212 118 182 1 199 61 33.3 2 659

1.2 Recode gender variable

dta$sex <- factor(dta$sex, 
                  levels=c(1, 2), 
                  labels=c("Male", "Female"))

1.3 Lattice plot

library(lattice)
xyplot(sbp ~ dbp | sex, 
       data=dta, cex=.5,
       type=c("p","g","r"),
       xlab="Diastolic pressure (mmHg)",
       ylab="Systolic pressure (mmHg)")

1.4 Gender difference in regression slopes

#install.packages("lsmeans")

library(lsmeans)

## Loading required package: emmeans

## The 'lsmeans' package is now basically a front end for 'emmeans'.
## Users are encouraged to switch the rest of the way.
## See help('transition') for more information, including how to
## convert old 'lsmeans' objects and scripts to work with 'emmeans'.

genderdiffer <- lm(sbp ~ dbp*sex, data = dta)
knitr::kable(anova(genderdiffer))

	Df	Sum Sq	Mean Sq	F value	Pr(>F)
dbp	1	1500666.80	1500666.8003	7737.83283	0
sex	1	13924.79	13924.7877	71.79987	0
dbp:sex	1	17310.17	17310.1696	89.25579	0
Residuals	4695	910543.14	193.9389	NA	NA

# Obtain slopes
genderdiffer$coefficients

##   (Intercept)           dbp     sexFemale dbp:sexFemale 
##    29.8539356     1.2251337   -22.1005883     0.3088544

m.lst <- lstrends(genderdiffer, "sex", var="dbp")
knitr::kable(m.lst)

sex	dbp.trend	SE	df	lower.CL	upper.CL
Male	1.225134	0.0254189	4695	1.175301	1.274967
Female	1.533988	0.0205576	4695	1.493685	1.574291

# Compare slopes
knitr::kable(pairs(m.lst))

contrast	estimate	SE	df	t.ratio	p.value
Male - Female	-0.3088544	0.0326916	4695	-9.447528	0

#slope male<female

1.5 Origin

m0 <- lm(sbp ~ dbp , data = dta)
m1 <- lm(sbp ~ dbp + sex, data = dta)
m2 <- lm(sbp ~ dbp + sex + sex:dbp, data=dta)
anova(m0, m1, m2)

## Analysis of Variance Table
## 
## Model 1: sbp ~ dbp
## Model 2: sbp ~ dbp + sex
## Model 3: sbp ~ dbp + sex + sex:dbp
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   4697 941778                                  
## 2   4696 927853  1     13925 71.800 < 2.2e-16 ***
## 3   4695 910543  1     17310 89.256 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(rstandard(m2) ~ fitted(m2), 
     cex=.5, 
     xlab="Fitted values", 
     ylab="Standardized residuals")
grid()

## The end

2 Inclass exercise 2: “Loudness data example”

2.1 Problem

weighted regression and compare the results against an ordinary regression of “length” onto “loudness”

2.2 Load the packages and input data

options(digits=5, show.signif.stars=FALSE)
# load the packages
# install.packages("pacman")
pacman::p_load(alr4, tidyverse, magrittr)

# make sure data set is active
data(Stevens, package="alr4")

# save a copy
dta <- Stevens %>% 
 rename(Length=y, Loudness=loudness, Subject=subject)

#data structure
str(dta)

## 'data.frame':    50 obs. of  5 variables:
##  $ Subject : Factor w/ 10 levels "A","B","C","D",..: 1 1 1 1 1 2 2 2 2 2 ...
##  $ Loudness: num  50 60 70 80 90 50 60 70 80 90 ...
##  $ Length  : num  0.379 1.727 3.016 3.924 4.413 ...
##  $ sd      : num  0.507 0.904 0.553 0.363 0.092 0.077 0.287 0.396 0.124 0.068 ...
##  $ n       : num  3 3 3 3 3 3 3 3 3 3 ...

knitr::kable(head(dta))

Subject	Loudness	Length	sd	n
A	50	0.379	0.507	3
A	60	1.727	0.904	3
A	70	3.016	0.553	3
A	80	3.924	0.363	3
A	90	4.413	0.092	3
B	50	0.260	0.077	3

# completely overrides the current theme.
ot <- theme_set(theme_bw())

2.3 One model fits all

# 1 regression fits all
ggplot(dta, aes(Loudness, Length, group = Subject, alpha = Subject))+
 geom_point() +
 geom_line() +
 stat_smooth(aes(group = 1), method = "lm", size = rel(.8)) +
 coord_cartesian(ylim = c(-1, 5)) +
 labs(x = "Loudness (db)", y = "Average log(Length)") +
 theme(legend.position = "NONE")

## Warning: Using alpha for a discrete variable is not advised.

## `geom_smooth()` using formula 'y ~ x'

# one model fits all and tidy summary model output
dta %>%
 do(mpre <-broom::tidy(lm(Length ~ Loudness, data = .)))

## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)  -3.20     0.344       -9.31 2.53e-12
## 2 Loudness      0.0790   0.00482     16.4  2.69e-21

2.4 Weighted least squares

dta$Subjectnum <- as.integer(as.factor(dta$Subject))
#weights by subject
dta %>%
  do(m_post <- broom::tidy(lm(Length ~ Loudness, weights = Subjectnum, data= .)))

## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)  -2.83     0.335       -8.46 4.49e-11
## 2 Loudness      0.0734   0.00469     15.7  1.74e-20

#weights by sd
dta %>%
  do(m_post <- broom::tidy(lm(Length ~ Loudness, weights = sd, data= .)))

## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)  -3.06     0.357       -8.56 3.15e-11
## 2 Loudness      0.0779   0.00513     15.2  5.71e-20

2.5 The end

3 Inclass exercise 3: “Quantile regression”

a sample of 4,011 US girls from the data set USgirl

3.1 Problem

Edit the R script to perform a quantile regression of weight onto age.

3.2 Load the packages and input data

library(Brq)
data(USgirl, package="Brq")
dta <- USgirl
str(dta)

## 'data.frame':    4011 obs. of  2 variables:
##  $ Age   : num  14.74 1.76 15.38 12.76 15.73 ...
##  $ Weight: num  85 12.2 59.4 38.6 47.9 ...

knitr::kable(head(dta))

Age	Weight
14.74	85.05
1.76	12.25
15.38	59.42
12.76	38.56
15.73	47.85
9.22	43.09

3.3 a quantile regression of weight onto age.

library(quantreg)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

mUS_qr <- rq(Age ~ Weight, data=dta, tau=1:9/10)
summary(mUS_qr)

## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.1
## 
## Coefficients:
##             Value    Std. Error t value  Pr(>|t|)
## (Intercept) -0.60905  0.07506   -8.11403  0.00000
## Weight       0.21493  0.00364   58.97602  0.00000
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.2
## 
## Coefficients:
##             Value    Std. Error t value  Pr(>|t|)
## (Intercept) -0.64512  0.08558   -7.53782  0.00000
## Weight       0.24674  0.00382   64.50824  0.00000
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.3
## 
## Coefficients:
##             Value    Std. Error t value  Pr(>|t|)
## (Intercept) -0.59693  0.06968   -8.56684  0.00000
## Weight       0.26760  0.00296   90.52868  0.00000
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.4
## 
## Coefficients:
##             Value    Std. Error t value  Pr(>|t|)
## (Intercept) -0.53221  0.07182   -7.40995  0.00000
## Weight       0.28547  0.00311   91.83318  0.00000
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.5
## 
## Coefficients:
##             Value     Std. Error t value   Pr(>|t|) 
## (Intercept)  -0.50678   0.05763   -8.79425   0.00000
## Weight        0.30109   0.00263  114.39482   0.00000
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.6
## 
## Coefficients:
##             Value     Std. Error t value   Pr(>|t|) 
## (Intercept)  -0.44654   0.06834   -6.53459   0.00000
## Weight        0.31549   0.00276  114.29469   0.00000
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.7
## 
## Coefficients:
##             Value     Std. Error t value   Pr(>|t|) 
## (Intercept)  -0.40546   0.06129   -6.61499   0.00000
## Weight        0.33230   0.00278  119.65397   0.00000
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.8
## 
## Coefficients:
##             Value     Std. Error t value   Pr(>|t|) 
## (Intercept)  -0.30468   0.07207   -4.22732   0.00002
## Weight        0.34899   0.00318  109.85603   0.00000
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.9
## 
## Coefficients:
##             Value    Std. Error t value  Pr(>|t|)
## (Intercept) -0.20892  0.07961   -2.62449  0.00871
## Weight       0.37489  0.00387   96.93329  0.00000

plot(summary(mUS_qr))

3.4 plot of quantile regression of weight onto age.

ggplot(dta, aes(Age, Weight)) + 
 geom_point(alpha=.5, 
            size=rel(.3),
            color="lightskyblue") + 
 geom_quantile(quantiles=1:9/10, 
               formula=y ~ x,
               color="steelblue")+
 labs(x="Age (in years)", 
      y="Body weight (in kg)") +
 theme_minimal()

## The end

4 Inclass exercise 4: “Australian Film Commission (AFC): box office”

4.1 Problem

Create a markdown file to replicate the analysis reported in the box office data example. ## Input data

fLbox <- "https://gattonweb.uky.edu/sheather/book/docs/datasets/boxoffice.txt"
dta <- read.table(fLbox, header=T)
str(dta)

## 'data.frame':    32 obs. of  2 variables:
##  $ GrossBoxOffice: num  95.3 86.4 119.4 124.4 154.2 ...
##  $ year          : int  1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 ...

dta4 <- dta %>% 
  mutate(Year_75 = 1:32 ) %>%
  rename(Box_Office = GrossBoxOffice)
knitr::kable(head(dta4))

Box_Office	year	Year_75
95.3	1976	1
86.4	1977	2
119.4	1978	3
124.4	1979	4
154.2	1980	5
174.3	1981	6

4.2 model 0 (predict yearly gross box office receipts: model for all years)

Model 0: Box_office_t = β0 + β1 × Year_75t + εt, t = 1, …, 32, where εt ~ N(0, σ).

# linear regression (Y=Box_Office, x= Year_75)
m0 <- lm(Box_Office ~ Year_75, data= dta4)
summary(m0)

## 
## Call:
## lm(formula = Box_Office ~ Year_75, data = dta4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -116.38  -79.20    6.08   62.26  121.70 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -55.93      28.03    -2.0    0.055
## Year_75        29.53       1.48    19.9   <2e-16
## 
## Residual standard error: 77.4 on 30 degrees of freedom
## Multiple R-squared:  0.93,   Adjusted R-squared:  0.927 
## F-statistic:  397 on 1 and 30 DF,  p-value: <2e-16

#
knitr::kable(summary(m0)$coefficients)

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	-55.931	28.0344	-1.9951	0.05519
Year_75	29.534	1.4827	19.9194	0.00000

# generalized least squares
library(nlme)

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:dplyr':
## 
##     collapse

m0gls <- dta4 %>%
  nlme::gls(Box_Office ~ Year_75, data= .)
summary(m0gls)

## Generalized least squares fit by REML
##   Model: Box_Office ~ Year_75 
##   Data: . 
##      AIC    BIC  logLik
##   363.48 367.69 -178.74
## 
## Coefficients:
##               Value Std.Error t-value p-value
## (Intercept) -55.931   28.0344 -1.9951  0.0552
## Year_75      29.534    1.4827 19.9194  0.0000
## 
##  Correlation: 
##         (Intr)
## Year_75 -0.873
## 
## Standardized residuals:
##       Min        Q1       Med        Q3       Max 
## -1.502839 -1.022668  0.078555  0.803955  1.571461 
## 
## Residual standard error: 77.442 
## Degrees of freedom: 32 total; 30 residual

# residual correlations
dta4 <- dta4 %>%
  mutate(res0 = resid(m0gls, type = "normalized"))
summary(dta4)

##    Box_Office         year         Year_75           res0        
##  Min.   : 86.4   Min.   :1976   Min.   : 1.00   Min.   :-1.5028  
##  1st Qu.:180.2   1st Qu.:1984   1st Qu.: 8.75   1st Qu.:-1.0227  
##  Median :329.6   Median :1992   Median :16.50   Median : 0.0786  
##  Mean   :431.4   Mean   :1992   Mean   :16.50   Mean   : 0.0000  
##  3rd Qu.:693.1   3rd Qu.:1999   3rd Qu.:24.25   3rd Qu.: 0.8040  
##  Max.   :907.2   Max.   :2007   Max.   :32.00   Max.   : 1.5715

# Durbin-Watson test 
library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

dta4 %>% 
  lmtest::dwtest(res0 ~ year, data= .)

## 
##  Durbin-Watson test
## 
## data:  res0 ~ year
## DW = 0.248, p-value = 2.3e-13
## alternative hypothesis: true autocorrelation is greater than 0

# Autocorrelation function Plot
acf(resid(m0gls, type = "normalized"), main = "LM", ylim = c(-1, 1))

4.3 model 1 (Generalized least squares)

Model 1: Box_office_t = β0 + β1 × Year_75t + εt, t = 1, …, 32, where εt = εt-1 + νt, and νt ~ N(0, σ).

m1 <- nlme::gls(Box_Office ~ Year_75, data = dta4, 
                method = "ML",
                correlation=corAR1(form = ~ Year_75))

summary(m1)

## Generalized least squares fit by maximum likelihood
##   Model: Box_Office ~ Year_75 
##   Data: dta4 
##      AIC    BIC  logLik
##   330.39 336.25 -161.19
## 
## Correlation Structure: AR(1)
##  Formula: ~Year_75 
##  Parameter estimate(s):
##     Phi 
## 0.87821 
## 
## Coefficients:
##               Value Std.Error t-value p-value
## (Intercept)  4.5141    72.744  0.0621  0.9509
## Year_75     27.0754     3.448  7.8533  0.0000
## 
##  Correlation: 
##         (Intr)
## Year_75 -0.782
## 
## Standardized residuals:
##       Min        Q1       Med        Q3       Max 
## -1.934208 -1.385920  0.018223  0.332026  1.542698 
## 
## Residual standard error: 76.165 
## Degrees of freedom: 32 total; 30 residual

#?? extracts fitted values from m1 returned by modeling functions??
dta4m1 <- dta4 %>%
       mutate(fitm1 = fitted(m1)) 

#plot
ggplot(dta4m1, aes(Year_75, Box_Office)) +
  geom_point(aes(Year_75, Box_Office), color = "grey") + #regression line (Year_75, Box_Office)
  geom_smooth(aes(Year_75, fitm1), color = "grey50") + #regression line(Year_75, fitm1)
  geom_smooth()+ #curve line
  stat_smooth(method = "lm", se = F, size = rel(1), lty = 2, color = "blue") +
  labs(x = "Year since 1975", y = "Gross Box Office (in million $)") +
  theme(legend.position = "NONE")+
  theme_bw()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## Plot of m0 and m1 Residual

dta4m1res <- dta4 %>% 
  mutate(res1=resid(m1, type="normalized"))

ggplot(dta4m1res, aes(Year_75, res1)) +
  geom_point(aes(Year_75, res0), color = "skyblue") +
  geom_point(aes(Year_75, res1), color = "gray50") +
  geom_hline(yintercept = 0, size = rel(1), color = "gray")+
  labs(x = "Year since 1975", y = "Standarized Residuals") +
  theme()+
  theme_bw()

## Residual correlations

#
lmtest::dwtest(res1 ~ year, data = dta4m1res)

## 
##  Durbin-Watson test
## 
## data:  res1 ~ year
## DW = 2.2, p-value = 0.65
## alternative hypothesis: true autocorrelation is greater than 0

#
acf(resid(m1, type = "normalized"), main = "GLS", ylim = c(-1, 1))

In-class Exercises:Regression

Chang-Chun Chen

2020-09-27