1 Inclass exercise 1: “Framingham example”

1.1 File location

fL<-"https://math.montana.edu/shancock/data/Framingham.txt"
dta <- read.table(fL, header=T)
knitr::kable(head(dta))
sex sbp dbp scl chdfate followup age bmi month id
1 120 80 267 1 18 55 25.0 8 2642
1 130 78 192 1 35 53 28.4 12 4627
1 144 90 207 1 109 61 25.1 8 2568
1 92 66 231 1 147 48 26.2 11 4192
1 162 98 271 1 169 39 28.4 11 3977
2 212 118 182 1 199 61 33.3 2 659

sex sbp dbp scl chdfate followup age bmi month id 1 120 80 267 1 18 55 25.0 8 2642 1 130 78 192 1 35 53 28.4 12 4627 1 144 90 207 1 109 61 25.1 8 2568 1 92 66 231 1 147 48 26.2 11 4192 1 162 98 271 1 169 39 28.4 11 3977 2 212 118 182 1 199 61 33.3 2 659

1.2 Recode gender variable

dta$sex <- factor(dta$sex, 
                  levels=c(1, 2), 
                  labels=c("Male", "Female")) 

1.3 Lattice plot

library(lattice)
xyplot(sbp ~ dbp | sex, 
       data=dta, cex=.5,
       type=c("p","g","r"),
       xlab="Diastolic pressure (mmHg)",
       ylab="Systolic pressure (mmHg)")

1.4 Gender difference in regression slopes

#install.packages("lsmeans")

library(lsmeans)
## Loading required package: emmeans
## The 'lsmeans' package is now basically a front end for 'emmeans'.
## Users are encouraged to switch the rest of the way.
## See help('transition') for more information, including how to
## convert old 'lsmeans' objects and scripts to work with 'emmeans'.
genderdiffer <- lm(sbp ~ dbp*sex, data = dta)
knitr::kable(anova(genderdiffer))
Df Sum Sq Mean Sq F value Pr(>F)
dbp 1 1500666.80 1500666.8003 7737.83283 0
sex 1 13924.79 13924.7877 71.79987 0
dbp:sex 1 17310.17 17310.1696 89.25579 0
Residuals 4695 910543.14 193.9389 NA NA
# Obtain slopes
genderdiffer$coefficients
##   (Intercept)           dbp     sexFemale dbp:sexFemale 
##    29.8539356     1.2251337   -22.1005883     0.3088544
m.lst <- lstrends(genderdiffer, "sex", var="dbp")
knitr::kable(m.lst)
sex dbp.trend SE df lower.CL upper.CL
Male 1.225134 0.0254189 4695 1.175301 1.274967
Female 1.533988 0.0205576 4695 1.493685 1.574291
# Compare slopes
knitr::kable(pairs(m.lst))
contrast estimate SE df t.ratio p.value
Male - Female -0.3088544 0.0326916 4695 -9.447528 0
#slope male<female

1.5 Origin

m0 <- lm(sbp ~ dbp , data = dta)
m1 <- lm(sbp ~ dbp + sex, data = dta)
m2 <- lm(sbp ~ dbp + sex + sex:dbp, data=dta)
anova(m0, m1, m2)
## Analysis of Variance Table
## 
## Model 1: sbp ~ dbp
## Model 2: sbp ~ dbp + sex
## Model 3: sbp ~ dbp + sex + sex:dbp
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   4697 941778                                  
## 2   4696 927853  1     13925 71.800 < 2.2e-16 ***
## 3   4695 910543  1     17310 89.256 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(rstandard(m2) ~ fitted(m2), 
     cex=.5, 
     xlab="Fitted values", 
     ylab="Standardized residuals")
grid()

## The end

2 Inclass exercise 2: “Loudness data example”

2.1 Problem

weighted regression and compare the results against an ordinary regression of “length” onto “loudness”

2.2 Load the packages and input data

options(digits=5, show.signif.stars=FALSE)
# load the packages
# install.packages("pacman")
pacman::p_load(alr4, tidyverse, magrittr)

# make sure data set is active
data(Stevens, package="alr4")

# save a copy
dta <- Stevens %>% 
 rename(Length=y, Loudness=loudness, Subject=subject)

#data structure
str(dta)
## 'data.frame':    50 obs. of  5 variables:
##  $ Subject : Factor w/ 10 levels "A","B","C","D",..: 1 1 1 1 1 2 2 2 2 2 ...
##  $ Loudness: num  50 60 70 80 90 50 60 70 80 90 ...
##  $ Length  : num  0.379 1.727 3.016 3.924 4.413 ...
##  $ sd      : num  0.507 0.904 0.553 0.363 0.092 0.077 0.287 0.396 0.124 0.068 ...
##  $ n       : num  3 3 3 3 3 3 3 3 3 3 ...
knitr::kable(head(dta))
Subject Loudness Length sd n
A 50 0.379 0.507 3
A 60 1.727 0.904 3
A 70 3.016 0.553 3
A 80 3.924 0.363 3
A 90 4.413 0.092 3
B 50 0.260 0.077 3
# completely overrides the current theme.
ot <- theme_set(theme_bw())

2.3 One model fits all

# 1 regression fits all
ggplot(dta, aes(Loudness, Length, group = Subject, alpha = Subject))+
 geom_point() +
 geom_line() +
 stat_smooth(aes(group = 1), method = "lm", size = rel(.8)) +
 coord_cartesian(ylim = c(-1, 5)) +
 labs(x = "Loudness (db)", y = "Average log(Length)") +
 theme(legend.position = "NONE")
## Warning: Using alpha for a discrete variable is not advised.
## `geom_smooth()` using formula 'y ~ x'

# one model fits all and tidy summary model output
dta %>%
 do(mpre <-broom::tidy(lm(Length ~ Loudness, data = .)))
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)  -3.20     0.344       -9.31 2.53e-12
## 2 Loudness      0.0790   0.00482     16.4  2.69e-21

2.4 Weighted least squares

dta$Subjectnum <- as.integer(as.factor(dta$Subject))
#weights by subject
dta %>%
  do(m_post <- broom::tidy(lm(Length ~ Loudness, weights = Subjectnum, data= .)))
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)  -2.83     0.335       -8.46 4.49e-11
## 2 Loudness      0.0734   0.00469     15.7  1.74e-20
#weights by sd
dta %>%
  do(m_post <- broom::tidy(lm(Length ~ Loudness, weights = sd, data= .)))
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)  -3.06     0.357       -8.56 3.15e-11
## 2 Loudness      0.0779   0.00513     15.2  5.71e-20

2.5 The end

3 Inclass exercise 3: “Quantile regression”

a sample of 4,011 US girls from the data set USgirl

3.1 Problem

Edit the R script to perform a quantile regression of weight onto age.

3.2 Load the packages and input data

library(Brq)
data(USgirl, package="Brq")
dta <- USgirl
str(dta)
## 'data.frame':    4011 obs. of  2 variables:
##  $ Age   : num  14.74 1.76 15.38 12.76 15.73 ...
##  $ Weight: num  85 12.2 59.4 38.6 47.9 ...
knitr::kable(head(dta))
Age Weight
14.74 85.05
1.76 12.25
15.38 59.42
12.76 38.56
15.73 47.85
9.22 43.09

3.3 a quantile regression of weight onto age.

library(quantreg)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
mUS_qr <- rq(Age ~ Weight, data=dta, tau=1:9/10)
summary(mUS_qr)
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.1
## 
## Coefficients:
##             Value    Std. Error t value  Pr(>|t|)
## (Intercept) -0.60905  0.07506   -8.11403  0.00000
## Weight       0.21493  0.00364   58.97602  0.00000
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.2
## 
## Coefficients:
##             Value    Std. Error t value  Pr(>|t|)
## (Intercept) -0.64512  0.08558   -7.53782  0.00000
## Weight       0.24674  0.00382   64.50824  0.00000
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.3
## 
## Coefficients:
##             Value    Std. Error t value  Pr(>|t|)
## (Intercept) -0.59693  0.06968   -8.56684  0.00000
## Weight       0.26760  0.00296   90.52868  0.00000
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.4
## 
## Coefficients:
##             Value    Std. Error t value  Pr(>|t|)
## (Intercept) -0.53221  0.07182   -7.40995  0.00000
## Weight       0.28547  0.00311   91.83318  0.00000
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.5
## 
## Coefficients:
##             Value     Std. Error t value   Pr(>|t|) 
## (Intercept)  -0.50678   0.05763   -8.79425   0.00000
## Weight        0.30109   0.00263  114.39482   0.00000
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.6
## 
## Coefficients:
##             Value     Std. Error t value   Pr(>|t|) 
## (Intercept)  -0.44654   0.06834   -6.53459   0.00000
## Weight        0.31549   0.00276  114.29469   0.00000
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.7
## 
## Coefficients:
##             Value     Std. Error t value   Pr(>|t|) 
## (Intercept)  -0.40546   0.06129   -6.61499   0.00000
## Weight        0.33230   0.00278  119.65397   0.00000
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.8
## 
## Coefficients:
##             Value     Std. Error t value   Pr(>|t|) 
## (Intercept)  -0.30468   0.07207   -4.22732   0.00002
## Weight        0.34899   0.00318  109.85603   0.00000
## 
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
## 
## tau: [1] 0.9
## 
## Coefficients:
##             Value    Std. Error t value  Pr(>|t|)
## (Intercept) -0.20892  0.07961   -2.62449  0.00871
## Weight       0.37489  0.00387   96.93329  0.00000
plot(summary(mUS_qr))

3.4 plot of quantile regression of weight onto age.

ggplot(dta, aes(Age, Weight)) + 
 geom_point(alpha=.5, 
            size=rel(.3),
            color="lightskyblue") + 
 geom_quantile(quantiles=1:9/10, 
               formula=y ~ x,
               color="steelblue")+
 labs(x="Age (in years)", 
      y="Body weight (in kg)") +
 theme_minimal()

## The end

4 Inclass exercise 4: “Australian Film Commission (AFC): box office”

4.1 Problem

Create a markdown file to replicate the analysis reported in the box office data example. ## Input data

fLbox <- "https://gattonweb.uky.edu/sheather/book/docs/datasets/boxoffice.txt"
dta <- read.table(fLbox, header=T)
str(dta)
## 'data.frame':    32 obs. of  2 variables:
##  $ GrossBoxOffice: num  95.3 86.4 119.4 124.4 154.2 ...
##  $ year          : int  1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 ...
dta4 <- dta %>% 
  mutate(Year_75 = 1:32 ) %>%
  rename(Box_Office = GrossBoxOffice)
knitr::kable(head(dta4))
Box_Office year Year_75
95.3 1976 1
86.4 1977 2
119.4 1978 3
124.4 1979 4
154.2 1980 5
174.3 1981 6

4.2 model 0 (predict yearly gross box office receipts: model for all years)

Model 0: Box_office_t = β0 + β1 × Year_75t + εt, t = 1, …, 32, where εt ~ N(0, σ).

# linear regression (Y=Box_Office, x= Year_75)
m0 <- lm(Box_Office ~ Year_75, data= dta4)
summary(m0)
## 
## Call:
## lm(formula = Box_Office ~ Year_75, data = dta4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -116.38  -79.20    6.08   62.26  121.70 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -55.93      28.03    -2.0    0.055
## Year_75        29.53       1.48    19.9   <2e-16
## 
## Residual standard error: 77.4 on 30 degrees of freedom
## Multiple R-squared:  0.93,   Adjusted R-squared:  0.927 
## F-statistic:  397 on 1 and 30 DF,  p-value: <2e-16
#
knitr::kable(summary(m0)$coefficients)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -55.931 28.0344 -1.9951 0.05519
Year_75 29.534 1.4827 19.9194 0.00000
# generalized least squares
library(nlme)
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
m0gls <- dta4 %>%
  nlme::gls(Box_Office ~ Year_75, data= .)
summary(m0gls)
## Generalized least squares fit by REML
##   Model: Box_Office ~ Year_75 
##   Data: . 
##      AIC    BIC  logLik
##   363.48 367.69 -178.74
## 
## Coefficients:
##               Value Std.Error t-value p-value
## (Intercept) -55.931   28.0344 -1.9951  0.0552
## Year_75      29.534    1.4827 19.9194  0.0000
## 
##  Correlation: 
##         (Intr)
## Year_75 -0.873
## 
## Standardized residuals:
##       Min        Q1       Med        Q3       Max 
## -1.502839 -1.022668  0.078555  0.803955  1.571461 
## 
## Residual standard error: 77.442 
## Degrees of freedom: 32 total; 30 residual
# residual correlations
dta4 <- dta4 %>%
  mutate(res0 = resid(m0gls, type = "normalized"))
summary(dta4)
##    Box_Office         year         Year_75           res0        
##  Min.   : 86.4   Min.   :1976   Min.   : 1.00   Min.   :-1.5028  
##  1st Qu.:180.2   1st Qu.:1984   1st Qu.: 8.75   1st Qu.:-1.0227  
##  Median :329.6   Median :1992   Median :16.50   Median : 0.0786  
##  Mean   :431.4   Mean   :1992   Mean   :16.50   Mean   : 0.0000  
##  3rd Qu.:693.1   3rd Qu.:1999   3rd Qu.:24.25   3rd Qu.: 0.8040  
##  Max.   :907.2   Max.   :2007   Max.   :32.00   Max.   : 1.5715
# Durbin-Watson test 
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
dta4 %>% 
  lmtest::dwtest(res0 ~ year, data= .)
## 
##  Durbin-Watson test
## 
## data:  res0 ~ year
## DW = 0.248, p-value = 2.3e-13
## alternative hypothesis: true autocorrelation is greater than 0
# Autocorrelation function Plot
acf(resid(m0gls, type = "normalized"), main = "LM", ylim = c(-1, 1))

4.3 model 1 (Generalized least squares)

Model 1: Box_office_t = β0 + β1 × Year_75t + εt, t = 1, …, 32, where εt = εt-1 + νt, and νt ~ N(0, σ).

m1 <- nlme::gls(Box_Office ~ Year_75, data = dta4, 
                method = "ML",
                correlation=corAR1(form = ~ Year_75))

summary(m1)
## Generalized least squares fit by maximum likelihood
##   Model: Box_Office ~ Year_75 
##   Data: dta4 
##      AIC    BIC  logLik
##   330.39 336.25 -161.19
## 
## Correlation Structure: AR(1)
##  Formula: ~Year_75 
##  Parameter estimate(s):
##     Phi 
## 0.87821 
## 
## Coefficients:
##               Value Std.Error t-value p-value
## (Intercept)  4.5141    72.744  0.0621  0.9509
## Year_75     27.0754     3.448  7.8533  0.0000
## 
##  Correlation: 
##         (Intr)
## Year_75 -0.782
## 
## Standardized residuals:
##       Min        Q1       Med        Q3       Max 
## -1.934208 -1.385920  0.018223  0.332026  1.542698 
## 
## Residual standard error: 76.165 
## Degrees of freedom: 32 total; 30 residual
#?? extracts fitted values from m1 returned by modeling functions??
dta4m1 <- dta4 %>%
       mutate(fitm1 = fitted(m1)) 

#plot
ggplot(dta4m1, aes(Year_75, Box_Office)) +
  geom_point(aes(Year_75, Box_Office), color = "grey") + #regression line (Year_75, Box_Office)
  geom_smooth(aes(Year_75, fitm1), color = "grey50") + #regression line(Year_75, fitm1)
  geom_smooth()+ #curve line
  stat_smooth(method = "lm", se = F, size = rel(1), lty = 2, color = "blue") +
  labs(x = "Year since 1975", y = "Gross Box Office (in million $)") +
  theme(legend.position = "NONE")+
  theme_bw()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## Plot of m0 and m1 Residual

dta4m1res <- dta4 %>% 
  mutate(res1=resid(m1, type="normalized"))

ggplot(dta4m1res, aes(Year_75, res1)) +
  geom_point(aes(Year_75, res0), color = "skyblue") +
  geom_point(aes(Year_75, res1), color = "gray50") +
  geom_hline(yintercept = 0, size = rel(1), color = "gray")+
  labs(x = "Year since 1975", y = "Standarized Residuals") +
  theme()+
  theme_bw()

## Residual correlations

#
lmtest::dwtest(res1 ~ year, data = dta4m1res)
## 
##  Durbin-Watson test
## 
## data:  res1 ~ year
## DW = 2.2, p-value = 0.65
## alternative hypothesis: true autocorrelation is greater than 0
#
acf(resid(m1, type = "normalized"), main = "GLS", ylim = c(-1, 1))