fL<-"https://math.montana.edu/shancock/data/Framingham.txt"
dta <- read.table(fL, header=T)
knitr::kable(head(dta))
sex | sbp | dbp | scl | chdfate | followup | age | bmi | month | id |
---|---|---|---|---|---|---|---|---|---|
1 | 120 | 80 | 267 | 1 | 18 | 55 | 25.0 | 8 | 2642 |
1 | 130 | 78 | 192 | 1 | 35 | 53 | 28.4 | 12 | 4627 |
1 | 144 | 90 | 207 | 1 | 109 | 61 | 25.1 | 8 | 2568 |
1 | 92 | 66 | 231 | 1 | 147 | 48 | 26.2 | 11 | 4192 |
1 | 162 | 98 | 271 | 1 | 169 | 39 | 28.4 | 11 | 3977 |
2 | 212 | 118 | 182 | 1 | 199 | 61 | 33.3 | 2 | 659 |
sex sbp dbp scl chdfate followup age bmi month id 1 120 80 267 1 18 55 25.0 8 2642 1 130 78 192 1 35 53 28.4 12 4627 1 144 90 207 1 109 61 25.1 8 2568 1 92 66 231 1 147 48 26.2 11 4192 1 162 98 271 1 169 39 28.4 11 3977 2 212 118 182 1 199 61 33.3 2 659
dta$sex <- factor(dta$sex,
levels=c(1, 2),
labels=c("Male", "Female"))
library(lattice)
xyplot(sbp ~ dbp | sex,
data=dta, cex=.5,
type=c("p","g","r"),
xlab="Diastolic pressure (mmHg)",
ylab="Systolic pressure (mmHg)")
#install.packages("lsmeans")
library(lsmeans)
## Loading required package: emmeans
## The 'lsmeans' package is now basically a front end for 'emmeans'.
## Users are encouraged to switch the rest of the way.
## See help('transition') for more information, including how to
## convert old 'lsmeans' objects and scripts to work with 'emmeans'.
genderdiffer <- lm(sbp ~ dbp*sex, data = dta)
knitr::kable(anova(genderdiffer))
Df | Sum Sq | Mean Sq | F value | Pr(>F) | |
---|---|---|---|---|---|
dbp | 1 | 1500666.80 | 1500666.8003 | 7737.83283 | 0 |
sex | 1 | 13924.79 | 13924.7877 | 71.79987 | 0 |
dbp:sex | 1 | 17310.17 | 17310.1696 | 89.25579 | 0 |
Residuals | 4695 | 910543.14 | 193.9389 | NA | NA |
# Obtain slopes
genderdiffer$coefficients
## (Intercept) dbp sexFemale dbp:sexFemale
## 29.8539356 1.2251337 -22.1005883 0.3088544
m.lst <- lstrends(genderdiffer, "sex", var="dbp")
knitr::kable(m.lst)
sex | dbp.trend | SE | df | lower.CL | upper.CL |
---|---|---|---|---|---|
Male | 1.225134 | 0.0254189 | 4695 | 1.175301 | 1.274967 |
Female | 1.533988 | 0.0205576 | 4695 | 1.493685 | 1.574291 |
# Compare slopes
knitr::kable(pairs(m.lst))
contrast | estimate | SE | df | t.ratio | p.value |
---|---|---|---|---|---|
Male - Female | -0.3088544 | 0.0326916 | 4695 | -9.447528 | 0 |
#slope male<female
m0 <- lm(sbp ~ dbp , data = dta)
m1 <- lm(sbp ~ dbp + sex, data = dta)
m2 <- lm(sbp ~ dbp + sex + sex:dbp, data=dta)
anova(m0, m1, m2)
## Analysis of Variance Table
##
## Model 1: sbp ~ dbp
## Model 2: sbp ~ dbp + sex
## Model 3: sbp ~ dbp + sex + sex:dbp
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4697 941778
## 2 4696 927853 1 13925 71.800 < 2.2e-16 ***
## 3 4695 910543 1 17310 89.256 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(rstandard(m2) ~ fitted(m2),
cex=.5,
xlab="Fitted values",
ylab="Standardized residuals")
grid()
## The end
weighted regression and compare the results against an ordinary regression of “length” onto “loudness”
options(digits=5, show.signif.stars=FALSE)
# load the packages
# install.packages("pacman")
pacman::p_load(alr4, tidyverse, magrittr)
# make sure data set is active
data(Stevens, package="alr4")
# save a copy
dta <- Stevens %>%
rename(Length=y, Loudness=loudness, Subject=subject)
#data structure
str(dta)
## 'data.frame': 50 obs. of 5 variables:
## $ Subject : Factor w/ 10 levels "A","B","C","D",..: 1 1 1 1 1 2 2 2 2 2 ...
## $ Loudness: num 50 60 70 80 90 50 60 70 80 90 ...
## $ Length : num 0.379 1.727 3.016 3.924 4.413 ...
## $ sd : num 0.507 0.904 0.553 0.363 0.092 0.077 0.287 0.396 0.124 0.068 ...
## $ n : num 3 3 3 3 3 3 3 3 3 3 ...
knitr::kable(head(dta))
Subject | Loudness | Length | sd | n |
---|---|---|---|---|
A | 50 | 0.379 | 0.507 | 3 |
A | 60 | 1.727 | 0.904 | 3 |
A | 70 | 3.016 | 0.553 | 3 |
A | 80 | 3.924 | 0.363 | 3 |
A | 90 | 4.413 | 0.092 | 3 |
B | 50 | 0.260 | 0.077 | 3 |
# completely overrides the current theme.
ot <- theme_set(theme_bw())
# 1 regression fits all
ggplot(dta, aes(Loudness, Length, group = Subject, alpha = Subject))+
geom_point() +
geom_line() +
stat_smooth(aes(group = 1), method = "lm", size = rel(.8)) +
coord_cartesian(ylim = c(-1, 5)) +
labs(x = "Loudness (db)", y = "Average log(Length)") +
theme(legend.position = "NONE")
## Warning: Using alpha for a discrete variable is not advised.
## `geom_smooth()` using formula 'y ~ x'
# one model fits all and tidy summary model output
dta %>%
do(mpre <-broom::tidy(lm(Length ~ Loudness, data = .)))
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -3.20 0.344 -9.31 2.53e-12
## 2 Loudness 0.0790 0.00482 16.4 2.69e-21
dta$Subjectnum <- as.integer(as.factor(dta$Subject))
#weights by subject
dta %>%
do(m_post <- broom::tidy(lm(Length ~ Loudness, weights = Subjectnum, data= .)))
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -2.83 0.335 -8.46 4.49e-11
## 2 Loudness 0.0734 0.00469 15.7 1.74e-20
#weights by sd
dta %>%
do(m_post <- broom::tidy(lm(Length ~ Loudness, weights = sd, data= .)))
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -3.06 0.357 -8.56 3.15e-11
## 2 Loudness 0.0779 0.00513 15.2 5.71e-20
a sample of 4,011 US girls from the data set USgirl
Edit the R script to perform a quantile regression of weight onto age.
library(Brq)
data(USgirl, package="Brq")
dta <- USgirl
str(dta)
## 'data.frame': 4011 obs. of 2 variables:
## $ Age : num 14.74 1.76 15.38 12.76 15.73 ...
## $ Weight: num 85 12.2 59.4 38.6 47.9 ...
knitr::kable(head(dta))
Age | Weight |
---|---|
14.74 | 85.05 |
1.76 | 12.25 |
15.38 | 59.42 |
12.76 | 38.56 |
15.73 | 47.85 |
9.22 | 43.09 |
library(quantreg)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
mUS_qr <- rq(Age ~ Weight, data=dta, tau=1:9/10)
summary(mUS_qr)
##
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
##
## tau: [1] 0.1
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) -0.60905 0.07506 -8.11403 0.00000
## Weight 0.21493 0.00364 58.97602 0.00000
##
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
##
## tau: [1] 0.2
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) -0.64512 0.08558 -7.53782 0.00000
## Weight 0.24674 0.00382 64.50824 0.00000
##
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
##
## tau: [1] 0.3
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) -0.59693 0.06968 -8.56684 0.00000
## Weight 0.26760 0.00296 90.52868 0.00000
##
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
##
## tau: [1] 0.4
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) -0.53221 0.07182 -7.40995 0.00000
## Weight 0.28547 0.00311 91.83318 0.00000
##
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
##
## tau: [1] 0.5
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) -0.50678 0.05763 -8.79425 0.00000
## Weight 0.30109 0.00263 114.39482 0.00000
##
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
##
## tau: [1] 0.6
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) -0.44654 0.06834 -6.53459 0.00000
## Weight 0.31549 0.00276 114.29469 0.00000
##
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
##
## tau: [1] 0.7
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) -0.40546 0.06129 -6.61499 0.00000
## Weight 0.33230 0.00278 119.65397 0.00000
##
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
##
## tau: [1] 0.8
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) -0.30468 0.07207 -4.22732 0.00002
## Weight 0.34899 0.00318 109.85603 0.00000
##
## Call: rq(formula = Age ~ Weight, tau = 1:9/10, data = dta)
##
## tau: [1] 0.9
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) -0.20892 0.07961 -2.62449 0.00871
## Weight 0.37489 0.00387 96.93329 0.00000
plot(summary(mUS_qr))
ggplot(dta, aes(Age, Weight)) +
geom_point(alpha=.5,
size=rel(.3),
color="lightskyblue") +
geom_quantile(quantiles=1:9/10,
formula=y ~ x,
color="steelblue")+
labs(x="Age (in years)",
y="Body weight (in kg)") +
theme_minimal()
## The end
Create a markdown file to replicate the analysis reported in the box office data example. ## Input data
fLbox <- "https://gattonweb.uky.edu/sheather/book/docs/datasets/boxoffice.txt"
dta <- read.table(fLbox, header=T)
str(dta)
## 'data.frame': 32 obs. of 2 variables:
## $ GrossBoxOffice: num 95.3 86.4 119.4 124.4 154.2 ...
## $ year : int 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 ...
dta4 <- dta %>%
mutate(Year_75 = 1:32 ) %>%
rename(Box_Office = GrossBoxOffice)
knitr::kable(head(dta4))
Box_Office | year | Year_75 |
---|---|---|
95.3 | 1976 | 1 |
86.4 | 1977 | 2 |
119.4 | 1978 | 3 |
124.4 | 1979 | 4 |
154.2 | 1980 | 5 |
174.3 | 1981 | 6 |
Model 0: Box_office_t = β0 + β1 × Year_75t + εt, t = 1, …, 32, where εt ~ N(0, σ).
# linear regression (Y=Box_Office, x= Year_75)
m0 <- lm(Box_Office ~ Year_75, data= dta4)
summary(m0)
##
## Call:
## lm(formula = Box_Office ~ Year_75, data = dta4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -116.38 -79.20 6.08 62.26 121.70
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -55.93 28.03 -2.0 0.055
## Year_75 29.53 1.48 19.9 <2e-16
##
## Residual standard error: 77.4 on 30 degrees of freedom
## Multiple R-squared: 0.93, Adjusted R-squared: 0.927
## F-statistic: 397 on 1 and 30 DF, p-value: <2e-16
#
knitr::kable(summary(m0)$coefficients)
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | -55.931 | 28.0344 | -1.9951 | 0.05519 |
Year_75 | 29.534 | 1.4827 | 19.9194 | 0.00000 |
# generalized least squares
library(nlme)
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
m0gls <- dta4 %>%
nlme::gls(Box_Office ~ Year_75, data= .)
summary(m0gls)
## Generalized least squares fit by REML
## Model: Box_Office ~ Year_75
## Data: .
## AIC BIC logLik
## 363.48 367.69 -178.74
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) -55.931 28.0344 -1.9951 0.0552
## Year_75 29.534 1.4827 19.9194 0.0000
##
## Correlation:
## (Intr)
## Year_75 -0.873
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.502839 -1.022668 0.078555 0.803955 1.571461
##
## Residual standard error: 77.442
## Degrees of freedom: 32 total; 30 residual
# residual correlations
dta4 <- dta4 %>%
mutate(res0 = resid(m0gls, type = "normalized"))
summary(dta4)
## Box_Office year Year_75 res0
## Min. : 86.4 Min. :1976 Min. : 1.00 Min. :-1.5028
## 1st Qu.:180.2 1st Qu.:1984 1st Qu.: 8.75 1st Qu.:-1.0227
## Median :329.6 Median :1992 Median :16.50 Median : 0.0786
## Mean :431.4 Mean :1992 Mean :16.50 Mean : 0.0000
## 3rd Qu.:693.1 3rd Qu.:1999 3rd Qu.:24.25 3rd Qu.: 0.8040
## Max. :907.2 Max. :2007 Max. :32.00 Max. : 1.5715
# Durbin-Watson test
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
dta4 %>%
lmtest::dwtest(res0 ~ year, data= .)
##
## Durbin-Watson test
##
## data: res0 ~ year
## DW = 0.248, p-value = 2.3e-13
## alternative hypothesis: true autocorrelation is greater than 0
# Autocorrelation function Plot
acf(resid(m0gls, type = "normalized"), main = "LM", ylim = c(-1, 1))
Model 1: Box_office_t = β0 + β1 × Year_75t + εt, t = 1, …, 32, where εt = εt-1 + νt, and νt ~ N(0, σ).
m1 <- nlme::gls(Box_Office ~ Year_75, data = dta4,
method = "ML",
correlation=corAR1(form = ~ Year_75))
summary(m1)
## Generalized least squares fit by maximum likelihood
## Model: Box_Office ~ Year_75
## Data: dta4
## AIC BIC logLik
## 330.39 336.25 -161.19
##
## Correlation Structure: AR(1)
## Formula: ~Year_75
## Parameter estimate(s):
## Phi
## 0.87821
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 4.5141 72.744 0.0621 0.9509
## Year_75 27.0754 3.448 7.8533 0.0000
##
## Correlation:
## (Intr)
## Year_75 -0.782
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.934208 -1.385920 0.018223 0.332026 1.542698
##
## Residual standard error: 76.165
## Degrees of freedom: 32 total; 30 residual
#?? extracts fitted values from m1 returned by modeling functions??
dta4m1 <- dta4 %>%
mutate(fitm1 = fitted(m1))
#plot
ggplot(dta4m1, aes(Year_75, Box_Office)) +
geom_point(aes(Year_75, Box_Office), color = "grey") + #regression line (Year_75, Box_Office)
geom_smooth(aes(Year_75, fitm1), color = "grey50") + #regression line(Year_75, fitm1)
geom_smooth()+ #curve line
stat_smooth(method = "lm", se = F, size = rel(1), lty = 2, color = "blue") +
labs(x = "Year since 1975", y = "Gross Box Office (in million $)") +
theme(legend.position = "NONE")+
theme_bw()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Plot of m0 and m1 Residual
dta4m1res <- dta4 %>%
mutate(res1=resid(m1, type="normalized"))
ggplot(dta4m1res, aes(Year_75, res1)) +
geom_point(aes(Year_75, res0), color = "skyblue") +
geom_point(aes(Year_75, res1), color = "gray50") +
geom_hline(yintercept = 0, size = rel(1), color = "gray")+
labs(x = "Year since 1975", y = "Standarized Residuals") +
theme()+
theme_bw()
## Residual correlations
#
lmtest::dwtest(res1 ~ year, data = dta4m1res)
##
## Durbin-Watson test
##
## data: res1 ~ year
## DW = 2.2, p-value = 0.65
## alternative hypothesis: true autocorrelation is greater than 0
#
acf(resid(m1, type = "normalized"), main = "GLS", ylim = c(-1, 1))