18.6 Exercises 1. In a previous section, we computed the correlation between mothers and daughters, mothers and sons, fathers and daughters, and fathers and sons, and noticed that the highest correlation is between fathers and sons and the lowest is between mothers and sons. We can compute these correlations using: Are these differences statistically significant? To answer this, we will compute the slopes of the regression line along with their standard errors. Start by using lm and the broom package to compute the slopes LSE and the standard errors.
library(HistData)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(broom)
library(ggplot2)
data("GaltonFamilies")
set.seed(1)
galton_heights<-GaltonFamilies|>group_by(family, gender)|>sample_n(1)|>ungroup()
cors<-galton_heights|>pivot_longer(father:mother, names_to="parent", values_to="parentHeight") |>
mutate(child=ifelse(gender=="female", "daughter", "son"))|> unite(pair, c("parent", "child")) |> group_by(pair)|>
summarize(cor=cor(parentHeight, childHeight))
fs<-filter(galton_heights, gender=="male") |> select(father, childHeight) |> rename(son=childHeight)
fd<-filter(galton_heights, gender=="female") |> select(father, childHeight) |> rename(daughter=childHeight)
md<-filter(galton_heights, gender=="female") |> select(mother, childHeight) |> rename(daughter=childHeight)
ms<-filter(galton_heights, gender=="male") |> select(mother, childHeight) |> rename(son=childHeight)
fitfs<-lm(father~son, data=fs)
fitms<-lm(mother~son, data=ms)
fitmd<-lm(mother~daughter, data=md)
fitfd<-lm(father~daughter, data=fd)
tidy(fitfs)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 37.0 4.87 7.61 1.54e-12
## 2 son 0.463 0.0702 6.59 4.74e-10
tidy(fitfd)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 41.7 5.02 8.30 2.71e-14
## 2 daughter 0.432 0.0783 5.52 1.21e- 7
tidy(fitmd)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 40.8 4.21 9.70 4.62e-18
## 2 daughter 0.364 0.0656 5.55 1.07e- 7
tidy(fitms)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 43.7 4.81 9.08 2.07e-16
## 2 son 0.293 0.0694 4.22 3.84e- 5
tidyfitmd<-tidy(fitmd, conf.int=TRUE)
tidyfit<-tidy(fitms, conf.int=TRUE)
tidyfitfd<-tidy(fitfd, conf.int=TRUE)
tidyfitfs<-tidy(fitfs, conf.int=TRUE)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.3.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggplot2)
SSS<-galton_heights|>pivot_longer(father:mother, names_to="parent", values_to="parentHeight") |> mutate(child=ifelse(gender=="female", "daughter", "son"))|> unite(pair, c("parent", "child")) |> group_by(pair)
SSS |> group_by(pair) |> reframe(tidy(lm(parentHeight ~ childHeight), conf.int=TRUE)) |> filter(term=="childHeight") |> select(pair, estimate, conf.low, conf.high) |>
ggplot(aes(pair, y=estimate, ymin=conf.low, ymax=conf.high)) + geom_errorbar() + geom_point()
B<-100
N<-25
#Father-son correlation simulation.
R<-replicate(B, {sample_n(fs, N, replace=TRUE) |> summarize(r=cor(father, son)) |> pull(r) })
#Mother-son correlation simulation.
rms<-replicate(B, {sample_n(ms, N, replace=TRUE) |> summarize(r=cor(mother, son)) |> pull(r) })
#Father-daughter correlation simulation.
rfd<-replicate(B, {sample_n(fd, N, replace=TRUE) |> summarize(r=cor(father, daughter)) |> pull(r) })
#Mother-daughter correlation simulation.
rmd<-replicate(B, {sample_n(md, N, replace=TRUE) |> summarize(r=cor(mother, daughter)) |> pull(r) })
library(Lahman)
## Warning: package 'Lahman' was built under R version 4.3.3
fit<-Teams %>% filter(yearID %in% 1971) %>% lm(R~BB+HR, data=.)
tidy(fit, conf.int=TRUE)
## # A tibble: 3 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 257. 112. 2.31 0.0314 25.3 489.
## 2 BB 0.414 0.210 1.97 0.0625 -0.0237 0.852
## 3 HR 1.30 0.431 3.01 0.00673 0.399 2.19
res<-Teams %>% filter(yearID %in% 1962:2018) %>% group_by(yearID) %>% do(tidy(lm(R~BB+HR, data=.))) %>% ungroup()
res%>%filter(term=="BB") %>% ggplot(aes(yearID, estimate)) + geom_point() + geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'
res<-Teams %>% filter(yearID %in% 1962:2018) %>% group_by(yearID) %>% do(tidy(lm(R~BB, data=.))) %>% ungroup()
res%>%filter(term=="BB") %>% ggplot(aes(yearID, estimate)) + geom_point() + geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'