Chapter 18 (Part 2) Exercises

18.6 Exercises 1. In a previous section, we computed the correlation between mothers and daughters, mothers and sons, fathers and daughters, and fathers and sons, and noticed that the highest correlation is between fathers and sons and the lowest is between mothers and sons. We can compute these correlations using: Are these differences statistically significant? To answer this, we will compute the slopes of the regression line along with their standard errors. Start by using lm and the broom package to compute the slopes LSE and the standard errors.

library(HistData)
library(tidyr)

## Warning: package 'tidyr' was built under R version 4.3.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(broom)
library(ggplot2)
data("GaltonFamilies")
set.seed(1)
galton_heights<-GaltonFamilies|>group_by(family, gender)|>sample_n(1)|>ungroup()

cors<-galton_heights|>pivot_longer(father:mother, names_to="parent", values_to="parentHeight") |>
mutate(child=ifelse(gender=="female", "daughter", "son"))|> unite(pair, c("parent", "child")) |> group_by(pair)|>
summarize(cor=cor(parentHeight, childHeight))

fs<-filter(galton_heights, gender=="male") |> select(father, childHeight) |> rename(son=childHeight)
fd<-filter(galton_heights, gender=="female") |> select(father, childHeight) |> rename(daughter=childHeight)
md<-filter(galton_heights, gender=="female") |> select(mother, childHeight) |> rename(daughter=childHeight)
ms<-filter(galton_heights, gender=="male") |> select(mother, childHeight) |> rename(son=childHeight)
fitfs<-lm(father~son, data=fs)
fitms<-lm(mother~son, data=ms)
fitmd<-lm(mother~daughter, data=md)
fitfd<-lm(father~daughter, data=fd)
tidy(fitfs)

## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   37.0      4.87        7.61 1.54e-12
## 2 son            0.463    0.0702      6.59 4.74e-10

tidy(fitfd)

## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   41.7      5.02        8.30 2.71e-14
## 2 daughter       0.432    0.0783      5.52 1.21e- 7

tidy(fitmd)

## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   40.8      4.21        9.70 4.62e-18
## 2 daughter       0.364    0.0656      5.55 1.07e- 7

tidy(fitms)

## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   43.7      4.81        9.08 2.07e-16
## 2 son            0.293    0.0694      4.22 3.84e- 5

Repeat the exercise above, but compute a confidence interval as well.

tidyfitmd<-tidy(fitmd, conf.int=TRUE)
tidyfit<-tidy(fitms, conf.int=TRUE)
tidyfitfd<-tidy(fitfd, conf.int=TRUE)
tidyfitfs<-tidy(fitfs, conf.int=TRUE)

Plot the confidence intervals and notice that they overlap, which implies that the data is consistent with the inheritance of height being independent of sex.

library(gridExtra)

## Warning: package 'gridExtra' was built under R version 4.3.3

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(ggplot2)

SSS<-galton_heights|>pivot_longer(father:mother, names_to="parent", values_to="parentHeight") |> mutate(child=ifelse(gender=="female", "daughter", "son"))|> unite(pair, c("parent", "child")) |> group_by(pair)
SSS |>  group_by(pair) |> reframe(tidy(lm(parentHeight ~ childHeight), conf.int=TRUE)) |> filter(term=="childHeight") |> select(pair, estimate, conf.low, conf.high) |>
ggplot(aes(pair, y=estimate, ymin=conf.low, ymax=conf.high)) +   geom_errorbar() + geom_point()

Because we are selecting children at random, we can actually do something like a permutation test here. Repeat the computation of correlations 100 times taking a different sample each time. Hint: use similar code to what we used with simulations.

B<-100
N<-25
#Father-son correlation simulation.
R<-replicate(B, {sample_n(fs, N, replace=TRUE) |>     summarize(r=cor(father, son)) |> pull(r) })
#Mother-son correlation simulation.
rms<-replicate(B, {sample_n(ms, N, replace=TRUE) |>     summarize(r=cor(mother, son)) |> pull(r) })
#Father-daughter correlation simulation.
rfd<-replicate(B, {sample_n(fd, N, replace=TRUE) |>     summarize(r=cor(father, daughter)) |> pull(r) })
#Mother-daughter correlation simulation.
rmd<-replicate(B, {sample_n(md, N, replace=TRUE) |>     summarize(r=cor(mother, daughter)) |> pull(r) })

Fit a linear regression model to obtain the effects of BB and HR on Runs (at the team level) in 1971. Use the tidy function in the broom package to obtain the results in a data frame.

library(Lahman)

## Warning: package 'Lahman' was built under R version 4.3.3

fit<-Teams %>% filter(yearID %in% 1971) %>%  lm(R~BB+HR, data=.)
tidy(fit, conf.int=TRUE)

## # A tibble: 3 × 7
##   term        estimate std.error statistic p.value conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)  257.      112.         2.31 0.0314   25.3      489.   
## 2 BB             0.414     0.210      1.97 0.0625   -0.0237     0.852
## 3 HR             1.30      0.431      3.01 0.00673   0.399      2.19

Now let’s repeat the above for each year since 1962 and make a plot. Use summarize and the broom package to fit this model for every year since 1962.

res<-Teams %>% filter(yearID %in% 1962:2018) %>%     group_by(yearID) %>% do(tidy(lm(R~BB+HR, data=.))) %>% ungroup()
res%>%filter(term=="BB") %>% ggplot(aes(yearID, estimate)) +     geom_point() + geom_smooth(method="lm")

## `geom_smooth()` using formula = 'y ~ x'

Use the results of the previous exercise to plot the estimated effects of BB on runs.

res<-Teams %>% filter(yearID %in% 1962:2018) %>%     group_by(yearID) %>% do(tidy(lm(R~BB, data=.))) %>% ungroup()
res%>%filter(term=="BB") %>% ggplot(aes(yearID, estimate)) +     geom_point() + geom_smooth(method="lm")

## `geom_smooth()` using formula = 'y ~ x'

Chapter 18 (Part 2) Exercises

Dimple K. Patel

2024-04-28