Greetings!. These are my course notes from Chapter 2 of the course - Inference for Linear Regression. Instructor for this Course is Jo Hardin and this course is available in Datacamp
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(broom)
library(RCurl)
## Loading required package: bitops
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
twins <- read.csv("https://assets.datacamp.com/production/course_3623/datasets/twins.csv")
head(twins)
## Foster Biological Social
## 1 82 82 high
## 2 80 90 high
## 3 88 91 high
## 4 108 115 high
## 5 116 115 high
## 6 117 129 high
str(twins)
## 'data.frame': 27 obs. of 3 variables:
## $ Foster : int 82 80 88 108 116 117 132 71 75 93 ...
## $ Biological: int 82 90 91 115 115 129 131 78 79 82 ...
## $ Social : Factor w/ 3 levels "high","low","middle": 1 1 1 1 1 1 1 3 3 3 ...
summary(twins)
## Foster Biological Social
## Min. : 63.00 Min. : 68.0 high : 7
## 1st Qu.: 84.50 1st Qu.: 83.5 low :14
## Median : 94.00 Median : 94.0 middle: 6
## Mean : 95.11 Mean : 95.3
## 3rd Qu.:107.50 3rd Qu.:104.5
## Max. :132.00 Max. :131.0
Instructions:
# Load the infer package
library(infer)
# Calculate the observed slope
obs_slope <- lm(Foster ~ Biological, data = twins) %>%
tidy() %>%
filter(term == "Biological") %>%
pull(estimate)
# Simulate 10 slopes with a permuted dataset
set.seed(4747)
perm_slope <- twins %>%
specify(Foster ~ Biological) %>%
hypothesize(null = "independence") %>%
generate(reps = 10, type = "permute") %>%
calculate(stat = "slope")
# Print the observed slope and the 10 permuted slopes
obs_slope
## [1] 0.901436
perm_slope
## # A tibble: 10 x 2
## replicate stat
## <int> <dbl>
## 1 1 0.143
## 2 2 0.0710
## 3 3 -0.456
## 4 4 0.0749
## 5 5 0.297
## 6 6 0.0673
## 7 7 0.140
## 8 8 0.164
## 9 9 0.0971
## 10 10 0.184
Instructions; - Using the infer steps, generate 500 permuted slope statistics - Plot the null slopes using geom_density - Find the mean and the standard deviation of the null slopes
# Make a dataframe with replicates and plot them!
set.seed(4747)
perm_slope <- twins %>%
specify(Foster ~ Biological) %>%
hypothesize(null = "independence") %>%
generate(reps = 500, type = "permute") %>%
calculate(stat = "slope")
ggplot(perm_slope, aes(x = stat)) +
geom_density()
# Calculate the mean and the standard deviation of the slopes
perm_slope %>%
ungroup() %>%
summarize(mean(stat), sd(stat))
## # A tibble: 1 x 2
## `mean(stat)` `sd(stat)`
## <dbl> <dbl>
## 1 0.00629 0.196
Instructions: - Calculate the absolute value of the observed test statistic. Use pull() to create a numeric value with which to work - Compare the observed statistic to the perm_slopes calculated in the previous exercise - The p-value will be the proportion of times (out of 1000) that the absolute value of the permuted statistics is greater than the absolute value of the observed statistic
# Calculate the absolute value of the slope
abs_obs_slope <- lm(Foster ~ Biological, data = twins) %>%
tidy() %>%
filter(term == "Biological") %>%
pull(estimate) %>%
abs()
# Compute the p-value
perm_slope %>%
mutate(abs_perm_slope = abs(stat)) %>%
summarize(p_value = mean(abs_perm_slope > abs_obs_slope))
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0
# Calculate 1000 bootstrapped slopes
set.seed(4747)
BS_slope <- twins %>%
specify(Foster ~ Biological) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "slope")
# Look at the head of BS_slope
head(BS_slope)
## # A tibble: 6 x 2
## replicate stat
## <int> <dbl>
## 1 1 0.946
## 2 2 0.966
## 3 3 0.870
## 4 4 0.930
## 5 5 0.807
## 6 6 0.900
Instructions: - Again, use the infer steps to calculate 1000 bootstrapped BS_slope values - Find the mean and standard deviation of the slopes, and calculate the confidence interval as the mean plus or minus two standard deviations.
# Bootstrap the slopes
set.seed(4747)
BS_slope <- twins %>%
specify(Foster ~ Biological) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "slope")
# Create a confidence interval
BS_slope %>%
summarize(lower = mean(stat) - 2*sd(stat),
upper = mean(stat) + 2*sd(stat))
## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 0.719 1.08
print(BS_slope)
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 0.946
## 2 2 0.966
## 3 3 0.870
## 4 4 0.930
## 5 5 0.807
## 6 6 0.900
## 7 7 0.862
## 8 8 0.930
## 9 9 0.768
## 10 10 0.835
## # ... with 990 more rows
# Set alpha
alpha <- .05
# Bootstrap the slopes
set.seed(4747)
BS_slope <- twins %>%
specify(Foster ~ Biological) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "slope")
# Create a confidence interval
BS_slope %>%
summarize(low = quantile(stat, alpha / 2),
high = quantile(stat, 1 - alpha / 2))
## # A tibble: 1 x 2
## low high
## <dbl> <dbl>
## 1 0.724 1.08