library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(infer)
data('yrbss', package='openintro')
glimpse(yrbss)
## Rows: 13,583
## Columns: 13
## $ age <int> 14, 14, 15, 15, 15, 15, 15, 14, 15, 15, 15, 1…
## $ gender <chr> "female", "female", "female", "female", "fema…
## $ grade <chr> "9", "9", "9", "9", "9", "9", "9", "9", "9", …
## $ hispanic <chr> "not", "not", "hispanic", "not", "not", "not"…
## $ race <chr> "Black or African American", "Black or Africa…
## $ height <dbl> NA, NA, 1.73, 1.60, 1.50, 1.57, 1.65, 1.88, 1…
## $ weight <dbl> NA, NA, 84.37, 55.79, 46.72, 67.13, 131.54, 7…
## $ helmet_12m <chr> "never", "never", "never", "never", "did not …
## $ text_while_driving_30d <chr> "0", NA, "30", "0", "did not drive", "did not…
## $ physically_active_7d <int> 4, 2, 7, 0, 2, 1, 4, 4, 5, 0, 0, 0, 4, 7, 7, …
## $ hours_tv_per_school_day <chr> "5+", "5+", "5+", "2", "3", "5+", "5+", "5+",…
## $ strength_training_7d <int> 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, 3, 0, 0, 7, 7, …
## $ school_night_hours_sleep <chr> "8", "6", "<5", "6", "9", "8", "9", "6", "<5"…
The cases in this data set are the people. There are 13583 cases in the sample.
summary(yrbss$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 29.94 56.25 64.41 67.91 76.20 180.99 1004
We are missing weights from 1004 people.
yrbss <- yrbss %>%
mutate(physical_3plus = ifelse(yrbss$physically_active_7d > 2, "yes", "no"))
ggplot(yrbss, aes(x=weight, y=physical_3plus)) + geom_boxplot()
## Warning: Removed 1004 rows containing non-finite values (stat_boxplot).
There are 2 conditions for inference
We definitely have more than 30 samples, and we can assume we are independently sampled.
yrbss %>%
group_by(physical_3plus) %>%
summarise(mean_weight = mean(weight, na.rm = TRUE), count = n())
Those who exercise and those who don’t will have the same average weight.
There are none
null_dist <- yrbss %>%
filter(!(is.na(physical_3plus) | is.na(weight))) %>%
specify(weight ~ physical_3plus) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means", order = c("yes", "no"))
obs_diff <- yrbss %>%
filter(!(is.na(physical_3plus) | is.na(weight))) %>%
specify(weight ~ physical_3plus) %>%
calculate(stat = "diff in means", order = c("yes", "no"))
ggplot(data = null_dist, aes(x = stat)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
null_dist %>%
get_p_value(obs_stat = obs_diff, direction = "two_sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.
test <- yrbss %>%
mutate(has_exercise = ifelse(yrbss$physically_active_7d > 2, "yes", "no")) %>%
filter(!(is.na(has_exercise) | is.na(has_exercise))) %>%
filter(!(is.na(weight) | is.na(weight))) %>%
group_by(has_exercise)
test
height_data <- summarise(test, n=n(), mean=mean(weight), sd=sd(weight))
height_data
Practically, those that exercise have a higher average weight.
mean_height <- mean(yrbss$height, na.rm = TRUE)
mean_height
## [1] 1.691241
sd_height <- sd(yrbss$height, na.rm = TRUE)
sd_height
## [1] 0.1046973
sde_height <- sd_height / sqrt(13583)
sde_height
## [1] 0.0008983332
tval_height <- qt(.05/2, 13583, lower.tail = FALSE)
ci_up <- mean_height + tval_height * sde_height
leftintheight <- mean_height - tval_height * sde_height
rightintheight<- mean_height + tval_height * sde_height
leftintheight
## [1] 1.68948
rightintheight
## [1] 1.693002
The 95% ci is (1.68948, 1.693002)
mean_height <- mean(yrbss$height, na.rm = TRUE)
mean_height
## [1] 1.691241
sd_height <- sd(yrbss$height, na.rm = TRUE)
sd_height
## [1] 0.1046973
sde_height <- sd_height / sqrt(13583)
sde_height
## [1] 0.0008983332
tval_height <- qt(.1/2, 13583, lower.tail = FALSE)
ci_up <- mean_height + tval_height * sde_height
leftintheight <- mean_height - tval_height * sde_height
rightintheight<- mean_height + tval_height * sde_height
leftintheight
## [1] 1.689763
rightintheight
## [1] 1.692719
The 90% ci is (1.689763, 1.692719)
Question: Does a exercise encourage more growth? Hypothesis: There is no relationship between hours of exercise and height. Confidence Interval: 95% α level: 0.05
yrbss$night_sleep_hours <-ifelse(yrbss$physically_active_7d > 2, "yes", "no")
t.test(height ~ night_sleep_hours, data = yrbss)
##
## Welch Two Sample t-test
##
## data: height by night_sleep_hours
## t = -19.029, df = 7973.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## -0.04150183 -0.03374994
## sample estimates:
## mean in group no mean in group yes
## 1.665587 1.703213
The fact that p < 0.05 means we can reject the null hypothesis.
yrbss %>% group_by(hours_tv_per_school_day) %>% summarise(n())
There are 8 different options, including na!
Question: Does a full nights sleep encourage more growth? Hypothesis: There is no relationship between hours of sleep and height. Confidence Interval: 95% α level: 0.05
yrbss %>% group_by(school_night_hours_sleep) %>% summarise(n())
yrbss <- yrbss %>%
mutate(full_sleep = ifelse(yrbss$school_night_hours_sleep > 7, "yes", "no")) %>%
mutate(full_sleep_binary = ifelse(yrbss$school_night_hours_sleep > 7, "1", "0"))
yrbss <- yrbss %>%
mutate(sleep = ifelse(yrbss$school_night_hours_sleep > 7, "yes", "no"))
ggplot(yrbss, aes(x=height, y=sleep)) + geom_boxplot()
## Warning: Removed 1004 rows containing non-finite values (stat_boxplot).
Looking at it, it appears as there there is a loose correlation, but lets really dig in and figure it out.
yrbss$night_sleep_hours <-ifelse(yrbss$school_night_hours_sleep > 7, "yes", "no")
t.test(height ~ night_sleep_hours, data = yrbss)
##
## Welch Two Sample t-test
##
## data: height by night_sleep_hours
## t = -1.5598, df = 5814.9, p-value = 0.1189
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## -0.0076754326 0.0008733169
## sample estimates:
## mean in group no mean in group yes
## 1.689770 1.693171
The P_val is 0.05 so we can reject the null hypothesis.