library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(infer)
data("yrbss")
?yrbss
glimpse(yrbss)
## Rows: 13,583
## Columns: 13
## $ age <int> 14, 14, 15, 15, 15, 15, 15, 14, 15, 15, 15, …
## $ gender <chr> "female", "female", "female", "female", "fem…
## $ grade <chr> "9", "9", "9", "9", "9", "9", "9", "9", "9",…
## $ hispanic <chr> "not", "not", "hispanic", "not", "not", "not…
## $ race <chr> "Black or African American", "Black or Afric…
## $ height <dbl> NA, NA, 1.73, 1.60, 1.50, 1.57, 1.65, 1.88, …
## $ weight <dbl> NA, NA, 84.37, 55.79, 46.72, 67.13, 131.54, …
## $ helmet_12m <chr> "never", "never", "never", "never", "did not…
## $ text_while_driving_30d <chr> "0", NA, "30", "0", "did not drive", "did no…
## $ physically_active_7d <int> 4, 2, 7, 0, 2, 1, 4, 4, 5, 0, 0, 0, 4, 7, 7,…
## $ hours_tv_per_school_day <chr> "5+", "5+", "5+", "2", "3", "5+", "5+", "5+"…
## $ strength_training_7d <int> 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, 3, 0, 0, 7, 7,…
## $ school_night_hours_sleep <chr> "8", "6", "<5", "6", "9", "8", "9", "6", "<5…
ggplot(data = yrbss, aes(x = weight)) +
geom_histogram(binwidth = 2) +
labs(
x = "Weight (kgs)", y = "frequency",
title = "Weight distribution of HS students")
## Warning: Removed 1004 rows containing non-finite values (stat_bin).
yrbss %>%
filter(!is.na(weight)) %>%
ggplot() +
geom_histogram(aes(x = weight), binwidth = 2, colour = "black", fill = "white") +
facet_wrap(~ gender)
ggplot(data = yrbss, aes(sample = weight)) +
geom_line(stat = "qq") +
labs(title = "Probability (Q-Q) plot")
## Warning: Removed 1004 rows containing non-finite values (stat_qq).
summary(yrbss$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 29.94 56.25 64.41 67.91 76.20 180.99 1004
yrbss %>%
filter(!is.na(weight), gender =="male") %>%
summary(weight)
## age gender grade hispanic
## Min. :12.00 Length:6414 Length:6414 Length:6414
## 1st Qu.:15.00 Class :character Class :character Class :character
## Median :16.00 Mode :character Mode :character Mode :character
## Mean :16.22
## 3rd Qu.:17.00
## Max. :18.00
##
## race height weight helmet_12m
## Length:6414 Min. :1.270 Min. : 31.75 Length:6414
## Class :character 1st Qu.:1.700 1st Qu.: 61.69 Class :character
## Mode :character Median :1.750 Median : 70.31 Mode :character
## Mean :1.757 Mean : 73.64
## 3rd Qu.:1.800 3rd Qu.: 81.65
## Max. :2.110 Max. :180.99
##
## text_while_driving_30d physically_active_7d hours_tv_per_school_day
## Length:6414 Min. :0.000 Length:6414
## Class :character 1st Qu.:3.000 Class :character
## Mode :character Median :5.000 Mode :character
## Mean :4.537
## 3rd Qu.:7.000
## Max. :7.000
## NA's :142
## strength_training_7d school_night_hours_sleep
## Min. :0.000 Length:6414
## 1st Qu.:1.000 Class :character
## Median :4.000 Mode :character
## Mean :3.596
## 3rd Qu.:6.000
## Max. :7.000
## NA's :554
yrbss %>%
filter(!is.na(weight), gender =="female") %>%
summary(weight)
## age gender grade hispanic
## Min. :12.00 Length:6165 Length:6165 Length:6165
## 1st Qu.:15.00 Class :character Class :character Class :character
## Median :16.00 Mode :character Mode :character Mode :character
## Mean :16.12
## 3rd Qu.:17.00
## Max. :18.00
##
## race height weight helmet_12m
## Length:6165 Min. :1.270 Min. : 29.94 Length:6165
## Class :character 1st Qu.:1.570 1st Qu.: 52.16 Class :character
## Mode :character Median :1.630 Median : 58.97 Mode :character
## Mean :1.623 Mean : 61.94
## 3rd Qu.:1.680 3rd Qu.: 68.04
## Max. :1.980 Max. :148.33
##
## text_while_driving_30d physically_active_7d hours_tv_per_school_day
## Length:6165 Min. :0.000 Length:6165
## Class :character 1st Qu.:1.000 Class :character
## Mode :character Median :3.000 Mode :character
## Mean :3.293
## 3rd Qu.:5.000
## Max. :7.000
## NA's :73
## strength_training_7d school_night_hours_sleep
## Min. :0.000 Length:6165
## 1st Qu.:0.000 Class :character
## Median :2.000 Mode :character
## Mean :2.321
## 3rd Qu.:4.000
## Max. :7.000
## NA's :490
yrbss <- yrbss %>%
mutate(physical_3plus = ifelse(yrbss$physically_active_7d > 2,"yes","no"))
yrbss %>%
filter(!is.na(physical_3plus)) %>%
ggplot()+
geom_boxplot(mapping = aes(x = physical_3plus, y = weight))
## Warning: Removed 946 rows containing non-finite values (stat_boxplot).
yrbss %>%
group_by(physical_3plus) %>%
summarise(mean_weight = mean(weight, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
## physical_3plus mean_weight
## <chr> <dbl>
## 1 no 66.7
## 2 yes 68.4
## 3 <NA> 69.9
yrbss%>%
count(physical_3plus) %>%
mutate(sizes = n /sum(n))
## # A tibble: 3 x 3
## physical_3plus n sizes
## <chr> <int> <dbl>
## 1 no 4404 0.324
## 2 yes 8906 0.656
## 3 <NA> 273 0.0201
Notice how you can use the functions specify and calculate again like you did for calculating confidence intervals. Here, though, the statistic you are searching for is the difference in means, with the order being yes - no != 0.
After you have initialized the test, you need to simulate the test on the null distribution, which we will save as null.
obs_diff <- yrbss %>%
specify(weight ~ physical_3plus) %>%
calculate(stat = "diff in means", order = c("yes", "no"))
## Warning: Removed 1219 rows containing missing values.
Here, hypothesize is used to set the null hypothesis as a test for independence. In one sample cases, the null argument can be set to “point” to test a hypothesis relative to a point estimate.
Also, note that the type argument within generate is set to permute, which is the argument when generating a null distribution for a hypothesis test.
null_dist <- yrbss %>%
specify(weight ~ physical_3plus) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means", order = c("yes", "no"))
## Warning: Removed 1219 rows containing missing values.
ggplot(data = null_dist, aes(x = stat)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
null_dist %>%
get_p_value(obs_stat = obs_diff, direction = "two_sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0
#library(BSDA)
#tsum.test(68.45,xx,8906,66.67,xx,4404, conf.level = .95)
Confidence interval example…
#samp %>%
# specify(response = climate_change_affects, success = "Yes") %>%
#generate(reps = 1000, type = "bootstrap") %>%
#calculate(stat = "prop") %>%
#get_ci(level = 0.95)
yrbss %>%
filter(physical_3plus =="yes") %>%
summary(physical_3plus)
## age gender grade hispanic
## Min. :12.0 Length:8906 Length:8906 Length:8906
## 1st Qu.:15.0 Class :character Class :character Class :character
## Median :16.0 Mode :character Mode :character Mode :character
## Mean :16.1
## 3rd Qu.:17.0
## Max. :18.0
## NA's :47
## race height weight helmet_12m
## Length:8906 Min. :1.270 Min. : 33.11 Length:8906
## Class :character 1st Qu.:1.630 1st Qu.: 56.70 Class :character
## Mode :character Median :1.700 Median : 65.77 Mode :character
## Mean :1.703 Mean : 68.45
## 3rd Qu.:1.780 3rd Qu.: 77.11
## Max. :2.110 Max. :160.12
## NA's :564 NA's :564
## text_while_driving_30d physically_active_7d hours_tv_per_school_day
## Length:8906 Min. :3.00 Length:8906
## Class :character 1st Qu.:4.00 Class :character
## Mode :character Median :6.00 Mode :character
## Mean :5.44
## 3rd Qu.:7.00
## Max. :7.00
##
## strength_training_7d school_night_hours_sleep physical_3plus
## Min. :0.000 Length:8906 Length:8906
## 1st Qu.:2.000 Class :character Class :character
## Median :4.000 Mode :character Mode :character
## Mean :3.865
## 3rd Qu.:6.000
## Max. :7.000
## NA's :625
yrbss %>%
filter(physical_3plus =="no") %>%
summary(physical_3plus)
## age gender grade hispanic
## Min. :12.00 Length:4404 Length:4404 Length:4404
## 1st Qu.:15.00 Class :character Class :character Class :character
## Median :16.00 Mode :character Mode :character Mode :character
## Mean :16.27
## 3rd Qu.:17.00
## Max. :18.00
## NA's :24
## race height weight helmet_12m
## Length:4404 Min. :1.270 Min. : 29.94 Length:4404
## Class :character 1st Qu.:1.600 1st Qu.: 54.43 Class :character
## Mode :character Median :1.650 Median : 62.60 Mode :character
## Mean :1.666 Mean : 66.67
## 3rd Qu.:1.730 3rd Qu.: 74.84
## Max. :2.110 Max. :180.99
## NA's :382 NA's :382
## text_while_driving_30d physically_active_7d hours_tv_per_school_day
## Length:4404 Min. :0.0000 Length:4404
## Class :character 1st Qu.:0.0000 Class :character
## Mode :character Median :1.0000 Mode :character
## Mean :0.7952
## 3rd Qu.:2.0000
## Max. :2.0000
##
## strength_training_7d school_night_hours_sleep physical_3plus
## Min. :0.000 Length:4404 Length:4404
## 1st Qu.:0.000 Class :character Class :character
## Median :0.000 Mode :character Mode :character
## Mean :1.101
## 3rd Qu.:2.000
## Max. :7.000
## NA's :298
obs_diff <- yrbss %>%
specify(weight ~ physical_3plus) %>%
calculate(stat = "diff in means", order = c("yes", "no"))
## Warning: Removed 1219 rows containing missing values.
set.seed(061582)
null_dist2 <- yrbss %>%
specify(weight ~ physical_3plus) %>%
hypothesize(null = "independence") %>%
generate(reps = 4000, type = "permute") %>%
calculate(stat = "diff in means", order = c("yes", "no"))
## Warning: Removed 1219 rows containing missing values.
ggplot(data = null_dist2, aes(x = stat)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
null_dist %>%
get_p_value(obs_stat = obs_diff, direction = "two_sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0