Youth Risk Behaviour Surveillance System (YRBSS) survey
Selected group of variables from a random sample of observations collected from highschoolers (9th-12th grade) on a given year the YRBSS was conducted.
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(infer)
data("yrbss")
?yrbss
Exercise 1: There are 13,583 cases in this data sample. Each case is represented by a HS student and the respective answers.
glimpse(yrbss)
## Rows: 13,583
## Columns: 13
## $ age                      <int> 14, 14, 15, 15, 15, 15, 15, 14, 15, 15, 15, …
## $ gender                   <chr> "female", "female", "female", "female", "fem…
## $ grade                    <chr> "9", "9", "9", "9", "9", "9", "9", "9", "9",…
## $ hispanic                 <chr> "not", "not", "hispanic", "not", "not", "not…
## $ race                     <chr> "Black or African American", "Black or Afric…
## $ height                   <dbl> NA, NA, 1.73, 1.60, 1.50, 1.57, 1.65, 1.88, …
## $ weight                   <dbl> NA, NA, 84.37, 55.79, 46.72, 67.13, 131.54, …
## $ helmet_12m               <chr> "never", "never", "never", "never", "did not…
## $ text_while_driving_30d   <chr> "0", NA, "30", "0", "did not drive", "did no…
## $ physically_active_7d     <int> 4, 2, 7, 0, 2, 1, 4, 4, 5, 0, 0, 0, 4, 7, 7,…
## $ hours_tv_per_school_day  <chr> "5+", "5+", "5+", "2", "3", "5+", "5+", "5+"…
## $ strength_training_7d     <int> 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, 3, 0, 0, 7, 7,…
## $ school_night_hours_sleep <chr> "8", "6", "<5", "6", "9", "8", "9", "6", "<5…
Exploratory data analysis
ggplot(data = yrbss, aes(x = weight)) +
  geom_histogram(binwidth = 2) +
  labs(
    x = "Weight (kgs)", y = "frequency",
    title = "Weight distribution of HS students")
## Warning: Removed 1004 rows containing non-finite values (stat_bin).

yrbss %>%
  filter(!is.na(weight)) %>%
ggplot() +
    geom_histogram(aes(x = weight), binwidth = 2, colour = "black", fill = "white") +
    facet_wrap(~ gender) 

  ggplot(data = yrbss, aes(sample = weight)) + 
  geom_line(stat = "qq") +
  labs(title = "Probability (Q-Q) plot")
## Warning: Removed 1004 rows containing non-finite values (stat_qq).

summary(yrbss$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   29.94   56.25   64.41   67.91   76.20  180.99    1004
yrbss %>%
  filter(!is.na(weight), gender =="male") %>%
  summary(weight)
##       age           gender             grade             hispanic        
##  Min.   :12.00   Length:6414        Length:6414        Length:6414       
##  1st Qu.:15.00   Class :character   Class :character   Class :character  
##  Median :16.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :16.22                                                           
##  3rd Qu.:17.00                                                           
##  Max.   :18.00                                                           
##                                                                          
##      race               height          weight        helmet_12m       
##  Length:6414        Min.   :1.270   Min.   : 31.75   Length:6414       
##  Class :character   1st Qu.:1.700   1st Qu.: 61.69   Class :character  
##  Mode  :character   Median :1.750   Median : 70.31   Mode  :character  
##                     Mean   :1.757   Mean   : 73.64                     
##                     3rd Qu.:1.800   3rd Qu.: 81.65                     
##                     Max.   :2.110   Max.   :180.99                     
##                                                                        
##  text_while_driving_30d physically_active_7d hours_tv_per_school_day
##  Length:6414            Min.   :0.000        Length:6414            
##  Class :character       1st Qu.:3.000        Class :character       
##  Mode  :character       Median :5.000        Mode  :character       
##                         Mean   :4.537                               
##                         3rd Qu.:7.000                               
##                         Max.   :7.000                               
##                         NA's   :142                                 
##  strength_training_7d school_night_hours_sleep
##  Min.   :0.000        Length:6414             
##  1st Qu.:1.000        Class :character        
##  Median :4.000        Mode  :character        
##  Mean   :3.596                                
##  3rd Qu.:6.000                                
##  Max.   :7.000                                
##  NA's   :554
yrbss %>%
  filter(!is.na(weight), gender =="female") %>%
  summary(weight)
##       age           gender             grade             hispanic        
##  Min.   :12.00   Length:6165        Length:6165        Length:6165       
##  1st Qu.:15.00   Class :character   Class :character   Class :character  
##  Median :16.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :16.12                                                           
##  3rd Qu.:17.00                                                           
##  Max.   :18.00                                                           
##                                                                          
##      race               height          weight        helmet_12m       
##  Length:6165        Min.   :1.270   Min.   : 29.94   Length:6165       
##  Class :character   1st Qu.:1.570   1st Qu.: 52.16   Class :character  
##  Mode  :character   Median :1.630   Median : 58.97   Mode  :character  
##                     Mean   :1.623   Mean   : 61.94                     
##                     3rd Qu.:1.680   3rd Qu.: 68.04                     
##                     Max.   :1.980   Max.   :148.33                     
##                                                                        
##  text_while_driving_30d physically_active_7d hours_tv_per_school_day
##  Length:6165            Min.   :0.000        Length:6165            
##  Class :character       1st Qu.:1.000        Class :character       
##  Mode  :character       Median :3.000        Mode  :character       
##                         Mean   :3.293                               
##                         3rd Qu.:5.000                               
##                         Max.   :7.000                               
##                         NA's   :73                                  
##  strength_training_7d school_night_hours_sleep
##  Min.   :0.000        Length:6165             
##  1st Qu.:0.000        Class :character        
##  Median :2.000        Mode  :character        
##  Mean   :2.321                                
##  3rd Qu.:4.000                                
##  Max.   :7.000                                
##  NA's   :490
Exercise 2: Variable - Weight-Physical Activity relationship
(missing 1004 observations from student weights)
yrbss <- yrbss %>%
  mutate(physical_3plus = ifelse(yrbss$physically_active_7d > 2,"yes","no"))
Exercise 3: Boxplot - Weight-Physical Activity relationship
The boxplots below illustrate that there is a relationship between physical activity and weight, in particular with outliers. Students who engage in physical activity at least 3 days a week, tend to have lower weight than those who do not. Weights of HS students who do not engage in physical activity at least 3 days a week are also more dispersed and contain higher-weight outliers than those who do practice a physical activity at least 3 days a week. The middle 50% of the distribution is surprising to me, as I did not expect students who engage in physical activity at least 3 times a week to have slighty higher weights than those who do not.
yrbss %>%
  filter(!is.na(physical_3plus)) %>%
  ggplot()+
  geom_boxplot(mapping = aes(x = physical_3plus, y = weight))
## Warning: Removed 946 rows containing non-finite values (stat_boxplot).

yrbss %>%
  group_by(physical_3plus) %>%
  summarise(mean_weight = mean(weight, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
##   physical_3plus mean_weight
##   <chr>                <dbl>
## 1 no                    66.7
## 2 yes                   68.4
## 3 <NA>                  69.9
Inference
Exercise 4: In general, it does appear that the basic conditions for statistical inference may be satisfied. although The sample has been randomly selected, it is not clear that the sampling distribution is not approximately normal by the histogram, though based on the qq plot, and the size of the sample one could say that it is. Lastly, 3) Each observation is also independent from one another. Therefore, this may satisfy the basic conditions for statistical inference.
yrbss%>%
  count(physical_3plus) %>%
  mutate(sizes = n /sum(n))
## # A tibble: 3 x 3
##   physical_3plus     n  sizes
##   <chr>          <int>  <dbl>
## 1 no              4404 0.324 
## 2 yes             8906 0.656 
## 3 <NA>             273 0.0201
Exercise 5: Write the hypotheses for testing if the average weights are different for those who exercise at least times a week and those who don’t
  1. The mean weight of the population of HS students who exercise at least 3 times a week is at least 69.936 kgs
  2. The mean weight of the population of HS students who do not exercise at least 3 times a week is at least 68.448 kgs
initializing the hypothesis test:

Notice how you can use the functions specify and calculate again like you did for calculating confidence intervals. Here, though, the statistic you are searching for is the difference in means, with the order being yes - no != 0.

After you have initialized the test, you need to simulate the test on the null distribution, which we will save as null.

obs_diff <- yrbss %>%
  specify(weight ~ physical_3plus) %>%
  calculate(stat = "diff in means", order = c("yes", "no"))
## Warning: Removed 1219 rows containing missing values.
Simulating the hypothesis test:

Here, hypothesize is used to set the null hypothesis as a test for independence. In one sample cases, the null argument can be set to “point” to test a hypothesis relative to a point estimate.

Also, note that the type argument within generate is set to permute, which is the argument when generating a null distribution for a hypothesis test.

null_dist <- yrbss %>%
  specify(weight ~ physical_3plus) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in means", order = c("yes", "no"))
## Warning: Removed 1219 rows containing missing values.
Visualizing this null distribution with the following code:
ggplot(data = null_dist, aes(x = stat)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise 6: Calculating the p-value for the hypothesis test using the function “get_p_value”:
null_dist %>%
  get_p_value(obs_stat = obs_diff, direction = "two_sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1       0
Exercise 7: Construct and record a confidence interval for the difference between the weights of those who exercise at least three times a week and those who don’t, and interpret this interval in context of the data.
#library(BSDA)
#tsum.test(68.45,xx,8906,66.67,xx,4404, conf.level = .95)

Confidence interval example…

#samp %>%
 # specify(response = climate_change_affects, success = "Yes") %>%
  #generate(reps = 1000, type = "bootstrap") %>%
  #calculate(stat = "prop") %>%
  #get_ci(level = 0.95)
yrbss %>%
filter(physical_3plus =="yes") %>%
summary(physical_3plus)
##       age          gender             grade             hispanic        
##  Min.   :12.0   Length:8906        Length:8906        Length:8906       
##  1st Qu.:15.0   Class :character   Class :character   Class :character  
##  Median :16.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :16.1                                                           
##  3rd Qu.:17.0                                                           
##  Max.   :18.0                                                           
##  NA's   :47                                                             
##      race               height          weight        helmet_12m       
##  Length:8906        Min.   :1.270   Min.   : 33.11   Length:8906       
##  Class :character   1st Qu.:1.630   1st Qu.: 56.70   Class :character  
##  Mode  :character   Median :1.700   Median : 65.77   Mode  :character  
##                     Mean   :1.703   Mean   : 68.45                     
##                     3rd Qu.:1.780   3rd Qu.: 77.11                     
##                     Max.   :2.110   Max.   :160.12                     
##                     NA's   :564     NA's   :564                        
##  text_while_driving_30d physically_active_7d hours_tv_per_school_day
##  Length:8906            Min.   :3.00         Length:8906            
##  Class :character       1st Qu.:4.00         Class :character       
##  Mode  :character       Median :6.00         Mode  :character       
##                         Mean   :5.44                                
##                         3rd Qu.:7.00                                
##                         Max.   :7.00                                
##                                                                     
##  strength_training_7d school_night_hours_sleep physical_3plus    
##  Min.   :0.000        Length:8906              Length:8906       
##  1st Qu.:2.000        Class :character         Class :character  
##  Median :4.000        Mode  :character         Mode  :character  
##  Mean   :3.865                                                   
##  3rd Qu.:6.000                                                   
##  Max.   :7.000                                                   
##  NA's   :625
yrbss %>%
filter(physical_3plus =="no") %>%
summary(physical_3plus)
##       age           gender             grade             hispanic        
##  Min.   :12.00   Length:4404        Length:4404        Length:4404       
##  1st Qu.:15.00   Class :character   Class :character   Class :character  
##  Median :16.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :16.27                                                           
##  3rd Qu.:17.00                                                           
##  Max.   :18.00                                                           
##  NA's   :24                                                              
##      race               height          weight        helmet_12m       
##  Length:4404        Min.   :1.270   Min.   : 29.94   Length:4404       
##  Class :character   1st Qu.:1.600   1st Qu.: 54.43   Class :character  
##  Mode  :character   Median :1.650   Median : 62.60   Mode  :character  
##                     Mean   :1.666   Mean   : 66.67                     
##                     3rd Qu.:1.730   3rd Qu.: 74.84                     
##                     Max.   :2.110   Max.   :180.99                     
##                     NA's   :382     NA's   :382                        
##  text_while_driving_30d physically_active_7d hours_tv_per_school_day
##  Length:4404            Min.   :0.0000       Length:4404            
##  Class :character       1st Qu.:0.0000       Class :character       
##  Mode  :character       Median :1.0000       Mode  :character       
##                         Mean   :0.7952                              
##                         3rd Qu.:2.0000                              
##                         Max.   :2.0000                              
##                                                                     
##  strength_training_7d school_night_hours_sleep physical_3plus    
##  Min.   :0.000        Length:4404              Length:4404       
##  1st Qu.:0.000        Class :character         Class :character  
##  Median :0.000        Mode  :character         Mode  :character  
##  Mean   :1.101                                                   
##  3rd Qu.:2.000                                                   
##  Max.   :7.000                                                   
##  NA's   :298
REMEMBER THE STANDARD WORKFLOW FOR PERFORMING HYPOTHESIS TESTS:
  • Initialize the hypothesis test:
obs_diff <- yrbss %>%
  specify(weight ~ physical_3plus) %>%
 calculate(stat = "diff in means", order = c("yes", "no"))
## Warning: Removed 1219 rows containing missing values.
  • Simulate the hypothesis test:
set.seed(061582)
null_dist2 <- yrbss %>%
  specify(weight ~ physical_3plus) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 4000, type = "permute") %>%
  calculate(stat = "diff in means", order = c("yes", "no"))
## Warning: Removed 1219 rows containing missing values.
  • Visualize the null distribution:
ggplot(data = null_dist2, aes(x = stat)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  • Calculate the p-value for the hypothesis test:
null_dist %>%
  get_p_value(obs_stat = obs_diff, direction = "two_sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1       0