data(yrbss)
?yrbss
## starting httpd help server ... done
EX1 What are the cases in this data set? How many cases are there in our sample?
Remember that you can answer this question by viewing the data in the data viewer or by using the following command: We see that there is 13 columns[variables] and 13,583 rows of entries. The data appears to be for youth activity[behaviors]
glimpse(yrbss)
## Rows: 13,583
## Columns: 13
## $ age <int> 14, 14, 15, 15, 15, 15, 15, 14, 15, 15, 15...
## $ gender <chr> "female", "female", "female", "female", "f...
## $ grade <chr> "9", "9", "9", "9", "9", "9", "9", "9", "9...
## $ hispanic <chr> "not", "not", "hispanic", "not", "not", "n...
## $ race <chr> "Black or African American", "Black or Afr...
## $ height <dbl> NA, NA, 1.73, 1.60, 1.50, 1.57, 1.65, 1.88...
## $ weight <dbl> NA, NA, 84.37, 55.79, 46.72, 67.13, 131.54...
## $ helmet_12m <chr> "never", "never", "never", "never", "did n...
## $ text_while_driving_30d <chr> "0", NA, "30", "0", "did not drive", "did ...
## $ physically_active_7d <int> 4, 2, 7, 0, 2, 1, 4, 4, 5, 0, 0, 0, 4, 7, ...
## $ hours_tv_per_school_day <chr> "5+", "5+", "5+", "2", "3", "5+", "5+", "5...
## $ strength_training_7d <int> 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, 3, 0, 0, 7, ...
## $ school_night_hours_sleep <chr> "8", "6", "<5", "6", "9", "8", "9", "6", "...
?yrbss
summary(yrbss$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 29.94 56.25 64.41 67.91 76.20 180.99 1004
EX2 How many observations are we missing weights from? It appears there are missing 1004 observations
yrbss <- yrbss %>%
mutate(physical_3plus = ifelse(yrbss$physically_active_7d > 2, "yes", "no"))
EX3 Make a side-by-side boxplot of physical_3plus and weight. Is there a relationship between these two variables? What did you expect and why? We see for the two variables the medians are close in appearance. the “yes” for mean weight is just above the “no”. You could witness the most outliers are appearing in the “no” group. This was to be expected because of the general population relevance of the group ie “teens”
yrbss %>%
group_by(physical_3plus) %>%
summarise(mean_weight = mean(weight, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
## physical_3plus mean_weight
## <chr> <dbl>
## 1 no 66.7
## 2 yes 68.4
## 3 <NA> 69.9
boxplot(yrbss$weight ~ yrbss$physical_3plus, col="orange", main="Distribution of Physical Activity and Weight", ylab="Weight", xlab="Physical Activity")
yrbss %>%
group_by(physical_3plus) %>%
summarise(mean_weight = mean(weight, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
## physical_3plus mean_weight
## <chr> <dbl>
## 1 no 66.7
## 2 yes 68.4
## 3 <NA> 69.9
EX4 Are all conditions necessary for inference satisfied? Comment on each. You can compute the group sizes with the summarize command above by defining a new variable with the definition n(). The essential elements would be for independent sample the youth and they are independent from one another.The data already presents lbel of random sample of observations.
yrbss %>%
group_by(physical_3plus) %>%
summarise(n_weight = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
## physical_3plus n_weight
## <chr> <int>
## 1 no 4404
## 2 yes 8906
## 3 <NA> 273
EX5 Write the hypotheses for testing if the average weights are different for those who exercise at least times a week and those who don’t.
H_nul = there is no difference in the average weights from those who exercise
H_alt = there is difference in the average weights from those who exercise at least 3 times a week
obs_diff <- yrbss %>%
specify(weight ~ physical_3plus) %>%
calculate(stat = "diff in means", order = c("yes", "no"))
## Warning: Removed 1219 rows containing missing values.
null_dist <- yrbss %>%
specify(weight ~ physical_3plus) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means", order = c("yes", "no"))
## Warning: Removed 1219 rows containing missing values.
ggplot(data = null_dist, aes(x = stat)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
EX6 How many of these null permutations have a difference of at least obs_stat? There is zero difference of the null permutation of obs_stat.
null_dist %>%
get_p_value(obs_stat = obs_diff, direction = "two_sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0
EX7 Construct and record a confidence interval for the difference between the weights of those who exercise at least three times a week and those who don’t, and interpret this interval in context of the data. The 95% confidence interval of the difference between those who exercise regularly and those who do not is between 1.76 and 1.79kg. This means that there is a difference between those who exercise and those who do not and we should reject the null hypothesis.
p1 <- yrbss %>%
filter(physical_3plus == "yes") %>%
summarise(mean_weight = mean(weight, na.rm = TRUE),
total = n())
p2 <- yrbss %>%
filter(physical_3plus == "no") %>%
summarise(mean_weight = mean(weight, na.rm = TRUE),
total = n())
n <- 13583 - 1004
prop1 <- p1$total/n
prop2 <- p2$total/n
mean_diff <- p1$mean_weight - p2$mean_weight
se <- ( ((prop1 * (1 - prop1)) / n) + ((prop2 * (1 - prop2)) / n)) ** 0.5
z <- 1.96
me <-z * se
ci_95 <- c(mean_diff - me, mean_diff + me)
ci_95
## [1] 1.763068 1.786101
EX8 Calculate a 95% confidence interval for the average height in meters (height) and interpret it in context.
x <- 8351
#n_no <- nrow(physical_no)
df <- x - 1
mean_h <- mean(yrbss$height)
#mean_yes <- mean(physical_yes)
sd_h <- sd(yrbss$height)
#sd_yes <- sd(physical_yes)
SE <- sd_h/sqrt(x)
t_value <- qt(0.05/2, df, lower.tail = FALSE)
point_estimate <- mean_h
lower_CI <- point_estimate - t_value * SE
upper_CI <- point_estimate + t_value * SE
lower_CI
## [1] NA
1.69481
## [1] 1.69481
1.699298
## [1] 1.699298
EX9 Calculate a new confidence interval for the same parameter at the 90% confidence level. Comment on the width of this interval versus the one obtained in the previous exercise.
height_distx <- yrbss %>%
specify(response = height) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "mean") %>%
get_ci(level = 0.90)
## Warning: Removed 1004 rows containing missing values.
height_distx
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 1.69 1.69
EX10 Conduct a hypothesis test evaluating whether the average height is different for those who exercise at least three times a week and those who don’t. H0: On average height, for people who exercise at least x times a week is the same to those who don’t.
H1: On average height, for people who exercise at least x times a week differ to those who don’t.
height_exercise9 <- yrbss %>%
filter(physical_3plus == "yes") %>%
select(height) %>%
na.omit()
height_noexercise9 <- yrbss %>%
filter(physical_3plus == "no") %>%
select(height) %>%
na.omit()
boxplot(height_exercise9$height, height_noexercise9$height,
names = c("exercise", "no_exercise"))
EX11 Now, a non-inference task: Determine the number of different options there are in the dataset for the hours_tv_per_school_day there are. Viewing below there are 7 options
yrbss %>% group_by(hours_tv_per_school_day) %>% summarise(n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 8 x 2
## hours_tv_per_school_day `n()`
## <chr> <int>
## 1 <1 2168
## 2 1 1750
## 3 2 2705
## 4 3 2139
## 5 4 1048
## 6 5+ 1595
## 7 do not watch 1840
## 8 <NA> 338
EX12 Come up with a research question evaluating the relationship between height or weight and sleep. Formulate the question in a way that it can be answered using a hypothesis test and/or a confidence interval. Report the statistical results, and also provide an explanation in plain language. Be sure to check all assumptions, state your α level, and conclude in context. H0 there is correlation between distro of good night sleep and weight
H1 there is no correlation between distro of good night sleep and weight
yrbss <- yrbss %>%
mutate(sleep_7plus = ifelse(yrbss$school_night_hours_sleep > 6, "yes", "no"))
boxplot(yrbss$weight ~ yrbss$sleep_7plus, col="red", main="Distribution of Good night sleep and Weight", ylab="Weight", xlab="Good night sleep")