This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(openintro)
## Warning: package 'openintro' was built under R version 4.1.2
## Loading required package: airports
## Warning: package 'airports' was built under R version 4.1.2
## Loading required package: cherryblossom
## Warning: package 'cherryblossom' was built under R version 4.1.2
## Loading required package: usdata
## Warning: package 'usdata' was built under R version 4.1.2
library(infer)
## Warning: package 'infer' was built under R version 4.1.2
library(ggplot2)
data('yrbss', package='openintro')
glimpse(yrbss)
## Rows: 13,583
## Columns: 13
## $ age <int> 14, 14, 15, 15, 15, 15, 15, 14, 15, 15, 15, 1~
## $ gender <chr> "female", "female", "female", "female", "fema~
## $ grade <chr> "9", "9", "9", "9", "9", "9", "9", "9", "9", ~
## $ hispanic <chr> "not", "not", "hispanic", "not", "not", "not"~
## $ race <chr> "Black or African American", "Black or Africa~
## $ height <dbl> NA, NA, 1.73, 1.60, 1.50, 1.57, 1.65, 1.88, 1~
## $ weight <dbl> NA, NA, 84.37, 55.79, 46.72, 67.13, 131.54, 7~
## $ helmet_12m <chr> "never", "never", "never", "never", "did not ~
## $ text_while_driving_30d <chr> "0", NA, "30", "0", "did not drive", "did not~
## $ physically_active_7d <int> 4, 2, 7, 0, 2, 1, 4, 4, 5, 0, 0, 0, 4, 7, 7, ~
## $ hours_tv_per_school_day <chr> "5+", "5+", "5+", "2", "3", "5+", "5+", "5+",~
## $ strength_training_7d <int> 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, 3, 0, 0, 7, 7, ~
## $ school_night_hours_sleep <chr> "8", "6", "<5", "6", "9", "8", "9", "6", "<5"~
summary(yrbss$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 29.94 56.25 64.41 67.91 76.20 180.99 1004
colSums(is.na(yrbss))
## age gender grade
## 77 12 79
## hispanic race height
## 231 2805 1004
## weight helmet_12m text_while_driving_30d
## 1004 311 918
## physically_active_7d hours_tv_per_school_day strength_training_7d
## 273 338 1176
## school_night_hours_sleep
## 1248
na_weight <- yrbss[is.na(yrbss$weight), ]
nrow(na_weight)
## [1] 1004
yrbss <- yrbss %>%
mutate(physical_3plus = ifelse(yrbss$physically_active_7d > 2, "yes", "no"))
yrbss
## # A tibble: 13,583 x 14
## age gender grade hispanic race height weight helmet_12m text_while_driv~
## <int> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 14 female 9 not Black ~ NA NA never 0
## 2 14 female 9 not Black ~ NA NA never <NA>
## 3 15 female 9 hispanic Native~ 1.73 84.4 never 30
## 4 15 female 9 not Black ~ 1.6 55.8 never 0
## 5 15 female 9 not Black ~ 1.5 46.7 did not r~ did not drive
## 6 15 female 9 not Black ~ 1.57 67.1 did not r~ did not drive
## 7 15 female 9 not Black ~ 1.65 132. did not r~ <NA>
## 8 14 male 9 not Black ~ 1.88 71.2 never <NA>
## 9 15 male 9 not Black ~ 1.75 63.5 never <NA>
## 10 15 male 10 not Black ~ 1.37 97.1 did not r~ <NA>
## # ... with 13,573 more rows, and 5 more variables: physically_active_7d <int>,
## # hours_tv_per_school_day <chr>, strength_training_7d <int>,
## # school_night_hours_sleep <chr>, physical_3plus <chr>
ggplot(data=yrbss, mapping = aes(x = physically_active_7d, y = weight)) +
geom_point(aes(color = weight)) +
theme_bw()
## Warning: Removed 1219 rows containing missing values (geom_point).
# Median looks similar as well as IQR
yrbss$physical_3plus<-as.factor(yrbss$physical_3plus)
yrb_plot<-yrbss%>%
filter(!is.na(weight),!is.na(physical_3plus))
p<-ggplot(yrb_plot, aes(x=physical_3plus,y=weight))+
geom_boxplot()
p
yrbss %>%
group_by(physical_3plus) %>%
summarise(mean_weight = mean(weight, na.rm = TRUE))
## # A tibble: 3 x 2
## physical_3plus mean_weight
## <fct> <dbl>
## 1 no 66.7
## 2 yes 68.4
## 3 <NA> 69.9
#Exercise 4: # Are all conditions necessary for inference satisfied? # Comment on each. You can compute the group sizes with the # summarize command above by defining a new variable with the definition n().
# the two gropus are independent and groups are less than 10% of population
#create new variable n
yrbss<- yrbss %>%
mutate(n = ifelse(yrbss$physically_active_7d > 2, "yes", "no"))
yrbss$n<-as.factor(yrbss$n)
view(yrbss)
yrbss%>%
count(n)
## Storing counts in `nn`, as `n` already present in input
## i Use `name = "new_name"` to pick a new name.
## # A tibble: 3 x 2
## n nn
## <fct> <int>
## 1 no 4404
## 2 yes 8906
## 3 <NA> 273
obs_diff <- yrbss %>%
specify(weight ~ physical_3plus) %>%
calculate(stat = "diff in means", order = c("yes", "no"))
## Warning: Removed 1219 rows containing missing values.
null_dist <- yrbss %>%
specify(weight ~ physical_3plus) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means", order = c("yes", "no"))
## Warning: Removed 1219 rows containing missing values.
ggplot(data = null_dist, aes(x = stat)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = null_dist, aes(x = stat)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Exercise 6: # How many of these null permutations have a difference of at least obs_stat?
obs_diff_val<-obs_diff$stat[1]
null_list<-as.list(null_dist$stat)
null_abs<-lapply(null_list, FUN=function(x){abs(x)})
null_dist%>%
summarise(mean= mean(stat, na.rm=TRUE))
## # A tibble: 1 x 1
## mean
## <dbl>
## 1 0.00726
null_dist%>%
filter(stat>obs_diff_val)
## Response: weight (numeric)
## Explanatory: physical_3plus (factor)
## Null Hypothesis: independence
## # A tibble: 0 x 2
## # ... with 2 variables: replicate <int>, stat <dbl>
null_dist
## Response: weight (numeric)
## Explanatory: physical_3plus (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 -0.327
## 2 2 -0.106
## 3 3 -0.127
## 4 4 -0.322
## 5 5 0.579
## 6 6 -0.168
## 7 7 -0.0672
## 8 8 -0.0106
## 9 9 -0.115
## 10 10 0.0802
## # ... with 990 more rows
# none of the values are greater than the obs_diff_val
null_dist %>%
get_p_value(obs_stat = obs_diff, direction = "two_sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0
null_dist
## Response: weight (numeric)
## Explanatory: physical_3plus (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 -0.327
## 2 2 -0.106
## 3 3 -0.127
## 4 4 -0.322
## 5 5 0.579
## 6 6 -0.168
## 7 7 -0.0672
## 8 8 -0.0106
## 9 9 -0.115
## 10 10 0.0802
## # ... with 990 more rows
# number of groups
n_1<-8406
n_2<-4408
x_bar_diff<-1.78
T_score<-pt(.025,4407,lower.tail = FALSE)*2
#get sigmas of samples
sigma_1<-yrbss %>%
group_by(physical_3plus) %>%
summarise(sd = sd(weight, na.rm = TRUE))%>%
filter(physical_3plus=="yes")%>%
select(sd)%>%
as.double()
sigma_1
## [1] 16.47832
sigma_2<-yrbss %>%
group_by(physical_3plus) %>%
summarise(sd = sd(weight, na.rm = TRUE))%>%
filter(physical_3plus=="no")%>%
select(sd)%>%
as.double()
sigma_2
## [1] 17.63805
SE<-sqrt((sigma_1^2 /n_1)+(sigma_2^2/n_2))
bottom<-x_bar_diff-T_score*SE
top<-x_bar_diff+T_score*SE
bottom
## [1] 1.465649
top
## [1] 2.094351
# there is NO difference bewteen the two groups falls OUTSIDE our 95% confidence interval, we can REJECT the null hypothesis.
summary(yrbss$height)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.270 1.600 1.680 1.691 1.780 2.110 1004
height<-yrbss%>%
filter(!is.na(height))%>%
select(height)
n<-nrow(height)
df<-n-1
height<-height$height
x_bar<-mean(height)
sigma<-sd(height)
SE<-sigma/sqrt(n)
t_star<-qt(.025,df=df)
#confidence interval
bottom<-x_bar-abs(t_star*SE)
top<-x_bar+abs(t_star*SE)
bottom
## [1] 1.689411
top
## [1] 1.693071
t_star<-abs(qt(.05,df=df))
# the rest is the same
x_bar<-mean(height)
sigma<-sd(height)
SE<-sigma/sqrt(n)
bottom_1<-x_bar-abs(t_star*SE)
top_1<-x_bar+abs(t_star*SE)
cat("the 90% confidence interval is ",bottom_1," to ",top_1)
## the 90% confidence interval is 1.689705 to 1.692777
# the difference is minor
yrbss<-as.data.frame(yrbss)
ggplot(yrbss, aes(x=height))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1004 rows containing non-finite values (stat_bin).
# Exercise 10: # Conduct a hypothesis test evaluating whether the average height is different for those who exercise # at least three times a week and those who don’t.
obs_diff <- yrbss %>%
specify(height ~ physical_3plus) %>%
calculate(stat = "diff in means", order = c("yes", "no"))
## Warning: Removed 1219 rows containing missing values.
# hypothesis test:
# H_0: the difference in the height for the two groups is zero
# (mean_diff=0)
# H_A: the difference in heights is NOT zero
# (mean_diff!=0)
#physically active count
n_1<-yrbss%>%
filter(physical_3plus=="yes")%>%
nrow()
n_2<-yrbss%>%
filter(physical_3plus=="no")%>%
nrow()
#difference in mean height
obs_diff <- yrbss %>%
specify(height ~ physical_3plus) %>%
calculate(stat = "diff in means", order = c("yes", "no"))#physically active count
## Warning: Removed 1219 rows containing missing values.
n_1<-yrbss%>%
filter(physical_3plus=="yes")%>%
nrow()
n_2<-yrbss%>%
filter(physical_3plus=="no")%>%
nrow()
#difference in mean height
obs_diff <- yrbss %>%
specify(height ~ physical_3plus) %>%
calculate(stat = "diff in means", order = c("yes", "no"))
## Warning: Removed 1219 rows containing missing values.
#Sandard deviation for physically active group:
null<-0
sigma_1<-yrbss %>%
group_by(physical_3plus) %>%
summarise(sd = sd(height, na.rm = TRUE))%>%
filter(physical_3plus=="yes")%>%
select(sd)%>%
as.double()
#Sandard deviation for not physically active group:
sigma_2<-yrbss %>%
group_by(physical_3plus) %>%
summarise(sd = sd(height, na.rm = TRUE))%>%
filter(physical_3plus=="no")%>%
select(sd)%>%
as.double()
SE<-sqrt((sigma_1^2 /n_1)+(sigma_2^2/n_2))
T_score<-abs(qt(.025,4407))
bottom<-x_bar_diff-T_score*SE
top<-x_bar_diff+T_score*SE
bottom
## [1] 1.77628
top
## [1] 1.78372
# The null value falls outside this range so we can REJECT the null hypothesis
yrbss%>%
filter(!is.na(hours_tv_per_school_day))%>%
select(hours_tv_per_school_day)%>%
unique()
## hours_tv_per_school_day
## 1 5+
## 4 2
## 5 3
## 10 do not watch
## 12 <1
## 14 4
## 19 1
yrbss$hours_tv_per_school_day<-as.factor(yrbss$hours_tv_per_school_day)
yrb_plot<-yrbss%>%
filter(!is.na(weight),!is.na(hours_tv_per_school_day))
p<-ggplot(yrb_plot, aes(x=hours_tv_per_school_day,y=weight))+
geom_boxplot()
p
# looking at the box plots, it looks like there is no major difference between weight and how much TV one watches.