R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Load Packages:

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.2

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## Warning: package 'ggplot2' was built under R version 4.1.2

## Warning: package 'stringr' was built under R version 4.1.2

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(openintro)

## Warning: package 'openintro' was built under R version 4.1.2

## Loading required package: airports

## Warning: package 'airports' was built under R version 4.1.2

## Loading required package: cherryblossom

## Warning: package 'cherryblossom' was built under R version 4.1.2

## Loading required package: usdata

## Warning: package 'usdata' was built under R version 4.1.2

library(infer)

## Warning: package 'infer' was built under R version 4.1.2

library(ggplot2)

The Data:

Load the yrbss data set

data('yrbss', package='openintro')

Exercise 1:

What are the cases in this data set? How many cases are there in our sample?

glimpse(yrbss)

## Rows: 13,583
## Columns: 13
## $ age                      <int> 14, 14, 15, 15, 15, 15, 15, 14, 15, 15, 15, 1~
## $ gender                   <chr> "female", "female", "female", "female", "fema~
## $ grade                    <chr> "9", "9", "9", "9", "9", "9", "9", "9", "9", ~
## $ hispanic                 <chr> "not", "not", "hispanic", "not", "not", "not"~
## $ race                     <chr> "Black or African American", "Black or Africa~
## $ height                   <dbl> NA, NA, 1.73, 1.60, 1.50, 1.57, 1.65, 1.88, 1~
## $ weight                   <dbl> NA, NA, 84.37, 55.79, 46.72, 67.13, 131.54, 7~
## $ helmet_12m               <chr> "never", "never", "never", "never", "did not ~
## $ text_while_driving_30d   <chr> "0", NA, "30", "0", "did not drive", "did not~
## $ physically_active_7d     <int> 4, 2, 7, 0, 2, 1, 4, 4, 5, 0, 0, 0, 4, 7, 7, ~
## $ hours_tv_per_school_day  <chr> "5+", "5+", "5+", "2", "3", "5+", "5+", "5+",~
## $ strength_training_7d     <int> 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, 3, 0, 0, 7, 7, ~
## $ school_night_hours_sleep <chr> "8", "6", "<5", "6", "9", "8", "9", "6", "<5"~

Exploratory data analysis:

summary(yrbss$weight)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   29.94   56.25   64.41   67.91   76.20  180.99    1004

Exercise 2:

How many observations are we missing weights from?

colSums(is.na(yrbss))

##                      age                   gender                    grade 
##                       77                       12                       79 
##                 hispanic                     race                   height 
##                      231                     2805                     1004 
##                   weight               helmet_12m   text_while_driving_30d 
##                     1004                      311                      918 
##     physically_active_7d  hours_tv_per_school_day     strength_training_7d 
##                      273                      338                     1176 
## school_night_hours_sleep 
##                     1248

Extract Rows with NA in Specific Column

na_weight <- yrbss[is.na(yrbss$weight), ]  
nrow(na_weight)

## [1] 1004

create a new variable physical_3plus, which will be coded as either “yes” if they are

physically active for at least 3 days a week, and “no” if not.

yrbss <- yrbss %>% 
  mutate(physical_3plus = ifelse(yrbss$physically_active_7d > 2, "yes", "no"))

yrbss

## # A tibble: 13,583 x 14
##      age gender grade hispanic race    height weight helmet_12m text_while_driv~
##    <int> <chr>  <chr> <chr>    <chr>    <dbl>  <dbl> <chr>      <chr>           
##  1    14 female 9     not      Black ~  NA      NA   never      0               
##  2    14 female 9     not      Black ~  NA      NA   never      <NA>            
##  3    15 female 9     hispanic Native~   1.73   84.4 never      30              
##  4    15 female 9     not      Black ~   1.6    55.8 never      0               
##  5    15 female 9     not      Black ~   1.5    46.7 did not r~ did not drive   
##  6    15 female 9     not      Black ~   1.57   67.1 did not r~ did not drive   
##  7    15 female 9     not      Black ~   1.65  132.  did not r~ <NA>            
##  8    14 male   9     not      Black ~   1.88   71.2 never      <NA>            
##  9    15 male   9     not      Black ~   1.75   63.5 never      <NA>            
## 10    15 male   10    not      Black ~   1.37   97.1 did not r~ <NA>            
## # ... with 13,573 more rows, and 5 more variables: physically_active_7d <int>,
## #   hours_tv_per_school_day <chr>, strength_training_7d <int>,
## #   school_night_hours_sleep <chr>, physical_3plus <chr>

Relationship between a high schooler’s weight and their physical activity

ggplot(data=yrbss, mapping = aes(x = physically_active_7d, y = weight)) + 
  geom_point(aes(color = weight)) +
  theme_bw()

## Warning: Removed 1219 rows containing missing values (geom_point).

Exercise 3:

Make a side-by-side boxplot of physical_3plus and weight.

Is there a relationship between these two variables? What did you expect and why?

# Median looks similar as well as IQR
yrbss$physical_3plus<-as.factor(yrbss$physical_3plus)
yrb_plot<-yrbss%>%
  filter(!is.na(weight),!is.na(physical_3plus))
p<-ggplot(yrb_plot, aes(x=physical_3plus,y=weight))+
  geom_boxplot()
p

Comparing the mean of two groups of physical exercise

yrbss %>%
  group_by(physical_3plus) %>%
  summarise(mean_weight = mean(weight, na.rm = TRUE))

## # A tibble: 3 x 2
##   physical_3plus mean_weight
##   <fct>                <dbl>
## 1 no                    66.7
## 2 yes                   68.4
## 3 <NA>                  69.9

Inference:

#Exercise 4: # Are all conditions necessary for inference satisfied? # Comment on each. You can compute the group sizes with the # summarize command above by defining a new variable with the definition n().

# the two gropus are independent and groups are less than 10% of population

#create new variable n

yrbss<- yrbss %>% 
  mutate(n = ifelse(yrbss$physically_active_7d > 2, "yes", "no"))

yrbss$n<-as.factor(yrbss$n)

view(yrbss)
yrbss%>%
 count(n)

## Storing counts in `nn`, as `n` already present in input
## i Use `name = "new_name"` to pick a new name.

## # A tibble: 3 x 2
##   n        nn
##   <fct> <int>
## 1 no     4404
## 2 yes    8906
## 3 <NA>    273

Exercise 5:

Hypothesis test:

H_0: the difference in the means between physical 3 plus and less than 3 are equal (mean1-mean2)=0

H_A: the weights are diffferent !=0

obs_diff <- yrbss %>%
  specify(weight ~ physical_3plus) %>%
  calculate(stat = "diff in means", order = c("yes", "no"))

## Warning: Removed 1219 rows containing missing values.

null_dist <- yrbss %>%
  specify(weight ~ physical_3plus) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in means", order = c("yes", "no"))

## Warning: Removed 1219 rows containing missing values.

ggplot(data = null_dist, aes(x = stat)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = null_dist, aes(x = stat)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Exercise 6: # How many of these null permutations have a difference of at least obs_stat?

obs_diff_val<-obs_diff$stat[1]

null_list<-as.list(null_dist$stat)
null_abs<-lapply(null_list, FUN=function(x){abs(x)})

null_dist%>%
  summarise(mean= mean(stat, na.rm=TRUE))

## # A tibble: 1 x 1
##      mean
##     <dbl>
## 1 0.00726

null_dist%>%
  filter(stat>obs_diff_val)

## Response: weight (numeric)
## Explanatory: physical_3plus (factor)
## Null Hypothesis: independence
## # A tibble: 0 x 2
## # ... with 2 variables: replicate <int>, stat <dbl>

null_dist

## Response: weight (numeric)
## Explanatory: physical_3plus (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.327 
##  2         2 -0.106 
##  3         3 -0.127 
##  4         4 -0.322 
##  5         5  0.579 
##  6         6 -0.168 
##  7         7 -0.0672
##  8         8 -0.0106
##  9         9 -0.115 
## 10        10  0.0802
## # ... with 990 more rows

# none of the values are greater than the obs_diff_val

null_dist %>%
  get_p_value(obs_stat = obs_diff, direction = "two_sided")

## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1       0

null_dist

## Response: weight (numeric)
## Explanatory: physical_3plus (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.327 
##  2         2 -0.106 
##  3         3 -0.127 
##  4         4 -0.322 
##  5         5  0.579 
##  6         6 -0.168 
##  7         7 -0.0672
##  8         8 -0.0106
##  9         9 -0.115 
## 10        10  0.0802
## # ... with 990 more rows

Exercise 7:

Construct and record a confidence interval for the difference between the weights of those

who exercise at least three times a week and those who don’t, and interpret this interval in context of the data.

# number of groups
n_1<-8406
n_2<-4408

x_bar_diff<-1.78
T_score<-pt(.025,4407,lower.tail = FALSE)*2

#get sigmas of samples
sigma_1<-yrbss %>%
  group_by(physical_3plus) %>%
  summarise(sd = sd(weight, na.rm = TRUE))%>%
  filter(physical_3plus=="yes")%>%
  select(sd)%>%
  as.double()
sigma_1

## [1] 16.47832

sigma_2<-yrbss %>%
  group_by(physical_3plus) %>%
  summarise(sd = sd(weight, na.rm = TRUE))%>%
  filter(physical_3plus=="no")%>%
  select(sd)%>%
  as.double()

sigma_2

## [1] 17.63805

SE<-sqrt((sigma_1^2 /n_1)+(sigma_2^2/n_2))

bottom<-x_bar_diff-T_score*SE
top<-x_bar_diff+T_score*SE

bottom

## [1] 1.465649

top

## [1] 2.094351

# there is NO difference bewteen the two groups falls OUTSIDE our 95% confidence interval, we can REJECT the null hypothesis.

Exercise 8:

Calculate a 95% confidence interval for the average height in meters (height) and interpret it in context.

summary(yrbss$height)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.270   1.600   1.680   1.691   1.780   2.110    1004

height<-yrbss%>%
  filter(!is.na(height))%>%
  select(height)

n<-nrow(height)
df<-n-1
height<-height$height


x_bar<-mean(height)
sigma<-sd(height)
SE<-sigma/sqrt(n)
t_star<-qt(.025,df=df)

#confidence interval

bottom<-x_bar-abs(t_star*SE)
top<-x_bar+abs(t_star*SE)

bottom

## [1] 1.689411

top

## [1] 1.693071

t_star<-abs(qt(.05,df=df))
# the rest is the same
x_bar<-mean(height)
sigma<-sd(height)
SE<-sigma/sqrt(n)
bottom_1<-x_bar-abs(t_star*SE)
top_1<-x_bar+abs(t_star*SE)

cat("the 90% confidence interval is ",bottom_1," to ",top_1)

## the 90% confidence interval is  1.689705  to  1.692777

# the difference is minor

yrbss<-as.data.frame(yrbss)
ggplot(yrbss, aes(x=height))+
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1004 rows containing non-finite values (stat_bin).

# Exercise 10: # Conduct a hypothesis test evaluating whether the average height is different for those who exercise # at least three times a week and those who don’t.

obs_diff <- yrbss %>%
  specify(height ~ physical_3plus) %>%
  calculate(stat = "diff in means", order = c("yes", "no"))

## Warning: Removed 1219 rows containing missing values.

# hypothesis test:

# H_0: the difference in the height for the two groups is zero

# (mean_diff=0)

# H_A: the difference in heights is NOT zero

# (mean_diff!=0)

#physically active count
n_1<-yrbss%>%
  filter(physical_3plus=="yes")%>%
  nrow()
n_2<-yrbss%>%
  filter(physical_3plus=="no")%>%
  nrow()
#difference in mean height
obs_diff <- yrbss %>%
  specify(height ~ physical_3plus) %>%
  calculate(stat = "diff in means", order = c("yes", "no"))#physically active count

## Warning: Removed 1219 rows containing missing values.

n_1<-yrbss%>%
  filter(physical_3plus=="yes")%>%
  nrow()
n_2<-yrbss%>%
  filter(physical_3plus=="no")%>%
  nrow()
#difference in mean height
obs_diff <- yrbss %>%
  specify(height ~ physical_3plus) %>%
  calculate(stat = "diff in means", order = c("yes", "no"))

## Warning: Removed 1219 rows containing missing values.

#Sandard deviation for physically active group:

null<-0

sigma_1<-yrbss %>%
  group_by(physical_3plus) %>%
  summarise(sd = sd(height, na.rm = TRUE))%>%
  filter(physical_3plus=="yes")%>%
  select(sd)%>%
  as.double()

```

#Sandard deviation for not physically active group:

sigma_2<-yrbss %>%
  group_by(physical_3plus) %>%
  summarise(sd = sd(height, na.rm = TRUE))%>%
  filter(physical_3plus=="no")%>%
  select(sd)%>%
  as.double()

Caluculation fro Stanadrd Error

SE<-sqrt((sigma_1^2 /n_1)+(sigma_2^2/n_2))
T_score<-abs(qt(.025,4407))

bottom<-x_bar_diff-T_score*SE
top<-x_bar_diff+T_score*SE

bottom

## [1] 1.77628

top

## [1] 1.78372

# The null value falls outside this range so we can REJECT the null hypothesis

Exercise 11:

Now, a non-inference task: Determine the number of different options there are

in the dataset for the hours_tv_per_school_day there are.

yrbss%>%
  filter(!is.na(hours_tv_per_school_day))%>%
  select(hours_tv_per_school_day)%>%
  unique()

##    hours_tv_per_school_day
## 1                       5+
## 4                        2
## 5                        3
## 10            do not watch
## 12                      <1
## 14                       4
## 19                       1

Exercise 12:

Come up with a research question evaluating the relationship between height or weight and sleep.

Exploring data:

yrbss$hours_tv_per_school_day<-as.factor(yrbss$hours_tv_per_school_day)
yrb_plot<-yrbss%>%
  filter(!is.na(weight),!is.na(hours_tv_per_school_day))
p<-ggplot(yrb_plot, aes(x=hours_tv_per_school_day,y=weight))+
  geom_boxplot()
p

# looking at the box plots, it looks like there is no major difference between weight and how much TV one watches.

Week 7 - Inference for Numerical Data