suppressMessages(library(tidymodels))

data(ad_data, package = 'modeldata')

ad_data %>%
               select(Genotype, Class)

## # A tibble: 333 x 2
##    Genotype Class   
##    <fct>    <fct>   
##  1 E3E3     Control 
##  2 E3E4     Control 
##  3 E3E4     Control 
##  4 E3E4     Control 
##  5 E3E3     Control 
##  6 E4E4     Impaired
##  7 E2E3     Control 
##  8 E2E3     Control 
##  9 E3E3     Control 
## 10 E2E3     Impaired
## # ... with 323 more rows

Test of indepdence

To carry out a chi-squared test of independence, we’ll examine the association between their cognitive ability (impaired and healthy) and the genetic makeup. This is what the relationship looks like in the sample data:

If there is no relationship, we would expect to see the purple bars reaching to the same length, regardless of cognitive ability. Are the differences we see here, though, just due to random noise?

First, to calculate the observed statistics, we can use `specify()andcalculate()``

#calculate the observed statistic

observed_indep_statistic <- ad_data %>%
               specify(Genotype ~ Class) %>%
               calculate(stat = 'Chisq')

observed_indep_statistic

## Response: Genotype (factor)
## Explanatory: Class (factor)
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  21.6

The observed X^2 statistic is 21.577. Now, we want to compare this statistic to a null distribution generated under the assumption that these variables are not actually related, to get a sense of how likely it would be for us to see this observed statistic if there were actually no association between cognitive ability and genetics.

We can generate the null distribution in one of 2 ways: using randomization or theory-based methods. The randomization approach permutes the response and explanatory variables, so that each person;s genetic is matched up with a random cognitive rating from the sample in oroder to break up any association between the two

#generate the null distribution using randomization 

null_distribution_simulated <- ad_data %>%
  specify(Genotype ~ Class) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 5000, type = "permute") %>%
  calculate(stat = "Chisq")

#generate the null distribution by theoretical approximation

null_distribution_theoretical <- ad_data %>%
               specify(Genotype ~ Class) %>%
               hypothesize(null = 'independence') %>%
               calculate(stat = 'Chisq')

To get a sense for what these distributions look like, and where our observed statistic falls, we can use visualize():

#visualize the null distribution and test statistic 

theme_set(theme_light())

null_distribution_simulated %>%
               visualize() +
               shade_p_value(observed_indep_statistic,
                             direction = 'greater')

# visualize the theoretical null distribution and test statistic!
ad_data %>%
  specify(Genotype ~ Class) %>%
  hypothesize(null = "independence") %>%
  visualize(method = "theoretical") + 
  shade_p_value(observed_indep_statistic,
                direction = "greater")

## Rather than setting `method = "theoretical"` with a simulation-based null distribution, the preferred method for visualizing theory-based distributions with infer is now to pass the output of `assume()` as the first argument to `visualize()`.

## Warning: Check to make sure the conditions have been met for the theoretical
## method. {infer} currently does not check these for you.

#visualize the null distribution and test statistic 

theme_set(theme_light())

null_distribution_simulated %>%
               visualize(method = 'both') +
               shade_p_value(observed_indep_statistic,
                             direction = 'greater')

## Warning: Check to make sure the conditions have been met for the theoretical
## method. {infer} currently does not check these for you.

# calculate the p value from the observed statistic and null distribution
p_value_independence <- null_distribution_simulated %>%
  get_p_value(obs_stat = observed_indep_statistic,
              direction = "greater")

p_value_independence

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1  0.0004

chisq_test(ad_data, Genotype ~ Class)

## Warning in stats::chisq.test(table(x), ...): Chi-squared approximation may be
## incorrect

## # A tibble: 1 x 3
##   statistic chisq_df  p_value
##       <dbl>    <int>    <dbl>
## 1      21.6        5 0.000630

Goodness of Fit

# Song, Y., Stampfer, M. J., & Liu, S. (2004). Meta-Analysis: Apolipoprotein E 
# Genotypes and Risk for Coronary Heart Disease. Annals of Internal Medicine, 
# 141(2), 137.
meta_rates <- c("E2E2" = 0.71, "E2E3" = 11.4, "E2E4" = 2.32,
                "E3E3" = 61.0, "E3E4" = 22.6, "E4E4" = 2.22)
meta_rates <- meta_rates/sum(meta_rates) # these add up to slightly > 100%

obs_rates <- table(ad_data$Genotype)/nrow(ad_data)
round(cbind(obs_rates, meta_rates) * 100, 2)

##      obs_rates meta_rates
## E2E2      0.60       0.71
## E2E3     11.11      11.37
## E2E4      2.40       2.31
## E3E3     50.15      60.85
## E3E4     31.83      22.54
## E4E4      3.90       2.21

null_distribution_gof <- ad_data %>%
  specify(response = Genotype) %>%
  hypothesize(null = "point", p = meta_rates) %>%
  generate(reps = 5000, type = "simulate") %>%
  calculate(stat = "Chisq")

## The `"simulate"` generation type has been renamed to `"draw"`. Use `type = "draw"` instead to quiet this message.

null_distribution_gof

## Response: Genotype (factor)
## Null Hypothesis: point
## # A tibble: 5,000 x 2
##    replicate  stat
##    <fct>     <dbl>
##  1 1          6.37
##  2 2          3.37
##  3 3          5.56
##  4 4          3.02
##  5 5          6.98
##  6 6          4.23
##  7 7          6.19
##  8 8          5.18
##  9 9          5.57
## 10 10         1.17
## # ... with 4,990 more rows

observed_gof_statistic <- ad_data %>%
  specify(response = Genotype) %>%
  hypothesize(null = "point", p = meta_rates) %>%
  calculate(stat = "Chisq")

null_distribution_gof %>%
  visualize() + 
  shade_p_value(observed_gof_statistic,
                direction = "greater")

Untitled

Nguyen_LSCM

1/8/2022

Test of indepdence

Goodness of Fit