Samples
library(tidyverse)
library(openintro)
library(palmerpenguins)
library(janitor)
library(interpretCI)
library(BSDA)
library(pwr)

# Consistent Randomization
set.seed(1)

# Remove scientific notation
options(scipen = 999) 
Penguin Samples
(penguins_tidy <- penguins %>% drop_na())
## # A tibble: 333 x 8
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
##  1 Adelie  Torgersen           39.1          18.7               181        3750
##  2 Adelie  Torgersen           39.5          17.4               186        3800
##  3 Adelie  Torgersen           40.3          18                 195        3250
##  4 Adelie  Torgersen           36.7          19.3               193        3450
##  5 Adelie  Torgersen           39.3          20.6               190        3650
##  6 Adelie  Torgersen           38.9          17.8               181        3625
##  7 Adelie  Torgersen           39.2          19.6               195        4675
##  8 Adelie  Torgersen           41.1          17.6               182        3200
##  9 Adelie  Torgersen           38.6          21.2               191        3800
## 10 Adelie  Torgersen           34.6          21.1               198        4400
## # i 323 more rows
## # i 2 more variables: sex <fct>, year <int>
penguins_tidy %>% 
      group_by(island) %>% 
      summarise(mean_bill_length = mean(bill_length_mm))
## # A tibble: 3 x 2
##   island    mean_bill_length
##   <fct>                <dbl>
## 1 Biscoe                45.2
## 2 Dream                 44.2
## 3 Torgersen             39.0
z Test vs t test, 1 sample vs 2 sample
When to use a z test over a t test
1 sample vs 2 sample

https://www.analyticsvidhya.com/blog/2020/06/statistics-analytics-hypothesis-testing-z-test-t-test/

Test for proportions
  • If you are looking to test the proportions of something in a sample
  • Test 1 sample to see if its proportion is different to a hypothesized proportion Ho
  • Test 2 samples if you want to see if the proportions in each are different
  • prop.test(x = c(490, 400), n = c(500, 500))
#hist(penguins_tidy$bill_length_mm)

#ggplot(data.frame(penguins_tidy$bill_length_mm), aes(sample = penguins_tidy$bill_length_mm)) +
#stat_qq() +
#stat_qq_line()
1 sample t and z test for means
# Sample below 25 so lets run t test
t.test(x = sample(penguins_tidy$bill_length_mm, size = 15),
       alternative = "two.sided",
       mu = 44.2)
## 
##  One Sample t-test
## 
## data:  sample(penguins_tidy$bill_length_mm, size = 15)
## t = 0.88371, df = 14, p-value = 0.3918
## alternative hypothesis: true mean is not equal to 44.2
## 95 percent confidence interval:
##  42.2307 48.9293
## sample estimates:
## mean of x 
##     45.58
# Sample below 25 so lets run t test
z.test(x = sample(penguins_tidy$bill_length_mm, size = 50),
       alternative = "two.sided",
       sigma.x = sd(penguins_tidy$bill_length_mm), # pop standard deviation
       mu = 44.2)
## 
##  One-sample z-Test
## 
## data:  sample(penguins_tidy$bill_length_mm, size = 50)
## z = -0.60772, p-value = 0.5434
## alternative hypothesis: true mean is not equal to 44.2
## 95 percent confidence interval:
##  42.21419 45.24581
## sample estimates:
## mean of x 
##     43.73
2 sample t and z test for means
biscoe_sample <- penguins_tidy %>% filter(island == "Biscoe") %>% 
                            pull(bill_length_mm) %>% 
                            sample(size = 30)

dream_sample <- penguins_tidy %>% filter(island == "Dream") %>% 
                            pull(bill_length_mm) %>% 
                            sample(size = 30)

# Sample below 25 so lets run t test
t.test(x = biscoe_sample,
       y = dream_sample, 
       alternative = "two.sided",
       mu = 0)
## 
##  Welch Two Sample t-test
## 
## data:  biscoe_sample and dream_sample
## t = 0.3231, df = 56.794, p-value = 0.7478
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.252496  3.119163
## sample estimates:
## mean of x mean of y 
##  45.07667  44.64333
# If we were to run z test
z.test(x = biscoe_sample,
       y = dream_sample, 
       sigma.x = sd(penguins_tidy$bill_length_mm), # pop standard deviation,
       sigma.y = sd(penguins_tidy$bill_length_mm), # pop standard deviation
      mu = 0)
## 
##  Two-sample z-Test
## 
## data:  biscoe_sample and dream_sample
## z = 0.30689, p-value = 0.7589
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.334144  3.200811
## sample estimates:
## mean of x mean of y 
##  45.07667  44.64333
Power
difference <- mean(biscoe_sample) - mean(dream_sample)

pwr.t.test(n =NULL, 
           d = difference, 
           sig.level = 0.05, 
           power = 0.8, 
    type = "two.sample",
    alternative = "two.sided")
## 
##      Two-sample t test power calculation 
## 
##               n = 84.56849
##               d = 0.4333333
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
T test for proportions
prop.test(x = c(490, 400), 
          n = c(500, 500), 
          alternative = "two.sided")
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(490, 400) out of c(500, 500)
## X-squared = 80.909, df = 1, p-value < 0.00000000000000022
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.1408536 0.2191464
## sample estimates:
## prop 1 prop 2 
##   0.98   0.80
Non Parametric: Wilcoxon signed rank test
non_para_sample_1 <- penguins_tidy %>% filter(species == "Adelie") %>% 
                        select(bill_depth_mm) %>% pull() %>% 
                          sample(size = 10)

non_para_sample_2 <- penguins_tidy %>% filter(species == "Gentoo") %>% 
                        select(bill_depth_mm) %>% pull() %>% 
                          sample(size = 10)

mean(non_para_sample_1)
## [1] 18.72
mean(non_para_sample_2)
## [1] 15.25

Wilcox Signed rank test

wilcox.test(non_para_sample_1, non_para_sample_2, 
            paired = FALSE, 
            alternative = "two.sided")
## Warning in wilcox.test.default(non_para_sample_1, non_para_sample_2, paired =
## FALSE, : cannot compute exact p-value with ties
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  non_para_sample_1 and non_para_sample_2
## W = 98, p-value = 0.0003247
## alternative hypothesis: true location shift is not equal to 0
Chi Squared Goodness of Fit test

http://www.sthda.com/english/wiki/chi-square-test-of-independence-in-r

file_path <- "http://www.sthda.com/sthda/RDoc/data/housetasks.txt"
(housetasks <- read.delim(file_path, row.names = 1))
##            Wife Alternating Husband Jointly
## Laundry     156          14       2       4
## Main_meal   124          20       5       4
## Dinner       77          11       7      13
## Breakfeast   82          36      15       7
## Tidying      53          11       1      57
## Dishes       32          24       4      53
## Shopping     33          23       9      55
## Official     12          46      23      15
## Driving      10          51      75       3
## Finances     13          13      21      66
## Insurance     8           1      53      77
## Repairs       0           3     160       2
## Holidays      0           1       6     153
chisq <- chisq.test(housetasks)

chisq
## 
##  Pearson's Chi-squared test
## 
## data:  housetasks
## X-squared = 1944.5, df = 36, p-value < 0.00000000000000022