library(tidyverse)
library(openintro)
library(palmerpenguins)
library(janitor)
library(interpretCI)
library(BSDA)
library(pwr)
# Consistent Randomization
set.seed(1)
# Remove scientific notation
options(scipen = 999)
(penguins_tidy <- penguins %>% drop_na())
## # A tibble: 333 x 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen 36.7 19.3 193 3450
## 5 Adelie Torgersen 39.3 20.6 190 3650
## 6 Adelie Torgersen 38.9 17.8 181 3625
## 7 Adelie Torgersen 39.2 19.6 195 4675
## 8 Adelie Torgersen 41.1 17.6 182 3200
## 9 Adelie Torgersen 38.6 21.2 191 3800
## 10 Adelie Torgersen 34.6 21.1 198 4400
## # i 323 more rows
## # i 2 more variables: sex <fct>, year <int>
penguins_tidy %>%
group_by(island) %>%
summarise(mean_bill_length = mean(bill_length_mm))
## # A tibble: 3 x 2
## island mean_bill_length
## <fct> <dbl>
## 1 Biscoe 45.2
## 2 Dream 44.2
## 3 Torgersen 39.0
https://www.analyticsvidhya.com/blog/2020/06/statistics-analytics-hypothesis-testing-z-test-t-test/
#hist(penguins_tidy$bill_length_mm)
#ggplot(data.frame(penguins_tidy$bill_length_mm), aes(sample = penguins_tidy$bill_length_mm)) +
#stat_qq() +
#stat_qq_line()
# Sample below 25 so lets run t test
t.test(x = sample(penguins_tidy$bill_length_mm, size = 15),
alternative = "two.sided",
mu = 44.2)
##
## One Sample t-test
##
## data: sample(penguins_tidy$bill_length_mm, size = 15)
## t = 0.88371, df = 14, p-value = 0.3918
## alternative hypothesis: true mean is not equal to 44.2
## 95 percent confidence interval:
## 42.2307 48.9293
## sample estimates:
## mean of x
## 45.58
# Sample below 25 so lets run t test
z.test(x = sample(penguins_tidy$bill_length_mm, size = 50),
alternative = "two.sided",
sigma.x = sd(penguins_tidy$bill_length_mm), # pop standard deviation
mu = 44.2)
##
## One-sample z-Test
##
## data: sample(penguins_tidy$bill_length_mm, size = 50)
## z = -0.60772, p-value = 0.5434
## alternative hypothesis: true mean is not equal to 44.2
## 95 percent confidence interval:
## 42.21419 45.24581
## sample estimates:
## mean of x
## 43.73
biscoe_sample <- penguins_tidy %>% filter(island == "Biscoe") %>%
pull(bill_length_mm) %>%
sample(size = 30)
dream_sample <- penguins_tidy %>% filter(island == "Dream") %>%
pull(bill_length_mm) %>%
sample(size = 30)
# Sample below 25 so lets run t test
t.test(x = biscoe_sample,
y = dream_sample,
alternative = "two.sided",
mu = 0)
##
## Welch Two Sample t-test
##
## data: biscoe_sample and dream_sample
## t = 0.3231, df = 56.794, p-value = 0.7478
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.252496 3.119163
## sample estimates:
## mean of x mean of y
## 45.07667 44.64333
# If we were to run z test
z.test(x = biscoe_sample,
y = dream_sample,
sigma.x = sd(penguins_tidy$bill_length_mm), # pop standard deviation,
sigma.y = sd(penguins_tidy$bill_length_mm), # pop standard deviation
mu = 0)
##
## Two-sample z-Test
##
## data: biscoe_sample and dream_sample
## z = 0.30689, p-value = 0.7589
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.334144 3.200811
## sample estimates:
## mean of x mean of y
## 45.07667 44.64333
difference <- mean(biscoe_sample) - mean(dream_sample)
pwr.t.test(n =NULL,
d = difference,
sig.level = 0.05,
power = 0.8,
type = "two.sample",
alternative = "two.sided")
##
## Two-sample t test power calculation
##
## n = 84.56849
## d = 0.4333333
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
prop.test(x = c(490, 400),
n = c(500, 500),
alternative = "two.sided")
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(490, 400) out of c(500, 500)
## X-squared = 80.909, df = 1, p-value < 0.00000000000000022
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.1408536 0.2191464
## sample estimates:
## prop 1 prop 2
## 0.98 0.80
non_para_sample_1 <- penguins_tidy %>% filter(species == "Adelie") %>%
select(bill_depth_mm) %>% pull() %>%
sample(size = 10)
non_para_sample_2 <- penguins_tidy %>% filter(species == "Gentoo") %>%
select(bill_depth_mm) %>% pull() %>%
sample(size = 10)
mean(non_para_sample_1)
## [1] 18.72
mean(non_para_sample_2)
## [1] 15.25
Wilcox Signed rank test
wilcox.test(non_para_sample_1, non_para_sample_2,
paired = FALSE,
alternative = "two.sided")
## Warning in wilcox.test.default(non_para_sample_1, non_para_sample_2, paired =
## FALSE, : cannot compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: non_para_sample_1 and non_para_sample_2
## W = 98, p-value = 0.0003247
## alternative hypothesis: true location shift is not equal to 0
The chi-square test of independence is used to analyze the frequency table (i.e. contengency table) formed by two categorical variables.
The chi-square test evaluates whether there is a significant association between the categories of the two variables.
Specifically in the course the question was whether or not the expected values came from a particular model specifcally discrete distributions such as the binomial and the Poisson.
http://www.sthda.com/english/wiki/chi-square-test-of-independence-in-r
file_path <- "http://www.sthda.com/sthda/RDoc/data/housetasks.txt"
(housetasks <- read.delim(file_path, row.names = 1))
## Wife Alternating Husband Jointly
## Laundry 156 14 2 4
## Main_meal 124 20 5 4
## Dinner 77 11 7 13
## Breakfeast 82 36 15 7
## Tidying 53 11 1 57
## Dishes 32 24 4 53
## Shopping 33 23 9 55
## Official 12 46 23 15
## Driving 10 51 75 3
## Finances 13 13 21 66
## Insurance 8 1 53 77
## Repairs 0 3 160 2
## Holidays 0 1 6 153
chisq <- chisq.test(housetasks)
chisq
##
## Pearson's Chi-squared test
##
## data: housetasks
## X-squared = 1944.5, df = 36, p-value < 0.00000000000000022