ChiSq.r

setwd("D:/Class Materials & Work/Summer 2020 practice/Chi Square Test of Independence")
getwd()

## [1] "D:/Class Materials & Work/Summer 2020 practice/Chi Square Test of Independence"

#Hypothesis
#Null:the variables are independent, there is no relationship between the two categorical variables. Knowing the value of one variable does not help to predict the value of the other variable
#Alt: the variables are dependent, there is a relationship between the two categorical variables. Knowing the value of one variable helps to predict the value of the other variable.

dat <- iris

#Since there is only one categorical variable and the Chi-square test requires two categorical variables, we added the variable size which corresponds to small if the length of the petal is smaller than the median of all flowers, big otherwise


dat$size <- ifelse(dat$Sepal.Length < median(dat$Sepal.Length),
                   "small", "big")

#We now create a contingency table of the two variables Species and size with the "table()" function:

table <- table(dat$Species, dat$size)

require(ggplot2)

## Loading required package: ggplot2

ggplot(dat) +
  aes(x = Species, fill = size) +
  geom_bar() +
  scale_fill_hue() +
  theme_minimal()

#For this example, we are going to test in R if there is a relationship between the variables Species and size.
test <- chisq.test(table)
test

## 
##  Pearson's Chi-squared test
## 
## data:  table
## X-squared = 86.035, df = 2, p-value < 2.2e-16

#You can also retrieve the ??2 test statistic and the p-value with:
test$statistic # test statistic

## X-squared 
##  86.03451

test$p.value # p-value

## [1] 2.078944e-19

test$method #method

## [1] "Pearson's Chi-squared test"

test$parameter #df

## df 
##  2

test$expected #expected frequencies

##             
##                   big    small
##   setosa     25.66667 24.33333
##   versicolor 25.66667 24.33333
##   virginica  25.66667 24.33333

#If a warning such as "Chi-squared approximation may be incorrect" appears, it means that the smallest expected frequencies are lower than 5. To avoid this issue, you can either:
#gather some levels (especially those with a small number of observations) to increase the number of observations in the subgroups, or
#use the Fisher's exact test

#Fisher's exact test does not require the assumption of a minimum of 5 expected counts.
#"fisher.test()"

fisher.test(table)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  table
## p-value < 2.2e-16
## alternative hypothesis: two.sided

#"https://towardsdatascience.com/chi-square-test-of-independence-in-r-c109947ca73a"

ChiSq.r

tarid

2020-09-29