library(tidyverse)
-- Attaching packages --------------------------------------- tidyverse 1.3.1 --
v ggplot2 3.3.5 v purrr 0.3.4
v tibble 3.1.6 v dplyr 1.0.8
v tidyr 1.2.0 v stringr 1.4.0
v readr 1.4.0 v forcats 0.5.1
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
name = c("Chris", "Knut", "Edgar", "Tunde", "Tayo", "Sade", "Gbemi", "Ify", "Kunmi", "John")
sex = c("M", "M", "M", "M", "F", "F", "F", "F","F", "M")
age = c(23, 24, 21, 25, 33, 31, 34, 20, 31, 22)
height = c(180, 170, 188, 185, 160, 168, 172, 162, 165, 200)
weight = c(80, 60, 82, 72, 82, 82, 81, 84, 83, 75)
education = c(3, 2, 2, 3, 3, 3, 3, 2, 1, 1)
dt <- tibble("name" = name <- c("Chris", "Knut", "Edgar", "Tunde", "Tayo", "Sade", "Gbemi", "Ify", "Kunmi", "John"), "sex" = sex <- c("M", "M", "M", "M", "F", "F", "F", "F","F", "M"), "age" = age <- c(23, 24, 21, 25, 33, 31, 34, 20, 31, 22), "height" = height <- c(180, 170, 188, 185, 160, 168, 172, 162, 165, 200), "weight" = weight <- c(80, 60, 82, 72, 82, 82, 81, 84, 83, 75), "educ" = education <- c(3, 2, 2, 3, 3, 3, 3, 2, 1, 1))
dt
# A tibble: 10 x 6
name sex age height weight educ
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Chris M 23 180 80 3
2 Knut M 24 170 60 2
3 Edgar M 21 188 82 2
4 Tunde M 25 185 72 3
5 Tayo F 33 160 82 3
6 Sade F 31 168 82 3
7 Gbemi F 34 172 81 3
8 Ify F 20 162 84 2
9 Kunmi F 31 165 83 1
10 John M 22 200 75 1
write_csv(dt, "data.csv")
# Create age groups
dt <- dt %>%
mutate(age_group =
# Convert to factor
factor(
# Create categories
dplyr::case_when(
age >= 20 & age <= 24 ~ "20-24",
age > 24 & age <= 29 ~ "25-29",
age > 29 & age <= 34 ~ "30-34"
), levels = c("20-24", "25-29", "30-34"))
)
# Calculate BMI
dt <- dt %>%
mutate(
bmi = weight/(height/100)^2
)
write_csv(dt, "data2.csv")
The p value is more than 0.05. Thus, sex and education are independent. The sample size is just 10 and it was written in some text that “Chi-squared test is invalid if we have fewer than 5 observations in a cell”. The data used does not meet this criterion. The contingency table below is from the data used:
Performing the chi-square in R also gives the same result
# Education levels
ed <- factor(education <- c(3, 2, 2, 3, 3, 3, 3, 2, 1, 1), levels = unique(education))
# Sex
sx <- factor(sex <- c("M", "M", "M", "M", "F", "F", "F", "F","F", "M"), levels = unique(sex))
# Table
table <- tibble(sx, ed)
# Contingency table
contingency_table <- table(table$sx, table$ed)
# Analysis
chisq.test(contingency_table, correct = TRUE)
Warning in chisq.test(contingency_table, correct = TRUE): Chi-squared
approximation may be incorrect
Pearson's Chi-squared test
data: contingency_table
X-squared = 0.53333, df = 2, p-value = 0.7659