Be sure to chunk your code if using RMarkdown

Example chunk:

``{r example chunk, message=FALSE, warning=FALSE, echo=TRUE}
# YOUR AWESOME CODE GOES IN THIS SPACE
```

Load your libraries. Be sure to install your packages first!

You will get a lot of output. I’ve suppressed it to prevent overcrowding

packages <- c(
  'psych', 'summarytools', 'Hmisc', 'dplyr', 'stats', 'rstatix', 
  'ggplot2', 'apaTables', 'effectsize', 'car', 'rcompanion',  
  'vcd', 'tukeytrend', 'lme4', 'lmtest', 'broom', 'modelr', 
  'tidyverse', 'lubridate', 'janitor', 'data.table', 'readr', 'knitr', 'mosaicData')

lapply(packages, library, character.only = TRUE)

It is time to calculate descriptive statistics we will be using the CPS85 dataset for this example

descriptives <- CPS85 %>%
  dplyr::select(wage, educ, exper, age)%>%
  psych::describe()


descriptives
##       vars   n  mean    sd median trimmed   mad min  max range  skew kurtosis
## wage     1 534  9.02  5.14   7.78    8.28  4.12   1 44.5  43.5  1.69     4.90
## educ     2 534 13.02  2.62  12.00   13.05  1.48   2 18.0  16.0 -0.20     0.81
## exper    3 534 17.82 12.38  15.00   16.75 11.86   0 55.0  55.0  0.68    -0.40
## age      4 534 36.83 11.73  35.00   36.07 11.86  18 64.0  46.0  0.55    -0.60
##         se
## wage  0.22
## educ  0.11
## exper 0.54
## age   0.51

Now it is time to run descriptives by categorical variables

# Get descriptive statistics by sector
descriptives_sector <- psych::describeBy(CPS85$wage, group = CPS85$sector, mat = TRUE)

# Convert the matrix into a data frame
descriptives_sector<- as.data.frame(descriptives_sector)

write.csv(descriptives_sector, "descriptives_sector.csv")

descriptives_sector
##     item   group1 vars   n      mean       sd median   trimmed      mad  min
## X11    1 clerical    1  97  7.422577 2.699018  7.500  7.299873 3.335850 3.00
## X12    2    const    1  20  9.502000 3.343877  9.750  9.499375 3.647196 3.75
## X13    3    manag    1  55 12.704000 7.572513 10.620 11.966222 6.493788 1.00
## X14    4    manuf    1  68  8.036029 4.117607  6.750  7.511786 3.335850 3.00
## X15    5    other    1  68  8.500588 4.601049  6.940  7.869464 3.261720 2.85
## X16    6     prof    1 105 11.947429 5.523833 10.610 11.360706 5.352186 4.35
## X17    7    sales    1  38  7.592632 4.232272  5.725  7.157813 2.891070 3.35
## X18    8  service    1  83  6.537470 3.673278  5.500  5.937612 2.816940 1.75
##       max range        skew    kurtosis        se
## X11 15.03 12.03  0.39715109 -0.71992219 0.2740438
## X12 15.00 11.25 -0.02255889 -1.05839301 0.7477136
## X13 44.50 43.50  1.47308721  3.67335914 1.0210774
## X14 22.20 19.20  1.26924002  1.53036996 0.4993332
## X15 26.00 23.15  1.46528476  2.30867798 0.5579592
## X16 24.98 20.63  0.77278554 -0.36441619 0.5390709
## X17 19.98 16.63  0.97785906  0.01196841 0.6865651
## X18 25.00 23.25  2.04386695  6.36667089 0.4031946

We will generate histograms overall and then by category

#Overall Wage
ggplot(CPS85, aes(x = wage)) +
  geom_histogram(aes(y = ..density..), binwidth = 1, fill = "skyblue", 
                 color = "black", alpha = 0.7) +  
  geom_density(color = "red", fill = "red", alpha = 0.3) +
  theme_minimal() +
  labs(title = "Overall Wage Distribution", x = "Wage", y = "Density")

###Plot wage by sectory category
ggplot(CPS85, aes(x = wage)) +
  geom_histogram(aes(y = ..density..), binwidth = 1, fill = "skyblue",
                 color = "black", alpha = 0.7) +  
  geom_density(color = "red", fill = "red", alpha = 0.3) +  # Add density curve
  facet_wrap(~ sector) +  # Facet by 'sector' column
  theme_minimal() +
  labs(title = "Wage Distribution by Sector", x = "Wage", y = "Density")

Calculating frequecies in R

library(dplyr)
library(tidyr)

##Single tables
married_table<- table(CPS85$married)

count_table <- table(CPS85$married, CPS85$union)

married_prop<- prop.table(table(CPS85$married))

married_union_prop<- prop.table((table(CPS85$married, CPS85$union)))

#Create into one table

df_count <- as.data.frame(count_table)

df_count <- df_count %>%
  rename(Married = Var1, Union = Var2, Count = Freq)

df_count <- df_count %>%
  group_by(Union) %>%
  mutate(Proportion = Count / sum(Count))

df_total <- df_count %>%
  group_by(Married) %>%
  summarise(Count = sum(Count), Proportion = sum(Proportion)) %>%
  mutate(Union = "Total")

final_table <- bind_rows(df_count, df_total)

married_table
## 
## Married  Single 
##     350     184
count_table
##          
##           Not Union
##   Married 278    72
##   Single  160    24
married_prop
## 
##   Married    Single 
## 0.6554307 0.3445693
married_union_prop
##          
##                  Not      Union
##   Married 0.52059925 0.13483146
##   Single  0.29962547 0.04494382
df_count
## # A tibble: 4 × 4
## # Groups:   Union [2]
##   Married Union Count Proportion
##   <fct>   <fct> <int>      <dbl>
## 1 Married Not     278      0.635
## 2 Single  Not     160      0.365
## 3 Married Union    72      0.75 
## 4 Single  Union    24      0.25
df_total
## # A tibble: 2 × 4
##   Married Count Proportion Union
##   <fct>   <int>      <dbl> <chr>
## 1 Married   350      1.38  Total
## 2 Single    184      0.615 Total
final_table
## # A tibble: 6 × 4
## # Groups:   Union [3]
##   Married Union Count Proportion
##   <fct>   <chr> <int>      <dbl>
## 1 Married Not     278      0.635
## 2 Single  Not     160      0.365
## 3 Married Union    72      0.75 
## 4 Single  Union    24      0.25 
## 5 Married Total   350      1.38 
## 6 Single  Total   184      0.615

This is a plot of proportions by category

# Calculate frequency and proportion
frequency_table <- CPS85 %>%
  dplyr::select(race, sex, hispanic, south, married, union, sector) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Category") %>%
  group_by(Variable, Category) %>%
  summarise(Frequency = n(), .groups = "drop") %>%
  group_by(Variable) %>%
  mutate(Proportion = Frequency / sum(Frequency)) %>%
  arrange(Variable, desc(Frequency))

# Create the plot
ggplot(frequency_table, aes(x = Category, y = Frequency, fill = Category)) +
  geom_bar(stat = "identity", show.legend = FALSE) + 
  geom_text(
    aes(label = scales::percent(Proportion, accuracy = 0.1)),
    position = position_stack(vjust = 0.5),
    size = 3, color = "black"
  ) +
  facet_wrap(~Variable, scales = "free", ncol = 2) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(size = 10, face = "bold")
  ) +
  labs(
    title = "Frequency and Proportion of Categories by Variable",
    x = "Category",
    y = "Frequency"
  )

write.csv(frequency_table, "frequencytable.csv")

Chi-square test

Be sure to install the vcd package: install.packages(‘vcd’) in your console

library(summarytools)
library(dplyr)
library(ggplot2)
library(vcd)

# Select relevant columns from the dataset
chidata <- CPS85 %>%
  dplyr::select(race, union)

# Generate contingency table
chitable <- table(chidata$race, chidata$union)

# Run chi-square test
chi_test <- chisq.test(chitable)

# Convert the expected counts matrix to a data frame
expected <- as.data.frame(chi_test$expected)

# Extract row names and make them a column
expected$race <- rownames(expected)

# Move 'race' column to the front (if desired)
expected <- expected %>%
  dplyr::select(race, everything())

# Add 'type' column to indicate that these are expected counts
expected <- expected %>%
  mutate(type = "Expected")

# Convert the contingency table to a data frame
chitable_df <- as.data.frame(chitable)%>%
  rename(race = Var1, union = Var2)

observed <- chitable_df %>%
  dplyr::select(race, union, Freq) %>%
  pivot_wider(names_from = union, values_from = Freq)%>%
  mutate(type = "Observed")

overalltable<- rbind(observed, expected)



print("contengency table")
## [1] "contengency table"
chitable
##     
##      Not Union
##   NW  49    18
##   W  389    78
print("Expected counts")
## [1] "Expected counts"
chi_test$expected
##     
##            Not    Union
##   NW  54.95506 12.04494
##   W  383.04494 83.95506
overalltable
## # A tibble: 4 × 4
##   race    Not Union type    
## * <fct> <dbl> <dbl> <chr>   
## 1 NW     49    18   Observed
## 2 W     389    78   Observed
## 3 NW     55.0  12.0 Expected
## 4 W     383.   84.0 Expected
print("Chi-square test results:")
## [1] "Chi-square test results:"
print(chi_test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  chitable
## X-squared = 3.4442, df = 1, p-value = 0.06348
# Calculate Cramér's V or Phi (for 2x2)
print("Effect size metrics, this uses phi because it's 2x2")
## [1] "Effect size metrics, this uses phi because it's 2x2"
assocstats(chitable)
##                     X^2 df P(> X^2)
## Likelihood Ratio 3.7469  1 0.052906
## Pearson          4.1045  1 0.042770
## 
## Phi-Coefficient   : 0.088 
## Contingency Coeff.: 0.087 
## Cramer's V        : 0.088
write.csv(overalltable, "chi.csv")

Generate a plot of the counts and proportions given categories

##Compute dataset for a figure
chitable_plot <- as.data.frame(chitable) %>%
  rename(Race = Var1, Union = Var2)%>%
  group_by(Race) %>%
  mutate(Proportion = Freq / sum(Freq)) 

# Plot with counts below the bars and proportions above
ggplot(chitable_plot, aes(x = Race, y = Freq, fill = Union)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = scales::percent(Proportion)), 
            position = position_dodge(width = 0.8), 
            vjust = -0.5, size = 3) +
  geom_text(aes(label = Freq), 
            position = position_dodge(width = 0.8), 
            vjust = 1.5, size = 3, color = "white") +
  labs(title = "Contingency Table: Race vs Union",
       x = "Race",
       y = "Count") +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal()

Independent samples t-test

library(tidyverse)
library(psych)
library(rstatix)
library(car)  # For Levene’s Test

# Load dataset
CPS85$married <- as.factor(CPS85$married)

# Descriptive statistics by marital status
descriptives_south <- psych::describeBy(CPS85$age, group = CPS85$married, mat = TRUE) %>%
  as.data.frame()

write.csv(descriptives_south, "descriptives_south.csv")

# Normality tests (Shapiro-Wilk)
shapiro.test(CPS85$age[CPS85$married == "Married"])
## 
##  Shapiro-Wilk normality test
## 
## data:  CPS85$age[CPS85$married == "Married"]
## W = 0.96039, p-value = 3.999e-08
shapiro.test(CPS85$age[CPS85$married == "Single"])
## 
##  Shapiro-Wilk normality test
## 
## data:  CPS85$age[CPS85$married == "Single"]
## W = 0.89074, p-value = 2.348e-10
# Histogram with density plot
ggplot(CPS85, aes(x = age, fill = married)) +
  geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.5, position = "identity") +
  geom_density(alpha = 0.7) +
  facet_wrap(~married) +
  theme_minimal() +
  labs(title = "Age Distribution by Marital Status", x = "Age", y = "Density")

# Levene's test for equality of variances
leveneTest(age ~ married, data = CPS85)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   1  0.1425  0.706
##       532
# Independent samples t-test (Equal variances assumed)
t_test_result <- t.test(age ~ married, data = CPS85, var.equal = TRUE)
print(t_test_result)
## 
##  Two Sample t-test
## 
## data:  age by married
## t = 6.6999, df = 532, p-value = 5.323e-11
## alternative hypothesis: true difference in means between group Married and group Single is not equal to 0
## 95 percent confidence interval:
##  4.860477 8.893063
## sample estimates:
## mean in group Married  mean in group Single 
##              39.20286              32.32609
# Welch's t-test (Unequal variances)
t_test_result_var <- t.test(age ~ married, data = CPS85, var.equal = FALSE)
print(t_test_result_var)
## 
##  Welch Two Sample t-test
## 
## data:  age by married
## t = 6.5934, df = 355.79, p-value = 1.555e-10
## alternative hypothesis: true difference in means between group Married and group Single is not equal to 0
## 95 percent confidence interval:
##  4.825607 8.927933
## sample estimates:
## mean in group Married  mean in group Single 
##              39.20286              32.32609
# Cohen's d for effect size (parametric)
cohen_d_result <- rstatix::cohens_d(CPS85, age ~ married)
print(cohen_d_result)
## # A tibble: 1 × 7
##   .y.   group1  group2 effsize    n1    n2 magnitude
## * <chr> <chr>   <chr>    <dbl> <int> <int> <ord>    
## 1 age   Married Single   0.605   350   184 moderate
# Mann-Whitney U test (Wilcoxon Rank-Sum test for independent samples)
mann_whitney_result <- wilcox.test(age ~ married, data = CPS85, exact = FALSE)
print(mann_whitney_result)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  age by married
## W = 44398, p-value = 5.971e-13
## alternative hypothesis: true location shift is not equal to 0
# Compute Mann-Whitney U statistic manually (SPSS-style)
compute_U <- function(group1, group2) {
  combined_data <- c(group1, group2)
  ranks <- rank(combined_data)
  
  rank_sum1 <- sum(ranks[1:length(group1)])
  rank_sum2 <- sum(ranks[(length(group1)+1):length(combined_data)])
  
  U1 <- rank_sum1 - length(group1)*(length(group1) + 1)/2
  U2 <- rank_sum2 - length(group2)*(length(group2) + 1)/2
  
  return(min(U1, U2))  # Smaller U value matches SPSS
}

# Define groups correctly for Mann-Whitney U
group1 <- CPS85$age[CPS85$married == "Married"]
group2 <- CPS85$age[CPS85$married == "Single"]

# Compute U statistic manually
U_value <- compute_U(group1, group2)
print(paste("Mann-Whitney U (Custom Calculation):", U_value))
## [1] "Mann-Whitney U (Custom Calculation): 20002.5"
# Compute effect size (r = Z / sqrt(n))
# Get sample sizes
n1 <- length(group1)
n2 <- length(group2)
n <- n1 + n2

# Approximate Z from the Mann-Whitney U test
Z <- qnorm(1 - mann_whitney_result$p.value / 2)  # Two-tailed assumption

# Compute effect size
effect_size_r <- abs(Z) / sqrt(n)
print(paste("Effect size (r):", round(effect_size_r, 3)))
## [1] "Effect size (r): 0.312"

Dependent Samples t-test in R

Here we move to the ‘datasets’ library, so we load it first

You may need to: install.package(‘datasets’)

library(datasets)
library(psych)
library(effectsize)
library(tidyverse)

# This is the default data set. 
# It is not necessary to rename it, but I do it to emphasize it's original form
chick<- ChickWeight

# Here we pivot the data so that we have only time series 4 and 21 in our set
chickdata <- chick%>%
  filter(Time %in% c(4, 21)) %>%  # Select only two time points for comparison
  pivot_wider(names_from = Time, values_from = weight, names_prefix = "Day_") %>%
  drop_na()%>%
  mutate(Difference = Day_21 - Day_4)

# Compute descriptive statistics
desc_day_4 <- psych::describe(chickdata$Day_4)
desc_day_21 <- psych::describe(chickdata$Day_21)
desc_difference <- psych::describe(chickdata$Difference)

as.data.frame(desc_day_4)
##    vars  n     mean       sd median  trimmed    mad min max range       skew
## X1    1 45 60.15556 4.279738     61 60.32432 4.4478  48  69    21 -0.4278585
##      kurtosis        se
## X1 0.06485024 0.6379857
as.data.frame(desc_day_21)
##    vars  n     mean       sd median  trimmed     mad min max range       skew
## X1    1 45 218.6889 71.51027    205 218.6216 75.6126  74 373   299 0.08910638
##      kurtosis       se
## X1 -0.7680967 10.66012
as.data.frame(desc_difference)
##    vars  n     mean       sd median  trimmed     mad min max range    skew
## X1    1 45 158.5333 69.83865    149 158.2973 71.1648  16 309   293 0.10106
##      kurtosis       se
## X1 -0.7835153 10.41093
desc_day_4$Variable <- "Day_4"
desc_day_21$Variable <- "Day_21"
desc_difference$Variable <- "Difference"

# Combine all into one dataset
desc_combined <- rbind(desc_day_4, desc_day_21, desc_difference)%>%
  dplyr::select(Variable, mean, median, sd, skew, kurtosis)



write.csv(desc_combined, "dependent_desc.csv")

# We now want to check the assumption of normality
# Plot a histogram of the Difference variable
hist(chickdata$Difference, main = "Histogram of Difference", 
     xlab = "Difference (Day_21 - Day_4)", col = "lightblue", 
     border = "black")

# Q-Q plot to check for normality
qqnorm(chickdata$Difference)
qqline(chickdata$Difference, col = "red")

# Perform Shapiro-Wilk test for normality on the Difference variable
shapiro_test <- shapiro.test(chickdata$Difference)
shapiro_test
## 
##  Shapiro-Wilk normality test
## 
## data:  chickdata$Difference
## W = 0.98718, p-value = 0.8939
# Run the t-test
paired_result <- t.test(chickdata$Day_4, chickdata$Day_21, paired = TRUE)
paired_result
## 
##  Paired t-test
## 
## data:  chickdata$Day_4 and chickdata$Day_21
## t = -15.228, df = 44, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -179.5152 -137.5515
## sample estimates:
## mean difference 
##       -158.5333
# Calculate Cohen's d
mean_dif<- mean(chickdata$Difference)
sd_dif<- sd(chickdata$Difference)
cohen_d<- mean_dif/sd_dif
cohen_d
## [1] 2.269994
# Use the effsize package
chickdata_long <- chickdata %>%
  dplyr::select(Day_4, Day_21)%>%
  pivot_longer(cols = c(Day_4, Day_21), 
               names_to = "Day", 
               values_to = "Weight")

chickdata_long$Day<- as.factor(chickdata_long$Day)

# Calculate Cohen's d for paired samples
cohen_d_result <- cohens_d(chickdata_long$Weight, chickdata_long$Day, paired = TRUE)

# Print the result
cohen_d_result
## Cohen's d |       95% CI
## ------------------------
## 2.27      | [1.71, 2.82]

Non parametric Wilcoxon Signed Rank

# Load necessary libraries
library(tidyr)
library(rstatix)
library(dplyr)

# Filter for Day 4 and Day 21
chick_subset <- ChickWeight[ChickWeight$Time %in% c(4, 21), ]

# Reshape data to wide format
chick_wide <- chick_subset %>%
  pivot_wider(names_from = Time, values_from = weight) %>%
  na.omit() %>%
  dplyr::select('4', '21')

# Rename columns for clarity
colnames(chick_wide) <- c("Four", "TwentyOne")

# Perform Wilcoxon Signed-Rank Test
test_result <- wilcox.test(chick_wide$Four, chick_wide$TwentyOne, paired = TRUE)

# Print the Wilcoxon Signed-Rank Test result
print(test_result)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  chick_wide$Four and chick_wide$TwentyOne
## V = 0, p-value = 5.355e-09
## alternative hypothesis: true location shift is not equal to 0
# Extract necessary values
Z <- qnorm(test_result$p.value / 2)  # Get Z-score (divide p-value by 2 for two-tailed test)
n <- nrow(chick_wide)  # Number of pairs

# Calculate effect size r
effect_size_r <- abs(Z) / sqrt(n)

# Print effect size
print(paste("Z:", round(Z, 3)))
## [1] "Z: -5.836"
print(paste("Effect size (r):", round(effect_size_r, 3)))
## [1] "Effect size (r): 0.87"

Descriptives for the one-way ANOVA

# Prepare the dataset
anovadata<- CPS85%>%
  filter(sector %in% c("manag", "prof", "sales"))%>%
  dplyr::select(wage, sector)

anovadata$sector<- as.character(anovadata$sector)

# Get descriptive statistics by profession
descriptives_profession <- psych::describeBy(anovadata$wage, group = anovadata$sector, mat = TRUE)%>%
  dplyr::select(-item)
overall_desc_profession<- psych::describe(anovadata$wage)

# Convert the matrix into a data frame
descriptives_profession<- as.data.frame(descriptives_profession)
overall_desc_profession<- as.data.frame(overall_desc_profession)%>%
  mutate(group1 = "Total")%>%
  relocate(group1, .before = everything())
  
descriptives_profession<- rbind(descriptives_profession, overall_desc_profession)

write.csv(descriptives_profession, "descriptives_profession.csv")

descriptives_profession
##     group1 vars   n      mean       sd median   trimmed      mad  min   max
## X11  manag    1  55 12.704000 7.572513 10.620 11.966222 6.493788 1.00 44.50
## X12   prof    1 105 11.947429 5.523833 10.610 11.360706 5.352186 4.35 24.98
## X13  sales    1  38  7.592632 4.232272  5.725  7.157813 2.891070 3.35 19.98
## X1   Total    1 198 11.321818 6.214124 10.000 10.656500 5.774727 1.00 44.50
##     range      skew    kurtosis        se
## X11 43.50 1.4730872  3.67335914 1.0210774
## X12 20.63 0.7727855 -0.36441619 0.5390709
## X13 16.63 0.9778591  0.01196841 0.6865651
## X1  43.50 1.2892669  3.07554682 0.4416185

#Check statistical assumptions for ANOVA

# Shapiro-Wilke Test for normality
aov_model <- aov(wage ~ sector, data = anovadata)
shapiro.test(residuals(aov_model))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(aov_model)
## W = 0.91874, p-value = 5.393e-09
# QQ Plot for normality
qqnorm(residuals(aov_model))
qqline(residuals(aov_model), col = "red")

# Levene's test for homogeniety of variance
library(car)
leveneTest(wage ~ sector, data = anovadata)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)  
## group   2  3.2421 0.0412 *
##       195                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Bartlett's test of homogeniety of variance
bartlett.test(wage ~ sector, data = anovadata)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  wage by sector
## Bartlett's K-squared = 15.317, df = 2, p-value = 0.000472

Run one-way ANOVA

# Be sure to load the library
library(apaTables)
library(dplyr)
library(tidyverse)
library(rstatix)

# Run the ANOVA
anova_results <- aov(wage ~ sector, data = anovadata)

apa.aov.table(anova_results, filename = "anova_results.doc")
## 
## 
## ANOVA results using wage as the dependent variable
##  
## 
##    Predictor      SS  df      MS      F    p partial_eta2 CI_90_partial_eta2
##  (Intercept) 8876.54   1 8876.54 249.68 .000                                
##       sector  674.63   2  337.31   9.49 .000          .09         [.03, .15]
##        Error 6932.59 195   35.55                                            
## 
## Note: Values in square brackets indicate the bounds of the 90% confidence interval for partial eta-squared
summary(anova_results)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## sector        2    675   337.3   9.488 0.000117 ***
## Residuals   195   6933    35.6                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Perform Tukey's HSD test if variances are equal
TukeyHSD(anova_results)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = wage ~ sector, data = anovadata)
## 
## $sector
##                   diff       lwr       upr     p adj
## prof-manag  -0.7565714 -3.100521  1.587379 0.7265306
## sales-manag -5.1113684 -8.081890 -2.140847 0.0002059
## sales-prof  -4.3547970 -7.020710 -1.688884 0.0004545
# Perform Games-Howel if variances were not equal
anovadata%>%
games_howell_test(wage ~ sector)
## # A tibble: 3 × 8
##   .y.   group1 group2 estimate conf.low conf.high      p.adj p.adj.signif
## * <chr> <chr>  <chr>     <dbl>    <dbl>     <dbl>      <dbl> <chr>       
## 1 wage  manag  prof     -0.757    -3.51      2.00 0.79       ns          
## 2 wage  manag  sales    -5.11     -8.04     -2.18 0.000222   ***         
## 3 wage  prof   sales    -4.35     -6.44     -2.27 0.00000942 ****
# Subset your data for Cohen's d
cohen_data_sales_manag<- anovadata%>%
  filter(sector %in% c("sales", "manag"))

cohen_data_sales_prof<- anovadata%>%
  filter(sector %in% c("sales", "prof"))

# Run cohen's d on subset data
 rstatix::cohens_d(cohen_data_sales_manag, wage ~ sector)
## # A tibble: 1 × 7
##   .y.   group1 group2 effsize    n1    n2 magnitude
## * <chr> <chr>  <chr>    <dbl> <int> <int> <ord>    
## 1 wage  manag  sales    0.833    55    38 large
 rstatix::cohens_d(cohen_data_sales_prof, wage ~ sector)
## # A tibble: 1 × 7
##   .y.   group1 group2 effsize    n1    n2 magnitude
## * <chr> <chr>  <chr>    <dbl> <int> <int> <ord>    
## 1 wage  prof   sales    0.885   105    38 large

Kruskal Wallis non-parametric test when ANOVA assumptions are violated

For the post-hoc test, install.packages(‘dunn.test’)

#Load the library
library(dunn.test)

# Run the omnibus KW test
kwtest<-kruskal.test(wage ~ sector, data = anovadata)

print(kwtest)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  wage by sector
## Kruskal-Wallis chi-squared = 22.533, df = 2, p-value = 1.28e-05
# Create ranks and summarize by group
planet_ranks <- anovadata %>%
  mutate(Rank = rank(wage)) %>%
  group_by(sector) %>%
  summarise(N = n(), Mean_Rank = mean(Rank), Sum_Rank = sum(Rank))

# Print the ranks
print(planet_ranks)
## # A tibble: 3 × 4
##   sector     N Mean_Rank Sum_Rank
##   <chr>  <int>     <dbl>    <dbl>
## 1 manag     55     110.     6032.
## 2 prof     105     109.    11395 
## 3 sales     38      59.9    2274.
# Run post-hoc dunn test
dt<- dunn.test(anovadata$wage, anovadata$sector, kw = TRUE, label = TRUE)
##   Kruskal-Wallis rank sum test
## 
## data: x and group
## Kruskal-Wallis chi-squared = 22.5328, df = 2, p-value = 0
## 
## 
##                            Comparison of x by group                            
##                                 (No adjustment)                                
## Col Mean-|
## Row Mean |      manag       prof
## ---------+----------------------
##     prof |   0.119529
##          |     0.4524
##          |
##    sales |   4.121490   4.487320
##          |    0.0000*    0.0000*
## 
## alpha = 0.05
## Reject Ho if p <= alpha/2
dt
## $chi2
## [1] 22.53284
## 
## $Z
## [1] 0.1195295 4.1214907 4.4873207
## 
## $P
## [1] 4.524279e-01 1.882143e-05 3.606224e-06
## 
## $P.adjusted
## [1] 4.524279e-01 1.882143e-05 3.606224e-06
## 
## $comparisons
## [1] "manag - prof"  "manag - sales" "prof - sales"
# Run eta-squared for omnibus effect size
H <- kwtest$statistic
k <- length(unique(anovadata$sector))
N <- nrow(anovadata)

eta_squared <- (H - k + 1) / (N - k)

# Rename the output explicitly
names(eta_squared) <- "eta_squared"

# Print the result
eta_squared
## eta_squared 
##   0.1052966
# Singed rank correlation for pairwise tests
# Z-values from Dunn's test results
z_manag_sales <- 4.1215
z_prof_sales <- 4.4873

# Calculate observations
n <- nrow(anovadata)  # Total number of observations

# Singed Rank correlation formula
r_manag_sales <- z_manag_sales / sqrt(n)
r_prof_sales <- z_prof_sales / sqrt(n)

# Print effect sizes
r_manag_sales
## [1] 0.2929023
r_prof_sales
## [1] 0.3188985