Be sure to chunk your code if using RMarkdown
- Example chunk:
Load your libraries. Be sure to install your packages first!
- You will get a lot of output. I’ve suppressed it to prevent overcrowding
It is time to calculate descriptive statistics we will be using the CPS85 dataset for this example
Now it is time to run descriptives by categorical variables
We will generate histograms overall and then by category
Calculating frequecies in R
This is a plot of proportions by category
Chi-square test
- Be sure to install the vcd package: install.packages(‘vcd’) in your console
Generate a plot of the counts and proportions given categories
Independent samples t-test
Dependent Samples t-test in R
- Here we move to the ‘datasets’ library, so we load it first
  - You may need to: install.package(‘datasets’)
Non parametric Wilcoxon Signed Rank
Descriptives for the one-way ANOVA
Run one-way ANOVA
Kruskal Wallis non-parametric test when ANOVA assumptions are violated
- For the post-hoc test, install.packages(‘dunn.test’)

Be sure to chunk your code if using RMarkdown

Example chunk:

``{r example chunk, message=FALSE, warning=FALSE, echo=TRUE}
# YOUR AWESOME CODE GOES IN THIS SPACE
```

Load your libraries. Be sure to install your packages first!

You will get a lot of output. I’ve suppressed it to prevent overcrowding

packages <- c(
  'psych', 'summarytools', 'Hmisc', 'dplyr', 'stats', 'rstatix', 
  'ggplot2', 'apaTables', 'effectsize', 'car', 'rcompanion',  
  'vcd', 'tukeytrend', 'lme4', 'lmtest', 'broom', 'modelr', 
  'tidyverse', 'lubridate', 'janitor', 'data.table', 'readr', 'knitr', 'mosaicData')

lapply(packages, library, character.only = TRUE)

It is time to calculate descriptive statistics we will be using the CPS85 dataset for this example

descriptives <- CPS85 %>%
  dplyr::select(wage, educ, exper, age)%>%
  psych::describe()


descriptives

##       vars   n  mean    sd median trimmed   mad min  max range  skew kurtosis
## wage     1 534  9.02  5.14   7.78    8.28  4.12   1 44.5  43.5  1.69     4.90
## educ     2 534 13.02  2.62  12.00   13.05  1.48   2 18.0  16.0 -0.20     0.81
## exper    3 534 17.82 12.38  15.00   16.75 11.86   0 55.0  55.0  0.68    -0.40
## age      4 534 36.83 11.73  35.00   36.07 11.86  18 64.0  46.0  0.55    -0.60
##         se
## wage  0.22
## educ  0.11
## exper 0.54
## age   0.51

Now it is time to run descriptives by categorical variables

# Get descriptive statistics by sector
descriptives_sector <- psych::describeBy(CPS85$wage, group = CPS85$sector, mat = TRUE)

# Convert the matrix into a data frame
descriptives_sector<- as.data.frame(descriptives_sector)

write.csv(descriptives_sector, "descriptives_sector.csv")

descriptives_sector

##     item   group1 vars   n      mean       sd median   trimmed      mad  min
## X11    1 clerical    1  97  7.422577 2.699018  7.500  7.299873 3.335850 3.00
## X12    2    const    1  20  9.502000 3.343877  9.750  9.499375 3.647196 3.75
## X13    3    manag    1  55 12.704000 7.572513 10.620 11.966222 6.493788 1.00
## X14    4    manuf    1  68  8.036029 4.117607  6.750  7.511786 3.335850 3.00
## X15    5    other    1  68  8.500588 4.601049  6.940  7.869464 3.261720 2.85
## X16    6     prof    1 105 11.947429 5.523833 10.610 11.360706 5.352186 4.35
## X17    7    sales    1  38  7.592632 4.232272  5.725  7.157813 2.891070 3.35
## X18    8  service    1  83  6.537470 3.673278  5.500  5.937612 2.816940 1.75
##       max range        skew    kurtosis        se
## X11 15.03 12.03  0.39715109 -0.71992219 0.2740438
## X12 15.00 11.25 -0.02255889 -1.05839301 0.7477136
## X13 44.50 43.50  1.47308721  3.67335914 1.0210774
## X14 22.20 19.20  1.26924002  1.53036996 0.4993332
## X15 26.00 23.15  1.46528476  2.30867798 0.5579592
## X16 24.98 20.63  0.77278554 -0.36441619 0.5390709
## X17 19.98 16.63  0.97785906  0.01196841 0.6865651
## X18 25.00 23.25  2.04386695  6.36667089 0.4031946

We will generate histograms overall and then by category

#Overall Wage
ggplot(CPS85, aes(x = wage)) +
  geom_histogram(aes(y = ..density..), binwidth = 1, fill = "skyblue", 
                 color = "black", alpha = 0.7) +  
  geom_density(color = "red", fill = "red", alpha = 0.3) +
  theme_minimal() +
  labs(title = "Overall Wage Distribution", x = "Wage", y = "Density")

###Plot wage by sectory category
ggplot(CPS85, aes(x = wage)) +
  geom_histogram(aes(y = ..density..), binwidth = 1, fill = "skyblue",
                 color = "black", alpha = 0.7) +  
  geom_density(color = "red", fill = "red", alpha = 0.3) +  # Add density curve
  facet_wrap(~ sector) +  # Facet by 'sector' column
  theme_minimal() +
  labs(title = "Wage Distribution by Sector", x = "Wage", y = "Density")

Calculating frequecies in R

library(dplyr)
library(tidyr)

##Single tables
married_table<- table(CPS85$married)

count_table <- table(CPS85$married, CPS85$union)

married_prop<- prop.table(table(CPS85$married))

married_union_prop<- prop.table((table(CPS85$married, CPS85$union)))

#Create into one table

df_count <- as.data.frame(count_table)

df_count <- df_count %>%
  rename(Married = Var1, Union = Var2, Count = Freq)

df_count <- df_count %>%
  group_by(Union) %>%
  mutate(Proportion = Count / sum(Count))

df_total <- df_count %>%
  group_by(Married) %>%
  summarise(Count = sum(Count), Proportion = sum(Proportion)) %>%
  mutate(Union = "Total")

final_table <- bind_rows(df_count, df_total)

married_table

## 
## Married  Single 
##     350     184

count_table

##          
##           Not Union
##   Married 278    72
##   Single  160    24

married_prop

## 
##   Married    Single 
## 0.6554307 0.3445693

married_union_prop

##          
##                  Not      Union
##   Married 0.52059925 0.13483146
##   Single  0.29962547 0.04494382

df_count

## # A tibble: 4 × 4
## # Groups:   Union [2]
##   Married Union Count Proportion
##   <fct>   <fct> <int>      <dbl>
## 1 Married Not     278      0.635
## 2 Single  Not     160      0.365
## 3 Married Union    72      0.75 
## 4 Single  Union    24      0.25

df_total

## # A tibble: 2 × 4
##   Married Count Proportion Union
##   <fct>   <int>      <dbl> <chr>
## 1 Married   350      1.38  Total
## 2 Single    184      0.615 Total

final_table

## # A tibble: 6 × 4
## # Groups:   Union [3]
##   Married Union Count Proportion
##   <fct>   <chr> <int>      <dbl>
## 1 Married Not     278      0.635
## 2 Single  Not     160      0.365
## 3 Married Union    72      0.75 
## 4 Single  Union    24      0.25 
## 5 Married Total   350      1.38 
## 6 Single  Total   184      0.615

This is a plot of proportions by category

# Calculate frequency and proportion
frequency_table <- CPS85 %>%
  dplyr::select(race, sex, hispanic, south, married, union, sector) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Category") %>%
  group_by(Variable, Category) %>%
  summarise(Frequency = n(), .groups = "drop") %>%
  group_by(Variable) %>%
  mutate(Proportion = Frequency / sum(Frequency)) %>%
  arrange(Variable, desc(Frequency))

# Create the plot
ggplot(frequency_table, aes(x = Category, y = Frequency, fill = Category)) +
  geom_bar(stat = "identity", show.legend = FALSE) + 
  geom_text(
    aes(label = scales::percent(Proportion, accuracy = 0.1)),
    position = position_stack(vjust = 0.5),
    size = 3, color = "black"
  ) +
  facet_wrap(~Variable, scales = "free", ncol = 2) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(size = 10, face = "bold")
  ) +
  labs(
    title = "Frequency and Proportion of Categories by Variable",
    x = "Category",
    y = "Frequency"
  )

write.csv(frequency_table, "frequencytable.csv")

Chi-square test

Be sure to install the vcd package: install.packages(‘vcd’) in your console

library(summarytools)
library(dplyr)
library(ggplot2)
library(vcd)

# Select relevant columns from the dataset
chidata <- CPS85 %>%
  dplyr::select(race, union)

# Generate contingency table
chitable <- table(chidata$race, chidata$union)

# Run chi-square test
chi_test <- chisq.test(chitable)

# Convert the expected counts matrix to a data frame
expected <- as.data.frame(chi_test$expected)

# Extract row names and make them a column
expected$race <- rownames(expected)

# Move 'race' column to the front (if desired)
expected <- expected %>%
  dplyr::select(race, everything())

# Add 'type' column to indicate that these are expected counts
expected <- expected %>%
  mutate(type = "Expected")

# Convert the contingency table to a data frame
chitable_df <- as.data.frame(chitable)%>%
  rename(race = Var1, union = Var2)

observed <- chitable_df %>%
  dplyr::select(race, union, Freq) %>%
  pivot_wider(names_from = union, values_from = Freq)%>%
  mutate(type = "Observed")

overalltable<- rbind(observed, expected)



print("contengency table")

## [1] "contengency table"

chitable

##     
##      Not Union
##   NW  49    18
##   W  389    78

print("Expected counts")

## [1] "Expected counts"

chi_test$expected

##     
##            Not    Union
##   NW  54.95506 12.04494
##   W  383.04494 83.95506

overalltable

## # A tibble: 4 × 4
##   race    Not Union type    
## * <fct> <dbl> <dbl> <chr>   
## 1 NW     49    18   Observed
## 2 W     389    78   Observed
## 3 NW     55.0  12.0 Expected
## 4 W     383.   84.0 Expected

print("Chi-square test results:")

## [1] "Chi-square test results:"

print(chi_test)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  chitable
## X-squared = 3.4442, df = 1, p-value = 0.06348

# Calculate Cramér's V or Phi (for 2x2)
print("Effect size metrics, this uses phi because it's 2x2")

## [1] "Effect size metrics, this uses phi because it's 2x2"

assocstats(chitable)

##                     X^2 df P(> X^2)
## Likelihood Ratio 3.7469  1 0.052906
## Pearson          4.1045  1 0.042770
## 
## Phi-Coefficient   : 0.088 
## Contingency Coeff.: 0.087 
## Cramer's V        : 0.088

write.csv(overalltable, "chi.csv")

Generate a plot of the counts and proportions given categories

##Compute dataset for a figure
chitable_plot <- as.data.frame(chitable) %>%
  rename(Race = Var1, Union = Var2)%>%
  group_by(Race) %>%
  mutate(Proportion = Freq / sum(Freq)) 

# Plot with counts below the bars and proportions above
ggplot(chitable_plot, aes(x = Race, y = Freq, fill = Union)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = scales::percent(Proportion)), 
            position = position_dodge(width = 0.8), 
            vjust = -0.5, size = 3) +
  geom_text(aes(label = Freq), 
            position = position_dodge(width = 0.8), 
            vjust = 1.5, size = 3, color = "white") +
  labs(title = "Contingency Table: Race vs Union",
       x = "Race",
       y = "Count") +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal()

Independent samples t-test

library(tidyverse)
library(psych)
library(rstatix)
library(car)  # For Levene’s Test

# Load dataset
CPS85$married <- as.factor(CPS85$married)

# Descriptive statistics by marital status
descriptives_south <- psych::describeBy(CPS85$age, group = CPS85$married, mat = TRUE) %>%
  as.data.frame()

write.csv(descriptives_south, "descriptives_south.csv")

# Normality tests (Shapiro-Wilk)
shapiro.test(CPS85$age[CPS85$married == "Married"])

## 
##  Shapiro-Wilk normality test
## 
## data:  CPS85$age[CPS85$married == "Married"]
## W = 0.96039, p-value = 3.999e-08

shapiro.test(CPS85$age[CPS85$married == "Single"])

## 
##  Shapiro-Wilk normality test
## 
## data:  CPS85$age[CPS85$married == "Single"]
## W = 0.89074, p-value = 2.348e-10

# Histogram with density plot
ggplot(CPS85, aes(x = age, fill = married)) +
  geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.5, position = "identity") +
  geom_density(alpha = 0.7) +
  facet_wrap(~married) +
  theme_minimal() +
  labs(title = "Age Distribution by Marital Status", x = "Age", y = "Density")

# Levene's test for equality of variances
leveneTest(age ~ married, data = CPS85)

## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   1  0.1425  0.706
##       532

# Independent samples t-test (Equal variances assumed)
t_test_result <- t.test(age ~ married, data = CPS85, var.equal = TRUE)
print(t_test_result)

## 
##  Two Sample t-test
## 
## data:  age by married
## t = 6.6999, df = 532, p-value = 5.323e-11
## alternative hypothesis: true difference in means between group Married and group Single is not equal to 0
## 95 percent confidence interval:
##  4.860477 8.893063
## sample estimates:
## mean in group Married  mean in group Single 
##              39.20286              32.32609

# Welch's t-test (Unequal variances)
t_test_result_var <- t.test(age ~ married, data = CPS85, var.equal = FALSE)
print(t_test_result_var)

## 
##  Welch Two Sample t-test
## 
## data:  age by married
## t = 6.5934, df = 355.79, p-value = 1.555e-10
## alternative hypothesis: true difference in means between group Married and group Single is not equal to 0
## 95 percent confidence interval:
##  4.825607 8.927933
## sample estimates:
## mean in group Married  mean in group Single 
##              39.20286              32.32609

# Cohen's d for effect size (parametric)
cohen_d_result <- rstatix::cohens_d(CPS85, age ~ married)
print(cohen_d_result)

## # A tibble: 1 × 7
##   .y.   group1  group2 effsize    n1    n2 magnitude
## * <chr> <chr>   <chr>    <dbl> <int> <int> <ord>    
## 1 age   Married Single   0.605   350   184 moderate

# Mann-Whitney U test (Wilcoxon Rank-Sum test for independent samples)
mann_whitney_result <- wilcox.test(age ~ married, data = CPS85, exact = FALSE)
print(mann_whitney_result)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  age by married
## W = 44398, p-value = 5.971e-13
## alternative hypothesis: true location shift is not equal to 0

# Compute Mann-Whitney U statistic manually (SPSS-style)
compute_U <- function(group1, group2) {
  combined_data <- c(group1, group2)
  ranks <- rank(combined_data)
  
  rank_sum1 <- sum(ranks[1:length(group1)])
  rank_sum2 <- sum(ranks[(length(group1)+1):length(combined_data)])
  
  U1 <- rank_sum1 - length(group1)*(length(group1) + 1)/2
  U2 <- rank_sum2 - length(group2)*(length(group2) + 1)/2
  
  return(min(U1, U2))  # Smaller U value matches SPSS
}

# Define groups correctly for Mann-Whitney U
group1 <- CPS85$age[CPS85$married == "Married"]
group2 <- CPS85$age[CPS85$married == "Single"]

# Compute U statistic manually
U_value <- compute_U(group1, group2)
print(paste("Mann-Whitney U (Custom Calculation):", U_value))

## [1] "Mann-Whitney U (Custom Calculation): 20002.5"

# Compute effect size (r = Z / sqrt(n))
# Get sample sizes
n1 <- length(group1)
n2 <- length(group2)
n <- n1 + n2

# Approximate Z from the Mann-Whitney U test
Z <- qnorm(1 - mann_whitney_result$p.value / 2)  # Two-tailed assumption

# Compute effect size
effect_size_r <- abs(Z) / sqrt(n)
print(paste("Effect size (r):", round(effect_size_r, 3)))

## [1] "Effect size (r): 0.312"

Dependent Samples t-test in R

Here we move to the ‘datasets’ library, so we load it first

You may need to: install.package(‘datasets’)

library(datasets)
library(psych)
library(effectsize)
library(tidyverse)

# This is the default data set. 
# It is not necessary to rename it, but I do it to emphasize it's original form
chick<- ChickWeight

# Here we pivot the data so that we have only time series 4 and 21 in our set
chickdata <- chick%>%
  filter(Time %in% c(4, 21)) %>%  # Select only two time points for comparison
  pivot_wider(names_from = Time, values_from = weight, names_prefix = "Day_") %>%
  drop_na()%>%
  mutate(Difference = Day_21 - Day_4)

# Compute descriptive statistics
desc_day_4 <- psych::describe(chickdata$Day_4)
desc_day_21 <- psych::describe(chickdata$Day_21)
desc_difference <- psych::describe(chickdata$Difference)

as.data.frame(desc_day_4)

##    vars  n     mean       sd median  trimmed    mad min max range       skew
## X1    1 45 60.15556 4.279738     61 60.32432 4.4478  48  69    21 -0.4278585
##      kurtosis        se
## X1 0.06485024 0.6379857

as.data.frame(desc_day_21)

##    vars  n     mean       sd median  trimmed     mad min max range       skew
## X1    1 45 218.6889 71.51027    205 218.6216 75.6126  74 373   299 0.08910638
##      kurtosis       se
## X1 -0.7680967 10.66012

as.data.frame(desc_difference)

##    vars  n     mean       sd median  trimmed     mad min max range    skew
## X1    1 45 158.5333 69.83865    149 158.2973 71.1648  16 309   293 0.10106
##      kurtosis       se
## X1 -0.7835153 10.41093

desc_day_4$Variable <- "Day_4"
desc_day_21$Variable <- "Day_21"
desc_difference$Variable <- "Difference"

# Combine all into one dataset
desc_combined <- rbind(desc_day_4, desc_day_21, desc_difference)%>%
  dplyr::select(Variable, mean, median, sd, skew, kurtosis)



write.csv(desc_combined, "dependent_desc.csv")

# We now want to check the assumption of normality
# Plot a histogram of the Difference variable
hist(chickdata$Difference, main = "Histogram of Difference", 
     xlab = "Difference (Day_21 - Day_4)", col = "lightblue", 
     border = "black")

# Q-Q plot to check for normality
qqnorm(chickdata$Difference)
qqline(chickdata$Difference, col = "red")

# Perform Shapiro-Wilk test for normality on the Difference variable
shapiro_test <- shapiro.test(chickdata$Difference)
shapiro_test

## 
##  Shapiro-Wilk normality test
## 
## data:  chickdata$Difference
## W = 0.98718, p-value = 0.8939

# Run the t-test
paired_result <- t.test(chickdata$Day_4, chickdata$Day_21, paired = TRUE)
paired_result

## 
##  Paired t-test
## 
## data:  chickdata$Day_4 and chickdata$Day_21
## t = -15.228, df = 44, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -179.5152 -137.5515
## sample estimates:
## mean difference 
##       -158.5333

# Calculate Cohen's d
mean_dif<- mean(chickdata$Difference)
sd_dif<- sd(chickdata$Difference)
cohen_d<- mean_dif/sd_dif
cohen_d

## [1] 2.269994

# Use the effsize package
chickdata_long <- chickdata %>%
  dplyr::select(Day_4, Day_21)%>%
  pivot_longer(cols = c(Day_4, Day_21), 
               names_to = "Day", 
               values_to = "Weight")

chickdata_long$Day<- as.factor(chickdata_long$Day)

# Calculate Cohen's d for paired samples
cohen_d_result <- cohens_d(chickdata_long$Weight, chickdata_long$Day, paired = TRUE)

# Print the result
cohen_d_result

## Cohen's d |       95% CI
## ------------------------
## 2.27      | [1.71, 2.82]

Non parametric Wilcoxon Signed Rank

# Load necessary libraries
library(tidyr)
library(rstatix)
library(dplyr)

# Filter for Day 4 and Day 21
chick_subset <- ChickWeight[ChickWeight$Time %in% c(4, 21), ]

# Reshape data to wide format
chick_wide <- chick_subset %>%
  pivot_wider(names_from = Time, values_from = weight) %>%
  na.omit() %>%
  dplyr::select('4', '21')

# Rename columns for clarity
colnames(chick_wide) <- c("Four", "TwentyOne")

# Perform Wilcoxon Signed-Rank Test
test_result <- wilcox.test(chick_wide$Four, chick_wide$TwentyOne, paired = TRUE)

# Print the Wilcoxon Signed-Rank Test result
print(test_result)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  chick_wide$Four and chick_wide$TwentyOne
## V = 0, p-value = 5.355e-09
## alternative hypothesis: true location shift is not equal to 0

# Extract necessary values
Z <- qnorm(test_result$p.value / 2)  # Get Z-score (divide p-value by 2 for two-tailed test)
n <- nrow(chick_wide)  # Number of pairs

# Calculate effect size r
effect_size_r <- abs(Z) / sqrt(n)

# Print effect size
print(paste("Z:", round(Z, 3)))

## [1] "Z: -5.836"

print(paste("Effect size (r):", round(effect_size_r, 3)))

## [1] "Effect size (r): 0.87"

Descriptives for the one-way ANOVA

# Prepare the dataset
anovadata<- CPS85%>%
  filter(sector %in% c("manag", "prof", "sales"))%>%
  dplyr::select(wage, sector)

anovadata$sector<- as.character(anovadata$sector)

# Get descriptive statistics by profession
descriptives_profession <- psych::describeBy(anovadata$wage, group = anovadata$sector, mat = TRUE)%>%
  dplyr::select(-item)
overall_desc_profession<- psych::describe(anovadata$wage)

# Convert the matrix into a data frame
descriptives_profession<- as.data.frame(descriptives_profession)
overall_desc_profession<- as.data.frame(overall_desc_profession)%>%
  mutate(group1 = "Total")%>%
  relocate(group1, .before = everything())
  
descriptives_profession<- rbind(descriptives_profession, overall_desc_profession)

write.csv(descriptives_profession, "descriptives_profession.csv")

descriptives_profession

##     group1 vars   n      mean       sd median   trimmed      mad  min   max
## X11  manag    1  55 12.704000 7.572513 10.620 11.966222 6.493788 1.00 44.50
## X12   prof    1 105 11.947429 5.523833 10.610 11.360706 5.352186 4.35 24.98
## X13  sales    1  38  7.592632 4.232272  5.725  7.157813 2.891070 3.35 19.98
## X1   Total    1 198 11.321818 6.214124 10.000 10.656500 5.774727 1.00 44.50
##     range      skew    kurtosis        se
## X11 43.50 1.4730872  3.67335914 1.0210774
## X12 20.63 0.7727855 -0.36441619 0.5390709
## X13 16.63 0.9778591  0.01196841 0.6865651
## X1  43.50 1.2892669  3.07554682 0.4416185

#Check statistical assumptions for ANOVA

# Shapiro-Wilke Test for normality
aov_model <- aov(wage ~ sector, data = anovadata)
shapiro.test(residuals(aov_model))

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(aov_model)
## W = 0.91874, p-value = 5.393e-09

# QQ Plot for normality
qqnorm(residuals(aov_model))
qqline(residuals(aov_model), col = "red")

# Levene's test for homogeniety of variance
library(car)
leveneTest(wage ~ sector, data = anovadata)

## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)  
## group   2  3.2421 0.0412 *
##       195                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Bartlett's test of homogeniety of variance
bartlett.test(wage ~ sector, data = anovadata)

## 
##  Bartlett test of homogeneity of variances
## 
## data:  wage by sector
## Bartlett's K-squared = 15.317, df = 2, p-value = 0.000472

Run one-way ANOVA

# Be sure to load the library
library(apaTables)
library(dplyr)
library(tidyverse)
library(rstatix)

# Run the ANOVA
anova_results <- aov(wage ~ sector, data = anovadata)

apa.aov.table(anova_results, filename = "anova_results.doc")

## 
## 
## ANOVA results using wage as the dependent variable
##  
## 
##    Predictor      SS  df      MS      F    p partial_eta2 CI_90_partial_eta2
##  (Intercept) 8876.54   1 8876.54 249.68 .000                                
##       sector  674.63   2  337.31   9.49 .000          .09         [.03, .15]
##        Error 6932.59 195   35.55                                            
## 
## Note: Values in square brackets indicate the bounds of the 90% confidence interval for partial eta-squared

summary(anova_results)

##              Df Sum Sq Mean Sq F value   Pr(>F)    
## sector        2    675   337.3   9.488 0.000117 ***
## Residuals   195   6933    35.6                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Perform Tukey's HSD test if variances are equal
TukeyHSD(anova_results)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = wage ~ sector, data = anovadata)
## 
## $sector
##                   diff       lwr       upr     p adj
## prof-manag  -0.7565714 -3.100521  1.587379 0.7265306
## sales-manag -5.1113684 -8.081890 -2.140847 0.0002059
## sales-prof  -4.3547970 -7.020710 -1.688884 0.0004545

# Perform Games-Howel if variances were not equal
anovadata%>%
games_howell_test(wage ~ sector)

## # A tibble: 3 × 8
##   .y.   group1 group2 estimate conf.low conf.high      p.adj p.adj.signif
## * <chr> <chr>  <chr>     <dbl>    <dbl>     <dbl>      <dbl> <chr>       
## 1 wage  manag  prof     -0.757    -3.51      2.00 0.79       ns          
## 2 wage  manag  sales    -5.11     -8.04     -2.18 0.000222   ***         
## 3 wage  prof   sales    -4.35     -6.44     -2.27 0.00000942 ****

# Subset your data for Cohen's d
cohen_data_sales_manag<- anovadata%>%
  filter(sector %in% c("sales", "manag"))

cohen_data_sales_prof<- anovadata%>%
  filter(sector %in% c("sales", "prof"))

# Run cohen's d on subset data
 rstatix::cohens_d(cohen_data_sales_manag, wage ~ sector)

## # A tibble: 1 × 7
##   .y.   group1 group2 effsize    n1    n2 magnitude
## * <chr> <chr>  <chr>    <dbl> <int> <int> <ord>    
## 1 wage  manag  sales    0.833    55    38 large

 rstatix::cohens_d(cohen_data_sales_prof, wage ~ sector)

## # A tibble: 1 × 7
##   .y.   group1 group2 effsize    n1    n2 magnitude
## * <chr> <chr>  <chr>    <dbl> <int> <int> <ord>    
## 1 wage  prof   sales    0.885   105    38 large

Kruskal Wallis non-parametric test when ANOVA assumptions are violated

For the post-hoc test, install.packages(‘dunn.test’)

#Load the library
library(dunn.test)

# Run the omnibus KW test
kwtest<-kruskal.test(wage ~ sector, data = anovadata)

print(kwtest)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  wage by sector
## Kruskal-Wallis chi-squared = 22.533, df = 2, p-value = 1.28e-05

# Create ranks and summarize by group
planet_ranks <- anovadata %>%
  mutate(Rank = rank(wage)) %>%
  group_by(sector) %>%
  summarise(N = n(), Mean_Rank = mean(Rank), Sum_Rank = sum(Rank))

# Print the ranks
print(planet_ranks)

## # A tibble: 3 × 4
##   sector     N Mean_Rank Sum_Rank
##   <chr>  <int>     <dbl>    <dbl>
## 1 manag     55     110.     6032.
## 2 prof     105     109.    11395 
## 3 sales     38      59.9    2274.

# Run post-hoc dunn test
dt<- dunn.test(anovadata$wage, anovadata$sector, kw = TRUE, label = TRUE)

##   Kruskal-Wallis rank sum test
## 
## data: x and group
## Kruskal-Wallis chi-squared = 22.5328, df = 2, p-value = 0
## 
## 
##                            Comparison of x by group                            
##                                 (No adjustment)                                
## Col Mean-|
## Row Mean |      manag       prof
## ---------+----------------------
##     prof |   0.119529
##          |     0.4524
##          |
##    sales |   4.121490   4.487320
##          |    0.0000*    0.0000*
## 
## alpha = 0.05
## Reject Ho if p <= alpha/2

dt

## $chi2
## [1] 22.53284
## 
## $Z
## [1] 0.1195295 4.1214907 4.4873207
## 
## $P
## [1] 4.524279e-01 1.882143e-05 3.606224e-06
## 
## $P.adjusted
## [1] 4.524279e-01 1.882143e-05 3.606224e-06
## 
## $comparisons
## [1] "manag - prof"  "manag - sales" "prof - sales"

# Run eta-squared for omnibus effect size
H <- kwtest$statistic
k <- length(unique(anovadata$sector))
N <- nrow(anovadata)

eta_squared <- (H - k + 1) / (N - k)

# Rename the output explicitly
names(eta_squared) <- "eta_squared"

# Print the result
eta_squared

## eta_squared 
##   0.1052966

# Singed rank correlation for pairwise tests
# Z-values from Dunn's test results
z_manag_sales <- 4.1215
z_prof_sales <- 4.4873

# Calculate observations
n <- nrow(anovadata)  # Total number of observations

# Singed Rank correlation formula
r_manag_sales <- z_manag_sales / sqrt(n)
r_prof_sales <- z_prof_sales / sqrt(n)

# Print effect sizes
r_manag_sales

## [1] 0.2929023

r_prof_sales

## [1] 0.3188985

RBook

Mark Perkins, Ph.D.

2024-12-16

Be sure to chunk your code if using RMarkdown

Example chunk:

Load your libraries. Be sure to install your packages first!

You will get a lot of output. I’ve suppressed it to prevent overcrowding

It is time to calculate descriptive statistics we will be using the CPS85 dataset for this example

Now it is time to run descriptives by categorical variables

We will generate histograms overall and then by category

Calculating frequecies in R

This is a plot of proportions by category

Chi-square test

Be sure to install the vcd package: install.packages(‘vcd’) in your console

Generate a plot of the counts and proportions given categories

Independent samples t-test

Dependent Samples t-test in R

Here we move to the ‘datasets’ library, so we load it first

You may need to: install.package(‘datasets’)

Non parametric Wilcoxon Signed Rank

Descriptives for the one-way ANOVA

Run one-way ANOVA

Kruskal Wallis non-parametric test when ANOVA assumptions are violated

For the post-hoc test, install.packages(‘dunn.test’)