``{r example chunk, message=FALSE, warning=FALSE, echo=TRUE}
# YOUR AWESOME CODE GOES IN THIS SPACE
```
packages <- c(
'psych', 'summarytools', 'Hmisc', 'dplyr', 'stats', 'rstatix',
'ggplot2', 'apaTables', 'effectsize', 'car', 'rcompanion',
'vcd', 'tukeytrend', 'lme4', 'lmtest', 'broom', 'modelr',
'tidyverse', 'lubridate', 'janitor', 'data.table', 'readr', 'knitr', 'mosaicData')
lapply(packages, library, character.only = TRUE)
descriptives <- CPS85 %>%
dplyr::select(wage, educ, exper, age)%>%
psych::describe()
descriptives
## vars n mean sd median trimmed mad min max range skew kurtosis
## wage 1 534 9.02 5.14 7.78 8.28 4.12 1 44.5 43.5 1.69 4.90
## educ 2 534 13.02 2.62 12.00 13.05 1.48 2 18.0 16.0 -0.20 0.81
## exper 3 534 17.82 12.38 15.00 16.75 11.86 0 55.0 55.0 0.68 -0.40
## age 4 534 36.83 11.73 35.00 36.07 11.86 18 64.0 46.0 0.55 -0.60
## se
## wage 0.22
## educ 0.11
## exper 0.54
## age 0.51
# Get descriptive statistics by sector
descriptives_sector <- psych::describeBy(CPS85$wage, group = CPS85$sector, mat = TRUE)
# Convert the matrix into a data frame
descriptives_sector<- as.data.frame(descriptives_sector)
write.csv(descriptives_sector, "descriptives_sector.csv")
descriptives_sector
## item group1 vars n mean sd median trimmed mad min
## X11 1 clerical 1 97 7.422577 2.699018 7.500 7.299873 3.335850 3.00
## X12 2 const 1 20 9.502000 3.343877 9.750 9.499375 3.647196 3.75
## X13 3 manag 1 55 12.704000 7.572513 10.620 11.966222 6.493788 1.00
## X14 4 manuf 1 68 8.036029 4.117607 6.750 7.511786 3.335850 3.00
## X15 5 other 1 68 8.500588 4.601049 6.940 7.869464 3.261720 2.85
## X16 6 prof 1 105 11.947429 5.523833 10.610 11.360706 5.352186 4.35
## X17 7 sales 1 38 7.592632 4.232272 5.725 7.157813 2.891070 3.35
## X18 8 service 1 83 6.537470 3.673278 5.500 5.937612 2.816940 1.75
## max range skew kurtosis se
## X11 15.03 12.03 0.39715109 -0.71992219 0.2740438
## X12 15.00 11.25 -0.02255889 -1.05839301 0.7477136
## X13 44.50 43.50 1.47308721 3.67335914 1.0210774
## X14 22.20 19.20 1.26924002 1.53036996 0.4993332
## X15 26.00 23.15 1.46528476 2.30867798 0.5579592
## X16 24.98 20.63 0.77278554 -0.36441619 0.5390709
## X17 19.98 16.63 0.97785906 0.01196841 0.6865651
## X18 25.00 23.25 2.04386695 6.36667089 0.4031946
#Overall Wage
ggplot(CPS85, aes(x = wage)) +
geom_histogram(aes(y = ..density..), binwidth = 1, fill = "skyblue",
color = "black", alpha = 0.7) +
geom_density(color = "red", fill = "red", alpha = 0.3) +
theme_minimal() +
labs(title = "Overall Wage Distribution", x = "Wage", y = "Density")
###Plot wage by sectory category
ggplot(CPS85, aes(x = wage)) +
geom_histogram(aes(y = ..density..), binwidth = 1, fill = "skyblue",
color = "black", alpha = 0.7) +
geom_density(color = "red", fill = "red", alpha = 0.3) + # Add density curve
facet_wrap(~ sector) + # Facet by 'sector' column
theme_minimal() +
labs(title = "Wage Distribution by Sector", x = "Wage", y = "Density")
library(dplyr)
library(tidyr)
##Single tables
married_table<- table(CPS85$married)
count_table <- table(CPS85$married, CPS85$union)
married_prop<- prop.table(table(CPS85$married))
married_union_prop<- prop.table((table(CPS85$married, CPS85$union)))
#Create into one table
df_count <- as.data.frame(count_table)
df_count <- df_count %>%
rename(Married = Var1, Union = Var2, Count = Freq)
df_count <- df_count %>%
group_by(Union) %>%
mutate(Proportion = Count / sum(Count))
df_total <- df_count %>%
group_by(Married) %>%
summarise(Count = sum(Count), Proportion = sum(Proportion)) %>%
mutate(Union = "Total")
final_table <- bind_rows(df_count, df_total)
married_table
##
## Married Single
## 350 184
count_table
##
## Not Union
## Married 278 72
## Single 160 24
married_prop
##
## Married Single
## 0.6554307 0.3445693
married_union_prop
##
## Not Union
## Married 0.52059925 0.13483146
## Single 0.29962547 0.04494382
df_count
## # A tibble: 4 × 4
## # Groups: Union [2]
## Married Union Count Proportion
## <fct> <fct> <int> <dbl>
## 1 Married Not 278 0.635
## 2 Single Not 160 0.365
## 3 Married Union 72 0.75
## 4 Single Union 24 0.25
df_total
## # A tibble: 2 × 4
## Married Count Proportion Union
## <fct> <int> <dbl> <chr>
## 1 Married 350 1.38 Total
## 2 Single 184 0.615 Total
final_table
## # A tibble: 6 × 4
## # Groups: Union [3]
## Married Union Count Proportion
## <fct> <chr> <int> <dbl>
## 1 Married Not 278 0.635
## 2 Single Not 160 0.365
## 3 Married Union 72 0.75
## 4 Single Union 24 0.25
## 5 Married Total 350 1.38
## 6 Single Total 184 0.615
# Calculate frequency and proportion
frequency_table <- CPS85 %>%
dplyr::select(race, sex, hispanic, south, married, union, sector) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Category") %>%
group_by(Variable, Category) %>%
summarise(Frequency = n(), .groups = "drop") %>%
group_by(Variable) %>%
mutate(Proportion = Frequency / sum(Frequency)) %>%
arrange(Variable, desc(Frequency))
# Create the plot
ggplot(frequency_table, aes(x = Category, y = Frequency, fill = Category)) +
geom_bar(stat = "identity", show.legend = FALSE) +
geom_text(
aes(label = scales::percent(Proportion, accuracy = 0.1)),
position = position_stack(vjust = 0.5),
size = 3, color = "black"
) +
facet_wrap(~Variable, scales = "free", ncol = 2) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(size = 10, face = "bold")
) +
labs(
title = "Frequency and Proportion of Categories by Variable",
x = "Category",
y = "Frequency"
)
write.csv(frequency_table, "frequencytable.csv")
library(summarytools)
library(dplyr)
library(ggplot2)
library(vcd)
# Select relevant columns from the dataset
chidata <- CPS85 %>%
dplyr::select(race, union)
# Generate contingency table
chitable <- table(chidata$race, chidata$union)
# Run chi-square test
chi_test <- chisq.test(chitable)
# Convert the expected counts matrix to a data frame
expected <- as.data.frame(chi_test$expected)
# Extract row names and make them a column
expected$race <- rownames(expected)
# Move 'race' column to the front (if desired)
expected <- expected %>%
dplyr::select(race, everything())
# Add 'type' column to indicate that these are expected counts
expected <- expected %>%
mutate(type = "Expected")
# Convert the contingency table to a data frame
chitable_df <- as.data.frame(chitable)%>%
rename(race = Var1, union = Var2)
observed <- chitable_df %>%
dplyr::select(race, union, Freq) %>%
pivot_wider(names_from = union, values_from = Freq)%>%
mutate(type = "Observed")
overalltable<- rbind(observed, expected)
print("contengency table")
## [1] "contengency table"
chitable
##
## Not Union
## NW 49 18
## W 389 78
print("Expected counts")
## [1] "Expected counts"
chi_test$expected
##
## Not Union
## NW 54.95506 12.04494
## W 383.04494 83.95506
overalltable
## # A tibble: 4 × 4
## race Not Union type
## * <fct> <dbl> <dbl> <chr>
## 1 NW 49 18 Observed
## 2 W 389 78 Observed
## 3 NW 55.0 12.0 Expected
## 4 W 383. 84.0 Expected
print("Chi-square test results:")
## [1] "Chi-square test results:"
print(chi_test)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: chitable
## X-squared = 3.4442, df = 1, p-value = 0.06348
# Calculate Cramér's V or Phi (for 2x2)
print("Effect size metrics, this uses phi because it's 2x2")
## [1] "Effect size metrics, this uses phi because it's 2x2"
assocstats(chitable)
## X^2 df P(> X^2)
## Likelihood Ratio 3.7469 1 0.052906
## Pearson 4.1045 1 0.042770
##
## Phi-Coefficient : 0.088
## Contingency Coeff.: 0.087
## Cramer's V : 0.088
write.csv(overalltable, "chi.csv")
##Compute dataset for a figure
chitable_plot <- as.data.frame(chitable) %>%
rename(Race = Var1, Union = Var2)%>%
group_by(Race) %>%
mutate(Proportion = Freq / sum(Freq))
# Plot with counts below the bars and proportions above
ggplot(chitable_plot, aes(x = Race, y = Freq, fill = Union)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = scales::percent(Proportion)),
position = position_dodge(width = 0.8),
vjust = -0.5, size = 3) +
geom_text(aes(label = Freq),
position = position_dodge(width = 0.8),
vjust = 1.5, size = 3, color = "white") +
labs(title = "Contingency Table: Race vs Union",
x = "Race",
y = "Count") +
scale_fill_brewer(palette = "Set1") +
theme_minimal()
library(tidyverse)
library(psych)
library(rstatix)
library(car) # For Levene’s Test
# Load dataset
CPS85$married <- as.factor(CPS85$married)
# Descriptive statistics by marital status
descriptives_south <- psych::describeBy(CPS85$age, group = CPS85$married, mat = TRUE) %>%
as.data.frame()
write.csv(descriptives_south, "descriptives_south.csv")
# Normality tests (Shapiro-Wilk)
shapiro.test(CPS85$age[CPS85$married == "Married"])
##
## Shapiro-Wilk normality test
##
## data: CPS85$age[CPS85$married == "Married"]
## W = 0.96039, p-value = 3.999e-08
shapiro.test(CPS85$age[CPS85$married == "Single"])
##
## Shapiro-Wilk normality test
##
## data: CPS85$age[CPS85$married == "Single"]
## W = 0.89074, p-value = 2.348e-10
# Histogram with density plot
ggplot(CPS85, aes(x = age, fill = married)) +
geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.5, position = "identity") +
geom_density(alpha = 0.7) +
facet_wrap(~married) +
theme_minimal() +
labs(title = "Age Distribution by Marital Status", x = "Age", y = "Density")
# Levene's test for equality of variances
leveneTest(age ~ married, data = CPS85)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 0.1425 0.706
## 532
# Independent samples t-test (Equal variances assumed)
t_test_result <- t.test(age ~ married, data = CPS85, var.equal = TRUE)
print(t_test_result)
##
## Two Sample t-test
##
## data: age by married
## t = 6.6999, df = 532, p-value = 5.323e-11
## alternative hypothesis: true difference in means between group Married and group Single is not equal to 0
## 95 percent confidence interval:
## 4.860477 8.893063
## sample estimates:
## mean in group Married mean in group Single
## 39.20286 32.32609
# Welch's t-test (Unequal variances)
t_test_result_var <- t.test(age ~ married, data = CPS85, var.equal = FALSE)
print(t_test_result_var)
##
## Welch Two Sample t-test
##
## data: age by married
## t = 6.5934, df = 355.79, p-value = 1.555e-10
## alternative hypothesis: true difference in means between group Married and group Single is not equal to 0
## 95 percent confidence interval:
## 4.825607 8.927933
## sample estimates:
## mean in group Married mean in group Single
## 39.20286 32.32609
# Cohen's d for effect size (parametric)
cohen_d_result <- rstatix::cohens_d(CPS85, age ~ married)
print(cohen_d_result)
## # A tibble: 1 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 age Married Single 0.605 350 184 moderate
# Mann-Whitney U test (Wilcoxon Rank-Sum test for independent samples)
mann_whitney_result <- wilcox.test(age ~ married, data = CPS85, exact = FALSE)
print(mann_whitney_result)
##
## Wilcoxon rank sum test with continuity correction
##
## data: age by married
## W = 44398, p-value = 5.971e-13
## alternative hypothesis: true location shift is not equal to 0
# Compute Mann-Whitney U statistic manually (SPSS-style)
compute_U <- function(group1, group2) {
combined_data <- c(group1, group2)
ranks <- rank(combined_data)
rank_sum1 <- sum(ranks[1:length(group1)])
rank_sum2 <- sum(ranks[(length(group1)+1):length(combined_data)])
U1 <- rank_sum1 - length(group1)*(length(group1) + 1)/2
U2 <- rank_sum2 - length(group2)*(length(group2) + 1)/2
return(min(U1, U2)) # Smaller U value matches SPSS
}
# Define groups correctly for Mann-Whitney U
group1 <- CPS85$age[CPS85$married == "Married"]
group2 <- CPS85$age[CPS85$married == "Single"]
# Compute U statistic manually
U_value <- compute_U(group1, group2)
print(paste("Mann-Whitney U (Custom Calculation):", U_value))
## [1] "Mann-Whitney U (Custom Calculation): 20002.5"
# Compute effect size (r = Z / sqrt(n))
# Get sample sizes
n1 <- length(group1)
n2 <- length(group2)
n <- n1 + n2
# Approximate Z from the Mann-Whitney U test
Z <- qnorm(1 - mann_whitney_result$p.value / 2) # Two-tailed assumption
# Compute effect size
effect_size_r <- abs(Z) / sqrt(n)
print(paste("Effect size (r):", round(effect_size_r, 3)))
## [1] "Effect size (r): 0.312"
library(datasets)
library(psych)
library(effectsize)
library(tidyverse)
# This is the default data set.
# It is not necessary to rename it, but I do it to emphasize it's original form
chick<- ChickWeight
# Here we pivot the data so that we have only time series 4 and 21 in our set
chickdata <- chick%>%
filter(Time %in% c(4, 21)) %>% # Select only two time points for comparison
pivot_wider(names_from = Time, values_from = weight, names_prefix = "Day_") %>%
drop_na()%>%
mutate(Difference = Day_21 - Day_4)
# Compute descriptive statistics
desc_day_4 <- psych::describe(chickdata$Day_4)
desc_day_21 <- psych::describe(chickdata$Day_21)
desc_difference <- psych::describe(chickdata$Difference)
as.data.frame(desc_day_4)
## vars n mean sd median trimmed mad min max range skew
## X1 1 45 60.15556 4.279738 61 60.32432 4.4478 48 69 21 -0.4278585
## kurtosis se
## X1 0.06485024 0.6379857
as.data.frame(desc_day_21)
## vars n mean sd median trimmed mad min max range skew
## X1 1 45 218.6889 71.51027 205 218.6216 75.6126 74 373 299 0.08910638
## kurtosis se
## X1 -0.7680967 10.66012
as.data.frame(desc_difference)
## vars n mean sd median trimmed mad min max range skew
## X1 1 45 158.5333 69.83865 149 158.2973 71.1648 16 309 293 0.10106
## kurtosis se
## X1 -0.7835153 10.41093
desc_day_4$Variable <- "Day_4"
desc_day_21$Variable <- "Day_21"
desc_difference$Variable <- "Difference"
# Combine all into one dataset
desc_combined <- rbind(desc_day_4, desc_day_21, desc_difference)%>%
dplyr::select(Variable, mean, median, sd, skew, kurtosis)
write.csv(desc_combined, "dependent_desc.csv")
# We now want to check the assumption of normality
# Plot a histogram of the Difference variable
hist(chickdata$Difference, main = "Histogram of Difference",
xlab = "Difference (Day_21 - Day_4)", col = "lightblue",
border = "black")
# Q-Q plot to check for normality
qqnorm(chickdata$Difference)
qqline(chickdata$Difference, col = "red")
# Perform Shapiro-Wilk test for normality on the Difference variable
shapiro_test <- shapiro.test(chickdata$Difference)
shapiro_test
##
## Shapiro-Wilk normality test
##
## data: chickdata$Difference
## W = 0.98718, p-value = 0.8939
# Run the t-test
paired_result <- t.test(chickdata$Day_4, chickdata$Day_21, paired = TRUE)
paired_result
##
## Paired t-test
##
## data: chickdata$Day_4 and chickdata$Day_21
## t = -15.228, df = 44, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -179.5152 -137.5515
## sample estimates:
## mean difference
## -158.5333
# Calculate Cohen's d
mean_dif<- mean(chickdata$Difference)
sd_dif<- sd(chickdata$Difference)
cohen_d<- mean_dif/sd_dif
cohen_d
## [1] 2.269994
# Use the effsize package
chickdata_long <- chickdata %>%
dplyr::select(Day_4, Day_21)%>%
pivot_longer(cols = c(Day_4, Day_21),
names_to = "Day",
values_to = "Weight")
chickdata_long$Day<- as.factor(chickdata_long$Day)
# Calculate Cohen's d for paired samples
cohen_d_result <- cohens_d(chickdata_long$Weight, chickdata_long$Day, paired = TRUE)
# Print the result
cohen_d_result
## Cohen's d | 95% CI
## ------------------------
## 2.27 | [1.71, 2.82]
# Load necessary libraries
library(tidyr)
library(rstatix)
library(dplyr)
# Filter for Day 4 and Day 21
chick_subset <- ChickWeight[ChickWeight$Time %in% c(4, 21), ]
# Reshape data to wide format
chick_wide <- chick_subset %>%
pivot_wider(names_from = Time, values_from = weight) %>%
na.omit() %>%
dplyr::select('4', '21')
# Rename columns for clarity
colnames(chick_wide) <- c("Four", "TwentyOne")
# Perform Wilcoxon Signed-Rank Test
test_result <- wilcox.test(chick_wide$Four, chick_wide$TwentyOne, paired = TRUE)
# Print the Wilcoxon Signed-Rank Test result
print(test_result)
##
## Wilcoxon signed rank test with continuity correction
##
## data: chick_wide$Four and chick_wide$TwentyOne
## V = 0, p-value = 5.355e-09
## alternative hypothesis: true location shift is not equal to 0
# Extract necessary values
Z <- qnorm(test_result$p.value / 2) # Get Z-score (divide p-value by 2 for two-tailed test)
n <- nrow(chick_wide) # Number of pairs
# Calculate effect size r
effect_size_r <- abs(Z) / sqrt(n)
# Print effect size
print(paste("Z:", round(Z, 3)))
## [1] "Z: -5.836"
print(paste("Effect size (r):", round(effect_size_r, 3)))
## [1] "Effect size (r): 0.87"
# Prepare the dataset
anovadata<- CPS85%>%
filter(sector %in% c("manag", "prof", "sales"))%>%
dplyr::select(wage, sector)
anovadata$sector<- as.character(anovadata$sector)
# Get descriptive statistics by profession
descriptives_profession <- psych::describeBy(anovadata$wage, group = anovadata$sector, mat = TRUE)%>%
dplyr::select(-item)
overall_desc_profession<- psych::describe(anovadata$wage)
# Convert the matrix into a data frame
descriptives_profession<- as.data.frame(descriptives_profession)
overall_desc_profession<- as.data.frame(overall_desc_profession)%>%
mutate(group1 = "Total")%>%
relocate(group1, .before = everything())
descriptives_profession<- rbind(descriptives_profession, overall_desc_profession)
write.csv(descriptives_profession, "descriptives_profession.csv")
descriptives_profession
## group1 vars n mean sd median trimmed mad min max
## X11 manag 1 55 12.704000 7.572513 10.620 11.966222 6.493788 1.00 44.50
## X12 prof 1 105 11.947429 5.523833 10.610 11.360706 5.352186 4.35 24.98
## X13 sales 1 38 7.592632 4.232272 5.725 7.157813 2.891070 3.35 19.98
## X1 Total 1 198 11.321818 6.214124 10.000 10.656500 5.774727 1.00 44.50
## range skew kurtosis se
## X11 43.50 1.4730872 3.67335914 1.0210774
## X12 20.63 0.7727855 -0.36441619 0.5390709
## X13 16.63 0.9778591 0.01196841 0.6865651
## X1 43.50 1.2892669 3.07554682 0.4416185
#Check statistical assumptions for ANOVA
# Shapiro-Wilke Test for normality
aov_model <- aov(wage ~ sector, data = anovadata)
shapiro.test(residuals(aov_model))
##
## Shapiro-Wilk normality test
##
## data: residuals(aov_model)
## W = 0.91874, p-value = 5.393e-09
# QQ Plot for normality
qqnorm(residuals(aov_model))
qqline(residuals(aov_model), col = "red")
# Levene's test for homogeniety of variance
library(car)
leveneTest(wage ~ sector, data = anovadata)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 3.2421 0.0412 *
## 195
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Bartlett's test of homogeniety of variance
bartlett.test(wage ~ sector, data = anovadata)
##
## Bartlett test of homogeneity of variances
##
## data: wage by sector
## Bartlett's K-squared = 15.317, df = 2, p-value = 0.000472
# Be sure to load the library
library(apaTables)
library(dplyr)
library(tidyverse)
library(rstatix)
# Run the ANOVA
anova_results <- aov(wage ~ sector, data = anovadata)
apa.aov.table(anova_results, filename = "anova_results.doc")
##
##
## ANOVA results using wage as the dependent variable
##
##
## Predictor SS df MS F p partial_eta2 CI_90_partial_eta2
## (Intercept) 8876.54 1 8876.54 249.68 .000
## sector 674.63 2 337.31 9.49 .000 .09 [.03, .15]
## Error 6932.59 195 35.55
##
## Note: Values in square brackets indicate the bounds of the 90% confidence interval for partial eta-squared
summary(anova_results)
## Df Sum Sq Mean Sq F value Pr(>F)
## sector 2 675 337.3 9.488 0.000117 ***
## Residuals 195 6933 35.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Perform Tukey's HSD test if variances are equal
TukeyHSD(anova_results)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = wage ~ sector, data = anovadata)
##
## $sector
## diff lwr upr p adj
## prof-manag -0.7565714 -3.100521 1.587379 0.7265306
## sales-manag -5.1113684 -8.081890 -2.140847 0.0002059
## sales-prof -4.3547970 -7.020710 -1.688884 0.0004545
# Perform Games-Howel if variances were not equal
anovadata%>%
games_howell_test(wage ~ sector)
## # A tibble: 3 × 8
## .y. group1 group2 estimate conf.low conf.high p.adj p.adj.signif
## * <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 wage manag prof -0.757 -3.51 2.00 0.79 ns
## 2 wage manag sales -5.11 -8.04 -2.18 0.000222 ***
## 3 wage prof sales -4.35 -6.44 -2.27 0.00000942 ****
# Subset your data for Cohen's d
cohen_data_sales_manag<- anovadata%>%
filter(sector %in% c("sales", "manag"))
cohen_data_sales_prof<- anovadata%>%
filter(sector %in% c("sales", "prof"))
# Run cohen's d on subset data
rstatix::cohens_d(cohen_data_sales_manag, wage ~ sector)
## # A tibble: 1 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 wage manag sales 0.833 55 38 large
rstatix::cohens_d(cohen_data_sales_prof, wage ~ sector)
## # A tibble: 1 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 wage prof sales 0.885 105 38 large
#Load the library
library(dunn.test)
# Run the omnibus KW test
kwtest<-kruskal.test(wage ~ sector, data = anovadata)
print(kwtest)
##
## Kruskal-Wallis rank sum test
##
## data: wage by sector
## Kruskal-Wallis chi-squared = 22.533, df = 2, p-value = 1.28e-05
# Create ranks and summarize by group
planet_ranks <- anovadata %>%
mutate(Rank = rank(wage)) %>%
group_by(sector) %>%
summarise(N = n(), Mean_Rank = mean(Rank), Sum_Rank = sum(Rank))
# Print the ranks
print(planet_ranks)
## # A tibble: 3 × 4
## sector N Mean_Rank Sum_Rank
## <chr> <int> <dbl> <dbl>
## 1 manag 55 110. 6032.
## 2 prof 105 109. 11395
## 3 sales 38 59.9 2274.
# Run post-hoc dunn test
dt<- dunn.test(anovadata$wage, anovadata$sector, kw = TRUE, label = TRUE)
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 22.5328, df = 2, p-value = 0
##
##
## Comparison of x by group
## (No adjustment)
## Col Mean-|
## Row Mean | manag prof
## ---------+----------------------
## prof | 0.119529
## | 0.4524
## |
## sales | 4.121490 4.487320
## | 0.0000* 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha/2
dt
## $chi2
## [1] 22.53284
##
## $Z
## [1] 0.1195295 4.1214907 4.4873207
##
## $P
## [1] 4.524279e-01 1.882143e-05 3.606224e-06
##
## $P.adjusted
## [1] 4.524279e-01 1.882143e-05 3.606224e-06
##
## $comparisons
## [1] "manag - prof" "manag - sales" "prof - sales"
# Run eta-squared for omnibus effect size
H <- kwtest$statistic
k <- length(unique(anovadata$sector))
N <- nrow(anovadata)
eta_squared <- (H - k + 1) / (N - k)
# Rename the output explicitly
names(eta_squared) <- "eta_squared"
# Print the result
eta_squared
## eta_squared
## 0.1052966
# Singed rank correlation for pairwise tests
# Z-values from Dunn's test results
z_manag_sales <- 4.1215
z_prof_sales <- 4.4873
# Calculate observations
n <- nrow(anovadata) # Total number of observations
# Singed Rank correlation formula
r_manag_sales <- z_manag_sales / sqrt(n)
r_prof_sales <- z_prof_sales / sqrt(n)
# Print effect sizes
r_manag_sales
## [1] 0.2929023
r_prof_sales
## [1] 0.3188985