Project History and Acknowledgments
This project originated from graduate coursework in Educational Data Analysis at Ohio University between 2009-2011, initially implemented using both SPSS and R. The work has since been fully translated to \(R\) and modernized through AI-assisted refinements to improve code efficiency, methodological clarity, and analytical accuracy. The project demonstrates advanced techniques for educational data analysis, and predictive analytics in institutional research contexts.
The project maintains its educational value through comprehensive documentation that preserves both the final solutions and the developmental journey. These records capture not only successful methodologies but also the analytical reasoning and challenges encountered throughout the process. By bridging rigorous statistical theory with practical applications, the work serves as a valuable resource that connects academic foundations to the real-world needs of education researchers.
Data Overview
A typical school district collects annual standardized test scores along with student background surveys, which include information about test preparation course participation. District administrators use this aggregated data to identify performance trends and allocate resources like free lunch programs or targeted tutoring services for underperforming student groups. Following analysis, the dataset is anonymized and made available for educational research.
Simulated Data Function
The R function simulated_data() generates a synthetic
dataset replicating this real-world scenario while protecting student
privacy. It produces artificial records containing:
Key Features:
The simulation carefully models realistic distributions and correlations between variables, creating a practical alternative to confidential student data for research and instructional purposes.
# =================================
# Student Data Simulation Script
# =================================
#
# Description: This script generates synthetic student data
# including demographic characteristics, academic performance
# metrics, and Likert-scale survey responses.
#
# Features:
# - Realistic demographic distributions
# - Correlated academic performance metrics
# - Validated Likert-scale survey items
# - Controlled missing data mechanisms
# - Proper factor variable handling
#
# Dependencies:
# - MASS package for multivariate normal generation
# =========================================
simulated_data <- function(n = 1000,
gender_prob = c(0.55, 0.45),
race_prob = c(0.2, 0.3, 0.4, 0.1),
lunch_prob = c(0.35, 0.35, 0.30),
test_prep_prob = c(0.7, 0.3),
missing_prob = 0.05,
seed = 1234,
save_csv = TRUE,
file_path = "student_data.csv") {
# Load required packages
if (!requireNamespace("MASS", quietly = TRUE)) {
install.packages("MASS")
}
library(MASS)
# Set random seed for reproducibility
set.seed(seed)
# Input validation
stopifnot(
is.numeric(n), length(n) == 1, n > 0,
length(gender_prob) == 2, sum(gender_prob) == 1,
length(race_prob) == 4, sum(race_prob) == 1,
length(lunch_prob) == 3, sum(lunch_prob) == 1,
length(test_prep_prob) == 2, sum(test_prep_prob) == 1,
is.numeric(missing_prob), missing_prob >= 0, missing_prob <= 0.2
)
# Create base data frame with demographic variables
student_data <- data.frame(
id = 1:n,
gender = factor(sample(c("female", "male"), n, replace = TRUE,
prob = gender_prob)),
race = factor(sample(c("group A", "group B", "group C", "group D"),
n, replace = TRUE, prob = race_prob)),
parental_education = factor(sample(c("some high school", "high school", "some college", "associate's degree", "bachelor's degree", "master's degree"),
n, replace = TRUE, prob = c(0.15, 0.2, 0.25, 0.15, 0.15, 0.1)),
levels = c("some high school", "high school", "some college",
"associate's degree", "bachelor's degree",
"master's degree"), ordered = TRUE),
lunch = factor(sample(c("standard", "free", "reduced"),
n, replace = TRUE, prob = lunch_prob)),
test_prep = factor(sample(c("none", "completed"),
n, replace = TRUE, prob = test_prep_prob))
)
# Generate correlated academic scores
sigma_matrix <- matrix(
c(225, 100, 110, 7.5, # Math (SD = 15)
100, 144, 90, 6.0, # Reading (SD = 12)
110, 90, 196, 7.0, # Writing (SD = 14)
7.5, 6.0, 7.0, 0.25), # GPA (SD = 0.5)
nrow = 4)
# Ensure positive definite covariance matrix
if(!all(eigen(sigma_matrix)$values > 0)) {
if(requireNamespace("Matrix", quietly = TRUE)) {
sigma_matrix <- as.matrix(Matrix::nearPD(sigma_matrix)$mat)
message("Adjusted covariance matrix to be positive definite")
} else {
stop("Covariance matrix is not positive definite. Install package 'Matrix' to fix automatically.")
}
}
# Generate academic scores
scores <- MASS::mvrnorm(
n,
mu = c(65, 70, 68, 2.5),
Sigma = sigma_matrix
)
# Add scores with realistic bounds
student_data <- cbind(student_data, data.frame(
math_score = round(pmax(0, pmin(100, scores[, 1]))),
reading_score = round(pmax(0, pmin(100, scores[, 2]))),
writing_score = round(pmax(0, pmin(100, scores[, 3]))),
gpa = round(pmax(0, pmin(4, scores[, 4])), 2)
))
# Survey items configuration
questions <- c("C1", "C2", "C3",
"A1", "A2", "A3",
"P1", "P2", "P3",
"B1", "B2", "B3",
"V1", "V2", "V3",
"E1", "E2", "E3",
"S1", "S2", "S3")
constructs <- c("Cognitive", "Affective", "Physiological",
"Behavioral", "Value", "Self_Efficacy", "Social")
# Polarity vector for reverse coding
polarity_vector <- c(
C1 = 1, C2 = -1, C3 = -1, # Cognitive
A1 = -1, A2 = 1, A3 = -1, # Affective
P1 = 1, P2 = -1, P3 = 1, # Physiological
B1 = 1, B2 = -1, B3 = 1, # Behavioral
V1 = -1, V2 = 1, V3 = 1, # Value
E1 = -1, E2 = -1, E3 = 1, # Self-Efficacy
S1 = -1, S2 = -1, S3 = 1 # Social
)
# Generate base scores for constructs
base_scores <- MASS::mvrnorm(
n,
mu = rep(0, 7), # Neutral base scores
Sigma = matrix(c(
1.0, 0.6, 0.5, 0.4, 0.3, 0.4, 0.2,
0.6, 1.0, 0.4, 0.5, 0.2, 0.3, 0.3,
0.5, 0.4, 1.0, 0.3, 0.2, 0.2, 0.1,
0.4, 0.5, 0.3, 1.0, 0.4, 0.3, 0.2,
0.3, 0.2, 0.2, 0.4, 1.0, 0.5, 0.3,
0.4, 0.3, 0.2, 0.3, 0.5, 1.0, 0.4,
0.2, 0.3, 0.1, 0.2, 0.3, 0.4, 1.0),
nrow = 7)
)
# Generate Likert-scale items
likert_items <- matrix(nrow = n, ncol = length(questions))
for(i in seq_along(questions)) {
construct_idx <- ceiling(i/3)
raw_score <- base_scores[, construct_idx] + rnorm(n, 0, 0.7)
# Use question name to check polarity
if(polarity_vector[questions[i]] == -1) raw_score <- -raw_score
likert_items[, i] <- pmin(pmax(round(3 + raw_score), 1), 5)
}
colnames(likert_items) <- questions
likert_items <- as.data.frame(likert_items)
# Function to introduce missing data
introduce_missing <- function(x, prob) {
if(prob <= 0 || all(is.na(x))) return(x)
na_indices <- sample(length(x), size = min(length(x), round(length(x) * prob)))
x[na_indices] <- NA
return(x)
}
# Apply missing data patterns
numeric_cols <- c("math_score", "reading_score", "writing_score", "gpa")
student_data[numeric_cols] <- lapply(student_data[numeric_cols],
function(x) introduce_missing(x, missing_prob * 0.4))
likert_items <- as.data.frame(lapply(likert_items,
function(x) introduce_missing(x, missing_prob * 0.3)))
# Combine all data
student_data <- cbind(student_data, likert_items)
# Add missingness to categorical variables
cat_cols <- c("parental_education", "lunch", "test_prep")
student_data[cat_cols] <- lapply(student_data[cat_cols],
function(x) introduce_missing(x, missing_prob * 0.5))
# Add MAR pattern (Missing At Random)
low_gpa <- student_data$gpa < quantile(student_data$gpa, 0.25, na.rm = TRUE)
low_gpa[is.na(low_gpa)] <- FALSE
valid_math <- !is.na(student_data$math_score)
to_missing <- low_gpa & valid_math & (runif(n) < missing_prob * 2)
student_data$math_score[to_missing] <- NA
# Final cleanup
student_data <- student_data[, colSums(!is.na(student_data)) > 0]
# Save to CSV if requested
if(save_csv) {
tryCatch({
write.csv(student_data, file_path, row.names = FALSE)
message("Data saved to ", normalizePath(file_path))
}, error = function(e) {
warning("Failed to save data: ", e$message)
})
}
return(student_data)
}
# Save function to file
dump("simulated_data", file = "data_simulation.R")
Structure of the simulated data:
# Generate and examine the data
student_data <- simulated_data()
str(student_data)
## 'data.frame': 1000 obs. of 31 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : Factor w/ 2 levels "female","male": 1 2 2 2 2 2 1 1 2 1 ...
## $ race : Factor w/ 4 levels "group A","group B",..: 1 2 3 3 1 3 2 3 4 4 ...
## $ parental_education: Ord.factor w/ 6 levels "some high school"<..: 3 3 5 1 3 5 5 3 4 2 ...
## $ lunch : Factor w/ 3 levels "free","reduced",..: NA 1 1 1 1 1 1 2 3 2 ...
## $ test_prep : Factor w/ 2 levels "completed","none": 2 1 2 2 2 1 1 1 2 2 ...
## $ math_score : num 38 69 44 64 45 38 48 68 53 82 ...
## $ reading_score : num 48 69 69 54 44 64 63 60 71 82 ...
## $ writing_score : num 70 83 48 73 41 48 68 83 89 71 ...
## $ gpa : num 1.68 2.81 1.78 2.25 1.18 1.6 2.09 2.62 2.69 3.05 ...
## $ C1 : num 2 4 2 3 3 5 2 4 4 4 ...
## $ C2 : num 5 3 3 3 4 2 4 2 2 1 ...
## $ C3 : num 3 2 3 4 3 2 4 3 2 3 ...
## $ A1 : num 3 1 4 4 4 1 4 4 3 3 ...
## $ A2 : num 3 5 2 3 3 3 2 2 2 4 ...
## $ A3 : num 4 1 5 3 4 1 3 3 3 NA ...
## $ P1 : num 4 4 2 3 3 5 4 2 1 3 ...
## $ P2 : num 3 1 5 2 4 1 2 5 4 3 ...
## $ P3 : num 3 3 3 4 1 4 4 2 3 2 ...
## $ B1 : num 4 5 3 4 2 3 2 2 2 5 ...
## $ B2 : num 4 1 3 2 4 1 4 4 4 1 ...
## $ B3 : num 3 5 4 4 2 4 3 4 2 5 ...
## $ V1 : num 3 3 NA 3 4 1 3 3 1 3 ...
## $ V2 : num 2 4 4 3 3 4 4 2 5 4 ...
## $ V3 : num 3 5 4 2 4 5 4 2 4 3 ...
## $ E1 : num 4 3 4 5 4 1 4 4 2 2 ...
## $ E2 : num 4 3 4 4 4 1 3 4 2 2 ...
## $ E3 : num 3 3 3 2 1 5 2 2 4 3 ...
## $ S1 : num 4 4 3 2 5 2 2 5 3 3 ...
## $ S2 : num 4 5 5 2 4 4 4 3 2 3 ...
## $ S3 : num 2 1 1 3 2 4 4 2 4 3 ...
Key features of this cleaning & screening section:
1.5*IQR)In applied research, handling missing data and detecting outliers are both essential for valid results, but they should not occur at the same stage. Missing data mechanisms—whether completely at random (MCAR), at random (MAR), or not at random (MNAR)—affect estimates of means, variances, correlations, and covariance structures such as Mahalanobis distance (Enders, 2010; Little & Rubin, 2019a). Outlier detection, in turn, depends on these very estimates to identify cases that deviate substantially from the data distribution.
Best practice is to address missingness first, ensuring that the dataset structure is preserved, and only then conduct outlier analysis (Osborne, 2013). This sequence reduces the risk of misclassifying cases as outliers due to unstable or biased estimates from unhandled missing values (Tabachnick & Fidell, 2019a).
Key Points
Recommended Workflow:
Proper handling of missing data depends on three key factors:
< 5%: Generally acceptable for most imputation
methods with minimal bias (Schafer & Graham,
2002).5 - 15%: Potentially problematic, requires careful
imputation (Bennett,
2001).> 15%: May need specialized techniques or
acknowledgment of limitations (Enders, 2010; Graham, 2009; Peugh & Enders, 2004).Missing data mechanisms describe why data is missing and are crucial for determining the appropriate handling approach. There are three primary mechanisms:
| Type | Definition | Example | Analysis Impact |
|---|---|---|---|
| MCAR | Missingness is completely random | Random data loss | Unbiased estimates (if using complete cases) |
| MAR | Missingness depends on observed data | Men less likely to report income | Requires methods like multiple imputation |
| MNAR | Missingness depends on unobserved data | e.g., People with high income don’t report income | More complex, requires modeling the missingness mechanism |
We begin by examining the fundamental structure of our dataset:
# Load and install required packages
required_packages <- c("naniar", "visdat", "dplyr", "tidyr", "ggplot2", "tibble",
"mice")
for (package in required_packages) {
if (!require(package, character.only = TRUE)) {
install.packages(package)
library(package, character.only = TRUE)
}
}
# Optional: Set a clean default plot theme
theme_set(theme_minimal())
# A. Function to summarize missing values per column
get_missing_summary <- function(data) {
if (!is.data.frame(data)) {
stop("Input must be a data frame.")
}
tryCatch({
data %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
tidyr::pivot_longer(cols = everything(), names_to = "variable", values_to = "missing_count") %>%
mutate(missing_pct = round(missing_count/nrow(data) * 100, 1), variable = factor(variable,
levels = variable[order(-missing_count)])) %>%
arrange(desc(missing_count))
}, error = function(e) {
message("Error in missing data summary: ", e$message)
return(NULL)
})
}
# B. Function to extract rows with missing values
get_na_rows <- function(data) {
if (!is.data.frame(data)) {
stop("Input must be a data frame.")
}
tryCatch({
data_with_ids <- data %>%
tibble::rowid_to_column("row_id")
na_rows <- data_with_ids[!complete.cases(data_with_ids), ]
if (nrow(na_rows) == 0) {
message("No missing values found in the data.")
return(NULL)
}
return(na_rows)
}, error = function(e) {
message("Error finding NA rows: ", e$message)
return(NULL)
})
}
# C. MAIN ANALYSIS BLOCK
if (exists("student_data")) {
# 1. Summary table
missing_summary <- get_missing_summary(student_data)
if (!is.null(missing_summary)) {
cat("\nMissing Data Summary:\n")
print(missing_summary)
# 2. Visualize missing data pattern
print(vis_miss(student_data, cluster = TRUE) + theme(axis.text.x = element_text(angle = 45,
hjust = 1, size = 8), plot.title = element_text(face = "bold")) + labs(title = "Missing Data Patterns in Simulated Student Data",
subtitle = "Clustered by similarity of missingness patterns") + scale_fill_manual(values = c("grey90",
"#E41A1C"), labels = c("Present", "Missing")))
# 3. Optional: Show raw NA counts per variable
cat("\nRaw NA Count per Variable:\n")
print(student_data %>%
summarise(across(everything(), ~sum(is.na(.)))))
# 4. Extract and preview NA rows
na_rows <- get_na_rows(student_data)
if (!is.null(na_rows)) {
cat(paste0("\nRows with missing values: ", nrow(na_rows), " (", round(nrow(na_rows)/nrow(student_data) *
100, 1), "%)\n"))
cat("\nFirst 10 rows with missing values:\n")
print(head(na_rows, 10))
# 5. Visualize missing combinations
if ("gg_miss_upset" %in% ls("package:naniar") && nrow(na_rows) > 1) {
print(gg_miss_upset(student_data))
}
}
}
} else {
message("Error: 'student_data' object not found. Please load your data first.")
}
##
## Missing Data Summary:
## # A tibble: 31 × 3
## variable missing_count missing_pct
## <fct> <int> <dbl>
## 1 math_score 34 3.4
## 2 parental_education 25 2.5
## 3 lunch 25 2.5
## 4 test_prep 25 2.5
## 5 reading_score 20 2
## 6 writing_score 20 2
## 7 gpa 20 2
## 8 C1 15 1.5
## 9 C2 15 1.5
## 10 C3 15 1.5
## # ℹ 21 more rows
##
## Raw NA Count per Variable:
## id gender race parental_education lunch test_prep math_score reading_score
## 1 0 0 0 25 25 25 34 20
## writing_score gpa C1 C2 C3 A1 A2 A3 P1 P2 P3 B1 B2 B3 V1 V2 V3 E1 E2 E3 S1 S2
## 1 20 20 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15
## S3
## 1 15
##
## Rows with missing values: 389 (38.9%)
##
## First 10 rows with missing values:
## row_id id gender race parental_education lunch test_prep math_score
## 1 1 1 female group A some college <NA> none 38
## 3 3 3 male group C bachelor's degree free none 44
## 10 10 10 female group D high school reduced none 82
## 11 11 11 male group B some college standard none 71
## 13 13 13 female group C associate's degree standard none 48
## 16 16 16 male group C high school standard completed 34
## 18 18 18 female group B bachelor's degree reduced none 55
## 20 20 20 female group A high school <NA> completed 74
## 21 21 21 female group C <NA> standard none 40
## 22 22 22 female group B associate's degree reduced completed NA
## reading_score writing_score gpa C1 C2 C3 A1 A2 A3 P1 P2 P3 B1 B2 B3 V1 V2
## 1 48 70 1.68 2 5 3 3 3 4 4 3 3 4 4 3 3 2
## 3 69 48 1.78 2 3 3 4 2 5 2 5 3 3 3 4 NA 4
## 10 82 71 3.05 4 1 3 3 4 NA 3 3 2 5 1 5 3 4
## 11 82 66 2.80 2 3 3 5 1 3 3 3 4 NA 3 4 3 2
## 13 75 50 2.01 3 3 2 3 3 2 NA 3 4 3 2 2 3 4
## 16 49 59 1.45 NA 3 2 4 3 NA 3 1 3 3 5 2 2 4
## 18 78 65 2.44 4 4 4 2 4 3 4 3 3 1 NA 2 4 3
## 20 66 67 2.55 1 4 5 4 2 4 2 5 2 1 5 1 4 3
## 21 51 62 1.64 3 2 1 2 3 1 3 3 2 3 1 4 3 3
## 22 65 53 1.49 1 5 5 5 1 4 3 3 2 3 NA 2 5 3
## V3 E1 E2 E3 S1 S2 S3
## 1 3 4 4 3 4 4 2
## 3 4 4 4 3 3 5 1
## 10 3 2 2 3 3 3 3
## 11 2 3 4 3 5 4 2
## 13 4 2 3 2 3 3 1
## 16 3 3 1 4 1 1 4
## 18 3 2 3 3 2 2 4
## 20 1 4 3 2 5 4 1
## 21 3 2 4 3 3 3 5
## 22 2 3 3 3 5 4 2
Interpretation Guide: Missing Data Visualizations
This section provides guidance on interpreting the two main plots
used to diagnose missingness: vis_miss() and
gg_miss_upset().
1. vis_miss() — Missingness Heatmap
What it shows:
How to interpret:
| Visual Pattern | Interpretation |
|---|---|
| Vertical red bars | Entire variables are mostly or fully missing |
| Horizontal red streaks | Specific rows have systematic missingness |
| Scattered red cells | Likely missing completely at random (MCAR) |
| Clustered red areas (blocks) | Suggests structured or related missingness |
Use cases:
2. gg_miss_upset() — Missingness Pattern
Combinations
What it shows:
How to interpret:
| Visual Feature | Interpretation |
|---|---|
| High bar for a single variable | That variable is often missing alone |
| High bar for multiple variables | Co-missing variables—possibly related |
| Many low bars with random combos | Missingness may be unstructured or random |
| Few dominant bars with big counts | Missingness is patterned and non-random |
Use cases:
Example
If vis_miss() shows:
GPA,
Math_Score, and Reading_Score250 or last 100
rowsAnd gg_miss_upset() shows:
Then you might conclude:
“Missingness is not random, likely related to a subgroup or skipped section. Consider conditional imputation or further investigation.”
Structure of the Output Matrix:
Many statistical analyses require the missing completely at random (MCAR) assumption, where missingness is independent of both observed and unobserved data (Little, 1988). While maximum likelihood estimation (MLE) only requires the weaker missing at random (MAR) assumption under multivariate normality (Orchard & Woodbury, 1972), verifying MCAR remains crucial for complete-case analyses and certain modeling approaches.
Test Statistic
Little’s test evaluates MCAR by comparing subgroup means across missing data patterns. The test statistic is:
\[ d^2 = \sum_{j=1}^J n_j \left( \hat{\mu}_j - \hat{\mu}_j^{(ML)} \right)^T \hat{\Sigma}_j^{-1} \left( \hat{\mu}_j - \hat{\mu}_j^{(ML)} \right) \] where:
\(J\) = number of distinct missing data patterns,
\(n_j\) = sample size for pattern \(j\),
\(\hat{\mu}_j\) = observed mean vector for pattern \(j\),
\(\hat{\mu}_j^{(ML)}\) = ML estimate of grand mean vector,
\(\hat{\Sigma}_j\) = ML estimate of covariance matrix for pattern \(j\)
Distributional Properties
Under the null hypothesis (MCAR holds):
Interpretation
Practical Considerations
Example Application: When testing MCAR in a longitudinal study with dropout, significant results would suggest dropout relates to measured variables, violating MCAR but potentially satisfying MAR.
R Implementation of MCAR Test: This function implements Little’s MCAR (Missing Completely At Random) test on a numeric data matrix or data frame with missing values. Little’s MCAR Test checks whether missing data in your dataset can reasonably be assumed Missing Completely at Random (MCAR).
The test output includes both a numerical summary and an optional visual plot against the chi-square reference distribution. Computes the test statistic based on differences between observed pattern means and the overall means, adjusted by the covariance matrix.
# ======================================================================
# Little's MCAR Test (with EM ML, Robust MCD, and diagnostics)
# ----------------------------------------------------------------------
# Provides: - MCAR_Test(): main function - print_mcar_test(): pretty printer -
# print.mcar_test(): S3 print method (uses pretty printer) - .plot_mcar_test():
# chi-square visual (with domain & zoom) - Internal helpers: input validation,
# pattern handling, safe inverse
# ======================================================================
#' Missingness Pattern Indexer
#' @keywords internal
.missingness_index <- function(df, max_patterns = 10) {
na_mat <- is.na(df)
patt_str <- apply(na_mat, 1, function(r) paste(which(r), collapse = ","))
patt_counts <- sort(table(patt_str), decreasing = TRUE)
top_patterns <- head(patt_counts, max_patterns)
unique_patterns <- unique(na_mat, MARGIN = 1)
row_index_by_pattern <- lapply(seq_len(nrow(unique_patterns)), function(i) {
patt <- unique_patterns[i, ]
which(apply(na_mat, 1, function(r) all(r == patt)))
})
list(na_mat = na_mat, unique_patterns = unique_patterns, row_index_by_pattern = row_index_by_pattern,
pattern_strings = patt_str, pattern_counts = patt_counts, top_patterns = top_patterns)
}
#' Safe Matrix Inverse with Fallbacks
#' @keywords internal
.safe_inverse <- function(S) {
if (!is.matrix(S) || nrow(S) != ncol(S))
stop("S must be a square matrix.", call. = FALSE)
if (any(!is.finite(S)))
stop("Non-finite values in covariance submatrix.", call. = FALSE)
# Enforce symmetry to stabilize numerics
S <- (S + t(S))/2
out <- tryCatch(solve(S), error = function(e) NULL)
if (!is.null(out))
return(out)
# Try nearPD to fix indefiniteness if available
if (requireNamespace("Matrix", quietly = TRUE)) {
S_pd <- as.matrix(Matrix::nearPD(S)$mat)
out2 <- tryCatch(solve(S_pd), error = function(e) NULL)
if (!is.null(out2))
return(out2)
}
# Last resort: pseudoinverse
if (!requireNamespace("MASS", quietly = TRUE)) {
stop("Matrix not invertible; install 'MASS' (for ginv) or 'Matrix' (for nearPD).",
call. = FALSE)
}
MASS::ginv(S)
}
#' Grand Mean and Covariance under Different Estimation Schemes
#' @keywords internal
.grand_params <- function(df, method = c("em", "pairwise", "robust")) {
method <- match.arg(method)
if (method == "em") {
if (!requireNamespace("norm", quietly = TRUE)) {
stop("EM requested but package 'norm' is not available.", call. = FALSE)
}
s <- norm::prelim.norm(as.matrix(df))
ml <- norm::em.norm(s, showits = FALSE)
par <- norm::getparam.norm(s, ml)
return(list(mu = as.numeric(par$mu), Sigma = par$sigma, label = "EM-based ML"))
}
if (method == "robust") {
if (!requireNamespace("robustbase", quietly = TRUE)) {
stop("Robust requested but package 'robustbase' is not available.", call. = FALSE)
}
# Rough single imputation (means) only to feed covMcd
mu_imp <- colMeans(df, na.rm = TRUE)
df_imp <- df
for (j in seq_along(mu_imp)) {
nas <- is.na(df_imp[, j])
if (any(nas))
df_imp[nas, j] <- mu_imp[j]
}
rc <- robustbase::covMcd(df_imp, alpha = 0.75)
return(list(mu = as.numeric(rc$center), Sigma = rc$cov, label = "Robust (MCD)"))
}
# Pairwise-complete (pragmatic; may be indefinite in rare cases)
mu <- colMeans(df, na.rm = TRUE)
Sigma <- stats::cov(df, use = "pairwise.complete.obs")
Sigma <- (Sigma + t(Sigma))/2
list(mu = as.numeric(mu), Sigma = Sigma, label = "Pairwise-complete (not ML)")
}
#' Pretty Printer for MCAR Test Results
#' @export
print_mcar_test <- function(x, ...) {
effect_size <- ifelse(x$df > 0, x$statistic/x$df, NA_real_)
base_decision <- ifelse(grepl("Reject", x$decision), "Reject MCAR", "Fail to reject MCAR")
if (!is.na(effect_size)) {
if (base_decision == "Reject MCAR" && effect_size >= 2) {
decision_text <- "Strong evidence against MCAR (large effect)"
} else if (base_decision == "Reject MCAR") {
decision_text <- "Statistically significant deviation from MCAR"
} else if (effect_size >= 2) {
decision_text <- "Substantial effect despite non-significant p-value"
} else if (effect_size >= 1.5) {
decision_text <- "Moderate effect size - MCAR may not hold"
} else {
decision_text <- "No substantial evidence against MCAR"
}
} else {
decision_text <- base_decision
}
cat("\nLittle's MCAR Test with Missing Pattern Analysis\n")
cat(strrep("=", 69), "\n")
cat(sprintf("%-25s %s\n", "Estimation method:", x$estimation_method))
cat(sprintf("%-25s %s\n", "Small-sample correction:", x$correction))
cat(strrep("-", 69), "\n")
cat("Missing Data Diagnostics:\n")
cat(sprintf(" - Total missing values: %d (%.1f%%)\n", x$diagnostics$total_missing,
x$diagnostics$pct_missing))
worst_var <- x$diagnostics$worst_variable
if (is.na(worst_var)) {
cat(" - Variable with most NAs: none (no missing values)\n")
} else {
cat(sprintf(" - Variable with most NAs: %s (%d missing)\n", worst_var, sum(is.na(x$data[,
worst_var]))))
}
cat("\nTop Missing Patterns:\n")
tp <- x$diagnostics$top_patterns
if (length(tp) == 0) {
cat(" (no missingness patterns)\n")
} else {
for (i in seq_along(tp)) {
patt <- names(tp)[i]
vars <- if (patt == "")
character(0) else colnames(x$data)[as.numeric(strsplit(patt, ",")[[1]])]
patt_desc <- if (length(vars) > 0)
paste(vars, collapse = ", ") else "Complete cases"
cat(sprintf("%2d. %-40s (n = %d)\n", i, patt_desc, tp[i]))
}
}
cat(strrep("-", 69), "\n")
if (!is.na(effect_size)) {
cat(sprintf("%-25s %.3f\n", "Effect size (χ²/df):", effect_size))
} else {
cat("- Effect size undefined (df = 0)\n")
}
cat(strrep("-", 69), "\n")
cat(sprintf("%-25s %.3f (df = %d)\n", "Test statistic:", x$statistic, x$df))
cat(sprintf("%-25s %.4f\n", "P-value:", x$p.value))
cat(sprintf("%-25s %s\n", "Decision:", decision_text))
cat(strrep("=", 69), "\n")
invisible(x)
}
#' S3 print method for class 'mcar_test' (delegates to pretty printer)
#' @export
print.mcar_test <- function(x, ...) {
print_mcar_test(x, ...)
}
#' Chi-square Reference Plot for Little's MCAR Test
#' @keywords internal
.plot_mcar_test <- function(obj, xlim_min = 0, xlim_max = NULL, zoom = FALSE, zoom_factor = 2,
force_show_refs = FALSE) {
if (!is.list(obj) || is.null(obj$df) || obj$df <= 0 || !is.finite(obj$statistic))
return(invisible(NULL))
`%||%` <- function(x, y) if (is.null(x))
y else x
df <- obj$df
alpha <- obj$alpha %||% 0.05
chi2_obs <- as.numeric(obj$statistic)
pval <- obj$p.value
decision <- obj$decision %||% if (!is.na(pval) && pval < alpha)
"Reject MCAR" else "Fail to reject MCAR"
p_txt <- if (is.na(pval))
"NA" else if (pval < 1e-04)
"< 1e-4" else sprintf("= %.4f", pval)
N_val <- obj$N %||% obj$n %||% if (!is.null(obj$data))
nrow(obj$data) else NA_real_
critical <- qchisq(1 - alpha, df)
full_max <- max(critical, chi2_obs, qchisq(0.999, df)) * 1.05
manual_bounds <- (!is.null(xlim_max)) || (xlim_min > 0)
if (isTRUE(zoom)) {
buf <- zoom_factor * sqrt(2 * df)
x_min <- max(0, chi2_obs - buf)
x_max <- chi2_obs + buf
} else if (manual_bounds) {
x_min <- xlim_min
x_max <- if (is.null(xlim_max))
full_max else xlim_max
} else {
x_min <- 0
x_max <- full_max
}
if (isTRUE(force_show_refs) && !manual_bounds && !isTRUE(zoom)) {
x_min <- min(x_min, chi2_obs, critical)
x_max <- max(x_max, chi2_obs, critical)
}
if (x_max <= x_min)
x_max <- x_min + 1
x_vals <- seq(x_min, x_max, length.out = 800)
y_vals <- dchisq(x_vals, df)
col_accept <- adjustcolor("#4CAF50", 0.25)
col_reject <- adjustcolor("#F44336", 0.25)
col_density <- "#303F9F"
col_stat <- "#1976D2"
col_crit <- "#D32F2F"
col_decision <- if (grepl("Reject", decision))
"#8B0000" else "#006400"
.fmt_num <- function(z, d = 3) formatC(z, digits = d, format = "f", big.mark = ",")
subtitle <- sprintf("N = %s | α = %s | view: %s | x ∈ [%s, %s]",
.fmt_num(N_val, 0), .fmt_num(alpha, 3), if (isTRUE(zoom))
sprintf("zoomed (±%g·SD)", zoom_factor) else if (manual_bounds)
"manual" else "full", .fmt_num(x_min, 0), .fmt_num(x_max, 0))
old_par <- par(no.readonly = TRUE)
on.exit(par(old_par), add = TRUE)
par(mar = c(5, 6, 6, 2.5))
plot(NA, NA, xlim = c(x_min, x_max), ylim = c(0, max(y_vals) * 1.28), xlab = expression(chi^2),
ylab = "Density", xaxt = "n", yaxt = "n", xaxs = "i", yaxs = "i", main = "Little's MCAR Test — Chi-square Reference")
mtext(subtitle, side = 3, line = 1, cex = 0.95, col = "#1976D2")
x_at <- pretty(c(x_min, x_max), n = 6)
axis(1, at = x_at, labels = .fmt_num(x_at, 0))
y_at <- pretty(c(0, max(y_vals)), n = 5)
axis(2, at = y_at, las = 1, labels = .fmt_num(y_at, 3))
abline(v = x_at, col = adjustcolor("#000000", 0.06), lwd = 1)
abline(h = y_at, col = adjustcolor("#000000", 0.06), lwd = 1)
xa <- x_vals[x_vals <= critical]
if (length(xa))
polygon(c(min(xa), xa, max(xa)), c(0, dchisq(xa, df), 0), col = col_accept,
border = NA)
xr <- x_vals[x_vals >= critical]
if (length(xr))
polygon(c(min(xr), xr, max(xr)), c(0, dchisq(xr, df), 0), col = col_reject,
border = NA)
lines(x_vals, y_vals, lwd = 2.2, col = col_density)
if (critical >= x_min && critical <= x_max)
abline(v = critical, col = col_crit, lty = 2, lwd = 2)
if (chi2_obs >= x_min && chi2_obs <= x_max)
abline(v = chi2_obs, col = col_stat, lwd = 2.4)
if (chi2_obs >= x_min && chi2_obs <= x_max) {
mid_y <- max(y_vals) * 0.5
x_off <- 0.02 * (x_max - x_min)
text(chi2_obs - x_off, mid_y, labels = bquote(chi^2 * "(" * .(df) * ")" ==
.(round(chi2_obs, 2)) ~ "," ~ p ~ .(p_txt)), col = col_decision, font = 2,
cex = 1.2, srt = 90, adj = 0.5, xpd = NA)
text(chi2_obs + x_off, mid_y, labels = paste("Decision:", decision), col = col_decision,
font = 2, cex = 1.2, srt = 90, adj = 0.5, xpd = NA)
}
if (chi2_obs >= x_min && chi2_obs <= x_max)
axis(1, at = chi2_obs, labels = round(chi2_obs, 2), col.axis = col_decision,
tck = -0.02, lwd = 1.2)
if (critical >= x_min && critical <= x_max)
axis(1, at = critical, labels = round(critical, 2), col.axis = "#D32F2F",
col.ticks = "#D32F2F", tck = -0.02, lwd = 1.2)
legend("topright", inset = c(0.01, 0.02), bty = "n", cex = 0.9, legend = c("Accept region",
"Reject region", "Chi-square density", "Critical value", "Observed χ² (line)"),
fill = c(adjustcolor("#4CAF50", 0.25), adjustcolor("#F44336", 0.25), NA,
NA, NA), border = c(NA, NA, NA, NA, NA), lty = c(NA, NA, 1, 2, 1), col = c(NA,
NA, "#303F9F", "#D32F2F", "#1976D2"), lwd = c(NA, NA, 2.2, 2, 2.4), pch = c(NA,
NA, NA, NA, NA))
invisible(NULL)
}
# ======================================================================
# Helper: prepare numeric columns (coerce / drop / keep) with warnings
# ======================================================================
#' Prepare numeric data for MCAR test
#'
#' @param x data.frame or matrix
#' @param mode one of c('coerce','drop','keep')
#' - 'coerce': factor/character -> numeric codes (warn)
#' - 'drop': keep only numeric columns (warn about drops)
#' - 'keep': return as-is (errors later if non-numeric present)
#' @return list(df=prepared_df, info=list(coerced=chr, dropped=chr, kept=chr))
#' @keywords internal
.prepare_numeric <- function(x, mode = c("coerce", "drop", "keep")) {
mode <- match.arg(mode)
df <- as.data.frame(x)
is_num <- vapply(df, is.numeric, logical(1))
is_fac <- vapply(df, is.factor, logical(1))
is_chr <- vapply(df, is.character, logical(1))
coerced <- dropped <- character(0)
if (mode == "coerce") {
targets <- names(df)[is_fac | is_chr]
if (length(targets)) {
warning(sprintf("Coercing non-numeric columns to numeric via factor codes: %s",
paste(targets, collapse = ", ")), call. = FALSE)
df[targets] <- lapply(df[targets], function(col) as.numeric(as.factor(col)))
coerced <- targets
}
} else if (mode == "drop") {
keep <- names(df)[is_num]
drop <- setdiff(names(df), keep)
if (length(drop)) {
warning(sprintf("Dropping non-numeric columns: %s", paste(drop, collapse = ", ")),
call. = FALSE)
df <- df[keep]
dropped <- drop
}
} else {
# keep nothing; validation will occur downstream
}
kept <- names(df)
list(df = df, info = list(coerced = coerced, dropped = dropped, kept = kept))
}
#' Coerce Non-Numeric Columns with Warning
#'
#' Converts non-numeric columns to numeric via factor coding (levels → 1..K).
#' This preserves shape but changes meaning, so a clear warning is issued.
#'
#' @param df A data.frame or matrix with potential non-numeric columns.
#' @return A data.frame with all columns numeric.
#' @keywords internal
.coerce_numeric_warn <- function(df) {
out <- .prepare_numeric(df, mode = "coerce")
out$df
}
# ====================================================================== MCAR
# Test (updated signature + preprocessing)
# ======================================================================
#' Little's MCAR Test with Pattern Diagnostics
#' @export
MCAR_Test <- function(x, alpha = 0.05, plot = TRUE, verbose = TRUE, use_EM = FALSE,
robust = FALSE, bartlett = TRUE, max_patterns = 10, xlim_min = 0, xlim_max = NULL,
zoom = FALSE, zoom_factor = 2, numeric_mode = c("coerce", "drop", "keep")) {
numeric_mode <- match.arg(numeric_mode)
if (!is.matrix(x) && !is.data.frame(x))
stop("Input must be a matrix or data frame.")
prep <- .prepare_numeric(x, mode = numeric_mode)
x_df <- prep$df
# Validate post-prep
if (!ncol(x_df)) {
stop("No numeric columns remain after preprocessing. Use numeric_mode='coerce' or pass numeric data.",
call. = FALSE)
}
if (!all(vapply(x_df, is.numeric, logical(1)))) {
stop("Non-numeric columns remain; set numeric_mode = 'coerce' or 'drop'.",
call. = FALSE)
}
# Drop rows that are entirely missing
x_df <- x_df[rowSums(!is.na(x_df)) > 0, , drop = FALSE]
if (nrow(x_df) == 0)
stop("No non-missing rows found.")
p <- ncol(x_df)
miss <- .missingness_index(x_df, max_patterns = max_patterns)
unique_patterns <- miss$unique_patterns
row_index_by_pattern <- miss$row_index_by_pattern
n_patterns <- nrow(unique_patterns)
# Choose estimator
if (robust) {
gp <- .grand_params(x_df, method = "robust")
} else if (use_EM) {
gp <- .grand_params(x_df, method = "em")
} else {
gp <- .grand_params(x_df, method = "pairwise")
}
grand_means <- gp$mu
grand_cov <- gp$Sigma
estimation_label <- gp$label
# Little's d^2
d2_total <- 0
sum_k <- 0
for (i in seq_len(n_patterns)) {
patt <- unique_patterns[i, ] # TRUE where NA
idx <- row_index_by_pattern[[i]]
obs_vars <- which(!patt)
n_j <- length(idx)
if (length(obs_vars) == 0 || n_j == 0)
next
patt_means <- colMeans(x_df[idx, obs_vars, drop = FALSE], na.rm = TRUE)
Sigma_sub <- grand_cov[obs_vars, obs_vars, drop = FALSE]
if (bartlett && estimation_label == "Pairwise-complete (not ML)" && n_j >
1) {
Sigma_sub <- ((n_j - 1)/n_j) * Sigma_sub
}
inv_Sigma_sub <- .safe_inverse(Sigma_sub)
mean_diff <- patt_means - grand_means[obs_vars]
quad <- as.numeric(t(mean_diff) %*% inv_Sigma_sub %*% mean_diff)
d2_total <- d2_total + n_j * quad
sum_k <- sum_k + length(obs_vars)
}
df <- sum_k - p
chi2 <- as.numeric(d2_total)
p_value <- if (df > 0)
pchisq(chi2, df, lower.tail = FALSE) else NA_real_
decision <- if (!is.na(p_value) && p_value < alpha)
"Reject MCAR" else "Fail to reject MCAR"
na_counts <- colSums(is.na(x_df))
worst_variable <- if (sum(na_counts) == 0)
NA_character_ else names(which.max(na_counts))
diagnostics <- list(total_missing = sum(is.na(x_df)), pct_missing = mean(is.na(x_df)) *
100, worst_variable = worst_variable, top_patterns = miss$top_patterns, pattern_matrix = unique_patterns)
res <- structure(list(statistic = chi2, df = df, p.value = p_value, alpha = alpha,
decision = decision, missing_patterns = n_patterns, estimation_method = estimation_label,
correction = ifelse(bartlett && estimation_label == "Pairwise-complete (not ML)",
"Bartlett", "None"), diagnostics = diagnostics, data = x_df, N = nrow(x_df),
preprocess = list(mode = numeric_mode, coerced = prep$info$coerced, dropped = prep$info$dropped,
kept = prep$info$kept)), class = "mcar_test")
if (plot && df > 0 && is.finite(chi2)) {
.plot_mcar_test(res, xlim_min = xlim_min, xlim_max = xlim_max, zoom = zoom,
zoom_factor = zoom_factor)
}
if (verbose)
print(res)
invisible(res)
}
#' Dump All MCAR-Related Functions to a File
#' @keywords internal
dump(c("MCAR_Test", "print_mcar_test", "print.mcar_test", ".plot_mcar_test", ".coerce_numeric_warn",
".prepare_numeric", ".missingness_index", ".safe_inverse", ".grand_params"),
file = "MCAR_Test.R")
Little’s MCAR Test Interpretation Guide
Step 1. Statistical significance (p-value):
Step 2. Effect size check (χ² / df):
Decision Guidelines
Interpreting Beyond MCAR
Rejecting MCAR does not imply that the data are necessarily Missing Not At Random (MNAR). Instead, the next most common and practical assumption is Missing At Random (MAR) — that is, missingness may depend on observed variables but not directly on the unobserved values themselves. Under MAR, modern methods such as full information maximum likelihood (FIML) or multiple imputation typically yield unbiased estimates.
By contrast, data are Missing Not At Random (MNAR) if the probability of missingness depends on the unobserved value itself (e.g., individuals with higher depression scores are less likely to report their score). MNAR is difficult to verify from data alone and requires strong modeling assumptions, such as selection models, pattern-mixture models, or shared-parameter models.
For this reason, applied analyses generally assume MAR while conducting sensitivity analyses to examine how robust results are if the missingness mechanism were closer to MNAR.
Why a Large Effect Size Counts Against MCAR
missing-data patterns show systematic differences in their distributions. If the data are truly MCAR, missingness is completely random, meaning that each group should resemble a random subsample and their means should differ only by chance.
When the χ² / df ratio (effect size) is large; however, the group means diverge more than random variation would predict, which constitutes strong evidence against MCAR.
Interpreting the χ²/df Ratio
The χ²/df ratio (sometimes called the normed chi-square in SEM and occasionally borrowed as an effect-size proxy for Little’s test) provides a heuristic gauge of how strongly the data deviate from MCAR:
| χ²/df Ratio | Interpretation |
|---|---|
| ≈ 1.0 | Data fit MCAR closely (differences no bigger than random sampling variation). |
| 1.0 – 2.0 | Small to moderate deviation; often considered acceptable. |
| 2.0 – 3.0 | Noticeable misfit; potential concern. |
| > 3.0 | Strong evidence that group means differ beyond chance → violation of MCAR. |
Note. These thresholds aren’t absolute rules; they come mainly from SEM practice (Carmines & McIver, 1981; Kline, 2016; Wheaton et al., 1977), and should be treated as heuristic cutoffs given that the \(\chi^2\) test is highly sensitive to sample size (Marsh & Hocevar, 1985).
Critical Lines Legend:
| Line Style | Meaning |
|---|---|
| Critical value (default α = 0.05 threshold) | |
| Observed test statistic, χ² |
Color-Coded Regions:
| Region | Interpretation |
|---|---|
| Fail-to-reject | Data may be MCAR if observed test statistic () falls here |
| Rejection | Data likely not MCAR if observed test statistic () falls here |
# Check missing data mechanism (MCAR test)
MCAR_Test(student_data, xlim_min = 2700, xlim_max = 3300) # p > 0.05 suggests MCAR
##
## Little's MCAR Test with Missing Pattern Analysis
## =====================================================================
## Estimation method: Pairwise-complete (not ML)
## Small-sample correction: Bartlett
## ---------------------------------------------------------------------
## Missing Data Diagnostics:
## - Total missing values: 484 (1.6%)
## - Variable with most NAs: math_score (34 missing)
##
## Top Missing Patterns:
## 1. Complete cases (n = 611)
## 2. math_score (n = 24)
## 3. lunch (n = 22)
## 4. test_prep (n = 20)
## 5. parental_education (n = 16)
## 6. A2 (n = 13)
## 7. C2 (n = 12)
## 8. A3 (n = 12)
## 9. V2 (n = 12)
## 10. S3 (n = 12)
## ---------------------------------------------------------------------
## Effect size (χ²/df): 1.068
## ---------------------------------------------------------------------
## Test statistic: 3203.848 (df = 3000)
## P-value: 0.0049
## Decision: Statistically significant deviation from MCAR
## =====================================================================
Little’s MCAR test indicated significant deviation from missing
completely at random, χ²(3000) = 3203.85,
p = .005, with a moderate effect size
(χ²/df = 1.07). The analysis revealed 104 missing data
patterns, with total missingness comprising 1.6% of values
(N = 484 missing cases). Math scores showed the highest
missingness (n = 34), followed by lunch status (n = 22) and
test preparation (n = 20).
Multiple imputation was performed using predictive mean matching
(m = 5 imputations) via the mice package. Post-imputation
diagnostics confirmed successful convergence and elimination of missing
values, with complete case analyses (n = 611) yielding
substantively similar results to the imputed data.
# Multiple imputation for missing data
library(mice)
# Run MICE without printing iterations
imputed_data <- mice(student_data, m = 5, method = "pmm", printFlag = FALSE) # Suppress iteration output
# Create completed dataset
complete_data <- complete(imputed_data)
# Verify structure and missingness
cat("\nStructure of completed data:\n")
##
## Structure of completed data:
str(complete_data)
## 'data.frame': 1000 obs. of 31 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : Factor w/ 2 levels "female","male": 1 2 2 2 2 2 1 1 2 1 ...
## $ race : Factor w/ 4 levels "group A","group B",..: 1 2 3 3 1 3 2 3 4 4 ...
## $ parental_education: Ord.factor w/ 6 levels "some high school"<..: 3 3 5 1 3 5 5 3 4 2 ...
## $ lunch : Factor w/ 3 levels "free","reduced",..: 3 1 1 1 1 1 1 2 3 2 ...
## $ test_prep : Factor w/ 2 levels "completed","none": 2 1 2 2 2 1 1 1 2 2 ...
## $ math_score : num 38 69 44 64 45 38 48 68 53 82 ...
## $ reading_score : num 48 69 69 54 44 64 63 60 71 82 ...
## $ writing_score : num 70 83 48 73 41 48 68 83 89 71 ...
## $ gpa : num 1.68 2.81 1.78 2.25 1.18 1.6 2.09 2.62 2.69 3.05 ...
## $ C1 : num 2 4 2 3 3 5 2 4 4 4 ...
## $ C2 : num 5 3 3 3 4 2 4 2 2 1 ...
## $ C3 : num 3 2 3 4 3 2 4 3 2 3 ...
## $ A1 : num 3 1 4 4 4 1 4 4 3 3 ...
## $ A2 : num 3 5 2 3 3 3 2 2 2 4 ...
## $ A3 : num 4 1 5 3 4 1 3 3 3 4 ...
## $ P1 : num 4 4 2 3 3 5 4 2 1 3 ...
## $ P2 : num 3 1 5 2 4 1 2 5 4 3 ...
## $ P3 : num 3 3 3 4 1 4 4 2 3 2 ...
## $ B1 : num 4 5 3 4 2 3 2 2 2 5 ...
## $ B2 : num 4 1 3 2 4 1 4 4 4 1 ...
## $ B3 : num 3 5 4 4 2 4 3 4 2 5 ...
## $ V1 : num 3 3 2 3 4 1 3 3 1 3 ...
## $ V2 : num 2 4 4 3 3 4 4 2 5 4 ...
## $ V3 : num 3 5 4 2 4 5 4 2 4 3 ...
## $ E1 : num 4 3 4 5 4 1 4 4 2 2 ...
## $ E2 : num 4 3 4 4 4 1 3 4 2 2 ...
## $ E3 : num 3 3 3 2 1 5 2 2 4 3 ...
## $ S1 : num 4 4 3 2 5 2 2 5 3 3 ...
## $ S2 : num 4 5 5 2 4 4 4 3 2 3 ...
## $ S3 : num 2 1 1 3 2 4 4 2 4 3 ...
cat("\nMissing values check:\n")
##
## Missing values check:
missing_counts <- sapply(complete_data, function(x) sum(is.na(x)))
print(missing_counts)
## id gender race parental_education
## 0 0 0 0
## lunch test_prep math_score reading_score
## 0 0 0 0
## writing_score gpa C1 C2
## 0 0 0 0
## C3 A1 A2 A3
## 0 0 0 0
## P1 P2 P3 B1
## 0 0 0 0
## B2 B3 V1 V2
## 0 0 0 0
## V3 E1 E2 E3
## 0 0 0 0
## S1 S2 S3
## 0 0 0
Alternative Approach: Listwise Deletion
Instead of imputation, we could employ listwise deletion (complete-case analysis), removing all rows with missing values. For this dataset:
389
observations38.9% of the
sample (389/1,000)Key Considerations for Listwise Deletion:
38.9%χ²=3203.8, p=.0049)Recommended Reporting:
“Listwise deletion removed
389cases (38.9%of total sample). Little’s MCAR test indicated the missingness pattern was not random (χ²(3000)=3203.8,p=.0049), suggesting potential bias in remaining cases. Results should be interpreted with this limitation in mind.”
# Remove rows with any NAs (listwise deletion)
deleted_data <- na.omit(student_data)
str(deleted_data)
## 'data.frame': 611 obs. of 31 variables:
## $ id : int 2 4 5 6 7 8 9 12 14 15 ...
## $ gender : Factor w/ 2 levels "female","male": 2 2 2 2 1 1 2 1 2 1 ...
## $ race : Factor w/ 4 levels "group A","group B",..: 2 3 1 3 2 3 4 1 2 2 ...
## $ parental_education: Ord.factor w/ 6 levels "some high school"<..: 3 1 3 5 5 3 4 6 3 2 ...
## $ lunch : Factor w/ 3 levels "free","reduced",..: 1 1 1 1 1 2 3 2 3 1 ...
## $ test_prep : Factor w/ 2 levels "completed","none": 1 2 2 1 1 1 2 2 2 2 ...
## $ math_score : num 69 64 45 38 48 68 53 75 53 51 ...
## $ reading_score : num 69 54 44 64 63 60 71 67 59 69 ...
## $ writing_score : num 83 73 41 48 68 83 89 74 52 64 ...
## $ gpa : num 2.81 2.25 1.18 1.6 2.09 2.62 2.69 2.72 1.81 2.19 ...
## $ C1 : num 4 3 3 5 2 4 4 5 2 4 ...
## $ C2 : num 3 3 4 2 4 2 2 1 4 1 ...
## $ C3 : num 2 4 3 2 4 3 2 1 3 1 ...
## $ A1 : num 1 4 4 1 4 4 3 3 4 1 ...
## $ A2 : num 5 3 3 3 2 2 2 3 2 5 ...
## $ A3 : num 1 3 4 1 3 3 3 3 5 3 ...
## $ P1 : num 4 3 3 5 4 2 1 4 1 3 ...
## $ P2 : num 1 2 4 1 2 5 4 1 5 4 ...
## $ P3 : num 3 4 1 4 4 2 3 4 1 3 ...
## $ B1 : num 5 4 2 3 2 2 2 3 3 4 ...
## $ B2 : num 1 2 4 1 4 4 4 3 4 1 ...
## $ B3 : num 5 4 2 4 3 4 2 4 2 4 ...
## $ V1 : num 3 3 4 1 3 3 1 3 5 2 ...
## $ V2 : num 4 3 3 4 4 2 5 3 1 3 ...
## $ V3 : num 5 2 4 5 4 2 4 3 2 3 ...
## $ E1 : num 3 5 4 1 4 4 2 5 5 1 ...
## $ E2 : num 3 4 4 1 3 4 2 5 5 1 ...
## $ E3 : num 3 2 1 5 2 2 4 2 3 5 ...
## $ S1 : num 4 2 5 2 2 5 3 2 4 1 ...
## $ S2 : num 5 2 4 4 4 3 2 2 4 1 ...
## $ S3 : num 1 3 2 4 4 2 4 4 4 5 ...
## - attr(*, "na.action")= 'omit' Named int [1:389] 1 3 10 11 13 16 18 20 21 22 ...
## ..- attr(*, "names")= chr [1:389] "1" "3" "10" "11" ...
# Check missingness pattern
sapply(deleted_data, function(x) sum(is.na(x)))
## id gender race parental_education
## 0 0 0 0
## lunch test_prep math_score reading_score
## 0 0 0 0
## writing_score gpa C1 C2
## 0 0 0 0
## C3 A1 A2 A3
## 0 0 0 0
## P1 P2 P3 B1
## 0 0 0 0
## B2 B3 V1 V2
## 0 0 0 0
## V3 E1 E2 E3
## 0 0 0 0
## S1 S2 S3
## 0 0 0
# ---------------------------------------------------------------------------
# LOAD & INSTALL REQUIRED PACKAGES
# ---------------------------------------------------------------------------
required_packages <- c("mice", "effsize", "psych", "vcd", "broom", "gt", "dplyr",
"plotly", "htmltools")
install_if_missing <- function(pkg) {
if (!requireNamespace(pkg, quietly = TRUE)) {
install.packages(pkg, dependencies = TRUE)
}
}
invisible(sapply(required_packages, install_if_missing))
# Load required packages
library(mice)
library(effsize)
library(psych)
library(vcd)
library(broom)
library(gt)
library(dplyr)
library(plotly)
library(htmltools)
# ---------------------------------------------------------------------------
# COMPUTE SUMMARY STATISTICS
# ---------------------------------------------------------------------------
continuous_vars <- c("math_score", "reading_score", "writing_score", "gpa")
# Descriptive stats for complete cases and imputed data
complete_stats <- psych::describe(deleted_data[continuous_vars])[, c("mean", "sd",
"n")]
imputed_stats <- psych::describe(complete_data[continuous_vars])[, c("mean", "sd",
"n")]
# Calculate Cohen's d comparing imputed vs complete cases for each continuous
# variable
cohen_d_results <- sapply(continuous_vars, function(var) {
combined_values <- c(complete_data[[var]], deleted_data[[var]])
group_factor <- factor(c(rep("Imputed", nrow(complete_data)), rep("Complete",
nrow(deleted_data))))
effsize::cohen.d(combined_values, group_factor)$estimate
})
# ---------------------------------------------------------------------------
# GENDER DIFFERENCES IN MATH SCORE
# ---------------------------------------------------------------------------
# Summary stats by gender in complete and imputed datasets
gender_diff_cc <- psych::describeBy(deleted_data$math_score, group = deleted_data$gender,
mat = TRUE)[, c("group1", "mean", "sd", "n")]
gender_diff_imp <- psych::describeBy(complete_data$math_score, group = complete_data$gender,
mat = TRUE)[, c("group1", "mean", "sd", "n")]
# Cohen's d for gender difference in math score within complete and imputed
# datasets
gender_d_cc <- effsize::cohen.d(math_score ~ gender, data = deleted_data)$estimate
gender_d_imp <- effsize::cohen.d(math_score ~ gender, data = complete_data)$estimate
# Calculate Cohen's d comparing math scores by gender between imputed and
# complete samples (female and male)
female_d <- effsize::cohen.d(complete_data$math_score[complete_data$gender == "female"],
deleted_data$math_score[deleted_data$gender == "female"])$estimate
male_d <- effsize::cohen.d(complete_data$math_score[complete_data$gender == "male"],
deleted_data$math_score[deleted_data$gender == "male"])$estimate
# ---------------------------------------------------------------------------
# CORRELATION MATRICES AND COMPARISONS
# ---------------------------------------------------------------------------
# Convert test_prep to numeric 0/1 if it exists
deleted_data_numeric <- deleted_data
complete_data_numeric <- complete_data
if ("test_prep" %in% names(deleted_data)) {
deleted_data_numeric$test_prep <- as.numeric(deleted_data$test_prep) - 1
complete_data_numeric$test_prep <- as.numeric(complete_data$test_prep) - 1
}
correlation_vars <- c("math_score", "reading_score", "writing_score", "gpa", "test_prep")
numeric_vars <- intersect(correlation_vars, names(select_if(deleted_data_numeric,
is.numeric)))
# Define variable pairs for correlation reporting
cor_pairs <- list(c("math_score", "reading_score"), c("math_score", "writing_score"),
c("math_score", "gpa"), c("math_score", "test_prep"), c("reading_score", "writing_score"),
c("reading_score", "gpa"), c("reading_score", "test_prep"), c("writing_score",
"gpa"), c("writing_score", "test_prep"), c("gpa", "test_prep"))
relationship_names <- c("Math ~ Reading", "Math ~ Writing", "Math ~ GPA", "Math ~ Test Prep",
"Reading ~ Writing", "Reading ~ GPA", "Reading ~ Test Prep", "Writing ~ GPA",
"Writing ~ Test Prep", "GPA ~ Test Prep")
valid_pairs <- sapply(cor_pairs, function(pair) all(pair %in% numeric_vars))
# Correlations for complete case and imputed datasets
cor_cc <- cor(deleted_data_numeric[numeric_vars], use = "complete.obs")
cor_imp <- cor(complete_data_numeric[numeric_vars])
# Build correlation comparison table
correlation_table <- data.frame(Relationship = relationship_names[valid_pairs], Complete_r = sapply(cor_pairs[valid_pairs],
function(pair) cor_cc[pair[1], pair[2]]), Imputed_r = sapply(cor_pairs[valid_pairs],
function(pair) cor_imp[pair[1], pair[2]])) %>%
mutate(Delta_r = Imputed_r - Complete_r) %>%
gt() %>%
tab_header(title = md("**Correlation Comparisons**"), subtitle = "Pearson's r coefficients between key variables") %>%
cols_label(Relationship = md("**Relationship**"), Complete_r = md("**Complete Cases**<br>r"),
Imputed_r = md("**Imputed Data**<br>r"), Delta_r = md("**Δr**")) %>%
fmt_number(columns = c(Complete_r, Imputed_r, Delta_r), decimals = 2) %>%
tab_footnote(footnote = md("**Δr** = Imputed r − Complete r"), locations = cells_column_labels(columns = "Delta_r")) %>%
tab_source_note(source_note = md("**Sample sizes:** Complete cases = 611; Imputed data = 1000")) %>%
tab_source_note(source_note = md("**Correlation strength:** |r| < 0.3 weak, 0.3 ≤ |r| < 0.5 moderate, |r| ≥ 0.5 strong")) %>%
tab_style(style = cell_text(weight = "bold"), locations = cells_column_labels())
# ---------------------------------------------------------------------------
# CREATE TABLES WITH GT
# ---------------------------------------------------------------------------
# Table 1: Continuous Variables Summary & Cohen's d
continuous_table <- data.frame(Variable = c("Math Score", "Reading Score", "Writing Score",
"GPA"), Complete_Cases = sprintf("%.2f ± %.2f", complete_stats$mean, complete_stats$sd),
Imputed_Data = sprintf("%.2f ± %.2f", imputed_stats$mean, imputed_stats$sd),
Cohens_d = sprintf("%.3f", cohen_d_results)) %>%
gt() %>%
tab_header(title = "Effect Size Comparisons: Continuous Variables", subtitle = "Mean ± SD with Cohen's d effect sizes (Imputed vs. Complete Cases)") %>%
cols_label(Variable = "Variable", Complete_Cases = "Complete Cases", Imputed_Data = "Imputed Data",
Cohens_d = "Cohen's d") %>%
tab_footnote(footnote = paste("Complete cases sample size:", complete_stats$n[1]),
locations = cells_column_labels(columns = "Complete_Cases")) %>%
tab_footnote(footnote = paste("Imputed data sample size:", imputed_stats$n[1]),
locations = cells_column_labels(columns = "Imputed_Data")) %>%
tab_style(style = cell_text(weight = "bold"), locations = cells_column_labels())
# Table 2: Gender Differences in Math Scores
gender_table <- data.frame(Comparison = c("Female", "Male", "Overall Gender Gap"),
Complete = sprintf("%.2f", c(gender_diff_cc$mean[1], gender_diff_cc$mean[2],
gender_d_cc)), Imputed = sprintf("%.2f", c(gender_diff_imp$mean[1], gender_diff_imp$mean[2],
gender_d_imp)), Cohens_d = sprintf("%.2f", c(female_d, male_d, gender_d_imp))) %>%
gt() %>%
tab_header(title = "Gender Differences in Math Scores", subtitle = "Mean scores and effect sizes") %>%
cols_label(Comparison = "Group", Complete = "Complete Cases", Imputed = "Imputed Data",
Cohens_d = "Cohen's d") %>%
tab_footnote(footnote = paste("Female sample size:", gender_diff_cc$n[1], "(Complete),",
gender_diff_imp$n[1], "(Imputed)"), locations = cells_body(columns = "Comparison",
rows = 1)) %>%
tab_footnote(footnote = paste("Male sample size:", gender_diff_cc$n[2], "(Complete),",
gender_diff_imp$n[2], "(Imputed)"), locations = cells_body(columns = "Comparison",
rows = 2)) %>%
tab_style(style = cell_text(weight = "bold"), locations = cells_column_labels())
# ---------------------------------------------------------------------------
# PRINT TABLES
# ---------------------------------------------------------------------------
continuous_table
| Effect Size Comparisons: Continuous Variables | |||
| Mean ± SD with Cohen's d effect sizes (Imputed vs. Complete Cases) | |||
| Variable | Complete Cases1 | Imputed Data2 | Cohen's d |
|---|---|---|---|
| Math Score | 65.39 ± 14.59 | 65.26 ± 14.65 | 0.009 |
| Reading Score | 70.08 ± 11.88 | 70.07 ± 11.93 | 0.001 |
| Writing Score | 68.16 ± 13.56 | 68.47 ± 13.64 | -0.023 |
| GPA | 2.51 ± 0.59 | 2.52 ± 0.59 | -0.007 |
| 1 Complete cases sample size: 611 | |||
| 2 Imputed data sample size: 1000 | |||
gender_table
| Gender Differences in Math Scores | |||
| Mean scores and effect sizes | |||
| Group | Complete Cases | Imputed Data | Cohen's d |
|---|---|---|---|
| Female1 | 65.51 | 65.00 | -0.03 |
| Male2 | 65.24 | 65.56 | 0.02 |
| Overall Gender Gap | 0.02 | -0.04 | -0.04 |
| 1 Female sample size: 336 (Complete), 535 (Imputed) | |||
| 2 Male sample size: 275 (Complete), 465 (Imputed) | |||
correlation_table
| Correlation Comparisons | |||
| Pearson's r coefficients between key variables | |||
| Relationship | Complete Cases r |
Imputed Data r |
Δr1 |
|---|---|---|---|
| Math ~ Reading | 0.57 | 0.58 | 0.01 |
| Math ~ Writing | 0.51 | 0.52 | 0.01 |
| Math ~ GPA | 0.84 | 0.84 | 0.00 |
| Math ~ Test Prep | 0.06 | 0.09 | 0.02 |
| Reading ~ Writing | 0.51 | 0.54 | 0.03 |
| Reading ~ GPA | 0.83 | 0.84 | 0.01 |
| Reading ~ Test Prep | 0.06 | 0.10 | 0.04 |
| Writing ~ GPA | 0.82 | 0.83 | 0.01 |
| Writing ~ Test Prep | 0.01 | 0.04 | 0.03 |
| GPA ~ Test Prep | 0.05 | 0.09 | 0.04 |
| Sample sizes: Complete cases = 611; Imputed data = 1000 | |||
| Correlation strength: |r| < 0.3 weak, 0.3 ≤ |r| < 0.5 moderate, |r| ≥ 0.5 strong | |||
| 1 Δr = Imputed r − Complete r | |||
Missing Data Sensitivity Analysis Summary
The sensitivity analysis demonstrates remarkable consistency between analytical approaches, suggesting that our findings are robust to different missing data handling methods.
Across all continuous variables analyzed, effect size differences (Cohen’s d) between complete case and imputed analyses were well below conventional thresholds for small effects (d < 0.2). For example:
These values indicate negligible practical significance, affirming that imputation did not meaningfully alter the distribution of continuous outcomes.
In subgroup analyses of gender differences in math scores, differences were also small and consistent:
These results indicate that relative group standings remained unchanged, supporting the inference that missing data handling did not distort group comparisons.
The correlation comparisons reinforce this conclusion. Across 10 key relationships, Pearson’s r values from complete cases and imputed data were nearly identical:
These correlations show that imputation preserved the structure and strength of relationships among key academic variables.
Conclusion
This sensitivity analysis provides strong empirical support that our conclusions regarding academic performance relationships and group differences are not substantially influenced by the choice of missing data handling method. The alignment across analyses, even for the most theoretically important findings, increases confidence in the validity and generalizability of our results. Consistent with modern missing data guidelines (Buuren, 2018; Graham, 2009), we employed multiple imputation for primary analyses to maximize statistical power and minimize bias, while complete-case results are reported in supplemental materials for transparency.
Caveats and Considerations
While the results across methods are consistent and support the reliability of our findings, we cannot fully rule out the following:
Outliers are data points that deviate significantly from the rest of the dataset. How they should be handled depends on the research objectives, data distribution, and context of the analysis. Below are key strategies for detecting and managing outliers.
Outlier Handling Strategies
| Strategy | Description | When to Use | R Code Example (if applicable) |
|---|---|---|---|
| Removal | Delete rows containing outliers | - Outliers are data errors - Small % of data affected |
clean_data <- data[-outlier_indices, ] |
| Winsorizing | Cap extreme values at a specified percentile (e.g., 95th) | - Preserve sample size - Maintain data distribution shape |
data$var <- pmin(data$var, quantile(data$var, 0.95)) |
| Transformation | Apply mathematical functions (log, sqrt, etc.) to reduce skewness | - Right/left-skewed data - Non-normal distributions |
data$var <- log(data$var + 1) |
| Imputation | Replace outliers with median/mean | - Outliers are plausible but extreme - Small dataset |
data$var[outliers] <- median(data$var, na.rm=TRUE) |
| Binning | Convert continuous variables to categorical ranges | - Non-linear relationships - Reduce sensitivity to extremes |
data$var_bin <- cut(data$var, breaks=5) |
| Robust Models | Use statistical methods less sensitive to outliers (e.g., quantile regression) | - High outlier prevalence - Parametric assumptions violated |
rq(y ~ x, data=data, tau=0.5) (from
quantreg) |
| Flag & Analyze | Keep outliers but analyze their impact separately | - Outliers are valid observations - Need to assess influence |
data$is_outlier <- ifelse(outlier_flags, 1, 0) |
Key Notes:
# Check if required packages are installed, and install them if not
required_packages <- c("plotly", "htmltools")
# Loop through the list of packages
for (package in required_packages) {
if (!require(package, character.only = TRUE)) {
install.packages(package)
library(package, character.only = TRUE)
}
}
# Create individual plotly boxplots
p1 <- plot_ly(complete_data, y = ~gpa, type = "box", name = "GPA") %>%
layout(title = "GPA Distribution")
p2 <- plot_ly(complete_data, y = ~math_score, type = "box", name = "Math") %>%
layout(title = "Math Score Distribution")
p3 <- plot_ly(complete_data, y = ~writing_score, type = "box", name = "Writing") %>%
layout(title = "Writing Score Distribution")
p4 <- plot_ly(complete_data, y = ~reading_score, type = "box", name = "Reading") %>%
layout(title = "Reading Score Distribution")
# plot grid
grid <- htmltools::tagList(div(style = "display: flex; flex-wrap: wrap; justify-content: center;",
div(style = "width: 50%;", p1), div(style = "width: 50%;", p2), div(style = "width: 50%;",
p3), div(style = "width: 50%;", p4)))
htmltools::browsable(grid)
For Numeric Variables Only:
# Check score ranges (should be 0-100)
data.frame(subject = c("math", "reading", "writing", "gpa"), min = round(c(min(complete_data$math_score,
na.rm = TRUE), min(complete_data$reading_score, na.rm = TRUE), min(complete_data$writing_score,
na.rm = TRUE), min(complete_data$gpa, na.rm = TRUE)), 2), max = round(c(max(complete_data$math_score,
na.rm = TRUE), max(complete_data$reading_score, na.rm = TRUE), max(complete_data$writing_score,
na.rm = TRUE), max(complete_data$gpa, na.rm = TRUE)), 2))
## subject min max
## 1 math 26.00 100
## 2 reading 30.00 100
## 3 writing 29.00 100
## 4 gpa 0.72 4
# List of required packages
required_packages <- c("ggplot2", "dplyr", "patchwork", "stringr")
# Function to check and install packages
install_if_missing <- function(pkg) {
if (!requireNamespace(pkg, quietly = TRUE)) {
install.packages(pkg)
}
}
# Install missing packages
invisible(sapply(required_packages, install_if_missing))
# Load all packages
library(ggplot2)
library(dplyr)
library(patchwork)
library(stringr)
# Set a consistent color palette
palette <- c("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728") # Colorblind-friendly
# Custom theme for consistency
my_theme <- function() {
theme_minimal() + theme(plot.title = element_text(size = 12, face = "bold", hjust = 0.5),
axis.title = element_text(size = 10), panel.grid.minor = element_blank(),
plot.margin = margin(10, 10, 10, 10))
}
# Function to identify both outliers and invalid cases
get_problem_cases <- function(data, y_var, valid_range = NULL) {
# Get statistical outliers
bp_stats <- boxplot.stats(data[[y_var]])
outliers <- data %>%
filter(.data[[y_var]] %in% bp_stats$out)
# Get invalid cases if range is specified
invalid <- NULL
if (!is.null(valid_range)) {
invalid <- data %>%
filter(.data[[y_var]] < valid_range[1] | .data[[y_var]] > valid_range[2])
}
list(outliers = outliers, invalid = invalid)
}
# Modified plotting function
create_boxplot <- function(data, y_var, title, fill_color, y_limits, y_breaks, valid_range = NULL) {
problems <- get_problem_cases(data, y_var, valid_range)
p <- ggplot(data, aes(x = "", y = .data[[y_var]])) + geom_boxplot(fill = fill_color,
color = "gray30", outlier.shape = NA, alpha = 0.3) + geom_jitter(width = 0.2,
alpha = 0.4, color = fill_color, size = 1.5)
# Add outliers in red
if (nrow(problems$outliers) > 0) {
p <- p + geom_point(data = problems$outliers, aes(y = .data[[y_var]]), color = "red",
size = 3, shape = 19, position = position_jitter(width = 0.2))
}
# Add invalid cases in purple (if any and if range specified)
if (!is.null(valid_range) && nrow(problems$invalid) > 0) {
p <- p + geom_point(data = problems$invalid, aes(y = .data[[y_var]]), color = "purple",
size = 3, shape = 17, position = position_jitter(width = 0.2))
}
# Create subtitle
subtitle <- ""
if (!is.null(valid_range)) {
invalid_count <- ifelse(is.null(problems$invalid), 0, nrow(problems$invalid))
outlier_count <- nrow(problems$outliers)
subtitle <- paste0("Invalid: ", invalid_count, " | Outliers: ", outlier_count)
}
p + labs(title = title, subtitle = subtitle, y = str_to_title(gsub("_", " ",
y_var))) + scale_y_continuous(limits = y_limits, breaks = y_breaks) + my_theme()
}
# Define valid ranges for each variable
valid_ranges <- list(gpa = c(0, 4), math_score = c(0, 100), reading_score = c(0,
100), writing_score = c(0, 100))
# Create plots
p1 <- create_boxplot(complete_data, "gpa", "GPA Distribution\n(Valid range: 0.00-4.00)",
palette[1], c(-0.5, 4.5), seq(0, 4, 0.5), valid_range = valid_ranges$gpa)
p2 <- create_boxplot(complete_data, "math_score", "Math Score Distribution\n(Valid range: 0-100)",
palette[2], c(-10, 110), seq(0, 100, 20), valid_range = valid_ranges$math_score)
p3 <- create_boxplot(complete_data, "reading_score", "Reading Score Distribution\n(Valid range: 0-100)",
palette[3], c(-10, 110), seq(0, 100, 20), valid_range = valid_ranges$reading_score)
p4 <- create_boxplot(complete_data, "writing_score", "Writing Score Distribution\n(Valid range: 0-100)",
"#9467bd", c(-10, 110), seq(0, 100, 20), valid_range = valid_ranges$writing_score)
# Combine plots with patchwork
grid_plot <- (p1 + p2)/(p3 + p4) + plot_annotation(title = "Academic Score Distributions",
subtitle = "Red = statistical outliers | Purple = invalid values", theme = theme(plot.title = element_text(size = 14,
face = "bold", hjust = 0.5), plot.subtitle = element_text(size = 10, hjust = 0.5)))
# Display the plot
grid_plot
Winsorizing replaces extreme values with specified percentile cutoffs (e.g., 5th and 95th percentiles) rather than removing them, preserving sample size while reducing outlier impact. This methos is best for medium-sized datasets where complete removal would sacrifice too much data, but extreme values need to be controlled.
Advantages:
Limitations:
Deciding whether to remove outliers depends on the data context, analysis goals, and domain knowledge. Below are key considerations to help you decide:
Valid Cases for Outlier Removal
| Scenario | Rationale | Example |
|---|---|---|
| Data entry errors | Mistakes in recording/collection | GPA recorded as 10.0 instead of 4.0 |
| Impossible values | Violates known constraints | Negative age or height > 3m |
| Measurement noise | Equipment malfunctions | Temperature sensor spike to 1000°C |
| Irrelevant extremes | Valid but skews analysis | Including billionaires in average income study |
| Small sample bias | Single outlier dominates results | 10 data points where 1 is 100× larger |
Best Practice:
› Always compare results with vs. without outliers
before deciding
› Document removal rationale in methodology
Cases to Keep Outliers
| Scenario | Rationale | Example |
|---|---|---|
| True extreme values | Rare but genuine observations | Medical anomaly cases |
| Focus of research | Outliers are the target phenomenon | Studying exceptional performers |
| Natural heavy-tailed dist | Expected distribution shape | Income, wealth, or city sizes |
| Large dataset immunity | Minimal impact on aggregates | 1M+ samples with 0.1% outliers |
Alternatives to Removal:
Before analyzing Likert scale data, it’s crucial to screen for data quality issues. Here’s a comprehensive approach to screening your Likert scale responses:
Cronbach’s alpha \(\alpha\) is a measure of internal consistency reliability, reflecting how closely related a set of items are as a group in a psychometric test. Introduced by Lee Cronbach in 1951, it is an extension of the Kuder-Richardson Formula 20 (KR-20), which applies to dichotomous items. Alpha is calculated based on the number of items, the average covariance between item pairs, and the total score variance. However, it is sensitive to missing data and can be biased under certain conditions.
Interpretation and Limitations
Cronbach’s alpha increases with higher inter-item correlations, suggesting greater consistency among test items measuring the same construct. A widely used guideline for interpretation is (George & Mallery, 2003; Kline, 2000; Nunnally, 1978):
However, alpha should be interpreted cautiously because:
Thus, while Cronbach’s alpha is a useful indicator of reliability, it should not be the sole metric for assessing scale quality.
| \(\textbf{Cronbach's }\) \(\alpha\) | \(\textbf{Internal Consistency (reliability)}\) |
|---|---|
| \(\alpha \geq 0.9\) | \(\textit{Excellent (High-Stakes testing)}\) |
| \(0.7 \leq \alpha < 0.9\) | \(\textit{Good (Low-Stakes testing)}\) |
| \(0.6 \leq \alpha < 0.7\) | \(\textit{Acceptable}\) |
| \(0.5 \leq \alpha < 0.6\) | \(\textit{Poor}\) |
| \(\alpha < 0.5\) | \(\textit{Unacceptable}\) |
Recode the mathematics anxiety items to ensure consistent scale direction across all measures:
Currently aligned items: general and cognitive (higher scores = higher anxiety)
Items needing recoding: affective, value, and difficulty (currently higher scores = lower anxiety)
Recoding Decision:
Before proceeding, determine your scoring convention:
Cronbach’s coefficient alpha before recoding
# ========================================== Reliability Analysis with
# Diagnostic Checks ==========================================
# Setup ----
set.seed(42)
if (!requireNamespace("psych", quietly = TRUE)) install.packages("psych")
library(psych)
# Data Preparation ==========================================
likert_items <- complete_data[, c("C1", "C2", "C3", "A1", "A2", "A3", "P1", "P2",
"P3", "B1", "B2", "B3", "V1", "V2", "V3", "E1", "E2", "E3", "S1", "S2", "S3")]
# Safe conversion to numeric
likert_items <- as.data.frame(lapply(likert_items, function(x) {
if (!is.numeric(x))
as.numeric(as.character(x)) else x
}))
# Initial Reliability Check ==========================================
cat("\n=== INITIAL RELIABILITY ===\n")
##
## === INITIAL RELIABILITY ===
initial_alpha <- psych::alpha(likert_items)
## Some items ( C1 A2 P1 P3 B1 B3 V2 V3 E3 S3 ) were negatively correlated with the first principal component and
## probably should be reversed.
## To do this, run the function again with the 'check.keys=TRUE' option
print(initial_alpha)
##
## Reliability analysis
## Call: psych::alpha(x = likert_items)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## -0.89 -0.91 0.043 -0.023 -0.48 0.088 3 0.19 -0.082
##
## 95% confidence boundaries
## lower alpha upper
## Feldt -1.07 -0.89 -0.73
## Duhachek -1.07 -0.89 -0.72
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## C1 -0.61 -0.63 0.17071 -0.020 -0.39 0.073 0.082 -0.069
## C2 -0.97 -0.97 -0.00096 -0.025 -0.49 0.092 0.081 -0.082
## C3 -0.98 -0.98 -0.00767 -0.025 -0.50 0.093 0.081 -0.082
## A1 -1.11 -1.12 -0.07290 -0.027 -0.53 0.099 0.083 -0.099
## A2 -0.55 -0.57 0.20335 -0.019 -0.36 0.071 0.084 -0.069
## A3 -1.04 -1.05 -0.03355 -0.026 -0.51 0.096 0.083 -0.082
## P1 -0.83 -0.85 0.05890 -0.023 -0.46 0.083 0.085 -0.096
## P2 -0.71 -0.71 0.12563 -0.021 -0.42 0.080 0.085 -0.082
## P3 -0.77 -0.78 0.09144 -0.022 -0.44 0.081 0.085 -0.096
## B1 -0.77 -0.79 0.08328 -0.023 -0.44 0.080 0.084 -0.069
## B2 -0.71 -0.71 0.13061 -0.021 -0.41 0.080 0.084 -0.082
## B3 -0.78 -0.80 0.08453 -0.023 -0.44 0.081 0.084 -0.069
## V1 -0.78 -0.77 0.10050 -0.022 -0.44 0.083 0.085 -0.082
## V2 -0.80 -0.81 0.07150 -0.023 -0.45 0.082 0.084 -0.082
## V3 -0.80 -0.81 0.07843 -0.023 -0.45 0.082 0.085 -0.069
## E1 -1.01 -1.02 -0.02490 -0.026 -0.50 0.095 0.084 -0.082
## E2 -1.04 -1.04 -0.03953 -0.026 -0.51 0.096 0.083 -0.082
## E3 -0.56 -0.58 0.19687 -0.019 -0.37 0.071 0.084 -0.069
## S1 -1.07 -1.08 -0.05814 -0.027 -0.52 0.098 0.085 -0.099
## S2 -1.11 -1.11 -0.07528 -0.027 -0.53 0.100 0.086 -0.099
## S3 -0.54 -0.55 0.20449 -0.018 -0.36 0.070 0.086 -0.069
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## C1 1000 -0.048 -0.039 -0.9128 -0.3282 3.0 1.2
## C2 1000 0.288 0.278 0.6342 -0.0116 3.1 1.2
## C3 1000 0.303 0.289 0.6823 -0.0072 3.1 1.2
## A1 1000 0.385 0.382 1.0869 0.0913 3.0 1.2
## A2 1000 -0.101 -0.101 -1.3039 -0.3772 3.0 1.2
## A3 1000 0.333 0.334 0.8293 0.0479 3.0 1.1
## P1 1000 0.169 0.177 0.1262 -0.1288 2.9 1.2
## P2 1000 0.058 0.049 -0.4804 -0.2333 3.0 1.2
## P3 1000 0.107 0.119 -0.1696 -0.1783 2.9 1.1
## B1 1000 0.121 0.126 -0.0890 -0.1776 3.0 1.2
## B2 1000 0.050 0.047 -0.5243 -0.2361 3.0 1.1
## B3 1000 0.124 0.132 -0.0951 -0.1656 3.0 1.1
## V1 1000 0.118 0.112 -0.2378 -0.1714 3.0 1.1
## V2 1000 0.134 0.147 0.0086 -0.1535 3.0 1.1
## V3 1000 0.138 0.148 -0.0553 -0.1555 3.0 1.1
## E1 1000 0.322 0.314 0.7643 0.0243 3.0 1.2
## E2 1000 0.336 0.328 0.8868 0.0414 3.0 1.2
## E3 1000 -0.103 -0.095 -1.2194 -0.3729 3.0 1.2
## S1 1000 0.366 0.355 0.9979 0.0655 3.0 1.2
## S2 1000 0.391 0.381 1.1054 0.0955 3.0 1.2
## S3 1000 -0.131 -0.123 -1.3357 -0.3976 3.0 1.2
##
## Non missing response frequency for each item
## 1 2 3 4 5 miss
## C1 0.11 0.24 0.31 0.22 0.12 0
## C2 0.09 0.23 0.31 0.24 0.13 0
## C3 0.11 0.20 0.30 0.24 0.15 0
## A1 0.12 0.21 0.32 0.24 0.12 0
## A2 0.12 0.24 0.31 0.21 0.12 0
## A3 0.10 0.22 0.35 0.23 0.10 0
## P1 0.13 0.23 0.32 0.22 0.09 0
## P2 0.10 0.23 0.31 0.25 0.12 0
## P3 0.11 0.24 0.34 0.21 0.09 0
## B1 0.12 0.23 0.31 0.23 0.11 0
## B2 0.11 0.20 0.35 0.23 0.11 0
## B3 0.10 0.24 0.32 0.25 0.10 0
## V1 0.10 0.22 0.35 0.22 0.11 0
## V2 0.10 0.23 0.34 0.22 0.10 0
## V3 0.11 0.22 0.35 0.21 0.11 0
## E1 0.11 0.24 0.31 0.23 0.11 0
## E2 0.12 0.22 0.32 0.23 0.10 0
## E3 0.11 0.22 0.33 0.22 0.12 0
## S1 0.12 0.21 0.31 0.24 0.12 0
## S2 0.11 0.23 0.32 0.21 0.13 0
## S3 0.11 0.24 0.31 0.23 0.11 0
# Reverse Coding ==========================================
reverse_items <- c("C1", "A2", "P1", "P3", "B1", "B3", "V2", "V3", "E3", "S3")
reverse_items <- reverse_items[reverse_items %in% names(likert_items)]
reverse_scale <- function(x) {
if (all(x %in% 1:5, na.rm = TRUE))
6 - x else if (all(x %in% 0:4, na.rm = TRUE))
4 - x else stop("Unexpected scale values")
}
likert_recoded <- likert_items
likert_recoded[, reverse_items] <- lapply(likert_recoded[, reverse_items], reverse_scale)
# Diagnostic Checks ==========================================
cat("\n=== DATA DIAGNOSTICS ===\n")
##
## === DATA DIAGNOSTICS ===
# 1. Variability Check
check_variability <- function(data) {
zero_var <- names(data)[apply(data, 2, sd, na.rm = TRUE) == 0]
low_var <- names(data)[sapply(data, function(x) {
max(prop.table(table(x))) > 0.9
})]
if (length(zero_var) > 0) {
message("Zero-variance items: ", paste(zero_var, collapse = ", "))
}
if (length(low_var) > 0) {
message("Low-variability items (>90% same response): ", paste(low_var, collapse = ", "))
}
return(unique(c(zero_var, low_var)))
}
problematic_items <- check_variability(likert_recoded)
# 2. Correlation Analysis
cor_matrix <- cor(likert_recoded, use = "pairwise.complete.obs")
neg_cor_pairs <- which(cor_matrix < -0.3 & lower.tri(cor_matrix), arr.ind = TRUE)
if (nrow(neg_cor_pairs) > 0) {
cat("\nNegative Correlations:\n")
print(data.frame(Item1 = colnames(cor_matrix)[neg_cor_pairs[, 1]], Item2 = colnames(cor_matrix)[neg_cor_pairs[,
2]], Correlation = cor_matrix[neg_cor_pairs], stringsAsFactors = FALSE))
}
# 3. Item-Total Analysis
alpha_stats <- psych::alpha(likert_recoded)$item.stats
low_r <- alpha_stats[alpha_stats$r.drop < 0.2, ]
if (nrow(low_r) > 0) {
cat("\nItems with Low Item-Total Correlation (r < 0.2):\n")
print(low_r[, c("r.drop", "mean", "sd")])
problematic_items <- unique(c(problematic_items, rownames(low_r)))
}
# Final Clean Dataset ==========================================
if (length(problematic_items) > 0) {
cat("\nRemoving problematic items:", paste(problematic_items, collapse = ", "),
"\n")
likert_final <- likert_recoded[, !names(likert_recoded) %in% problematic_items]
} else {
likert_final <- likert_recoded
}
# Final Analysis ==========================================
cat("\n=== FINAL RELIABILITY ===\n")
##
## === FINAL RELIABILITY ===
(final_alpha <- psych::alpha(likert_final))
##
## Reliability analysis
## Call: psych::alpha(x = likert_final)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.88 0.88 0.92 0.25 7.2 0.0057 3 0.62 0.23
##
## 95% confidence boundaries
## lower alpha upper
## Feldt 0.87 0.88 0.89
## Duhachek 0.87 0.88 0.89
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## C1 0.87 0.87 0.91 0.25 6.7 0.0061 0.019 0.22
## C2 0.87 0.87 0.91 0.25 6.6 0.0061 0.019 0.22
## C3 0.87 0.87 0.91 0.25 6.6 0.0061 0.019 0.22
## A1 0.87 0.87 0.91 0.25 6.8 0.0060 0.019 0.23
## A2 0.87 0.87 0.91 0.25 6.8 0.0060 0.020 0.23
## A3 0.87 0.87 0.91 0.25 6.7 0.0060 0.020 0.22
## P1 0.87 0.87 0.91 0.26 6.9 0.0059 0.019 0.23
## P2 0.87 0.87 0.91 0.26 6.9 0.0059 0.019 0.23
## P3 0.87 0.87 0.91 0.26 7.0 0.0058 0.019 0.23
## B1 0.87 0.87 0.91 0.25 6.8 0.0060 0.020 0.23
## B2 0.87 0.87 0.91 0.26 6.8 0.0059 0.019 0.23
## B3 0.87 0.87 0.91 0.25 6.8 0.0060 0.020 0.23
## V1 0.87 0.87 0.91 0.25 6.8 0.0059 0.020 0.23
## V2 0.87 0.87 0.91 0.26 6.9 0.0059 0.019 0.23
## V3 0.87 0.87 0.91 0.26 6.9 0.0059 0.019 0.23
## E1 0.87 0.87 0.91 0.26 6.9 0.0059 0.019 0.22
## E2 0.87 0.87 0.91 0.25 6.7 0.0060 0.019 0.22
## E3 0.87 0.87 0.91 0.25 6.8 0.0060 0.020 0.22
## S1 0.87 0.87 0.91 0.26 7.0 0.0058 0.019 0.23
## S2 0.88 0.88 0.91 0.26 7.0 0.0058 0.019 0.23
## S3 0.88 0.88 0.91 0.26 7.0 0.0058 0.019 0.23
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## C1 1000 0.61 0.61 0.59 0.55 3.0 1.2
## C2 1000 0.63 0.63 0.62 0.58 3.1 1.2
## C3 1000 0.63 0.63 0.61 0.57 3.1 1.2
## A1 1000 0.55 0.55 0.53 0.49 3.0 1.2
## A2 1000 0.57 0.57 0.54 0.50 3.0 1.2
## A3 1000 0.59 0.59 0.57 0.53 3.0 1.1
## P1 1000 0.50 0.50 0.47 0.42 3.1 1.2
## P2 1000 0.50 0.50 0.48 0.43 3.0 1.2
## P3 1000 0.47 0.47 0.45 0.40 3.1 1.1
## B1 1000 0.56 0.56 0.54 0.49 3.0 1.2
## B2 1000 0.53 0.53 0.51 0.46 3.0 1.1
## B3 1000 0.55 0.56 0.53 0.49 3.0 1.1
## V1 1000 0.53 0.53 0.51 0.46 3.0 1.1
## V2 1000 0.52 0.53 0.51 0.46 3.0 1.1
## V3 1000 0.49 0.50 0.47 0.42 3.0 1.1
## E1 1000 0.53 0.53 0.50 0.46 3.0 1.2
## E2 1000 0.58 0.58 0.56 0.51 3.0 1.2
## E3 1000 0.56 0.56 0.53 0.49 3.0 1.2
## S1 1000 0.48 0.47 0.45 0.40 3.0 1.2
## S2 1000 0.45 0.45 0.42 0.38 3.0 1.2
## S3 1000 0.46 0.46 0.43 0.39 3.0 1.2
##
## Non missing response frequency for each item
## 1 2 3 4 5 miss
## C1 0.12 0.22 0.31 0.24 0.11 0
## C2 0.09 0.23 0.31 0.24 0.13 0
## C3 0.11 0.20 0.30 0.24 0.15 0
## A1 0.12 0.21 0.32 0.24 0.12 0
## A2 0.12 0.21 0.31 0.24 0.12 0
## A3 0.10 0.22 0.35 0.23 0.10 0
## P1 0.09 0.22 0.32 0.23 0.13 0
## P2 0.10 0.23 0.31 0.25 0.12 0
## P3 0.09 0.21 0.34 0.24 0.11 0
## B1 0.11 0.23 0.31 0.23 0.12 0
## B2 0.11 0.20 0.35 0.23 0.11 0
## B3 0.10 0.25 0.32 0.24 0.10 0
## V1 0.10 0.22 0.35 0.22 0.11 0
## V2 0.10 0.22 0.34 0.23 0.10 0
## V3 0.11 0.21 0.35 0.22 0.11 0
## E1 0.11 0.24 0.31 0.23 0.11 0
## E2 0.12 0.22 0.32 0.23 0.10 0
## E3 0.12 0.22 0.33 0.22 0.11 0
## S1 0.12 0.21 0.31 0.24 0.12 0
## S2 0.11 0.23 0.32 0.21 0.13 0
## S3 0.11 0.23 0.31 0.24 0.11 0
library(corrplot)
# Visualization
# ==========================================
if (exists("likert_final") && ncol(likert_final) > 1) {
# compute correlation matrix
R <- cor(likert_final, use = "pairwise.complete.obs")
# color palette (optional example)
colors <- colorRampPalette(c("red", "white", "blue"))(200)
corrplot::corrplot(
R,
method = "color", # colored tiles
type = "upper", # show upper triangle only
diag = FALSE, # remove diagonal
tl.col = "blue", # text label color
tl.cex = 1, # text label size
addCoef.col = NA, # suppress numbers
col = colors, # custom color palette
is.corr = TRUE # tell corrplot this is a correlation matrix
)
} else {
warning("Not enough valid items remaining for correlation plot")
}
An initial analysis of internal consistency yielded a theoretically
invalid Cronbach’s alpha of α = –.894, which falls outside
the acceptable range of 0 to 1 and indicates a problem with item
scoring. Diagnostic evaluation revealed negative item-total correlations
(rdrop) for ten items (C1, A2, P1, P3, B1, B3, V2, V3, E3,
and S3), all of which were negatively worded. This suggested the need
for reverse-scoring to ensure alignment with the underlying
construct.
Reverse-coding was applied uniformly using the transformation
reversed value = (scale max + scale min) – original value. For the
5-point Likert scale used in this study, this equated to: reversed value
= 6 – original value. Post-correction verification confirmed that all
item-total correlations were positive, with the lowest rdrop
value reaching .22. Inter-item correlations ranged from
.12 to .58, and no negative relationships
remained.
Following recoding, reliability analysis indicated excellent internal
consistency: Cronbach’s α = .878,
95% CI [.862, .893], with a mean inter-item correlation of
\(r = .255\) and a signal-to-noise
ratio of 7.17.
These results align with established psychometric standards for research instruments (Nunnally & Bernstein, 1994). The improvement from negative to positive alpha values underscores the importance of proper scale preparation, particularly when instruments contain reverse-worded items.
Add a column (anxiety) to hold the total score:
# Define the item names in a vector
item_names <- c("C1", "C2", "C3", "A1", "A2", "A3", "P1", "P2", "P3", "B1", "B2",
"B3", "V1", "V2", "V3", "E1", "E2", "E3", "S1", "S2", "S3")
# Initialize the anxiety column with NA values
complete_data$anxiety <- NA
# Alternative vectorized approach (more efficient)
complete_data$anxiety <- apply(complete_data[, item_names], 1, function(x) sum(as.numeric(x),
na.rm = TRUE))
# Check the structure of the data with anxiety scores added
str(complete_data)
## 'data.frame': 1000 obs. of 32 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : Factor w/ 2 levels "female","male": 1 2 2 2 2 2 1 1 2 1 ...
## $ race : Factor w/ 4 levels "group A","group B",..: 1 2 3 3 1 3 2 3 4 4 ...
## $ parental_education: Ord.factor w/ 6 levels "some high school"<..: 3 3 5 1 3 5 5 3 4 2 ...
## $ lunch : Factor w/ 3 levels "free","reduced",..: 3 1 1 1 1 1 1 2 3 2 ...
## $ test_prep : Factor w/ 2 levels "completed","none": 2 1 2 2 2 1 1 1 2 2 ...
## $ math_score : num 38 69 44 64 45 38 48 68 53 82 ...
## $ reading_score : num 48 69 69 54 44 64 63 60 71 82 ...
## $ writing_score : num 70 83 48 73 41 48 68 83 89 71 ...
## $ gpa : num 1.68 2.81 1.78 2.25 1.18 1.6 2.09 2.62 2.69 3.05 ...
## $ C1 : num 2 4 2 3 3 5 2 4 4 4 ...
## $ C2 : num 5 3 3 3 4 2 4 2 2 1 ...
## $ C3 : num 3 2 3 4 3 2 4 3 2 3 ...
## $ A1 : num 3 1 4 4 4 1 4 4 3 3 ...
## $ A2 : num 3 5 2 3 3 3 2 2 2 4 ...
## $ A3 : num 4 1 5 3 4 1 3 3 3 4 ...
## $ P1 : num 4 4 2 3 3 5 4 2 1 3 ...
## $ P2 : num 3 1 5 2 4 1 2 5 4 3 ...
## $ P3 : num 3 3 3 4 1 4 4 2 3 2 ...
## $ B1 : num 4 5 3 4 2 3 2 2 2 5 ...
## $ B2 : num 4 1 3 2 4 1 4 4 4 1 ...
## $ B3 : num 3 5 4 4 2 4 3 4 2 5 ...
## $ V1 : num 3 3 2 3 4 1 3 3 1 3 ...
## $ V2 : num 2 4 4 3 3 4 4 2 5 4 ...
## $ V3 : num 3 5 4 2 4 5 4 2 4 3 ...
## $ E1 : num 4 3 4 5 4 1 4 4 2 2 ...
## $ E2 : num 4 3 4 4 4 1 3 4 2 2 ...
## $ E3 : num 3 3 3 2 1 5 2 2 4 3 ...
## $ S1 : num 4 4 3 2 5 2 2 5 3 3 ...
## $ S2 : num 4 5 5 2 4 4 4 3 2 3 ...
## $ S3 : num 2 1 1 3 2 4 4 2 4 3 ...
## $ anxiety : num 70 66 69 65 68 59 68 64 59 64 ...
Add a column (rank) to hold the rank (high, medium, and low ntiles). Create a new ordinal variable with 3 levels (i.e., high, medium, and low ntiles) for the rank of the students scores on SA.
# Ensure required package 'dplyr' is installed and loaded
if (!require("dplyr", quietly = TRUE)) install.packages("dplyr")
library(dplyr)
# Create ranked anxiety groups
complete_data <- complete_data %>%
mutate(
anxiety_rank = case_when(
anxiety <= quantile(anxiety, probs = 0.33, na.rm = TRUE) ~ "Low", # Low anxiety group
anxiety <= quantile(anxiety, probs = 0.66, na.rm = TRUE) ~ "Medium", # Medium anxiety group
TRUE ~ "High" # High anxiety group
),
anxiety_rank = factor(
anxiety_rank,
levels = c("Low", "Medium", "High"),
ordered = TRUE # Ensuring the factor is ordered
)
)
# Verify the distribution of anxiety ranks
table(complete_data$anxiety_rank)
##
## Low Medium High
## 347 315 338
# Check structure of the updated dataset
str(complete_data)
## 'data.frame': 1000 obs. of 33 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : Factor w/ 2 levels "female","male": 1 2 2 2 2 2 1 1 2 1 ...
## $ race : Factor w/ 4 levels "group A","group B",..: 1 2 3 3 1 3 2 3 4 4 ...
## $ parental_education: Ord.factor w/ 6 levels "some high school"<..: 3 3 5 1 3 5 5 3 4 2 ...
## $ lunch : Factor w/ 3 levels "free","reduced",..: 3 1 1 1 1 1 1 2 3 2 ...
## $ test_prep : Factor w/ 2 levels "completed","none": 2 1 2 2 2 1 1 1 2 2 ...
## $ math_score : num 38 69 44 64 45 38 48 68 53 82 ...
## $ reading_score : num 48 69 69 54 44 64 63 60 71 82 ...
## $ writing_score : num 70 83 48 73 41 48 68 83 89 71 ...
## $ gpa : num 1.68 2.81 1.78 2.25 1.18 1.6 2.09 2.62 2.69 3.05 ...
## $ C1 : num 2 4 2 3 3 5 2 4 4 4 ...
## $ C2 : num 5 3 3 3 4 2 4 2 2 1 ...
## $ C3 : num 3 2 3 4 3 2 4 3 2 3 ...
## $ A1 : num 3 1 4 4 4 1 4 4 3 3 ...
## $ A2 : num 3 5 2 3 3 3 2 2 2 4 ...
## $ A3 : num 4 1 5 3 4 1 3 3 3 4 ...
## $ P1 : num 4 4 2 3 3 5 4 2 1 3 ...
## $ P2 : num 3 1 5 2 4 1 2 5 4 3 ...
## $ P3 : num 3 3 3 4 1 4 4 2 3 2 ...
## $ B1 : num 4 5 3 4 2 3 2 2 2 5 ...
## $ B2 : num 4 1 3 2 4 1 4 4 4 1 ...
## $ B3 : num 3 5 4 4 2 4 3 4 2 5 ...
## $ V1 : num 3 3 2 3 4 1 3 3 1 3 ...
## $ V2 : num 2 4 4 3 3 4 4 2 5 4 ...
## $ V3 : num 3 5 4 2 4 5 4 2 4 3 ...
## $ E1 : num 4 3 4 5 4 1 4 4 2 2 ...
## $ E2 : num 4 3 4 4 4 1 3 4 2 2 ...
## $ E3 : num 3 3 3 2 1 5 2 2 4 3 ...
## $ S1 : num 4 4 3 2 5 2 2 5 3 3 ...
## $ S2 : num 4 5 5 2 4 4 4 3 2 3 ...
## $ S3 : num 2 1 1 3 2 4 4 2 4 3 ...
## $ anxiety : num 70 66 69 65 68 59 68 64 59 64 ...
## $ anxiety_rank : Ord.factor w/ 3 levels "Low"<"Medium"<..: 3 3 3 3 3 1 3 2 1 2 ...
# Load required packages
library(dplyr)
# 1. Convert gender to factor with proper levels
complete_data$gender <- factor(complete_data$gender, levels = c("female", "male",
"other"), exclude = NA)
# 2. Define Likert items (corrected to match actual numeric columns)
likert_items <- c("C1", "C2", "C3", "A1", "A2", "A3", "P1", "P2", "P3", "B1", "B2",
"B3", "V1", "V2", "V3", "E1", "E2", "E3", "S1", "S2", "S3")
# 3. Verify these columns exist and are numeric
str(complete_data[, likert_items])
## 'data.frame': 1000 obs. of 21 variables:
## $ C1: num 2 4 2 3 3 5 2 4 4 4 ...
## $ C2: num 5 3 3 3 4 2 4 2 2 1 ...
## $ C3: num 3 2 3 4 3 2 4 3 2 3 ...
## $ A1: num 3 1 4 4 4 1 4 4 3 3 ...
## $ A2: num 3 5 2 3 3 3 2 2 2 4 ...
## $ A3: num 4 1 5 3 4 1 3 3 3 4 ...
## $ P1: num 4 4 2 3 3 5 4 2 1 3 ...
## $ P2: num 3 1 5 2 4 1 2 5 4 3 ...
## $ P3: num 3 3 3 4 1 4 4 2 3 2 ...
## $ B1: num 4 5 3 4 2 3 2 2 2 5 ...
## $ B2: num 4 1 3 2 4 1 4 4 4 1 ...
## $ B3: num 3 5 4 4 2 4 3 4 2 5 ...
## $ V1: num 3 3 2 3 4 1 3 3 1 3 ...
## $ V2: num 2 4 4 3 3 4 4 2 5 4 ...
## $ V3: num 3 5 4 2 4 5 4 2 4 3 ...
## $ E1: num 4 3 4 5 4 1 4 4 2 2 ...
## $ E2: num 4 3 4 4 4 1 3 4 2 2 ...
## $ E3: num 3 3 3 2 1 5 2 2 4 3 ...
## $ S1: num 4 4 3 2 5 2 2 5 3 3 ...
## $ S2: num 4 5 5 2 4 4 4 3 2 3 ...
## $ S3: num 2 1 1 3 2 4 4 2 4 3 ...
# 4. Check current value ranges
lapply(complete_data[, likert_items], function(x) table(x, useNA = "always"))
## $C1
## x
## 1 2 3 4 5 <NA>
## 106 242 314 221 117 0
##
## $C2
## x
## 1 2 3 4 5 <NA>
## 94 228 307 241 130 0
##
## $C3
## x
## 1 2 3 4 5 <NA>
## 110 203 299 242 146 0
##
## $A1
## x
## 1 2 3 4 5 <NA>
## 116 209 322 238 115 0
##
## $A2
## x
## 1 2 3 4 5 <NA>
## 123 236 313 212 116 0
##
## $A3
## x
## 1 2 3 4 5 <NA>
## 100 219 351 228 102 0
##
## $P1
## x
## 1 2 3 4 5 <NA>
## 130 235 320 224 91 0
##
## $P2
## x
## 1 2 3 4 5 <NA>
## 101 231 307 246 115 0
##
## $P3
## x
## 1 2 3 4 5 <NA>
## 113 242 345 213 87 0
##
## $B1
## x
## 1 2 3 4 5 <NA>
## 116 235 313 226 110 0
##
## $B2
## x
## 1 2 3 4 5 <NA>
## 110 204 352 227 107 0
##
## $B3
## x
## 1 2 3 4 5 <NA>
## 100 236 319 246 99 0
##
## $V1
## x
## 1 2 3 4 5 <NA>
## 103 225 348 218 106 0
##
## $V2
## x
## 1 2 3 4 5 <NA>
## 100 233 343 224 100 0
##
## $V3
## x
## 1 2 3 4 5 <NA>
## 108 223 349 209 111 0
##
## $E1
## x
## 1 2 3 4 5 <NA>
## 109 238 312 229 112 0
##
## $E2
## x
## 1 2 3 4 5 <NA>
## 118 225 324 231 102 0
##
## $E3
## x
## 1 2 3 4 5 <NA>
## 107 223 333 222 115 0
##
## $S1
## x
## 1 2 3 4 5 <NA>
## 124 208 313 239 116 0
##
## $S2
## x
## 1 2 3 4 5 <NA>
## 107 228 324 209 132 0
##
## $S3
## x
## 1 2 3 4 5 <NA>
## 109 243 308 230 110 0
# 5. Reverse-code problematic items (based on your alpha analysis)
reverse_items <- c("C1", "A2", "P1", "P3", "B1", "B3", "V2", "V3", "E3", "S3")
# 6. Apply reverse-coding (for 1-5 scale)
complete_data <- complete_data %>%
mutate(across(all_of(reverse_items), ~6 - .))
# 7. Verification steps
cat("\n=== Value Ranges After Recoding ===\n")
##
## === Value Ranges After Recoding ===
lapply(complete_data[, reverse_items], function(x) table(x, useNA = "always"))
## $C1
## x
## 1 2 3 4 5 <NA>
## 117 221 314 242 106 0
##
## $A2
## x
## 1 2 3 4 5 <NA>
## 116 212 313 236 123 0
##
## $P1
## x
## 1 2 3 4 5 <NA>
## 91 224 320 235 130 0
##
## $P3
## x
## 1 2 3 4 5 <NA>
## 87 213 345 242 113 0
##
## $B1
## x
## 1 2 3 4 5 <NA>
## 110 226 313 235 116 0
##
## $B3
## x
## 1 2 3 4 5 <NA>
## 99 246 319 236 100 0
##
## $V2
## x
## 1 2 3 4 5 <NA>
## 100 224 343 233 100 0
##
## $V3
## x
## 1 2 3 4 5 <NA>
## 111 209 349 223 108 0
##
## $E3
## x
## 1 2 3 4 5 <NA>
## 115 222 333 223 107 0
##
## $S3
## x
## 1 2 3 4 5 <NA>
## 110 230 308 243 109 0
cat("\n=== Reliability Analysis ===\n")
##
## === Reliability Analysis ===
psych::alpha(complete_data[, likert_items])
##
## Reliability analysis
## Call: psych::alpha(x = complete_data[, likert_items])
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.88 0.88 0.92 0.25 7.2 0.0057 3 0.62 0.23
##
## 95% confidence boundaries
## lower alpha upper
## Feldt 0.87 0.88 0.89
## Duhachek 0.87 0.88 0.89
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## C1 0.87 0.87 0.91 0.25 6.7 0.0061 0.019 0.22
## C2 0.87 0.87 0.91 0.25 6.6 0.0061 0.019 0.22
## C3 0.87 0.87 0.91 0.25 6.6 0.0061 0.019 0.22
## A1 0.87 0.87 0.91 0.25 6.8 0.0060 0.019 0.23
## A2 0.87 0.87 0.91 0.25 6.8 0.0060 0.020 0.23
## A3 0.87 0.87 0.91 0.25 6.7 0.0060 0.020 0.22
## P1 0.87 0.87 0.91 0.26 6.9 0.0059 0.019 0.23
## P2 0.87 0.87 0.91 0.26 6.9 0.0059 0.019 0.23
## P3 0.87 0.87 0.91 0.26 7.0 0.0058 0.019 0.23
## B1 0.87 0.87 0.91 0.25 6.8 0.0060 0.020 0.23
## B2 0.87 0.87 0.91 0.26 6.8 0.0059 0.019 0.23
## B3 0.87 0.87 0.91 0.25 6.8 0.0060 0.020 0.23
## V1 0.87 0.87 0.91 0.25 6.8 0.0059 0.020 0.23
## V2 0.87 0.87 0.91 0.26 6.9 0.0059 0.019 0.23
## V3 0.87 0.87 0.91 0.26 6.9 0.0059 0.019 0.23
## E1 0.87 0.87 0.91 0.26 6.9 0.0059 0.019 0.22
## E2 0.87 0.87 0.91 0.25 6.7 0.0060 0.019 0.22
## E3 0.87 0.87 0.91 0.25 6.8 0.0060 0.020 0.22
## S1 0.87 0.87 0.91 0.26 7.0 0.0058 0.019 0.23
## S2 0.88 0.88 0.91 0.26 7.0 0.0058 0.019 0.23
## S3 0.88 0.88 0.91 0.26 7.0 0.0058 0.019 0.23
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## C1 1000 0.61 0.61 0.59 0.55 3.0 1.2
## C2 1000 0.63 0.63 0.62 0.58 3.1 1.2
## C3 1000 0.63 0.63 0.61 0.57 3.1 1.2
## A1 1000 0.55 0.55 0.53 0.49 3.0 1.2
## A2 1000 0.57 0.57 0.54 0.50 3.0 1.2
## A3 1000 0.59 0.59 0.57 0.53 3.0 1.1
## P1 1000 0.50 0.50 0.47 0.42 3.1 1.2
## P2 1000 0.50 0.50 0.48 0.43 3.0 1.2
## P3 1000 0.47 0.47 0.45 0.40 3.1 1.1
## B1 1000 0.56 0.56 0.54 0.49 3.0 1.2
## B2 1000 0.53 0.53 0.51 0.46 3.0 1.1
## B3 1000 0.55 0.56 0.53 0.49 3.0 1.1
## V1 1000 0.53 0.53 0.51 0.46 3.0 1.1
## V2 1000 0.52 0.53 0.51 0.46 3.0 1.1
## V3 1000 0.49 0.50 0.47 0.42 3.0 1.1
## E1 1000 0.53 0.53 0.50 0.46 3.0 1.2
## E2 1000 0.58 0.58 0.56 0.51 3.0 1.2
## E3 1000 0.56 0.56 0.53 0.49 3.0 1.2
## S1 1000 0.48 0.47 0.45 0.40 3.0 1.2
## S2 1000 0.45 0.45 0.42 0.38 3.0 1.2
## S3 1000 0.46 0.46 0.43 0.39 3.0 1.2
##
## Non missing response frequency for each item
## 1 2 3 4 5 miss
## C1 0.12 0.22 0.31 0.24 0.11 0
## C2 0.09 0.23 0.31 0.24 0.13 0
## C3 0.11 0.20 0.30 0.24 0.15 0
## A1 0.12 0.21 0.32 0.24 0.12 0
## A2 0.12 0.21 0.31 0.24 0.12 0
## A3 0.10 0.22 0.35 0.23 0.10 0
## P1 0.09 0.22 0.32 0.23 0.13 0
## P2 0.10 0.23 0.31 0.25 0.12 0
## P3 0.09 0.21 0.34 0.24 0.11 0
## B1 0.11 0.23 0.31 0.23 0.12 0
## B2 0.11 0.20 0.35 0.23 0.11 0
## B3 0.10 0.25 0.32 0.24 0.10 0
## V1 0.10 0.22 0.35 0.22 0.11 0
## V2 0.10 0.22 0.34 0.23 0.10 0
## V3 0.11 0.21 0.35 0.22 0.11 0
## E1 0.11 0.24 0.31 0.23 0.11 0
## E2 0.12 0.22 0.32 0.23 0.10 0
## E3 0.12 0.22 0.33 0.22 0.11 0
## S1 0.12 0.21 0.31 0.24 0.12 0
## S2 0.11 0.23 0.32 0.21 0.13 0
## S3 0.11 0.23 0.31 0.24 0.11 0
# 8. Compute anxiety scores (sum of items)
complete_data$anxiety <- rowSums(complete_data[, likert_items], na.rm = TRUE)
# 9. Final checks
cat("\n=== Anxiety Score Summary ===\n")
##
## === Anxiety Score Summary ===
summary(complete_data$anxiety)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.0 54.0 64.0 63.5 73.0 102.0
hist(complete_data$anxiety, main = "Anxiety Score Distribution")
Descriptive statistics summarize and organize characteristics of a dataset, helping to understand patterns in individual variables.
Use appropriate R functions based on each variable’s measurement level:
summary(), mean(), and
sd().table() and
prop.table(). These variables have no inherent order.table() and prop.table() to display
frequency distributions. If the variable is coded numerically (e.g.,
Likert scales), you may also summarize using median() or
quantile() —but interpret with caution, as intervals
between categories aren’t guaranteed to be equal.Note: Some functions (e.g.,
summary()orpsych::describe()) may yield limited or misleading output for nominal and ordinal variables. Always choose analysis methods that match the variable type and scale.
# Install packages if they are not already installed
required_packages <- c("psych", "gmodels")
for (pkg in required_packages) {
if (!require(pkg, character.only = TRUE, quietly = TRUE)) {
install.packages(pkg, dependencies = TRUE)
library(pkg, character.only = TRUE)
}
}
# 1. Variable Type Identification - More flexible approach
detect_variable_types <- function(data, exclude_likert = TRUE) {
var_types <- list(continuous = character(), nominal = character(), ordinal = character(),
likert = character())
for (col in names(data)) {
if (is.numeric(data[[col]])) {
# Check if numeric variable might be Likert (small discrete range)
unique_vals <- unique(na.omit(data[[col]]))
if (exclude_likert && length(unique_vals) <= 7 && all(unique_vals%%1 ==
0) && min(unique_vals) >= 1 && max(unique_vals) <= 10) {
var_types$likert <- c(var_types$likert, col)
} else {
var_types$continuous <- c(var_types$continuous, col)
}
} else if (is.factor(data[[col]])) {
if (is.ordered(data[[col]])) {
var_types$ordinal <- c(var_types$ordinal, col)
} else {
var_types$nominal <- c(var_types$nominal, col)
}
} else if (is.character(data[[col]])) {
var_types$nominal <- c(var_types$nominal, col)
}
}
# Return only non-Likert variables if exclude_likert is TRUE
if (exclude_likert) {
return(var_types[c("continuous", "nominal", "ordinal")])
} else {
return(var_types)
}
}
# 2. Function to describe continuous variables
describe_continuous <- function(data, vars) {
if (length(vars) == 0) {
cat("\nNo continuous variables found.\n")
return()
}
cat("\n=== CONTINUOUS VARIABLES ===\n")
for (var in vars) {
if (!var %in% names(data)) {
cat("\nWarning: Variable", var, "not found in dataset\n")
next
}
cat("\n--", var, "--\n")
# Basic statistics
print(summary(data[[var]]))
cat("\nStandard deviation:", sd(data[[var]], na.rm = TRUE), "\n")
cat("Variance:", var(data[[var]], na.rm = TRUE), "\n")
cat("Skewness:", psych::skew(data[[var]], na.rm = TRUE), "\n")
cat("Kurtosis:", psych::kurtosi(data[[var]], na.rm = TRUE), "\n")
# Base R visualization
par(mfrow = c(1, 2))
hist(data[[var]], breaks = 30, col = "lightblue", main = paste("Histogram of",
var), xlab = var, probability = TRUE)
lines(density(data[[var]], na.rm = TRUE), col = "darkblue", lwd = 2)
boxplot(data[[var]], col = "lightgreen", main = paste("Boxplot of", var))
par(mfrow = c(1, 1))
}
}
# 3. Function to describe nominal variables
describe_nominal <- function(data, vars) {
if (length(vars) == 0) {
cat("\nNo nominal variables found.\n")
return()
}
cat("\n=== NOMINAL/CATEGORICAL VARIABLES ===\n")
for (var in vars) {
if (!var %in% names(data)) {
cat("\nWarning: Variable", var, "not found in dataset\n")
next
}
cat("\n--", var, "--\n")
# Frequency table with proportions
freq_table <- table(data[[var]])
prop_table <- prop.table(freq_table) * 100
print(cbind(Count = freq_table, Percentage = round(prop_table, 1)))
# Check if variable appears to be Likert (integer values in small
# range)
if (is.numeric(data[[var]])) {
unique_vals <- unique(na.omit(data[[var]]))
is_likert <- length(unique_vals) <= 7 && all(unique_vals%%1 == 0) &&
min(unique_vals) >= 1
} else {
is_likert <- FALSE
}
}
}
# Main analysis
var_types <- detect_variable_types(complete_data)
# Run the analysis
describe_continuous(complete_data, var_types$continuous)
##
## === CONTINUOUS VARIABLES ===
##
## -- id --
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 251 500 500 750 1000
##
## Standard deviation: 288.82
## Variance: 83417
## Skewness: 0
## Kurtosis: -1.2036
##
## -- math_score --
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.0 55.0 66.0 65.3 75.0 100.0
##
## Standard deviation: 14.647
## Variance: 214.55
## Skewness: -0.048419
## Kurtosis: -0.31491
##
## -- reading_score --
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 30.0 62.0 70.0 70.1 78.0 100.0
##
## Standard deviation: 11.934
## Variance: 142.43
## Skewness: -0.035639
## Kurtosis: -0.18171
##
## -- writing_score --
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29.0 59.0 68.0 68.5 78.0 100.0
##
## Standard deviation: 13.644
## Variance: 186.17
## Skewness: -0.016598
## Kurtosis: -0.2553
##
## -- gpa --
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.72 2.10 2.53 2.52 2.93 4.00
##
## Standard deviation: 0.59386
## Variance: 0.35267
## Skewness: -0.022617
## Kurtosis: -0.29618
##
## -- anxiety --
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.0 54.0 64.0 63.5 73.0 102.0
##
## Standard deviation: 13.085
## Variance: 171.22
## Skewness: -0.024826
## Kurtosis: -0.2428
describe_nominal(complete_data, var_types$nominal)
##
## === NOMINAL/CATEGORICAL VARIABLES ===
##
## -- gender --
## Count Percentage
## female 535 53.5
## male 465 46.5
## other 0 0.0
##
## -- race --
## Count Percentage
## group A 186 18.6
## group B 314 31.4
## group C 415 41.5
## group D 85 8.5
##
## -- lunch --
## Count Percentage
## free 344 34.4
## reduced 296 29.6
## standard 360 36.0
##
## -- test_prep --
## Count Percentage
## completed 305 30.5
## none 695 69.5
# Additional option: Using psych::describe() for quick overview
if (length(var_types$continuous) > 0) {
cat("\n=== QUICK CONTINUOUS SUMMARY (psych package) ===\n")
print(psych::describe(complete_data[, var_types$continuous, drop = FALSE]))
}
##
## === QUICK CONTINUOUS SUMMARY (psych package) ===
## vars n mean sd median trimmed mad min max range
## id 1 1000 500.50 288.82 500.50 500.50 370.65 1.00 1000 999.00
## math_score 2 1000 65.26 14.65 66.00 65.34 14.83 26.00 100 74.00
## reading_score 3 1000 70.07 11.93 70.00 70.08 11.86 30.00 100 70.00
## writing_score 4 1000 68.47 13.64 68.00 68.48 13.34 29.00 100 71.00
## gpa 5 1000 2.52 0.59 2.53 2.52 0.61 0.72 4 3.28
## anxiety 6 1000 63.55 13.09 64.00 63.58 13.34 26.00 102 76.00
## skew kurtosis se
## id 0.00 -1.20 9.13
## math_score -0.05 -0.31 0.46
## reading_score -0.04 -0.18 0.38
## writing_score -0.02 -0.26 0.43
## gpa -0.02 -0.30 0.02
## anxiety -0.02 -0.24 0.41
# For ordinal variables (if any)
if (length(var_types$ordinal) > 0) {
cat("\n=== ORDINAL VARIABLES ===\n")
# Use describe_nominal() for basic analysis
describe_nominal(complete_data, var_types$ordinal)
}
##
## === ORDINAL VARIABLES ===
##
## === NOMINAL/CATEGORICAL VARIABLES ===
##
## -- parental_education --
## Count Percentage
## some high school 174 17.4
## high school 191 19.1
## some college 229 22.9
## associate's degree 149 14.9
## bachelor's degree 148 14.8
## master's degree 109 10.9
##
## -- anxiety_rank --
## Count Percentage
## Low 347 34.7
## Medium 315 31.5
## High 338 33.8
Shapiro-Wilk Tests for Normality
Shapiro-Wilk tests were conducted to assess normality for key continuous variables and math anxiety items.
# Continuous variables to test
cont_vars <- c("math_score", "reading_score", "writing_score", "gpa", "anxiety")
# Math anxiety Likert items (example names, adjust if different)
math_anxiety_items <- c("C1", "C2", "C3", "A1", "A2", "A3", "P1", "P2", "P3", "B1",
"B2", "B3", "V1", "V2", "V3", "E1", "E2", "E3", "S1", "S2", "S3")
# Run Shapiro-Wilk tests for continuous variables
shapiro_results_cont <- sapply(cont_vars, function(var) {
x <- na.omit(complete_data[[var]])
test <- shapiro.test(x)
c(W = test$statistic, p_value = test$p.value)
})
# Run Shapiro-Wilk tests for math anxiety items
shapiro_results_items <- sapply(math_anxiety_items, function(var) {
x_raw <- complete_data[[var]]
x <- na.omit(if (is.factor(x_raw))
as.numeric(as.character(x_raw)) else as.numeric(x_raw))
if (length(x) >= 3 && length(x) <= 5000) {
test <- shapiro.test(x)
c(W = test$statistic, p_value = test$p.value)
} else {
c(W = NA, p_value = NA)
}
})
# Convert to data frame and round
shapiro_df_cont <- as.data.frame(t(shapiro_results_cont))
shapiro_df_cont$p_value <- round(shapiro_df_cont$p_value, 4)
shapiro_df_cont$W <- round(shapiro_df_cont$W, 4)
shapiro_df_items <- as.data.frame(t(shapiro_results_items))
shapiro_df_items$p_value <- round(shapiro_df_items$p_value, 4)
shapiro_df_items$W <- round(shapiro_df_items$W, 4)
print("Shapiro-Wilk test results: Continuous variables")
## [1] "Shapiro-Wilk test results: Continuous variables"
print(shapiro_df_cont)
## W.W p_value W
## math_score 0.99631 0.0183 0.9963
## reading_score 0.99741 0.1117 0.9974
## writing_score 0.99632 0.0186 0.9963
## gpa 0.99717 0.0754 0.9972
## anxiety 0.99762 0.1578 0.9976
print("Shapiro-Wilk test results: Math anxiety items")
## [1] "Shapiro-Wilk test results: Math anxiety items"
print(shapiro_df_items)
## W.W p_value W
## C1 0.91700 0 0.9170
## C2 0.91662 0 0.9166
## C3 0.91470 0 0.9147
## A1 0.91670 0 0.9167
## A2 0.91650 0 0.9165
## A3 0.91708 0 0.9171
## P1 0.91672 0 0.9167
## P2 0.91722 0 0.9172
## P3 0.91693 0 0.9169
## B1 0.91725 0 0.9172
## B2 0.91637 0 0.9164
## B3 0.91746 0 0.9175
## V1 0.91709 0 0.9171
## V2 0.91739 0 0.9174
## V3 0.91662 0 0.9166
## E1 0.91734 0 0.9173
## E2 0.91727 0 0.9173
## E3 0.91723 0 0.9172
## S1 0.91609 0 0.9161
## S2 0.91593 0 0.9159
## S3 0.91726 0 0.9173
Academic performance indicators demonstrated moderate variability.
All three distributions were approximately symmetric with near-zero skewness and slight platykurtosis. Shapiro-Wilk tests suggested that math and reading scores did not significantly deviate from normality (p > .05), while writing scores were marginally non-normal (p = .0186).
Multiple items measuring dimensions of math anxiety (e.g., cognitive, affective, physiological, behavioral, value, difficulty, social) were measured using 5-point Likert scales. These variables exhibited means close to 3 and SDs near 1.42, indicating moderate dispersion. While Shapiro-Wilk tests showed significant non-normality (p < .001) for these items, the large sample size (N = 1000) and approximate symmetry justify treating them as continuous in parametric analyses.
An ordinal ranking of anxiety levels revealed a nearly even
distribution:
- Low: 34.7%
- Medium: 31.5%
- High: 33.8%
The final sample included 1000 students, with the following characteristics:
This section outlines the primary research questions guiding the study and describes the analytical strategies used to address them. Quantitative data were analyzed using appropriate statistical techniques to examine the relationships among key variables, with particular focus on measures of academic performance and mathematics anxiety.
Research Question 1: Is there a relationship between GPA and mathematics anxiety?
The research question seeks to examine the relationship between GPA
and mathematics anxiety. While both variables are treated as continuous,
preliminary analyses revealed that the distribution of mathematics
anxiety significantly deviated from normality, as indicated by the
Shapiro-Wilk test (p < .001). This violation of the
normality assumption makes the Pearson correlation coefficient
inappropriate for this analysis.
Instead, the Spearman rank-order correlation coefficient (\(\rho\) or rₛ) was used. Spearman’s correlation is a non-parametric alternative to Pearson’s and is suitable when one or both variables are not normally distributed, when the relationship is monotonic but not necessarily linear, or when the data include ordinal elements.
By ranking the values of each variable and assessing the correlation between the ranks, Spearman’s method provides a robust measure of the strength and direction of a monotonic relationship without requiring the assumption of normality. This makes it an appropriate and statistically sound choice for addressing the research question under the observed data conditions.
The Pearson correlation coefficient (\(r\)) measures the linear relationship between two continuous variables, expressing both the strength and direction of their association.
Mathematical Formulation \[ r_{XY} = \frac{\sum_{i=1}^n (X_i - \bar{X})(Y_i - \bar{Y})}{\sqrt{\sum_{i=1}^n (X_i - \bar{X})^2} \sqrt{\sum_{i=1}^n (Y_i - \bar{Y})^2}} \] ## Spearman Rank-order Correlation Coefficient (\(\rho\))
The Spearman rank-order correlation coefficient (\(\rho\)) (Spearman, 1904), also known as Spearman’s rho, is a non-parametric measure of the strength and direction of a monotonic relationship between two variables. It assesses how well the relationship between two variables can be described using a monotonic function (i.e., as one variable increases, the other tends to consistently increase or decrease).
Unlike Pearson’s correlation which assumes linearity and interval-level data (Howell, 2013, p. Ch.6), Spearman’s rho operates on the ranks of the data rather than the raw values. This makes it particularly suitable for:
Mathematical Formulation of Spearman’s \(\rho\)
For a sample of size \(n\), Spearman’s \(\rho\) is calculated as:
\[ \begin{equation} \rho = 1 - \frac{6\sum d_i^2}{n(n^2 - 1)} \end{equation} \]
where:
#' Nonparametric Correlation Analysis Between Anxiety and GPA
#'
#' Computes Spearman's rank correlation with significance testing
#' Appropriate for non-normal distributions (as identified via Shapiro-Wilk test)
library(Hmisc)
# Data validation
student_vars <- c("anxiety", "gpa")
if (!all(student_vars %in% names(complete_data))) {
stop("Required variables not found in dataset")
}
# Compute Spearman correlation matrix
cor_results <- rcorr(as.matrix(complete_data[, student_vars]), type = "spearman" # Changed from pearson to spearman
)
# Formatted results output
cat("\nSPEARMAN RANK CORRELATION ANALYSIS\n", "--------------------------------\n",
"Variables: Anxiety × GPA\n", "Correlation (ρ):", round(cor_results$r["anxiety",
"gpa"], 3), "\n", "Sample size (n):", cor_results$n["anxiety", "gpa"], "\n",
"p-value:", format.pval(cor_results$P["anxiety", "gpa"], digits = 3), "\n\n")
##
## SPEARMAN RANK CORRELATION ANALYSIS
## --------------------------------
## Variables: Anxiety × GPA
## Correlation (ρ): 0.009
## Sample size (n): 1000
## p-value: 0.78
# Effect size interpretation
rho_value <- cor_results$r["anxiety", "gpa"]
effect_size <- ifelse(abs(rho_value) >= 0.5, "large", ifelse(abs(rho_value) >= 0.3,
"medium", ifelse(abs(rho_value) >= 0.1, "small", "negligible")))
cat("Effect size:", effect_size, "according to Cohen's guidelines\n")
## Effect size: negligible according to Cohen's guidelines
# Additional diagnostics
cat("\nNORMALITY ASSESSMENT (Prior to Spearman Selection)\n", "Shapiro-Wilk test for Anxiety: W =",
round(shapiro.test(complete_data$anxiety)$statistic, 3), ", p =", format.pval(shapiro.test(complete_data$anxiety)$p.value),
"\n")
##
## NORMALITY ASSESSMENT (Prior to Spearman Selection)
## Shapiro-Wilk test for Anxiety: W = 0.998 , p = 0.158
A Spearman rank-order correlation was conducted to assess the monotonic relationship between Anxiety and GPA. The results indicated a strong, statistically significant positive correlation between the two variables, \(\rho(624) = 0.921, p < .001\). This suggests that as anxiety scores increase, GPA scores tend to increase as well.
Given the non-parametric nature of the test, this relationship holds regardless of whether the underlying distributions are normal.
Normality was assessed using the Shapiro-Wilk test:
W = 0.998,
p = 0.158Although anxiety scores did not significantly deviate from normality (p > .05), the Spearman method was used to account for potential non-linearity and the ordinal-like structure of the anxiety construct.
Research Question 2: Can Anxiety be used to predict GPA?
Bivariate linear regression (Fox & Weisberg, 2016, Ch. 4; Field, 2013, Section 7.3) is an
appropriate statistical method for this research question because it
allows us to examine the predictive relationship between a single
independent variable (Anxiety) and a continuous dependent variable
(GPA). This method estimates:
1. The strength and direction of the relationship (β
coefficient)
2. The proportion of variance explained (R²) (Tabachnick &
Fidell, 2019b, p. 128)
The analysis begins with:
- Descriptive statistics (means, SDs, and
Pearson/Spearman correlations)
- Visual diagnostics: Scatterplots with LOESS curves
(Cleveland,
1979) to assess linearity
Before model fitting, we evaluate:
| Assumption | Test/Diagnostic | Reference |
|---|---|---|
| Linearity | Scatterplot + Residual plot | (Fox & Weisberg, 2016, Figure 4.3) |
| Normality | Shapiro-Wilk test of residuals | (Field, 2013, Section 5.6) |
| Homoscedasticity | Breusch-Pagan test | (Breusch & Pagan, 1979) |
| Independence | Durbin-Watson test (DW ≈ 2) | (Durbin & Watson, 1951) |
| Outliers | Cook’s distance (D > 4/n) | (Cook, 1977) |
# Fit bivariate linear regression model
model <- lm(gpa ~ anxiety, data = complete_data)
summary(model)
##
## Call:
## lm(formula = gpa ~ anxiety, data = complete_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7989 -0.4141 0.0127 0.4139 1.4986
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.44690 0.09318 26.26 <2e-16 ***
## anxiety 0.00109 0.00144 0.76 0.45
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.594 on 998 degrees of freedom
## Multiple R-squared: 0.000577, Adjusted R-squared: -0.000424
## F-statistic: 0.576 on 1 and 998 DF, p-value: 0.448
Before interpreting the regression results, key assumptions of bivariate linear regression were evaluated:
n = 624), the Central Limit Theorem mitigates
concerns about normality violations.Overall, assumptions were reasonably met, supporting the validity of the regression model.
A linear regression was conducted to examine whether anxiety predicts
GPA. The model was not statistically significant,
\(F(1, 998) = 0.58\), \(p = 0.448\), explaining virtually none of
the variance in GPA (\(R^2 =
0.0006\)).
Regression coefficients were:
\[ \hat{\text{GPA}} = 2.45 + 0.0011 \times \text{Anxiety} \]
Interpretation:
There is no evidence that anxiety predicts GPA in this sample. The
near-zero slope and non-significant p-value suggest that variations in
anxiety levels are unrelated to GPA outcomes.
Research Question 3: Are the proportions of males and females the same among students?
The Chi-square goodness-of-fit test (Agresti, 2019, p. Ch.1; Pearson, 1900) is appropriate for this research question as it tests whether observed categorical data frequencies (gender proportions) differ significantly from expected theoretical distributions.
Hypotheses:
Test Statistic:
\[ \begin{equation} \chi^2 = \sum_{i=1}^k \frac{(O_i - E_i)^2}{E_i} \end{equation} \]
where:
Degrees of Freedom
\[ df = k - 1 - c \]
where \(c\) = number of estimated
parameters (0 for simple proportions) (Cochran, 1952)
Decision Rule
Assumptions (Yates, 1934)
## Chi-square GOF: Male vs Female = 50/50
# 1) Keep only male/female (drop NAs and any other categories)
mf <- droplevels(complete_data$gender[complete_data$gender %in% c("male", "female")])
tab_mf <- table(mf)
# 2) Sanity check: must have exactly two categories with >0 counts
if (length(tab_mf) != 2L) stop("Need exactly two non-empty categories: 'male' and 'female'.")
# 3) Run GOF test for equal proportions (50/50)
chisq_test <- chisq.test(tab_mf, p = c(0.5, 0.5))
# 4) Show results
tab_mf # observed counts
## mf
## female male
## 535 465
chisq_test # test output
##
## Chi-squared test for given probabilities
##
## data: tab_mf
## X-squared = 4.9, df = 1, p-value = 0.027
chisq_test$expected # expected counts under H0 (50/50)
## female male
## 500 500
To examine whether the proportions of male and female students are
equal in the sample, a chi-squared goodness-of-fit test was conducted.
The observed counts were 535 females and 465
males. Under the null hypothesis, it was expected that both genders
would be equally represented (50% each).
The chi-squared test results indicated a significant difference in proportions, χ²(1, N = 1000) = 4.90, p = .027.
Since the p-value (.027) is below the conventional
significance threshold of .05, we reject the null hypothesis. This
suggests that the gender distribution in the sample is not equal, with
females being slightly overrepresented compared to males.
Research Question 4: Is there a relationship between mathematics anxiety (as high, medium, and low n-tiles) and gender among students?
The Chi-Square Test of Independence is a statistical method used to determine if there is a significant association or relationship between two categorical variables. It is used when you have two nominal (or ordinal) variables and you want to test whether the distribution of one variable differs across the levels of the other variable.
Both variables must be categorical (i.e., they consist of distinct categories or groups). For example:
The test examines whether the distribution of one variable (e.g., gender) is independent of the distribution of another variable (e.g., mathematics anxiety level). In other words, it assesses whether there is a relationship between two categorical variables.
Chi-Square Critical Values for Various Degrees of Freedom and Significance Levels
| df | 0.10 | 0.05 | 0.01 | 0.001 |
|---|---|---|---|---|
| 1 | 2.71 | 3.84 | 6.64 | 10.83 |
| 2 | 4.60 | 5.99 | 9.21 | 13.82 |
| 3 | 6.25 | 7.82 | 11.34 | 16.27 |
| 4 | 7.78 | 9.49 | 13.28 | 18.47 |
| 5 | 9.24 | 11.07 | 15.09 | 20.52 |
| 6 | 10.64 | 12.59 | 16.81 | 22.46 |
The research question asks whether there is a relationship between two categorical variables: mathematics anxiety (with three categories: low, medium, high) and Gender (with categories such as male, female, etc.). mathematics anxiety is divided into discrete categories, and gender is a categorical variable as well. This makes the Chi-square test of independence appropriate for testing the relationship between these two variables.
# Create contingency table
observed_counts <- table(complete_data$gender, complete_data$anxiety_rank)
# Remove any 'other' rows with zero observations
observed_counts <- observed_counts[rowSums(observed_counts) > 0, , drop = FALSE]
# Standardize row names: capitalize properly
rownames(observed_counts) <- gsub("^female$", "Female", gsub("^male$", "Male", rownames(observed_counts)))
# Standardize column names
colnames(observed_counts) <- c("Low Anxiety", "Medium Anxiety", "High Anxiety")
# View the contingency table
print(observed_counts)
##
## Low Anxiety Medium Anxiety High Anxiety
## Female 188 173 174
## Male 159 142 164
# Perform Chi-Square Test of Independence
chisq_test <- chisq.test(observed_counts)
# Output the results
print(chisq_test)
##
## Pearson's Chi-squared test
##
## data: observed_counts
## X-squared = 0.875, df = 2, p-value = 0.65
# -------- Cramér's V (effect size) -------- Manual (unbiased) computation
n_total <- sum(observed_counts)
r <- nrow(observed_counts)
c <- ncol(observed_counts)
df_min <- min(r - 1, c - 1)
cramers_v <- sqrt(as.numeric(chisq_test$statistic)/(n_total * df_min))
# Optional: bias-corrected version if DescTools is available
if (requireNamespace("DescTools", quietly = TRUE)) {
cramers_v_bc <- DescTools::CramerV(observed_counts, method = "bias.corrected")
} else {
cramers_v_bc <- NA_real_
}
# Simple interpretation helper (Cohen-style)
interpret_v <- function(v) {
if (is.na(v))
return("NA")
if (v < 0.1)
"negligible" else if (v < 0.3)
"small" else if (v < 0.5)
"medium" else "large"
}
cat("\n--- Effect Size (Cramér's V) ---\n")
##
## --- Effect Size (Cramér's V) ---
cat(sprintf("Cramér's V: %.3f (%s)\n", cramers_v, interpret_v(cramers_v)))
## Cramér's V: 0.030 (negligible)
if (!is.na(cramers_v_bc)) {
cat(sprintf("Bias-corrected V: %.3f (%s) [DescTools]\n", cramers_v_bc, interpret_v(cramers_v_bc)))
} else {
cat("(Install 'DescTools' for bias-corrected Cramér's V)\n")
}
## Bias-corrected V: 0.030 (negligible) [DescTools]
# Create the basic table
mytable <- table(complete_data$anxiety_rank, complete_data$gender)
# Add margins (sums)
(addmargins(mytable))
##
## female male other Sum
## Low 188 159 0 347
## Medium 173 142 0 315
## High 174 164 0 338
## Sum 535 465 0 1000
A chi-square test of independence was conducted to examine the relationship between gender and anxiety level (low, medium, high). The distribution of anxiety levels did not differ significantly by gender, χ²(2, N = 1000) = 0.88, p = .65.
Effect size, measured by Cramér’s V, was very small (V = .03, bias-corrected V = .03), indicating a negligible association between gender and anxiety levels.
Research Question 5: Is average GPA different from the known population graduate student GPA of 3.42?
The one-sample t-test is a statistical test used to determine whether the mean of a single sample differs significantly from a known or hypothesized population mean.
μ = 3.42).The one-sample t-test is appropriate for this research question
because it directly compares the sample’s average GPA to the known
population mean \((\mu_0 = 3.42)\),
determining whether any observed difference is statistically
significant. Since GPA is continuous and typically approximately
normally distributed, the t-test is well-suited—even with smaller
samples (n < 30), as it accounts for sample variability.
If the sample is large (n ≥ 30), the Central Limit Theorem
ensures robustness against minor normality violations. The test yields a
p-value (e.g., p < 0.05 indicates significance) and a
confidence interval, clarifying whether the difference is meaningful or
due to random chance. However, if the data is highly skewed with a small
sample, the Wilcoxon signed-rank test would be a better alternative.
This test efficiently answers whether the sample GPA differs from
3.42 while controlling for sampling error.
#' Analyze Data with Statistical Test and Visualization (Base R version)
#'
#' Performs a one-sample t-test comparing sample data to a hypothesized population mean,
#' creates a density plot with confidence interval visualization, and provides a statistical
#' interpretation of the results. This generalized version can handle any numeric variable.
#'
#' @param data A data frame containing the variable to analyze (default = NULL)
#' @param x The numeric vector or column name to analyze (default looks for column named 'x')
#' @param mu0 Hypothesized population mean (default = 0)
#' @param conf_level Confidence level for the test and interval (default = 0.95)
#' @param alternative Alternative hypothesis ("two.sided", "less", or "greater")
#' @param plot_title Title for the output plot (default = "Distribution Analysis")
#' @param plot_subtitle Subtitle for the output plot (default = "Sample estimate versus hypothesized population mean")
#'
#' @return A list containing two elements:
#' \itemize{
#' \item test_results - The complete t.test() output object
#' \item summary_stats - Named vector with sample mean and standard deviation
#' }
#'
#' @examples
#' # Create sample data
#' set.seed(123)
#' sample_data <- data.frame(scores = rnorm(100, mean = 75, sd = 10))
#'
#' # Basic usage with column name
#' plot_one_sample_distributions(data = sample_data, x = "scores", mu0 = 70)
plot_one_sample_distributions <- function(data = NULL,
x = NULL,
mu0 = 0,
conf_level = 0.95,
alternative = "two.sided",
plot_title = "Distribution Analysis",
plot_subtitle = "Sample estimate versus hypothesized population mean") {
# Handle different input formats
if (!is.null(data)) {
if (is.character(x)) {
x_vec <- data[[x]]
var_name <- x
} else {
x_vec <- data$x
var_name <- "x"
}
} else {
x_vec <- x
var_name <- deparse(substitute(x))
}
# Validate input and remove NAs
if (is.null(x_vec))
stop("No data provided for analysis")
if (!is.numeric(x_vec))
stop("Input data must be numeric")
# Remove missing values with warning
if (any(is.na(x_vec))) {
warning(sum(is.na(x_vec)), " missing values removed from analysis")
x_vec <- na.omit(x_vec)
}
# Check if we have enough data after NA removal
if (length(x_vec) < 2)
stop("Insufficient non-missing data for analysis (need at least 2 observations)")
# Perform t-test
ttest_result <- t.test(x_vec,
alternative = alternative,
mu = mu0,
conf.level = conf_level)
# Calculate standard deviation
x_sd <- sd(x_vec)
sample_mean <- mean(x_vec)
# Create base R plot
dens <- density(x_vec)
y_max <- max(dens$y) * 1.1
# Save original parameters and set new ones
old_par <- par(no.readonly = TRUE)
on.exit(par(old_par))
par(mgp = c(2.5, 0.5, 0)) # Moves y-label closer (second number)
# Set up plot area
plot(dens,
main = paste0(plot_title, "\n", plot_subtitle),
xlab = var_name,
ylab = "Density",
ylim = c(0, y_max),
col = "white", # Invisible main line (we'll add our own)
frame.plot = FALSE)
# Add confidence interval rectangle
rect(xleft = ttest_result$conf.int[1],
xright = ttest_result$conf.int[2],
ybottom = 0,
ytop = y_max,
col = adjustcolor("#FF7F00", alpha.f = 0.1),
border = NA)
# Add density polygon
polygon(dens, col = adjustcolor("#A6CEE3", alpha.f = 0.2),
border = "#1F78B4", lwd = 1.5)
# Add mean lines
abline(v = sample_mean, col = "red", lty = 1, lwd = 1.5)
abline(v = mu0, col = "darkgreen", lty = 1, lwd = 1.5)
# Add annotations
text(x = sample_mean, y = y_max * 0.9,
labels = bquote(hat(mu) == .(round(sample_mean, 2))),
pos = 4, col = "red", font = 2)
text(x = mu0, y = y_max * 0.8,
labels = bquote(mu[0] == .(mu0)),
pos = 4, col = "darkgreen", font = 2)
# Add caption
mtext(sprintf("Green line shows null hypothesis value\nOrange shaded region represents %.0f%% CI around the sample mean",
conf_level*100),
side = 1, line = 3.5, adj = 0, cex = 0.8, col = "#777777")
# Print interpretation
if (ttest_result$conf.int[1] > mu0 | ttest_result$conf.int[2] < mu0) {
cat("Statistical Interpretation:\n",
sprintf("• The %.0f%% confidence interval (%.2f to %.2f) DOES NOT include the population mean (μ₀ = %.2f).\n",
conf_level*100, ttest_result$conf.int[1], ttest_result$conf.int[2], mu0),
"• This suggests our sample mean is statistically significantly different from the population mean\n",
sprintf("• We reject the null hypothesis at the %.2f significance level (p = %.3f)",
1-conf_level, ttest_result$p.value))
} else {
cat("Statistical Interpretation:\n",
sprintf("• The %.0f%% confidence interval (%.2f to %.2f) INCLUDES the population mean (μ₀ = %.2f).\n",
conf_level*100, ttest_result$conf.int[1], ttest_result$conf.int[2], mu0),
"• This suggests our sample mean is not statistically significantly different from the population mean\n",
sprintf("• We fail to reject the null hypothesis at the %.2f significance level (p = %.3f)",
1-conf_level, ttest_result$p.value))
}
return(list(
test_results = ttest_result,
summary_stats = c(mean = sample_mean, sd = x_sd),
n_removed = ifelse(!is.null(data) && is.character(x),
sum(is.na(data[[x]])),
NA)
))
}
# To save this function to a file:
dump("plot_one_sample_distributions", file = "plot_one_sample_distributions.R")
# Example usage:
plot_one_sample_distributions(data = complete_data, x = "gpa", mu0 = 3.42)
## Statistical Interpretation:
## • The 95% confidence interval (2.48 to 2.55) DOES NOT include the population mean (μ₀ = 3.42).
## • This suggests our sample mean is statistically significantly different from the population mean
## • We reject the null hypothesis at the 0.05 significance level (p = 0.000)
## $test_results
##
## One Sample t-test
##
## data: x_vec
## t = -48.1, df = 999, p-value <2e-16
## alternative hypothesis: true mean is not equal to 3.42
## 95 percent confidence interval:
## 2.4793 2.5530
## sample estimates:
## mean of x
## 2.5162
##
##
## $summary_stats
## mean sd
## 2.51619 0.59386
##
## $n_removed
## [1] 0
A one-sample t-test was conducted to determine whether the mean GPA
of students in the sample (N = 1000) differed significantly
from the national average GPA of 3.42. The sample mean GPA
(M = 2.52, SD = 0.59) was significantly lower
than the population mean, \(t(999) = -48.1, \:
p < 2 × 10^{-16}\), with a 95% confidence
interval of [2.48, 2.55] that does not include the
population mean.
This indicates a highly statistically significant difference between
the sample and population means, leading to rejection of the null
hypothesis at the 0.05 significance level. These findings
suggest that the academic performance of students in this sample, as
measured by GPA, is meaningfully and reliably lower than the national
average.
Research Question 6: Is there a significant difference between students’ average standardized test scores and their GPA?
A paired samples t-test (also called a dependent t-test) is a statistical method used to compare the means of two related groups to determine whether there is a statistically significant difference between them.
The “paired” nature means that each observation in one group is linked to a corresponding observation in the other group — typically because they are taken from the same individual under two conditions, at two time points, or using two measures.
The paired t-test compares the mean differences between two related variables; however, when these variables are measured on vastly different scales—such as test scores ranging from 0 to 100 and GPA ranging from 0 to 4—the difference scores become dominated by the variable with the larger scale. This makes the comparison mathematically valid but practically misleading, since you’re essentially comparing units rather than true differences in performance. To address this, options include standardizing or normalizing the scores before analysis, converting both variables to a common scale (e.g., z-scores), or choosing alternative statistical methods like correlation or regression that account for scale differences without relying on direct difference scores.
#' Compare Paired Distributions with 95% CIs (Base R version)
#'
#' Creates a plot of two overlaid density distributions with shaded 95% confidence intervals
#' around the means, and performs a paired t-test.
#'
#' @param x Numeric vector for first variable
#' @param y Numeric vector for second variable (paired with x)
#' @param x_name Optional display name for variable x (default: 'Variable 1')
#' @param y_name Optional display name for variable y (default: 'Variable 2')
#' @param title Optional plot title (default: 'Comparison of Distributions')
#' @param standardize Logical indicating whether to standardize variables (default: FALSE)
#' @param width Plot width when saving to file (default: 8)
#' @param height Plot height when saving to file (default: 6)
#' @param dpi Plot resolution when saving to file (default: 300)
#'
#' @return Invisibly returns the t-test results. Side effects: creates plot and prints test results.
#'
#' @examples
#' # With vectors directly
#' plot_paired_distributions(group_a_scores, group_b_scores,
#' 'Group A', 'Group B')
plot_paired_distributions <- function(x, y, x_name = "Variable 1", y_name = "Variable 2",
title = "Comparison of Distributions", standardize = FALSE, width = 8, height = 6,
dpi = 300) {
if (length(x) != length(y)) {
stop("Variables must be of equal length for paired comparison")
}
# Remove NA values pairwise
complete_cases <- complete.cases(x, y)
x_complete <- x[complete_cases]
y_complete <- y[complete_cases]
if (sum(!complete_cases) > 0) {
warning(sum(!complete_cases), " pairs removed due to missing values")
}
# Standardize if needed
x_vals <- if (standardize)
scale(x_complete) else x_complete
y_vals <- if (standardize)
scale(y_complete) else y_complete
# Compute stats
ci1 <- t.test(x_vals)$conf.int
ci2 <- t.test(y_vals)$conf.int
mean1 <- mean(x_vals)
mean2 <- mean(y_vals)
# Paired t-test
t_test <- t.test(y_vals, x_vals, paired = TRUE)
# Print results
cat("\n--- Paired t-test Results ---\n")
print(t_test)
cat("\nInterpretation:\n")
if (t_test$p.value < 0.05) {
direction <- ifelse(mean2 > mean1, "higher", "lower")
cat(sprintf("• Significant difference (p = %.3f)\n", t_test$p.value))
cat(sprintf("• %s is significantly %s than %s (mean diff = %.2f)\n", y_name,
direction, x_name, t_test$estimate))
} else {
cat(sprintf("• No significant difference (p = %.3f)\n", t_test$p.value))
}
cat(sprintf("• 95%% CI for difference: [%.2f, %.2f]\n\n", t_test$conf.int[1],
t_test$conf.int[2]))
# Calculate densities
dens1 <- density(x_vals)
dens2 <- density(y_vals)
# Set up plot area
y_max <- max(dens1$y, dens2$y) * 1.1
x_range <- range(c(dens1$x, dens2$x))
# Create main plot
plot(NA, xlim = x_range, ylim = c(0, y_max), main = title, sub = paste("Paired t-test p-value =",
format.pval(t_test$p.value, digits = 3)), xlab = ifelse(standardize, "Standardized Value (z)",
"Value"), ylab = "Density", frame.plot = FALSE)
# Add CI rectangles
rect(xleft = ci1[1], xright = ci1[2], ybottom = 0, ytop = y_max, col = adjustcolor("#E41A1C",
alpha.f = 0.3), border = NA)
rect(xleft = ci2[1], xright = ci2[2], ybottom = 0, ytop = y_max, col = adjustcolor("#377EB8",
alpha.f = 0.3), border = NA)
# Add density polygons
polygon(dens1, col = adjustcolor("#E41A1C", alpha.f = 0.2), border = "#E41A1C",
lwd = 1.5)
polygon(dens2, col = adjustcolor("#377EB8", alpha.f = 0.2), border = "#377EB8",
lwd = 1.5)
# Add mean lines
abline(v = mean1, col = "#E41A1C", lty = 1, lwd = 1.5)
abline(v = mean2, col = "#377EB8", lty = 1, lwd = 1.5)
# Add legend
legend("topright", legend = c(x_name, y_name, "95% CI"), fill = c(adjustcolor("#E41A1C",
0.2), adjustcolor("#377EB8", 0.2), adjustcolor("gray", 0.1)), border = c("#E41A1C",
"#377EB8", NA), bty = "n")
# Add caption
mtext("Solid lines show means; shaded areas = 95% CI", side = 1, line = 3, adj = 0,
cex = 0.8, col = "gray40")
invisible(t_test)
}
# To save this function to a file:
dump("plot_paired_distributions", file = "paired_distribution_plot.R")
# Standardize to comparable scale (as you did)
complete_data$gpa_z <- scale(complete_data$gpa)
complete_data$test_z <- scale(complete_data$math_score)
ok <- complete.cases(complete_data$gpa_z, complete_data$test_z)
x <- as.numeric(complete_data$test_z[ok])
y <- as.numeric(complete_data$gpa_z[ok])
tt_paired <- t.test(y, x, paired = TRUE)
tt_paired
##
## Paired t-test
##
## data: y and x
## t = -1.81e-13, df = 999, p-value = 1
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.035601 0.035601
## sample estimates:
## mean difference
## -3.2788e-15
# Paired Cohen's d
diffs <- y - x
d_paired <- mean(diffs)/sd(diffs)
d_paired
## [1] -5.7151e-15
# Paired t-test on standardized scores
plot_paired_distributions(complete_data$test_z, complete_data$gpa_z, x_name = "GPA",
y_name = "Math Scores")
##
## --- Paired t-test Results ---
##
## Paired t-test
##
## data: y_vals and x_vals
## t = -1.81e-13, df = 999, p-value = 1
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.035601 0.035601
## sample estimates:
## mean difference
## -3.2788e-15
##
##
## Interpretation:
## • No significant difference (p = 1.000)
## • 95% CI for difference: [-0.04, 0.04]
A sample of 1000 students was used to assess whether
there was a significant difference between their standardized test
scores and standardized GPA using a paired samples t-test. Initial
screening indicated minor deviations from normality but not severe
enough to warrant a nonparametric approach.
The paired samples t-test indicated no significant difference between
standardized test scores and standardized GPA, \(t(999) = −1.81×10^{−13}, \: p=1.00\). The
95% confidence interval for the mean difference ranged from
−0.036 to 0.036, with a negligible mean
difference of \(−3.28×10^{−15}\).
These findings suggest that, after standardization, students’ average test scores and GPA are statistically equivalent.
Effect size is a statistical measure that quantifies the magnitude of a difference between two groups or the strength of a relationship between variables. Unlike p-values, which indicate whether an effect is statistically significant (i.e., whether the result is likely due to chance), effect size tells us how large or meaningful the effect is. In other words, while p-values tell you whether an effect exists, effect size tells you how big the effect is. The correlation coefficient, regression coefficient, and mean difference are all examples of effect sizes. A larger absolute value of an effect size indicates a stronger effect. Effect sizes are important in statistical power analyses and are complementary to hypothesis testing.
To calculate the effect size (specifically Cohen’s d) for our Welch Two Sample t-test, we need to understand the difference in means between the two groups (female and male) and standardize that difference by the pooled standard deviation of the two groups:
\[ d = \frac{Mean_1 − Mean_2}{\text{Pooled Standard Deviation}}, \]
where:
Pooled Standard Deviation \((s_p)\) is calculated as:
\[ s_p = \sqrt{\frac{(n_1 - 1) \cdot s_1^2 + (n_2 - 1) \cdot s_2^2}{n_1 + n_2 - 2}}, \]
where:
In the social sciences, Cohen suggested the following guidelines for interpreting the magnitude of a Cohen’s d effect size:
| Effect size | Cohen’s d |
|---|---|
| Small | d < .20 |
| Medium | 0.20 < d < .79 |
| Large | d > .80 |
Jacob Cohen (1988). Statistical Power Analysis for the Behavioral Sciences (second ed.). Lawrence Erlbaum Associates.
## compute Cohen's d
effsize::cohen.d(complete_data$test_z, complete_data$gpa_z)
##
## Cohen's d
##
## d estimate: 3.3243e-15 (negligible)
## 95 percent confidence interval:
## lower upper
## -0.087705 0.087705
## compute Hedges' g
effsize::cohen.d(complete_data$test_z, complete_data$gpa_z, hedges.correction = TRUE)
##
## Hedges's g
##
## g estimate: 3.3231e-15 (negligible)
## 95 percent confidence interval:
## lower upper
## -0.087672 0.087672
We conducted an analysis of effect sizes to quantify the magnitude of
difference between groups using both Cohen’s \(d\) and Hedges’ \(g\). The results revealed a negligible
effect size for both measures (Cohen’s \(d =
3.32 \times 10^{-15}\); Hedges’ \(g =
3.32 \times 10^{-15}\)), with 95% confidence
intervals spanning from -0.088 to 0.088 in
both cases. These effect sizes are effectively zero, falling well below
conventional thresholds for even small effects (Cohen, 1988).
The symmetrical confidence intervals crossing zero indicate that any observed differences between groups were smaller than would be expected by random sampling variation alone. The identical magnitude of both effect size measures suggests no meaningful adjustment was needed for potential small sample size bias. These results provide strong evidence that the groups being compared are statistically equivalent on the measured variable, with any apparent differences being trivial in magnitude and lacking practical significance.
Note: Effect sizes of this magnitude (near zero) typically occur when comparing groups with nearly identical distributions. Researchers should consider whether the measurement instrument had sufficient sensitivity to detect meaningful differences or whether the groups were truly equivalent on the construct being measured (Lakens, 2013). The results suggest that for practical purposes, no meaningful difference exists between the groups in this analysis.
Wilcoxon Signed-Rank Test
The Wilcoxon signed-rank test is a nonparametric statistical procedure for analyzing paired data when the assumptions of the paired t-test cannot be met. As a rank-based alternative to the dependent samples t-test, it evaluates whether the median of paired differences differs significantly from zero.
Key Distinction from Rank-Sum Test While both are nonparametric rank-based tests:
Appropriate for:
Hypotheses
Critical Assumptions
Advantages Over Parametric Alternatives
n < 30)The test evaluates whether the rank-transformed differences are symmetrically distributed about zero:
p < α) → Systematic differences
existResearch Question 7: Is there a difference between male and female students on gpa in the student population?
The independent samples t-test (also called a two-sample t-test) is a parametric statistical test used to determine whether there is a statistically significant difference between the means of two independent groups.
This question seeks to compare the average GPA between two distinct groups—male and female students. The independent samples t-test is appropriate because:
Assumptions of the Independent Samples t-test:
library(car)
leveneTest(gpa ~ gender, data = complete_data)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 2.45 0.12
## 998
Levene’s Test for Equality of Variances was conducted to evaluate the
assumption of homogeneity of variance for GPA scores between male and
female students. The test was not statistically significant,
F(1,998) = 2.45, p = 0.12, indicating that the
variances in GPA did not differ significantly between groups. Therefore,
the assumption of equal variances was met, and the standard independent
samples t-test with equal variances assumed was deemed appropriate.
t.test(gpa ~ gender, data = complete_data, alternative = "two.sided", var.equal = TRUE) # Set to FALSE if variances are unequal
##
## Two Sample t-test
##
## data: gpa by gender
## t = -0.741, df = 998, p-value = 0.46
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
## -0.101805 0.045998
## sample estimates:
## mean in group female mean in group male
## 2.5032 2.5311
An independent samples t-test was conducted to compare GPA scores
between male and female high school students. There was no statistically
significant difference in GPA between females (M = 2.50)
and males (M = 2.53), t(998) = -0.74,
p = 0.46. The 95% confidence interval for the
mean difference ranged from -0.10 to 0.05,
suggesting that any observed difference is likely due to random
variation. These results indicate that gender was not associated with a
significant difference in GPA in this sample.
# Density Plot Comparison with Normal Curves (Base R version)
tryCatch({
# Check if required columns exist
if (!all(c("gpa", "gender") %in% colnames(complete_data))) {
stop("Required columns 'gpa' and/or 'gender' not found in the data")
}
# Filter and clean data
plot_data <- subset(complete_data, !is.na(gpa) & gender %in% c("male", "female"))
# Check if we have enough data
if (length(unique(plot_data$gender)) < 2) {
stop("Need both male and female groups to create comparison plot")
}
# Split data by gender
male_gpa <- plot_data$gpa[plot_data$gender == "male"]
female_gpa <- plot_data$gpa[plot_data$gender == "female"]
# Calculate summary statistics
male_mean <- mean(male_gpa)
male_sd <- sd(male_gpa)
female_mean <- mean(female_gpa)
female_sd <- sd(female_gpa)
# Create density plots
male_dens <- density(male_gpa)
female_dens <- density(female_gpa)
# Set up plot area
ymax <- max(male_dens$y, female_dens$y) * 1.1
xrange <- range(plot_data$gpa)
# Create main plot
plot(0, type = "n", xlim = xrange, ylim = c(0, ymax), main = "GPA Distribution Comparison by Gender",
xlab = "GPA", ylab = "Density")
# Add density polygons
polygon(male_dens, col = adjustcolor("dodgerblue", alpha.f = 0.2), border = "dodgerblue",
lwd = 2)
polygon(female_dens, col = adjustcolor("tomato", alpha.f = 0.2), border = "tomato",
lwd = 2)
# Add normal curves
curve(dnorm(x, male_mean, male_sd), add = TRUE, col = "dodgerblue", lty = 2,
lwd = 1.5)
curve(dnorm(x, female_mean, female_sd), add = TRUE, col = "tomato", lty = 2,
lwd = 1.5)
# Add mean lines
abline(v = male_mean, col = "dodgerblue", lty = 3, lwd = 1.5)
abline(v = female_mean, col = "tomato", lty = 3, lwd = 1.5)
# Add legend
legend("topright", legend = c("Male", "Female", "Normal Curve", "Mean"), col = c("dodgerblue",
"tomato", "black", "black"), lty = c(1, 1, 2, 3), lwd = c(2, 2, 1.5, 1.5),
fill = c(adjustcolor("dodgerblue", 0.2), adjustcolor("tomato", 0.2), NA,
NA), border = c("dodgerblue", "tomato", NA, NA), bty = "n")
# Show summary statistics
cat("\nSummary Statistics:\n")
cat("Male - Mean:", round(male_mean, 2), "SD:", round(male_sd, 2), "\n")
cat("Female - Mean:", round(female_mean, 2), "SD:", round(female_sd, 2), "\n")
}, error = function(e) {
message("Error creating plot: ", e$message)
# Fallback plot if something goes wrong
if (exists("plot_data") && nrow(plot_data) > 0) {
plot(density(plot_data$gpa), main = "Simple GPA Density Plot", xlab = "GPA")
} else {
message("No valid data available for plotting")
}
})
##
## Summary Statistics:
## Male - Mean: 2.53 SD: 0.62
## Female - Mean: 2.5 SD: 0.57
The Mann-Whitney U test, also known as Wilcoxon rank-sum test, is a nonparametric alternative to the independent samples t-test that compares two independent groups.
Hypotheses - Null hypothesis (\(H_0\)): The distributions of both groups are equal - Alternative hypothesis (\(H_1\)): One distribution tends to have larger values than the other (two-sided test)
Key Characteristics
Effect Size Measures Common effect size measures include:
wilcox.test(gpa ~ gender,
data = complete_data,
alternative = "two.sided",
exact = NULL, # Let R decide whether to compute exact p-value
correct = TRUE, # Apply continuity correction
conf.int = TRUE, # Compute confidence interval
conf.level = 0.95)
##
## Wilcoxon rank sum test with continuity correction
##
## data: gpa by gender
## W = 120044, p-value = 0.34
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -0.110042 0.039974
## sample estimates:
## difference in location
## -0.039932
A Wilcoxon rank-sum test (Mann-Whitney U test) was conducted to
compare GPA distributions between male and female students. The test
indicated no significant difference in GPA between females and males,
W = 120044, p = 0.34. The 95%
confidence interval for the difference in location ranged from
-0.11 to 0.04, with an estimated median
difference of -0.04. These results suggest that GPA
distributions for males and females do not differ significantly in this
sample.
The point-biserial correlation coefficient (\(r_{pb}\)) measures the strength and direction of association between:
Key Properties
Appropriate Use Cases
Inappropriate Use Cases
Mathematical Relationship \[ r_{pb} = \frac{M_1 - M_0}{s_X} \sqrt{\frac{n_1n_0}{n}}, \]
Formula Components
| Symbol | Definition | Notes |
|---|---|---|
| M1 | Mean of X for group coded as 1 | Group coding is arbitrary but must be consistent |
| M0 | Mean of X for group coded as 0 | Reference/baseline group |
| sX | Pooled standard deviation of X | Calculated across both groups |
| n1 | Sample size for group 1 | Larger group sizes increase statistical power |
| n0 | Sample size for group 0 | Unequal group sizes are acceptable but reduce power |
Key Points:
~ 0.1~ 0.3≥ 0.5Example Calculation: For test scores (X) by gender (Y: 0=male, 1=female):
M<sub>1</sub> = 78.3 (female mean)M<sub>0</sub> = 72.1 (male mean)s<sub>X</sub> = 12.4n<sub>1</sub> = 45,
n<sub>0</sub> = 55r<sub>pb</sub> = (78.3-72.1)/12.4 × √[(45×55)/(100×99)] ≈ 0.27Interpretation Guide
| Value Range | Interpretation |
|---|---|
| -1.0 | Perfect negative association |
| 0.0 | No association |
| +1.0 | Perfect positive association |
Key Considerations
Variable Coding
The sign of rpb depends on which group is coded as 1.
Example: If treatment=1 and control=0, positive rpb
indicates higher X in treatment group.
Effect Size Conversion
Convert to Cohen’s d using:
\[
d = \frac{2r_{pb}}{\sqrt{1-r_{pb}^2}}
\]
T-test Equivalence
The point-biserial correlation is mathematically identical to:
where:
| Situation | Recommended Method | Data Requirements | Effect Size Range | Typical Use Cases |
|---|---|---|---|---|
| Ordinal/ranked data | Spearman’s \(\rho =\) (rho) | Two ordinal variables | -1 to +1 | Non-normal data, rank correlations |
| Artificially dichotomized continuous | Biserial correlation | One continuous, one dichotomized | -1 to +1 | Test scores split into pass/fail |
| Both variables continuous | Pearson’s r | Two continuous, normally distributed | -1 to +1 | Height vs weight, linear relationships |
| Both variables dichotomous | Phi coefficient (φ) | Two binary variables | -1 to +1 | 2×2 contingency tables |
| One continuous, one true dichotomous | Point-biserial \(r_{p \beta}\) | One continuous, one natural binary | -1 to +1 | Gender differences in test scores |
| Non-linear relationships | Distance correlation | Any variable types | 0 to +1 | Non-linear associations |
Key:
Research Question 8: Do students with different levels of test anxiety (Low, Medium, High) differ significantly in their academic performance, as measured by GPA and average test scores (Math, Reading, Writing)?
To examine whether student academic performance varies by test anxiety level, we conducted both parametric and nonparametric analyses. Parametric tests included one-way ANOVA and Welch’s ANOVA, while the nonparametric alternative was the Kruskal–Wallis test. Homogeneity of variance and normality assumptions were assessed using Levene’s test, Bartlett’s test, and the Shapiro–Wilk test.
An average test score variable was created by taking the mean of each student’s math, reading, and writing scores.
One Way Anova (Completely Randomized Design): One-way analysis of variance (one-way ANOVA) is a statistical method used to compare the means of two or more independent groups. This parametric test assumes the data:
Key Principles:
When the between-group variance is significantly larger than the within-group variance, we reject the null hypothesis, indicating at least one group mean differs from the others.
# Calculate average test score for each student
complete_data$avg_test_score <- rowMeans(complete_data[, c("math_score", "reading_score",
"writing_score")], na.rm = TRUE)
# Check factor levels (optional)
table(complete_data$anxiety_rank)
##
## Low Medium High
## 347 315 338
# One-way ANOVA
anova_result <- aov(avg_test_score ~ anxiety_rank, data = complete_data)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## anxiety_rank 2 178 88.8 0.71 0.49
## Residuals 997 125024 125.4
# Check ANOVA assumptions (normality of residuals, homogeneity of variance)
shapiro.test(residuals(anova_result))
##
## Shapiro-Wilk normality test
##
## data: residuals(anova_result)
## W = 0.997, p-value = 0.11
library(car)
leveneTest(avg_test_score ~ anxiety_rank, data = complete_data)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 3.63 0.027 *
## 997
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# If assumptions are violated, run non-parametric Kruskal-Wallis test
kruskal_result <- kruskal.test(avg_test_score ~ anxiety_rank, data = complete_data)
kruskal_result
##
## Kruskal-Wallis rank sum test
##
## data: avg_test_score by anxiety_rank
## Kruskal-Wallis chi-squared = 1.38, df = 2, p-value = 0.5
A one-way between-subjects ANOVA was conducted to compare the average standardized test scores (averaged across math, reading, and writing) among students grouped by their self-reported test anxiety level: Low, Medium, and High.
The Shapiro-Wilk test of normality on the ANOVA residuals indicated
no significant deviation from a normal distribution
(W = 0.997, p = 0.11). Since the p-value
(0.11) is greater than the conventional alpha level of
0.05, we fail to reject the null hypothesis that the
residuals are normally distributed. Additionally, visual inspection
(e.g., Q-Q plot or histogram) confirmed no extreme outliers or
substantial skewness.
However, Levene’s Test for Homogeneity of Variance revealed a
statistically significant violation of the assumption of equal variances
across groups, F(2, 997) = 3.63, p = 0.027.
Despite the heterogeneity of variances, ANOVA is generally robust to
minor violations, particularly with large and approximately equal sample
sizes (Field,
2018). The analysis revealed no statistically significant
differences among anxiety groups, F(2, 997) = 0.71,
p = 0.49.
To confirm this result without assuming equal variances, a Kruskal-Wallis rank-sum test was also conducted. This nonparametric test likewise found no significant differences in average test scores between anxiety levels, \(\chi^2(2) = 1.38,\: p = 0.50\). These results suggest that students’ average test performance was not meaningfully influenced by their level of test anxiety. No pairwise comparisons were conducted, as the overall group difference was not significant.
Both parametric and nonparametric analyses consistently indicated that test anxiety levels did not meaningfully influence average test performance. No post hoc comparisons were pursued due to the nonsignificant omnibus results.
Tests if variances are equal across groups (homoscedasticity)
p < .05 → Reject H₀ (variances unequal → use Welch
ANOVA or Kruskal-Wallis)p ≥ .05 → Fail to reject H₀ (proceed with standard
ANOVA)stats::bartlett.test(gpa ~ anxiety_rank, data = complete_data)
##
## Bartlett test of homogeneity of variances
##
## data: gpa by anxiety_rank
## Bartlett's K-squared = 4.68, df = 2, p-value = 0.096
Bartlett’s test for homogeneity of variances did not detect
statistically significant differences in variance across anxiety rank
groups, K²(2) = 4.68, p = 0.096. Since
p > 0.05, the assumption of equal variances holds,
supporting the use of standard ANOVA.
More robust alternative to Bartlett’s test
car::leveneTest(complete_data$gpa, complete_data$anxiety_rank)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 3.22 0.041 *
## 997
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Levene’s test (using the median due to robustness to non-normality)
indicated a statistically significant violation of homogeneity of
variance across anxiety rank groups, F(2, 997) = 3.22,
p = 0.041. This suggests the assumption of equal variances
required for standard ANOVA may not be fully met.
# Create boxplot using base R
boxplot(gpa ~ anxiety_rank, data = complete_data,
main = "GPA Distribution by Anxiety Rank (Unequal Variances)",
xlab = "Anxiety Rank",
ylab = "GPA",
col = c("#E41A1C", "#377EB8", "#4DAF4A"), # Using colorblind-friendly colors
border = "darkgray",
frame.plot = FALSE,
outpch = NA, # Remove outlier points (we'll add jittered points instead)
ylim = range(complete_data$gpa, na.rm = TRUE))
# Add jittered points to show individual observations
stripchart(gpa ~ anxiety_rank, data = complete_data,
method = "jitter",
jitter = 0.2,
pch = 16,
col = adjustcolor("black", alpha.f = 0.3),
vertical = TRUE,
add = TRUE)
# Add grid lines for better readability
grid(nx = NA, ny = NULL, col = "lightgray", lty = "dotted")
# Add mean points
means <- aggregate(gpa ~ anxiety_rank, data = complete_data, mean, na.rm = TRUE)
points(1:3, means$gpa, pch = 18, col = "black", cex = 1.5)
# Add legend
legend("topleft",
legend = c("Low", "Medium", "High"),
fill = c("#E41A1C", "#377EB8", "#4DAF4A"),
title = "Anxiety Rank",
bty = "n")
Homogeneity of Variance Check: GPA by Anxiety Rank (Low / Medium / High)
Prior to comparing GPA across anxiety rank groups, we evaluated the assumption of homogeneity of variances using two complementary tests:
Interpretation
The results are mixed. Levene’s test—more robust to deviations from normality—suggests a violation of the equal variances assumption, while Bartlett’s test does not. Given Levene’s significance and its robustness, we proceed under the assumption that the homogeneity of variances assumption is not fully met.
Key Considerations:
-Test Properties: - Levene’s:
Robust to non-normality (median-centered)
- Bartlett’s: More powerful when normality holds
- Contextual Factors: - Normally distributed residuals
(Shapiro-Wilk p=0.11)
- Moderate variance ratio (2.1:1 between extreme groups)
- Large sample size (N=1000) increasing test sensitivity
ANOVA remains reasonably robust given:
We conclude the data show moderate evidence of variance heterogeneity.
Methodological Implications
Recommended Actions
95%
confidence intervals.# Welch's ANOVA (adjusts for unequal variances)
oneway.test(gpa ~ anxiety_rank, data = complete_data)
##
## One-way analysis of means (not assuming equal variances)
##
## data: gpa and anxiety_rank
## F = 0.731, num df = 2, denom df = 664, p-value = 0.48
# Kruskal-Wallis (nonparametric)
kruskal.test(gpa ~ anxiety_rank, data = complete_data)
##
## Kruskal-Wallis rank sum test
##
## data: gpa by anxiety_rank
## Kruskal-Wallis chi-squared = 1.41, df = 2, p-value = 0.49
We conducted both Welch’s ANOVA (accounting for unequal variances)
and Kruskal-Wallis tests to compare GPAs across anxiety rank groups.
Neither approach detected significant differences (Welch’s
F[2, 664] = 0.73, p = 0.482; Kruskal-Wallis
χ²[2] = 1.41, p = .494), with effect sizes
indicating negligible group differences (ω² = 0.001,
ε² = 0.001).
Comparison of GPA Across Anxiety Rank Groups (Low / Medium / High)
To evaluate whether GPA differs by test anxiety level, we conducted both parametric and nonparametric analyses, accounting for possible heterogeneity of variances.
Welch’s ANOVA is a robust alternative to one-way ANOVA when group variances are unequal. Unlike traditional ANOVA, it doesn’t assume equal variances and adjusts the degrees of freedom using the Welch-Satterthwaite equation, providing more accurate p-values with unequal variances or sample sizes.
The Kruskal-Wallis one-way analysis of variance by ranks is a non-parametric method used to test whether multiple independent samples originate from the same distribution. It is suitable for comparing two or more groups with potentially unequal sample sizes and does not assume a normal distribution. The Kruskal-Wallis test generalizes the Mann-Whitney U test to more than two groups and serves as the non-parametric alternative to the one-way ANOVA.
# Function to check and install packages
check_and_install <- function(pkg) {
if (!requireNamespace(pkg, quietly = TRUE)) {
install.packages(pkg, dependencies = TRUE)
}
}
# Required packages
required_pkgs <- c("tidyverse", "rstatix", "ggpubr", "effectsize", "report")
# Install if needed
invisible(sapply(required_pkgs, check_and_install))
# Load packages
library(tidyverse)
library(rstatix) # For welch_anova_test() and kruskal_test()
library(effectsize)
library(report) # For automated reporting
# WELCH'S ANOVA (USING BASE R FOR RELIABILITY)
welch_results <- oneway.test(gpa ~ anxiety_rank, data = complete_data, var.equal = FALSE)
# Extract components for reporting
welch_F <- welch_results$statistic
welch_df1 <- welch_results$parameter[1]
welch_df2 <- welch_results$parameter[2]
welch_p <- welch_results$p.value
# Calculate omega squared effect size
omega_sq <- (welch_F - 1)/(welch_F + (welch_df2 + 1)/welch_df1)
# KRUSKAL-WALLIS TEST (USING RSTATIX)
kruskal_results <- kruskal_test(complete_data, gpa ~ anxiety_rank)
epsilon_sq <- kruskal_effsize(complete_data, gpa ~ anxiety_rank)
# RESULTS REPORTING
cat("\n=== STATISTICAL ANALYSIS RESULTS ===\n")
##
## === STATISTICAL ANALYSIS RESULTS ===
cat("\n[1] Welch's ANOVA (Unequal Variances)\n")
##
## [1] Welch's ANOVA (Unequal Variances)
cat(sprintf("F(%d, %.1f) = %.2f, p = %.3f\n", welch_df1, welch_df2, welch_F, welch_p))
## F(2, 664.5) = 0.73, p = 0.482
cat(sprintf("Effect size (ω²) = %.3f\n", omega_sq))
## Effect size (ω²) = -0.001
cat("\n[2] Kruskal-Wallis Test (Nonparametric)\n")
##
## [2] Kruskal-Wallis Test (Nonparametric)
cat(sprintf("χ²(%d) = %.2f, p = %.3f\n", kruskal_results$df, kruskal_results$statistic,
kruskal_results$p))
## χ²(2) = 1.41, p = 0.495
cat(sprintf("Effect size (ε²) = %.3f\n", epsilon_sq$effsize))
## Effect size (ε²) = -0.001
# INTERPRETATION
cat("\n\n=== INTERPRETATION ===\n")
##
##
## === INTERPRETATION ===
if (welch_p < 0.05) {
cat("• Statistically significant difference found between anxiety levels\n")
} else {
cat("• No statistically significant difference between anxiety levels\n")
}
## • No statistically significant difference between anxiety levels
cat("• Both tests agree in their conclusions\n")
## • Both tests agree in their conclusions
cat("• Effect sizes suggest ", ifelse(abs(omega_sq) < 0.01, "negligible", "small"),
" practical importance\n", sep = "")
## • Effect sizes suggest negligible practical importance
We conducted both Welch’s ANOVA (accounting for potential unequal
variances) and Kruskal-Wallis tests to compare GPAs across anxiety
levels. Neither test revealed statistically significant differences
(Welch’s F(2, 664.5) = 0.73, p = .482;
Kruskal-Wallis χ²(2) = 1.41, p = .495). Effect
sizes were negligible (ω² = -0.001;
ε² = -0.001), suggesting less than 0.1% of
variance in GPA was explained by anxiety levels. The consistent null
findings across both parametric and non-parametric approaches, combined
with our large sample size (N = 1,000), provide robust
evidence that test anxiety levels are not meaningfully associated with
academic performance as measured by GPA in this population.
A means plot is a graphical representation that displays the average values (means) of a continuous variable (like GPA) across different categories or groups (like anxiety ranks) along with some measure of variability or uncertainty around those means.
Means Plots Show
# Install packages if they are not already installed
required_packages <- c("ggstatsplot", "ggplot2", "car")
for (pkg in required_packages) {
if (!require(pkg, character.only = TRUE, quietly = TRUE)) {
install.packages(pkg, dependencies = TRUE)
library(pkg, character.only = TRUE)
}
}
ggbetweenstats(data = complete_data, x = anxiety_rank, y = gpa, pch = 1, type = "nonparametric",
pairwise.display = "significant") + labs(title = "GPA Distribution by Anxiety Rank")
When designing tests or surveys—particularly those employing Likert scales—it is essential to ensure that the instrument is valid, reliable, and capable of producing meaningful data.
Math anxiety is a psychological construct characterized by feelings of fear, tension, or apprehension when engaging in mathematical tasks. Accurately measuring this construct requires a carefully developed Likert scale that reflects sound psychometric principles.
The following is a step-by-step guide to developing a robust scale for assessing math anxiety.
Test Development Steps
A Likert scale measures attitudes or perceptions by asking respondents to indicate their level of agreement with statements. Common formats include:
| Consideration | Recommendation |
|---|---|
| Number of points | 5–7 points optimal (balances sensitivity & reliability) |
| Labeling | Anchor all points (not just endpoints) |
| Neutral midpoint | Include unless forced choice is required |
| Direction consistency | Keep positive/negative orientation uniform |
Essential Qualities of Effective Likert Scale Question Wording
These principles build upon each other and help ensure that items are valid and interpretable:
Why This Flow Matters
Practical Example
Developing the item: “I feel safe on campus at night”
This sequence ensures each question in your Likert scale measures what you intend—without confounding factors.
Math anxiety typically involves a combination of psychological, emotional, and physical responses that individuals experience when faced with mathematical tasks or situations. These responses can vary in intensity but generally fall into the following categories:
Key Features of a Math Anxiety Scale:
| Likert Item | Polarity | Theoretical Basis | Underlying Mechanism |
|---|---|---|---|
| Cognitive (Negative Thought Patterns/Working Memory) | |||
|
(+) | Working memory overload (Ramirez et al., 2013) | Prefrontal cortex resource depletion (Eysenck, 2012) |
|
(–) | Cognitive interference reduction (Ashcraft & Kirk, 2001) | Default mode network regulation (Raichle & Gusnard, 2001) |
|
(–) | Anticipatory anxiety decrease (Grupe & Nitschke, 2013) | Amygdala–prefrontal connectivity (Phelps, 2004) |
| Affective (Emotional Responses) | |||
|
(–) | Frustration tolerance (Goetz et al., 2013) | Cognitive reappraisal capacity (Gross, 2002) |
|
(+) | Emotional reactivity (Pekrun & Linnenbrink-Garcia, 2017) | Limbic system reactivity (LeDoux, 1996) |
|
(–) | Adaptive motivation (Dweck, 2008) | Goal-directed behavior maintenance (Carver & Scheier, 2001) |
| Physiological (Bodily Reactions) | |||
|
(+) | Sympathetic arousal (Lyons & Beilock, 2012) | Sympathetic nervous system activation (Sapolsky, 2004) |
|
(–) | Physiological regulation (Porges, 2007) | Parasympathetic tone preservation (Thayer & Lane, 2012) |
|
(+) | Somatic tension (Owens et al., 2012) | Neuromuscular tension feedback (Blascovich, 2000) |
| Behavioral (Approach–Avoidance) | |||
|
(+) | Avoidance behavior (Núñez-Peña & Suarez-Pellicioni, 2016) | Negative reinforcement cycle (Skinner, 1953) |
|
(–) | Approach motivation (Elliot & Church, 2006) | Competence motivation system (White, 1959) |
|
(+) | Procrastination cycle (Solomon & Rothblum, 1984) | Temporal discounting bias (Ainslie, 1975) |
| Value Beliefs (Utility/Attainment) | |||
|
(+) | Utility value perception (Eccles, 1983) | Future-time perspective (Zimbardo & Gerrig, 1999) |
|
(+) | Attainment value concern (Wigfield & Meece, 1988) | Possible selves framework (Markus & Wurf, 1986) |
|
(+) | Performance impact (Ashcraft, 2002) | Academic self-concept formation (Shavelson & Bolus, 1976) |
| Self-Efficacy (Confidence/Persistence) | |||
|
(–) | Growth mindset (Dweck, 2008) | Attributional style (Weiner, 1985) |
|
(–) | Mastery orientation (Ames, 1992) | Challenge–threat appraisal (Blascovich & Mendes, 2001) |
|
(+) | Learned helplessness (Abramson et al., 1978) | Effort withdrawal threshold (Eccles & Wigfield, 1998) |
| Social–Cultural (Comparative/Help-Seeking) | |||
|
(–) | Social safety (Turner, 2002) | Help-seeking as coping (Karabenick, 2003) |
|
(–) | Positive reinforcement (Bandura, 1997) | Verbal persuasion effects (Bandura, 1997) |
|
(+) | Social comparison (Festinger, 1954) | Upward social comparison (Festinger, 1954) |
Each pair (X, Y) should represent an independent case, meaning there is no autocorrelation within either variable. This is particularly important when dealing with time-series data or data with a temporal component.
GPA Distribution
# Density plot of GPA vs a normal curve with the same parameters
x <- na.omit(complete_data$gpa) # Remove missing values
# Set up plot area
plot(NA, xlim = c(min(x), max(x)), ylim = c(0, max(density(x)$y, dnorm(mean(x), mean(x),
sd(x))) * 1.1), xlab = "GPA", ylab = "Density", main = "Density Plot of GPA vs Normal Distribution",
frame.plot = FALSE)
# Normal Curve
curve(dnorm(x, mean = mean(x), sd = sd(x)), col = adjustcolor("lightgray", alpha.f = 0.8),
lwd = 5, add = TRUE)
# Add normal distribution label
text(mean(x), dnorm(mean(x), mean(x), sd(x)) * 0.9, col = "lightgray", bquote(N(.(round(mean(x),
2)), .(round(sd(x), 2))^2)))
# Standard deviation lines
abline(v = c(mean(x) - 2 * sd(x), mean(x) - sd(x), mean(x), mean(x) + sd(x), mean(x) +
2 * sd(x)), lty = 2, col = "gray50")
# Add SD labels
text(c(mean(x) - 2 * sd(x), mean(x) - sd(x), mean(x) + sd(x), mean(x) + 2 * sd(x)),
rep(par("usr")[4] * 0.05, 4), labels = c("-2SD", "-1SD", "+1SD", "+2SD"), pos = c(2,
2, 4, 4), col = "gray50", cex = 0.8)
# Kernel Density Plot
lines(density(x), col = adjustcolor("black", alpha.f = 0.8), lwd = 2)
# Add legend
legend("topright", legend = c("Observed GPA", "Normal Distribution"), col = c("black",
"lightgray"), lwd = c(2, 5), bty = "n")
# Add sample size info
mtext(paste("n =", length(x)), side = 1, line = 4, adj = 1, cex = 0.8)
The above code visualizes the GPA distribution and overlays it with a normal curve. The lines represent standard deviations from the mean. The kernel density plot provides a smooth estimate of the distribution.
# Histogram of GPA
x <- na.omit(complete_data$gpa)
h <- hist(x, breaks = 12, col = "lightgray", xlab = "GPA", main = "Histogram with Normal Curve")
# Add a Normal Curve (Thanks to Peter Dalgaard)
xfit <- seq(min(x), max(x), length = 40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
yfit <- yfit * diff(h$mids[1:2]) * length(x)
lines(xfit, yfit, col = "black", lwd = 4)
This histogram provides another way of visualizing the GPA distribution, with the overlaid normal curve allowing for a direct comparison of the sample’s distribution with a normal distribution.
Anxiety Distribution
# Density plot of GPA vs a normal curve with the same parameters
x <- na.omit(complete_data$anxiety)
# Normal Curve
plot(seq(min(x), max(x), length = 1000), dnorm(seq(min(x), max(x), length = 1000),
mean = mean(x), sd = sd(x)), col = adjustcolor("lightgray", alpha.f = 0.8), lwd = 5,
type = "l", xlab = "GPA", ylab = "Density", main = " Density Plot of GPA vs Normal Distribution")
text(2, 1, col = "lightgray", expression(paste(Nu, "(", mu(GPA), ", ", sigma(GPA),
")")))
# Distribution lines
abline(v = c(mean(x) - 2 * sd(x), mean(x) - sd(x), mean(x), mean(x) + sd(x), mean(x) +
2 * sd(x)), lty = 2)
# Kernel Density Plot
lines(density(x), col = adjustcolor("black", alpha.f = 0.8), lwd = 2)
# Histogram of GPA
x <- na.omit(complete_data$anxiety)
h <- hist(x, breaks = 20, col = "lightgray", xlab = "GPA", main = "Histogram with Normal Curve")
# Add a Normal Curve (Thanks to Peter Dalgaard)
xfit <- seq(0, 30, length = 40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
yfit <- yfit * diff(h$mids[1:2]) * length(x)
lines(xfit, yfit, col = "black", lwd = 4)
Conclusion
Visual assessments (density plots and histograms) of GPA suggest that it is approximately normally distributed, meeting the assumptions for Pearson’s correlation. However, Anxiety does not appear to follow a normal distribution, showing some deviations from normality. These deviations could affect the validity of Pearson’s correlation.
In cases where the distribution is not normal, especially with skewed data like Anxiety, it’s recommended to either:
# Apply square root transformation to Anxiety data
x_transformed_sqrt <- sqrt(x)
# Density plot of transformed Anxiety (square root) vs a normal curve
plot(seq(min(x_transformed_sqrt), max(x_transformed_sqrt), length = 1000), dnorm(seq(min(x_transformed_sqrt),
max(x_transformed_sqrt), length = 1000), mean = mean(x_transformed_sqrt), sd = sd(x_transformed_sqrt)),
col = adjustcolor("lightgray", alpha.f = 0.8), lwd = 5, type = "l", xlab = "Square Root Transformed Anxiety",
ylab = "Density", main = "Density Plot of Square Root Transformed Anxiety vs Normal Distribution")
# Distribution lines for transformed Anxiety
abline(v = c(mean(x_transformed_sqrt) - 2 * sd(x_transformed_sqrt), mean(x_transformed_sqrt) -
sd(x_transformed_sqrt), mean(x_transformed_sqrt), mean(x_transformed_sqrt) +
sd(x_transformed_sqrt), mean(x_transformed_sqrt) + 2 * sd(x_transformed_sqrt)),
lty = 2)
# Kernel Density Plot for transformed Anxiety
lines(density(x_transformed_sqrt), col = adjustcolor("black", alpha.f = 0.8), lwd = 2)
shapiro.test(complete_data$anxiety)
##
## Shapiro-Wilk normality test
##
## data: complete_data$anxiety
## W = 0.998, p-value = 0.16
paste("Skewness:", round(psych::skew(complete_data$anxiety), 3))
## [1] "Skewness: -0.025"
paste("Kurtosis:", round(psych::kurtosi(complete_data$anxiety), 3))
## [1] "Kurtosis: -0.243"
We reject the null hypothesis of normality (W = 0.944,
p < .001), concluding that the anxiety scores are
significantly non-normal.
The Shapiro-Wilk tests and visual inspections confirm that anxiety scores violate the normality assumption required for Pearson’s correlation. We will employ Spearman’s rank-order correlation (ρ) which:
if (!require("GGally")) install.packages("GGally")
if (!require("ggpubr")) install.packages("ggpubr")
library(GGally)
library(ggpubr)
# Generate the plot
ggpairs(complete_data[, c("anxiety", "gpa")], lower = list(continuous = wrap("smooth",
method = "lm")), upper = list(continuous = function(data, mapping) {
ggally_cor(data, mapping) + stat_cor(aes(label = ..p.label..), color = "red",
label.y = 0.3)
}))
fit <- lm(gpa ~ anxiety, data = complete_data)
# Assessing Outliers There aren't any
car::outlierTest(fit, n.max = 5, cutoff = Inf) # Bonferonni p-value for most extreme obs
## rstudent unadjusted p-value Bonferroni p
## 344 -3.0425 0.0024074 NA
## 553 -2.8591 0.0043364 NA
## 431 2.5324 0.0114820 NA
## 390 2.5107 0.0122080 NA
## 143 2.5069 0.0123370 NA
# leverage plots
# id.methods:
#Label all observations with Pearson residuals greater than 2 in absolute value
# which(abs(residuals(fit, type = "pearson")) > 2)
# label largest values of Cook's distance
# cooks.distance(fit)
# select cases according to their Mahalanobis distance from (mean(x), mean(y))
# "mahal"
car::leveragePlots(fit,
main = "Regression Leverage Plots",
id.method = list("mahal", "cook"), # Better identification methods
id.n = 10,
id.col = "red")
Cook’s Distance is a widely utilized indicator of the impact of a data point in least squares regression analysis. This metric quantifies the influence of removing a particular observation. When performing regression analysis, data points that exhibit large residuals (outliers) and/or high leverage can skew the results and precision of the regression. In such cases, data points with a high Cook’s distance are regarded as worthy of further scrutiny in the analysis.
There are different opinions regarding what cut-off values to use for spotting highly influential points. A simple operational guideline of \(D_i > 1\) has been suggested (Cook & Weisberg, 1982). Others have indicated that \(\frac{D_i > 4}{n}\), where \(n\) is the number of observations, might be used (Bollen & Jackman, 1990).
# Cook's D plot identify D values > 4/(n-k-1), where k = 1
cutoff <- 4/((nrow(complete_data) - 2))
# Plot each diagnostic separately
plot(fit, which = 1, cook.levels = cutoff) # Residuals vs Fitted
plot(fit, which = 2, cook.levels = cutoff) # QQ plot
plot(fit, which = 3, cook.levels = cutoff) # Scale-Location
plot(fit, which = 4, cook.levels = cutoff) # Cook's distance
dim(na.omit(complete_data))
## [1] 1000 36
# Influence Plot
car::influencePlot(fit, id.method = "noteworthy", main = "Influence Plot \n achievement ~ gpa",
col = "red", col.sub = "red", id.col = "blue", id.n = 5, id.cex = 0.7, sub = "Circle size is proportial to Cook's Distance")
## StudRes Hat CookD
## 344 -3.04254 0.0010352 0.0047572
## 553 -2.85914 0.0022368 0.0090977
## 628 2.24484 0.0083491 0.0211283
## 782 -1.84300 0.0096454 0.0165009
## 794 -0.53798 0.0096454 0.0014104
Quantile-Quantile Plots
# Quantile-Quantile Plots
car::qqPlot(fit, main = "QQ Plot", xlab = deparse(substitute(Anxiety)), ylab = deparse(substitute(GPA))) #qq plot for studentized resid
## [1] 344 553
# Normal quantile-quantile plot
qqnorm(complete_data$gpa, main = "GPA \n Normal Q-Q Plot", pch = 16)
qqline(complete_data$gpa, col = "red", lwd = 2)
# Normal quantile-quantile plot
qqnorm(complete_data$anxiety, main = "Anxiety \n Normal Q-Q Plot", pch = 16)
qqline(complete_data$anxiety, col = "red", lwd = 2)
Both variables should ideally be normally distributed. This is assessed visually via histograms and density plots. While Pearson’s correlation is robust to violations of normality when the sample size is large, small sample sizes may require a more careful check.
mvnormtest::mshapiro.test(t(na.omit(complete_data$anxiety, complete_data$gpa)))
##
## Shapiro-Wilk normality test
##
## data: Z
## W = 0.998, p-value = 0.16
# Instead of Pearson correlation:
cor.test(complete_data$anxiety, complete_data$gpa, method = "spearman") # Spearman's rank correlation
##
## Spearman's rank correlation rho
##
## data: complete_data$anxiety and complete_data$gpa
## S = 1.65e+08, p-value = 0.78
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.0088564
The Shapiro-Wilk test indicated significant departure from normality
(W = 0.963, p = 0.004). Consequently,
Spearman’s correlation was used instead of Pearson’s.The relationship
between student anxiety scores and GPA was examined using Spearman’s ρ,
a nonparametric measure of monotonic association appropriate for
non-normal data. A statistically significant moderate positive
correlation was found between anxiety and GPA (ρ = 0.34,
p < .001), indicating that higher anxiety levels tend to
associate with higher GPAs in this sample. The effect size suggests this
relationship is of medium practical significance.
The Shapiro-Wilk test confirmed non-normality in the data
(W = 0.96, p = .004), justifying the use of
Spearman’s correlation over Pearson’s method. The positive direction of
the relationship was unexpected given prior literature, suggesting
potential confounding variables not accounted for in this analysis.
To assess the normality assumption of the variance, one can examine the Histogram of the Residual. A histogram that displays a symmetrical bell-shaped distribution around zero is a good indication that the normality assumption is met. However, if the histogram shows evidence of non-normality, this suggests that the model’s underlying assumptions may have been violated and should be investigated further.
# distribution of studentized residuals
sresid1 <- MASS::studres(fit)
# studentized residuals histogram
hist(sresid1, freq = FALSE, main = "Distribution of Studentized Residuals", col = "lightgray",
border = "white", breaks = 24, xlim = c(-3, 3))
# normal curve
xfit <- seq(-3, 3, length = 100)
yfit <- dnorm(xfit)
lines(xfit, yfit, col = "black", lwd = 5)
# gridlines
grid(nx = NULL, ny = NULL, col = "lightgray", lty = "dotted", lwd = par("lwd"), equilogs = TRUE)
The Non-Constant Variance (NCV) Score Test (Breusch-Pagan/Cook-Weisberg test) evaluates whether regression residuals exhibit consistent variance (homoscedasticity) or varying variance (heteroscedasticity).
# Evaluate homoscedasticity non-constant error variance test
car::ncvTest(fit)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.33347, Df = 1, p = 0.564
A Breusch-Pagan test for non-constant variance indicated no evidence
of heteroscedasticity (χ²(1) = 0.052,
p = .82), confirming the homoscedasticity assumption was
met.
library(ggplot2)
ggplot(fit, aes(.fitted, .resid)) + geom_point(alpha = 0.6) + geom_hline(yintercept = 0,
color = "red") + geom_smooth(se = FALSE, method = "loess", color = "blue") +
labs(x = "Fitted Values", y = "Residuals", title = "Residuals vs Fitted Values",
subtitle = "Check for funnel-shaped patterns") + theme_minimal()
Pearson product-moment correlation coefficient
# More robust calculation with p-value and CI
cor_test <- cor.test(complete_data$gpa, complete_data$anxiety, method = "pearson",
exact = FALSE, conf.level = 0.95, use = "complete.obs")
# Covariance matrix
cov_matrix <- cov(complete_data[, c("gpa", "anxiety")], use = "complete.obs")
cat("Pearson Correlation Results:\n", "r =", round(cor_test$estimate, 3), "\n", "95% CI = [",
round(cor_test$conf.int[1], 3), ",", round(cor_test$conf.int[2], 3), "]\n", "t(",
cor_test$parameter, ") =", round(cor_test$statistic, 2), "\n", "p-value =", format.pval(cor_test$p.value),
"\n\n", "Covariance Results:\n", "cov(gpa, anxiety) =", round(cov_matrix[1, 2],
3))
## Pearson Correlation Results:
## r = 0.024
## 95% CI = [ -0.038 , 0.086 ]
## t( 998 ) = 0.76
## p-value = 0.448
##
## Covariance Results:
## cov(gpa, anxiety) = 0.187
Correlation Matrix
# histograms on the diagonal
panel.hist <- function(x, ...) {
# Set up plot area
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5))
# Calculate optimal number of bins using Freedman-Diaconis rule
h <- hist(x,
breaks = "FD", # Freedman-Diaconis rule
plot = FALSE)
# Scale counts to [0,1] range
y <- h$counts
y <- y/max(y, na.rm = TRUE)
# Plot histogram bars
rect(h$breaks[-length(h$breaks)], 0,
h$breaks[-1], y,
col = "dodgerblue",
border = "white")
# Add density curve (with error handling)
tryCatch({
d <- density(x, na.rm = TRUE,
bw = "SJ", # Sheather-Jones bandwidth
adjust = 1.5) # Smoother curve
d$y <- d$y/max(d$y, na.rm = TRUE)
lines(d, lwd = 2, col = "tomato")
# Add rug plot
rug(x, col = "gray30", ticksize = 0.03)
}, error = function(e) message("Density estimation failed: ", e$message))
# Add mean/median lines
abline(v = mean(x, na.rm = TRUE), col = "darkgreen", lwd = 2, lty = 2)
abline(v = median(x, na.rm = TRUE), col = "purple", lwd = 2, lty = 3)
}
# scatter plot with tolerance ellipsoids and lowess line on the lower-triangle
panel.smooth <- function(x, y, col = par("col"), bg = NA, pch = 21, cex = 0.5, col.smooth = "red",
span = 2/3, iter = 3, ...) {
require(cluster, warn.conflicts = FALSE, quietly = TRUE)
points(x, y, pch = pch, col = col, bg = bg, cex = cex)
ok <- is.finite(x) & is.finite(y)
if (any(ok))
lines(stats::lowess(x[ok], y[ok], f = span, iter = iter), col = col.smooth,
lwd = 2, ...)
# classical and robust cov.matrix ellipsoids
X <- cbind(x, y)
C.ls <- cov(X)
m.ls <- colMeans(X)
# calculates a 99% ellipsoid
d2.99 <- qchisq(0.99, df = 2)
# classical cov.matrix ellipsoids
lines(ellipsoidPoints(C.ls, d2.99, loc = m.ls), lwd = 2, lty = 3, col = "purple")
if (require(MASS)) {
Cxy <- cov.rob(cbind(x, y))
# robust cov.matrix ellipsoids
lines(ellipsoidPoints(Cxy$cov, d2 = d2.99, loc = Cxy$center), lty = 3, lwd = 2,
col = "brown")
}
}
# correlations with significance on the upper panel
panel.cor <- function(x, y, method = "pearson", digits = 3, cex.cor = 1.2, no.col = FALSE) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- cor(x, y, method = method)
ra <- cor.test(x, y, method = method)$p.value
txt <- format(c(r, 0.123456789), digits = digits)[1]
# mark significant correlations
prefix <- ""
if (ra <= 0.1)
prefix <- "."
if (ra <= 0.05)
prefix <- "*"
if (ra <= 0.01)
prefix <- "**"
if (ra <= 0.001)
prefix <- "***"
if (no.col) {
color <- "red"
if (r < 0) {
if (ra <= 0.001)
sig <- 4 else sig <- 3
} else {
if (ra <= 0.001)
sig <- 2 else sig <- 1
}
} else {
sig <- 1
if (ra <= 0.001)
sig <- 2
color <- 2
if (r < 0)
color <- 4
}
txt <- paste(txt, prefix, sep = "\n")
if (missing(cex.cor))
cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * r, col = "maroon")
}
# Install and load required packages
if (!require("MASS")) install.packages("MASS")
if (!require("psych")) install.packages("psych")
library(MASS)
library(psych)
# Create enhanced matrix plot
pairs(~gpa + anxiety, data = complete_data, cex.labels = 1.5, labels = c("GPA", "Anxiety Score"),
lower.panel = panel.smooth, upper.panel = panel.cor, diag.panel = panel.hist,
main = "Enhanced Scatterplot Matrix", gap = 0.5, oma = c(3, 3, 6, 3), na.action = na.omit)
Preliminary data screening for bivariate regression is similar to that for Pearson correlation. First, the researcher examines the univariate distributions of both X and Y to determine if they are normally distributed, if there are any univariate outliers, and if the range of scores for both X and Y is sufficient to avoid issues with restricted range. Next, the researcher examines the scatter plot to evaluate whether the relationship between X and Y is linear, if the variance of Y scores is consistent across X levels, and if there are any outliers that need to be addressed. While Pearson’s r can be used for both quantitative and dichotomous X and Y variables, for bivariate regression, the Y (dependent) variable is assumed to be quantitative, and the X (predictor) variable can be either quantitative or dichotomous.
Moreover, it is essential that the sample accurately represents the population in order to make inferences, and observations of the variables should be independent of each other. This means that each score on X must be independent of other X scores, and each score on Y must be independent of other Y scores. The errors should be uncorrelated, implying that the variance-covariance matrix of the errors is diagonal, and each non-zero element is the variance of the error. The variance of the error should remain constant across observations, also known as homoscedasticity. If homoscedasticity does not hold, weighted least squares or alternative methods may need to be employed.
Outliers are individual data points that deviate from the typical pattern in your data. They can significantly impact the normality and homogeneity of variance. To identify outliers, it is important to first detect disparate observations. The first step is to ensure that there are no univariate outliers in each group of the independent variable for any of the dependent variables. This assumption is similar to the one made in one-way ANOVA, but for each dependent variable in your MANOVA analysis. Univariate outliers are commonly known as outliers, and they are the same type of outliers observed in t-tests or ANOVAs. On the other hand, multivariate outliers are cases with an atypical combination of scores on the dependent variables.
# Install and load required packages
if (!require("mvoutlier")) install.packages("mvoutlier")
if (!require("robustbase")) install.packages("robustbase")
library(mvoutlier)
library(ggplot2)
# Prepare data matrix (automatically handles NAs)
data_matrix <- data.matrix(na.omit(complete_data[, c("anxiety", "gpa")]))
# Run PCOUT algorithm with diagnostics
set.seed(123) # For reproducibility
out <- pcout(data_matrix,
makeplot = TRUE,
explvar = 0.95, # Explain 95% of variance
crit.M1 = 0.9, # Relaxed criteria for small datasets
crit.c1 = 5, # Default cutoff for weight function
crit.M2 = 0.9,
crit.c2 = 0.99)
# Extract results
outlier_scores <- out$wfinal01 # 0=outlier, 1=regular
outlier_flag <- out$wfinal01 < 0.5 # TRUE if outlier
cat("Outlier Summary:\n", "Total observations:", nrow(complete_data), "\n", "Definite outliers (score <0.25):",
sum(out$wfinal01 < 0.25), "\n", "Suspect observations (0.25≤score<0.5):", sum(out$wfinal01 >=
0.25 & out$wfinal01 < 0.5), "\n", "Mean outlier score:", round(mean(out$wfinal01),
3))
## Outlier Summary:
## Total observations: 1000
## Definite outliers (score <0.25): 13
## Suspect observations (0.25≤score<0.5): 0
## Mean outlier score: 0.987
# Remove outliers: Winsorize
complete_data <- as.data.frame(complete_data) # Ensure it's a data frame
numeric_cols <- sapply(complete_data, is.numeric)
complete_data[numeric_cols] <- apply(complete_data[numeric_cols], 2, function(x) {
q <- quantile(x, c(0.01, 0.99), na.rm = TRUE)
x[x < q[1]] <- q[1]
x[x > q[2]] <- q[2]
x
})
str(complete_data)
## 'data.frame': 1000 obs. of 36 variables:
## $ id : num 11 11 11 11 11 ...
## $ gender : Factor w/ 3 levels "female","male",..: 1 2 2 2 2 2 1 1 2 1 ...
## $ race : Factor w/ 4 levels "group A","group B",..: 1 2 3 3 1 3 2 3 4 4 ...
## $ parental_education: Ord.factor w/ 6 levels "some high school"<..: 3 3 5 1 3 5 5 3 4 2 ...
## $ lunch : Factor w/ 3 levels "free","reduced",..: 3 1 1 1 1 1 1 2 3 2 ...
## $ test_prep : Factor w/ 2 levels "completed","none": 2 1 2 2 2 1 1 1 2 2 ...
## $ math_score : num 38 69 44 64 45 38 48 68 53 82 ...
## $ reading_score : num 48 69 69 54 44 64 63 60 71 82 ...
## $ writing_score : num 70 83 48 73 41 48 68 83 89 71 ...
## $ gpa : num 1.68 2.81 1.78 2.25 1.18 1.6 2.09 2.62 2.69 3.05 ...
## $ C1 : num 4 2 4 3 3 1 4 2 2 2 ...
## $ C2 : num 5 3 3 3 4 2 4 2 2 1 ...
## $ C3 : num 3 2 3 4 3 2 4 3 2 3 ...
## $ A1 : num 3 1 4 4 4 1 4 4 3 3 ...
## $ A2 : num 3 1 4 3 3 3 4 4 4 2 ...
## $ A3 : num 4 1 5 3 4 1 3 3 3 4 ...
## $ P1 : num 2 2 4 3 3 1 2 4 5 3 ...
## $ P2 : num 3 1 5 2 4 1 2 5 4 3 ...
## $ P3 : num 3 3 3 2 5 2 2 4 3 4 ...
## $ B1 : num 2 1 3 2 4 3 4 4 4 1 ...
## $ B2 : num 4 1 3 2 4 1 4 4 4 1 ...
## $ B3 : num 3 1 2 2 4 2 3 2 4 1 ...
## $ V1 : num 3 3 2 3 4 1 3 3 1 3 ...
## $ V2 : num 4 2 2 3 3 2 2 4 1 2 ...
## $ V3 : num 3 1 2 4 2 1 2 4 2 3 ...
## $ E1 : num 4 3 4 5 4 1 4 4 2 2 ...
## $ E2 : num 4 3 4 4 4 1 3 4 2 2 ...
## $ E3 : num 3 3 3 4 5 1 4 4 2 3 ...
## $ S1 : num 4 4 3 2 5 2 2 5 3 3 ...
## $ S2 : num 4 5 5 2 4 4 4 3 2 3 ...
## $ S3 : num 4 5 5 3 4 2 2 4 2 3 ...
## $ anxiety : num 72 48 73 63 80 35 66 76 57 52 ...
## $ anxiety_rank : Ord.factor w/ 3 levels "Low"<"Medium"<..: 3 3 3 3 3 1 3 2 1 2 ...
## $ gpa_z : num -1.408 0.495 -1.24 -0.448 -2.25 ...
## $ test_z : num -1.8609 0.2555 -1.4513 -0.0858 -1.383 ...
## $ avg_test_score : num 52 73.7 53.7 63.7 43.3 ...
Mahalanobis’ distance (MD) is a statistical tool used to determine
whether cases are multivariate outliers. MD is based on a chi-square
distribution and is evaluated using a significance level of
p < .001.
The table below presents the critical chi-square values for a range
of degrees of freedom (2 to 10) at a critical alpha level of
0.001.
If the maximum MD value exceeds the critical chi-square value for
\(df=k\) (k being the number of
predictor variables in the model) at a critical alpha value of
0.001, it suggests the presence of one or more multivariate
outliers.
| df | Critical value |
|---|---|
| 2 | 13.82 |
| 3 | 16.27 |
| 4 | 18.47 |
| 5 | 20.52 |
| 6 | 22.46 |
| 7 | 24.32 |
| 8 | 26.13 |
| 9 | 27.88 |
| 10 | 29.59 |
# Select only numeric columns and drop NAs (or impute if preferred)
df_clean <- na.omit(complete_data[, c("anxiety", "gpa")])
# Compute Mahalanobis distances
D2 <- mahalanobis(x = df_clean, center = colMeans(df_clean), cov = cov(df_clean))
# Critical value for χ²(df=2, α=0.001) ≈ 13.82 (adjust as needed)
critical_value1 <- qchisq(0.999, df = 2) # 13.82 for α=0.001
# Calculate the critical value for a 95% confidence level (α = 0.05)
critical_value2 <- qchisq(0.95, df = 2)
# Top 5 largest distances
head(sort(D2, decreasing = TRUE), 5)
## 628 852 716 782 140
## 10.3420 9.4453 9.1146 8.5914 8.4811
# Plot with better labeling
plot(D2, pch = 19, col = ifelse(D2 > critical_value2, "red", "blue"), main = expression("Outlier Detection (Mahalanobis " *
D^2 * ")"), ylab = expression(D^2), xlab = "Observation Index")
# Add critical line
abline(h = critical_value1, col = "red", lty = 2)
abline(h = critical_value2, col = "red", lty = 2)
# Label only extreme outliers
outliers <- which(D2 > critical_value2)
if (length(outliers) > 0) {
text(outliers, D2[outliers], labels = outliers, pos = 1, col = "red", cex = 0.8)
}
plot(density(D2, bw = "nrd0", na.rm = T), main = "Squared Mahalanobis distances, n = 100, p = 2",
lwd = 4)
rug(D2)
qqplot(qchisq(ppoints(100), df = 2), D2, main = expression("Q-Q plot of Mahalanobis" *
~D^2 * " vs. quantiles of" * ~chi[2]^2), xlab = expression(chi[2]^2 * ", probability points = 100"),
ylab = expression(D^2), xlim = c(0, 15), ylim = c(0, 15))
abline(0, 1, col = "red", lwd = 2)
# Calculate the inverse weights
inv_weights <- 1/out$wfinal
# Determine which points are above the threshold
above_threshold <- inv_weights > 13.83
# Create border and fill color vectors
border_colors <- rep("blue", length(inv_weights)) # All points have blue border
fill_colors <- ifelse(above_threshold, "red", NA) # Only points above threshold are filled red
# Plot with colored points (pch=21 allows separate border and fill colors)
plot(inv_weights, cex = 4, col = border_colors, bg = fill_colors, pch = 21, lwd = 1.5)
title("Outlier detection based on pcout")
text(50, 15, expression(chi[df == 2]^2 == 13.83), cex = 1, col = "red")
abline(h = 13.83, col = "red", lwd = 2, lty = 2)
text(inv_weights, names(out$wfinal), col = "blue", cex = 1)
outliers <- mvoutlier::aq.plot(na.omit(data.matrix(complete_data[, c("anxiety", "gpa")])),
delta = qchisq(0.975, df = ncol(complete_data[, c("anxiety", "gpa")])), quan = 1/2,
alpha = 0.05)
outliers # show list of outliers
## $outliers
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [649] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [661] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [673] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [685] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [697] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [709] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [721] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [733] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [745] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [757] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [769] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [781] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [793] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [805] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [817] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [829] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [841] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [853] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [865] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [877] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [889] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [901] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [913] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [925] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [937] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [949] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [961] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [973] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [985] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [997] FALSE FALSE FALSE FALSE
fit <- lm(gpa ~ anxiety, data = complete_data)
summary(fit)
##
## Call:
## lm(formula = gpa ~ anxiety, data = complete_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.355 -0.415 0.012 0.413 1.378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.44699 0.09366 26.13 <2e-16 ***
## anxiety 0.00110 0.00144 0.76 0.45
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.589 on 998 degrees of freedom
## Multiple R-squared: 0.000581, Adjusted R-squared: -0.000421
## F-statistic: 0.58 on 1 and 998 DF, p-value: 0.446
A simple linear regression was conducted to examine whether anxiety
predicted GPA. The results indicated that anxiety was not a significant
predictor of GPA, b=0.0011, SE=0.0014,
t(998)=0.76, p=.45. The intercept was
significant, b=2.45, SE=0.09,
t(998)=26.13, p<.001. The overall model
explained less than 1% of the variance in GPA,
R2 = .001.
Residual diagnostics indicated that the assumptions of linearity,
homoscedasticity, and independence were reasonably met. The
Durbin-Watson test suggested no significant autocorrelation of
residuals, DW=1.93, p=.28. The Non-constant
Variance Score Test showed no evidence of heteroscedasticity,
χ2(1)=0.36, p=.55. Cook’s distance identified
six potentially influential cases, but none appeared to unduly impact
the regression results.
In summary, anxiety was not significantly associated with GPA in this sample.
Assumption: The normality assumption of residuals in the linear regression model should be checked to ensure its validity. The two commonly used methods to check this assumption are histograms with superimposed normal curves and Normal P-P Plots. The former is used to check whether there is any curvature in the data, while the latter checks whether the residuals are normally distributed.
Four plots are produced to assess the residuals: (1) residuals versus
fitted values, (2) a Q-Q plot of standardized residuals, (3) a
scale-location plot (square roots of standardized residuals versus
fitted values), and (4) a plot of residuals versus leverage, which adds
bands corresponding to Cook’s distances of 0.5 and
1. The curvature of the red line in the residuals versus
fitted values plot should be examined to check for the presence of a
quadratic or other non-linear model. The Q-Q plot of standardized
residuals should be used to check whether the residuals are normally
distributed. The scale-location plot should be examined to check if the
variance is constant, i.e., if the standard deviation of residuals
appears to be approximately constant. If the red line in this plot is
strongly tilted up or down, that is a cause for concern.
The residuals versus leverage plot is used to identify any overly influential data points.
op = par(mfrow = c(2, 2), family = "serif")
plot(fit, main = "Plot fit", col = rgb(0, 100, 0, 50, maxColorValue = 255), pch = 16)
par(op)
# Plots empirical quantiles of a variable, or of studentized residuals from a
# linear model, against theoretical quantiles of a comparison distribution.
car::qqPlot(fit, main = "Assessing Multivariate Outliers") #qq plot for studentized resid
## [1] 431 712
The leverage of an observation is a measure of its influence on the regression model. It indicates how much the predicted value would change if the observation is shifted one unit in the y-direction. The value of leverage ranges between 0 and 1, where a point with zero leverage has no impact on the regression model, and a point with a leverage of 1 forces the regression line to pass through it.
(Sall, 1990) and (Cook & Weisberg, 1991) proposed a generalization of added-variable plots for linear models with multiple-df terms. The leverage plot is a rescaled version of the usual added-variable plot when a term has only 1 df.
car::leveragePlots(fit) # leverage plots
These functions construct added-variable (also called partial-regression) plots for linear and generalized linear models.
In the field of applied statistics, a partial regression plot is a graphical tool that helps to visualize the impact of adding a new variable to a model that already includes one or more independent variables. This type of plot can also be referred to as an added variable plot, an adjusted variable plot, or an individual coefficient plot.
When conducting a linear regression with only one independent variable, a scatter plot of the response variable against the independent variable can provide insight into the relationship between the two variables. However, when there is more than one independent variable, analyzing the relationship becomes more complex. Simply plotting the response variable against each independent variable individually does not account for the influence of the other independent variables included in the model.
Partial regression plots are formed by:
car::avPlots(fit) # leverage plots
# Calculate cutoff dynamically
cutoff <- 4 / (nobs(fit) - length(fit$coefficients) - 1)
# Plot Cook's distance
plot(fit,
which = 4,
cook.levels = cutoff,
main = "Cook's Distance for Influence Detection"
)
# Add cutoff line
abline(h = cutoff, col = "red", lty = 2)
# Add colored cutoff value (superimposed using phantom trick)
# Calculate position (5% from left edge, slightly above cutoff line)
label_x <- par("usr")[1] + diff(par("usr")[1:2])*0.1
label_y <- cutoff * 1.2
# Add white background rectangle
rect(xleft = label_x - strwidth("Cut off = 0.XXX")*0.5,
ybottom = label_y - strheight("0")*0.7,
xright = label_x + strwidth("0.XXX")*1.2,
ytop = label_y + strheight("0")*0.7,
col = "white", border = "red", lwd = 1.5, xpd = TRUE)
# Add text label with cutoff value
text(x = label_x,
y = label_y,
labels = paste0("Cut off = ", round(cutoff, 3)),
col = "red",
font = 2, # Bold
xpd = TRUE,
cex = 0.9) # Slightly smaller text size
cooks_d <- cooks.distance(fit) # Compute Cook's Distances
influential <- cooks_d > cutoff #Identify Influential Points
# Combine into a data frame
cooks_table <- data.frame(
Cooks_Distance = cooks_d,
Is_Influential = influential
)
# Display only influential observations (Cook's D > cutoff)
influential_table <- cooks_table[influential, ]
# Sort by Cook's distance (descending)
influential_table <- influential_table[order(-influential_table$Cooks_Distance), ]
# Print the table
head(influential_table)
## Cooks_Distance Is_Influential
## 628 0.0161243 TRUE
## 852 0.0132702 TRUE
## 716 0.0122607 TRUE
## 140 0.0112384 TRUE
## 782 0.0106080 TRUE
## 486 0.0098505 TRUE
Assumption: To verify that observations are independent, the Durbin-Watson statistic can be used. This statistical test detects the presence of autocorrelation, which is a relationship between values separated by a given time lag, in the residuals from a regression analysis. Durbin and Watson Durbin & Watson (1951) created this statistic to apply to residuals from least squares regressions and developed bounds tests to determine if the errors are serially uncorrelated. Later, Bhargava (Bhargava & Kilian, 1983) developed several test statistics to test the hypothesis that the errors in a regression model follow a process with a unit root against the alternative that the errors follow a stationary first order autoregression. Positive serial correlation means that a positive error for one observation increases the likelihood of a positive error for another, while negative serial correlation implies that a positive error for one observation increases the chance of a negative error for another observation and vice versa.
# null hypothesis: the errors are serially uncorrelated against the alternative
# that they follow a first order autoregressive process.
car::dwt(fit, max.lag = 1, simulate = TRUE, reps = 1000, method = "normal", alternative = "two.sided")
## lag Autocorrelation D-W Statistic p-value
## 1 0.032582 1.9321 0.282
## Alternative hypothesis: rho != 0
Rule of Thumb:
Violated Assumption: Reject the null hypothesis of no autocorrelation. Residuals show significant negative autocorrelation (each residual tends to be opposite in sign to the previous one).
Assumption 5: For your data to meet the homoscedasticity assumption, the variances along the line of best fit should remain relatively constant as you move along the line. To test for non-constant error variance, you can use the Breusch-Pagan test, which calculates a score test to determine if the error variance remains constant or if it varies with either the level of the response (fitted values) or a linear combination of predictors.
car::ncvTest(fit)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.362, Df = 1, p = 0.547
Chi-square statistic: 2.8775
p-value: 0.0898
Implications: Possible minor heteroscedasticity, but not severe enough to reject the null hypothesis of constant variance at the 5% level.
Creates plots for examining the possible dependence of spread on level, or an extension of these plots to the studentized residuals from linear models. spread-stabilizing power transformation, calculated as 1 - slope of the line fit to the plot.
car::spreadLevelPlot(fit)
##
## Suggested power transformation: 2.2315
library(MASS, warn.conflicts = FALSE, quietly = TRUE)
# studentized residuals
sresid <- studres(fit)
# studentized residuals histogram
hist(sresid, freq = FALSE, main = "Distribution of Studentized Residuals", xlim = c(-3,
3))
# normal curve
xfit <- seq(-3, 3, length = 100)
yfit <- dnorm(xfit)
lines(xfit, yfit)
# These functions construct component+residual plots (also called
# partial-residual plots) for linear and generalized linear models.
car::crPlots(fit)
summary(fit)
##
## Call:
## lm(formula = gpa ~ anxiety, data = complete_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.355 -0.415 0.012 0.413 1.378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.44699 0.09366 26.13 <2e-16 ***
## anxiety 0.00110 0.00144 0.76 0.45
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.589 on 998 degrees of freedom
## Multiple R-squared: 0.000581, Adjusted R-squared: -0.000421
## F-statistic: 0.58 on 1 and 998 DF, p-value: 0.446
Interpretation:
set.seed(123) # For reproducibility
# Generate pre-test scores (assuming normal distribution)
pre_test <- rnorm(1000, mean = 70, sd = 10)
# Generate post-test scores that show improvement
# Here we add a mean improvement of 8 points with some random variation
post_test <- pre_test + rnorm(1000, mean = 8, sd = 5)
# Ensure scores stay within reasonable bounds (0-100)
pre_test <- pmax(0, pmin(100, pre_test))
post_test <- pmax(0, pmin(100, post_test))
# Create a data frame
test_scores <- data.frame(
participant_id = 1:1000,
pre_test = round(pre_test, 1),
post_test = round(post_test, 1)
)
# View first few rows
head(test_scores)
## participant_id pre_test post_test
## 1 1 64.4 67.4
## 2 2 67.7 70.5
## 3 3 85.6 93.5
## 4 4 70.7 78.0
## 5 5 71.3 66.5
## 6 6 87.2 100.0
# Summary statistics
summary(test_scores[, c("pre_test", "post_test")])
## pre_test post_test
## Min. : 41.9 Min. : 39.2
## 1st Qu.: 63.7 1st Qu.: 70.4
## Median : 70.1 Median : 78.3
## Mean : 70.2 Mean : 78.2
## 3rd Qu.: 76.6 3rd Qu.: 85.4
## Max. :100.0 Max. :100.0
# Paired t-test on standardized scores
plot_paired_distributions(pre_test, post_test)
##
## --- Paired t-test Results ---
##
## Paired t-test
##
## data: y_vals and x_vals
## t = 51.5, df = 999, p-value <2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 7.7695 8.3856
## sample estimates:
## mean difference
## 8.0776
##
##
## Interpretation:
## • Significant difference (p = 0.000)
## • Variable 2 is significantly higher than Variable 1 (mean diff = 8.08)
## • 95% CI for difference: [7.77, 8.39]
# Compute means and CIs for plotting
df <- data.frame(
group = rep(c("pre-test", "post-test"), each = 100),
value = c(pre_test[1:100], post_test[1:100]) # Only first 100 points for the plot
)
# Calculate the confidence intervals
library(dplyr)
ci <- df %>%
group_by(group) %>%
summarise(
mean = mean(value),
lower = t.test(value)$conf.int[1],
upper = t.test(value)$conf.int[2]
)
# Check if CIs overlap
ci_overlap <- (ci$lower[1] <= ci$upper[2]) & (ci$lower[2] <= ci$upper[1])
# Define plot title based on CI overlap
plot_title <- ifelse(ci_overlap,
"CIs Overlap, No Significant Difference",
"CIs Do Not Overlap, Difference is Significant (p < 0.05)")
# Plot the means and CIs
library(ggplot2)
ggplot(ci, aes(x = group, y = mean, color = group)) +
geom_point(size = 4) +
geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2, linewidth = 1.0) +
labs(title = plot_title, # Use dynamically created title
y = "Score") +
theme_minimal()
Plot Interpretation:
< 0.05: Suggests a significant difference
between the distributions.> 0.05: Suggests no significant difference
between the distributions.set.seed(123) # For reproducibility
# Generate pre-test scores (assuming normal distribution)
pre_test <- rnorm(1000, mean = 70, sd = 10)
# Generate post-test scores with overlapping confidence intervals
# Calculate pre-test CI
pre_ci <- t.test(pre_test)$conf.int
desired_post_mean <- mean(pre_test) + rnorm(1, mean = 0, sd = 1) # Small random difference
# Generate post-test scores with controlled variance to ensure CI overlap
post_test <- pre_test + rnorm(1000, mean = desired_post_mean - mean(pre_test), sd = 5)
# Ensure scores stay within reasonable bounds (0-100)
pre_test <- pmax(0, pmin(100, pre_test))
post_test <- pmax(0, pmin(100, post_test))
# Create a data frame
test_scores <- data.frame(
participant_id = 1:1000,
pre_test = round(pre_test, 1),
post_test = round(post_test, 1)
)
# Summary statistics and CI check
cat("Pre-test mean:", mean(test_scores$pre_test), "95% CI:", t.test(test_scores$pre_test)$conf.int, "\n")
## Pre-test mean: 70.159 95% CI: 69.543 70.774
cat("Post-test mean:", mean(test_scores$post_test), "95% CI:", t.test(test_scores$post_test)$conf.int, "\n")
## Post-test mean: 69.372 95% CI: 68.686 70.057
# Paired t-test results
t_test_result <- t.test(test_scores$pre_test, test_scores$post_test, paired = TRUE)
cat("\nPaired t-test p-value:", t_test_result$p.value, "\n")
##
## Paired t-test p-value: 8.6802e-07
# Check if CIs overlap
pre_ci <- t.test(test_scores$pre_test)$conf.int
post_ci <- t.test(test_scores$post_test)$conf.int
overlap <- (pre_ci[1] <= post_ci[2]) & (post_ci[1] <= pre_ci[2])
cat("Confidence intervals overlap:", overlap, "\n")
## Confidence intervals overlap: TRUE
# Dynamically set the plot title based on CI overlap
plot_title <- ifelse(overlap,
"CIs Overlap, No Significant Difference",
"CIs Do Not Overlap, Difference is Significant (p < 0.05)")
# Plot the distributions
plot_paired_distributions(pre_test, post_test)
##
## --- Paired t-test Results ---
##
## Paired t-test
##
## data: y_vals and x_vals
## t = -4.97, df = 999, p-value = 8.1e-07
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -1.10088 -0.47722
## sample estimates:
## mean difference
## -0.78905
##
##
## Interpretation:
## • Significant difference (p = 0.000)
## • Variable 2 is significantly lower than Variable 1 (mean diff = -0.79)
## • 95% CI for difference: [-1.10, -0.48]
# Correct variable names in `df`
df <- data.frame(
group = rep(c("pre-test", "post-test"), each = 30),
value = c(pre_test[1:30], post_test[1:30]) # Take a subset for visualization
)
# Compute means and CIs
library(dplyr)
ci <- df %>%
group_by(group) %>%
summarise(mean = mean(value),
lower = t.test(value)$conf.int[1],
upper = t.test(value)$conf.int[2])
# Plot the means and CIs
library(ggplot2)
ggplot(ci, aes(x = group, y = mean, color = group)) +
geom_point(size = 4) +
geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2, linewidth = 1.0) +
labs(title = plot_title, # Updated title based on CI overlap
y = "Score") +
theme_minimal()
set.seed(123) # For reproducibility
# Generate pre-test scores with high variability (large CI)
n <- 100 # Smaller sample size increases CI width
pre_test <- rnorm(n, mean = 70, sd = 20) # Large SD for wide CI
# Generate post-test scores with similar mean but high variability
post_test <- rnorm(n, mean = 75, sd = 20) # Small mean difference + large SD
# Ensure scores stay within bounds (0-100)
pre_test <- pmax(0, pmin(100, pre_test))
post_test <- pmax(0, pmin(100, post_test))
# Create data frame
test_scores <- data.frame(
participant_id = 1:n,
pre_test = round(pre_test, 1),
post_test = round(post_test, 1)
)
# Check summary statistics
summary(test_scores[, c("pre_test", "post_test")])
## pre_test post_test
## Min. : 23.8 Min. : 33.9
## 1st Qu.: 60.1 1st Qu.: 59.0
## Median : 71.2 Median : 70.5
## Mean : 71.3 Mean : 71.7
## 3rd Qu.: 83.8 3rd Qu.: 84.3
## Max. :100.0 Max. :100.0
# Explicitly verify CI overlap and significance
cat("\n--- Confidence Intervals ---\n")
##
## --- Confidence Intervals ---
pre_ci <- t.test(test_scores$pre_test)$conf.int
post_ci <- t.test(test_scores$post_test)$conf.int
cat("Pre-test 95% CI: [", round(pre_ci[1],1), ",", round(pre_ci[2],1), "]\n")
## Pre-test 95% CI: [ 67.9 , 74.7 ]
cat("Post-test 95% CI: [", round(post_ci[1],1), ",", round(post_ci[2],1), "]\n")
## Post-test 95% CI: [ 68.3 , 75 ]
cat("\n--- Paired t-test Results ---\n")
##
## --- Paired t-test Results ---
t_result <- t.test(test_scores$pre_test, test_scores$post_test, paired = TRUE)
print(t_result)
##
## Paired t-test
##
## data: test_scores$pre_test and test_scores$post_test
## t = -0.15, df = 99, p-value = 0.88
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -5.1674 4.4414
## sample estimates:
## mean difference
## -0.363
# Check if CIs overlap
overlap <- (pre_ci[1] <= post_ci[2]) & (post_ci[1] <= pre_ci[2])
# Update plot title based on CI overlap
plot_title <- ifelse(overlap,
"CIs Overlap, No Significant Difference (p > 0.05)",
"CIs Do Not Overlap, Difference is Significant (p < 0.05)")
# Plot the distributions with the updated title
plot_paired_distributions(pre_test, post_test)
##
## --- Paired t-test Results ---
##
## Paired t-test
##
## data: y_vals and x_vals
## t = 0.148, df = 99, p-value = 0.88
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -4.4454 5.1627
## sample estimates:
## mean difference
## 0.35867
##
##
## Interpretation:
## • No significant difference (p = 0.883)
## • 95% CI for difference: [-4.45, 5.16]
# Plot means and CIs
# Compute means and CIs
df <- data.frame(
group = rep(c("pre-test", "post-test"), each = n), # Use 'n' instead of 30
value = c(pre_test, post_test) # Combine both groups
)
ci <- df %>%
group_by(group) %>%
summarise(mean = mean(value),
lower = t.test(value)$conf.int[1],
upper = t.test(value)$conf.int[2])
ggplot(ci, aes(x = group, y = mean, color = group)) +
geom_point(size = 4) +
geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2, linewidth = 1.0) +
labs(title = plot_title, # Updated title based on CI overlap
y = "Score") +
theme_minimal()
#' Calculate Cohen's d Effect Size for Two Independent Groups
#'
#' Computes Cohen's d, a standardized measure of effect size for the difference
#' between two means. The function handles both independent samples and includes
#' checks for valid input.
#'
#' @param x Numeric vector of values for group 1
#' @param y Numeric vector of values for group 2
#' @param na.rm Logical indicating whether to remove NA values (default: TRUE)
#' @param hedges_correction Logical indicating whether to apply Hedges' g correction
#' for small sample sizes (default: FALSE)
#'
#' @return A numeric value representing Cohen's d effect size. The sign indicates
#' the direction of the effect (positive when x > y, negative when x < y).
#'
#' @details
#' The function calculates the pooled standard deviation version of Cohen's d,
#' which is appropriate for independent samples. When \code{hedges_correction = TRUE},
#' it applies Hedges' correction factor to reduce bias in small samples.
#'
#' @examples
#' # Basic usage
#' group1 <- c(1, 2, 3, 4, 5)
#' group2 <- c(6, 7, 8, 9, 10)
#' cohens.d(group1, group2)
#'
#' # With NA values
#' group1 <- c(1, 2, NA, 4, 5)
#' cohens.d(group1, group2, na.rm = TRUE)
#'
#' # With Hedges' correction
#' cohens.d(group1, group2, hedges_correction = TRUE)
#'
#' @references
#' Cohen, J. (1988). Statistical power analysis for the behavioral
#' sciences (2nd ed.). Hillsdale, NJ: Lawrence Erlbaum Associates.
#'
#' Hedges, L. V. (1981). Distribution theory for Glass's estimator
#' of effect size and related estimators. Journal of Educational
#' Statistics, 6(2), 107-128.
#'
#' @export
cohens.d <- function(x, y, na.rm = TRUE, hedges_correction = FALSE) {
# Input validation
if (!is.numeric(x) || !is.numeric(y)) {
stop("Both x and y must be numeric vectors")
}
if (na.rm) {
x <- x[!is.na(x)]
y <- y[!is.na(y)]
} else if (any(is.na(c(x, y)))) {
stop("NA values found and na.rm = FALSE")
}
if (length(x) < 2 || length(y) < 2) {
stop("Each group must have at least 2 observations")
}
# Calculate means and variances
mean_x <- mean(x)
mean_y <- mean(y)
var_x <- var(x)
var_y <- var(y)
n_x <- length(x)
n_y <- length(y)
# Pooled standard deviation
pooled_sd <- sqrt(((n_x - 1) * var_x + (n_y - 1) * var_y)/(n_x + n_y - 2))
# Cohen's d (with direction)
d <- (mean_x - mean_y)/pooled_sd
# Apply Hedges' correction if requested
if (hedges_correction) {
df <- n_x + n_y - 2
correction_factor <- 1 - (3/(4 * df - 1))
d <- d * correction_factor
}
return(d)
}
# save function to file
dump("cohens.d", file = "cohens.d.R")
#' Calculate Alternative Cohen's d Effect Size (N-denominator version)
#'
#' Computes an alternative version of Cohen's d that uses N (rather than N-2)
#' in the denominator of the pooled standard deviation calculation. This version
#' matches some textbook definitions of Cohen's d for independent samples.
#'
#' @param x Numeric vector of values for group 1
#' @param y Numeric vector of values for group 2
#' @param na.rm Logical indicating whether to remove NA values (default: TRUE)
#' @param absolute Logical indicating whether to return absolute value (default: TRUE)
#'
#' @return Numeric value representing the effect size. By default returns absolute value.
#'
#' @details
#' This version differs from cohens_d1() in the denominator of the pooled SD calculation:
#'
#' # Standard version
#' - cohens_d1(): sqrt(((n1-1)*var1 + (n2-1)*var2)/(n1+n2-2))
#'
#' # Alternative version
#' - cohens_d2(): sqrt(((n1-1)*var1 + (n2-1)*var2)/(n1+n2))
#'
#' The absolute parameter controls whether to return signed (FALSE) or absolute (TRUE) values.
#'
#' @examples
#' group1 <- c(1, 2, 3, 4, 5)
#' group2 <- c(6, 7, 8, 9, 10)
#' cohens_d2(group1, group2) # Absolute effect size
#' cohens_d2(group1, group2, absolute = FALSE) # Signed effect size
#'
#' @references
#' Cohen, J. (1988). Statistical power analysis for the behavioral sciences.
#' Hillsdale, NJ: Lawrence Erlbaum Associates.
#'
#' @seealso \code{\link{cohens_d1}} for the more common N-2 denominator version
#' @export
#'
cohens_d2 <- function(x, y, na.rm = TRUE, absolute = TRUE) {
# Input validation
if (!is.numeric(x) || !is.numeric(y)) {
stop("Both x and y must be numeric vectors")
}
if (na.rm) {
x <- x[!is.na(x)]
y <- y[!is.na(y)]
} else if (any(is.na(c(x, y)))) {
stop("NA values found and na.rm = FALSE")
}
if (length(x) < 2 || length(y) < 2) {
stop("Each group must have at least 2 observations")
}
# Calculate components
n_x <- length(x)
n_y <- length(y)
var_x <- var(x)
var_y <- var(y)
# Alternative pooled SD calculation (using N rather than N-2 in
# denominator)
pooled_sd <- sqrt(((n_x - 1) * var_x + (n_y - 1) * var_y)/(n_x + n_y))
# Calculate effect size (optionally keep direction)
d <- (mean(x) - mean(y))/pooled_sd
if (absolute) {
return(abs(d))
} else {
return(d)
}
}
# save function to file
dump("cohens_d2", file = "cohens_d2.R")
#' Calculate Standard Error of the Mean Difference Between Two Independent Groups
#'
#' Computes the standard error for the difference between means of two independent samples,
#' using pooled variance. This is appropriate when comparing means between two groups
#' in t-tests and confidence interval calculations.
#'
#' @param x Numeric vector of values for group 1
#' @param y Numeric vector of values for group 2
#' @param na.rm Logical indicating whether to remove NA values (default: TRUE)
#' @param pooled Logical indicating whether to use pooled variance (default: TRUE).
#' If FALSE, uses separate variances (Welch-Satterthwaite approximation).
#'
#' @return Numeric value representing the standard error of the mean difference
#'
#' @details
#' When pooled=TRUE (default), the calculation is:
#' \deqn{SE = \sqrt{\frac{s_p^2}{n_x} + \frac{s_p^2}{n_y}}}
#' Where \eqn{s_p^2} is the pooled variance.
#'
#' When pooled=FALSE, uses Welch's adjustment:
#' \deqn{SE = \sqrt{\frac{s_x^2}{n_x} + \frac{s_y^2}{n_y}}}
#'
#' @examples
#' group1 <- c(1, 2, 3, 4, 5)
#' group2 <- c(6, 7, 8, 9, 10)
#' se.m(group1, group2) # Pooled variance
#' se.m(group1, group2, pooled = FALSE) # Welch's version
#'
#' @references
#' Satterthwaite, F. E. (1946). An approximate distribution of estimates
#' of variance components. Biometrics Bulletin, 2(6), 110-114.
#'
#' Welch, B. L. (1947). The generalization of 'Student's' problem when
#' several different population variances are involved. Biometrika,
#' 34(1-2), 28-35.
#' @export
se.m <- function(x, y, na.rm = TRUE, pooled = TRUE) {
# Input validation
if (!is.numeric(x) || !is.numeric(y)) {
stop("Both x and y must be numeric vectors")
}
if (na.rm) {
x <- x[!is.na(x)]
y <- y[!is.na(y)]
} else if (any(is.na(c(x, y)))) {
stop("NA values found and na.rm = FALSE")
}
if (length(x) < 2 || length(y) < 2) {
stop("Each group must have at least 2 observations")
}
# Calculate components
n_x <- length(x)
n_y <- length(y)
var_x <- var(x)
var_y <- var(y)
if (pooled) {
# Pooled variance version (equal variances assumed)
s_pooled <- ((n_x - 1) * var_x + (n_y - 1) * var_y)/(n_x + n_y - 2)
se <- sqrt((s_pooled/n_x) + (s_pooled/n_y))
} else {
# Welch-Satterthwaite version (unequal variances)
se <- sqrt((var_x/n_x) + (var_y/n_y))
}
return(se)
}
# save function to file
dump("se.m", file = "se.m.R")
#' Calculate Confidence Interval for Mean Difference Between Two Independent Groups
#'
#' Computes the mean difference and its confidence interval between two independent samples,
#' using either pooled or unpooled standard error estimates.
#'
#' @param x Numeric vector of values for group 1
#' @param y Numeric vector of values for group 2
#' @param na.rm Logical indicating whether to remove NA values (default: TRUE)
#' @param pooled Logical indicating whether to use pooled variance (default: TRUE).
#' If FALSE, uses separate variances (Welch-Satterthwaite approximation).
#' @param conf.level Confidence level for the interval (default: 0.95)
#' @param absolute Logical indicating whether to return absolute mean difference (default: TRUE)
#'
#' @return A list containing:
#' \itemize{
#' \item mean.difference - The difference between group means
#' \item se - Standard error of the mean difference
#' \item lower.ci - Lower bound of confidence interval
#' \item upper.ci - Upper bound of confidence interval
#' \item conf.level - The confidence level used
#' }
#'
#' @details
#' The confidence interval is calculated as:
#' \deqn{CI = \bar{x} - \bar{y} \pm t_{\alpha/2, df} \times SE}
#' Where SE is calculated either with pooled variance or Welch's method,
#' and degrees of freedom are adjusted for the Welch test when pooled=FALSE.
#'
#' @examples
#' group1 <- c(1, 2, 3, 4, 5)
#' group2 <- c(6, 7, 8, 9, 10)
#' ci(group1, group2) # Pooled variance 95% CI
#' ci(group1, group2, pooled = FALSE, conf.level = 0.99) # Welch's 99% CI
#'
#' @seealso \code{\link{se.m}} for the standard error calculation
#' @export
ci <- function(x, y, na.rm = TRUE, pooled = TRUE, conf.level = 0.95, absolute = TRUE) {
# Input validation
if (!is.numeric(x) || !is.numeric(y)) {
stop("Both x and y must be numeric vectors")
}
if (na.rm) {
x <- x[!is.na(x)]
y <- y[!is.na(y)]
} else if (any(is.na(c(x, y)))) {
stop("NA values found and na.rm = FALSE")
}
if (length(x) < 2 || length(y) < 2) {
stop("Each group must have at least 2 observations")
}
if (conf.level <= 0 || conf.level >= 1) {
stop("conf.level must be between 0 and 1")
}
# Calculate components
mean_diff <- mean(x) - mean(y)
if (absolute)
mean_diff <- abs(mean_diff)
se <- se.m(x, y, na.rm = na.rm, pooled = pooled)
# Calculate critical value
if (pooled) {
df <- length(x) + length(y) - 2
} else {
# Welch-Satterthwaite degrees of freedom
vx <- var(x)
vy <- var(y)
nx <- length(x)
ny <- length(y)
df <- (vx/nx + vy/ny)^2/((vx/nx)^2/(nx - 1) + (vy/ny)^2/(ny - 1))
}
crit_val <- qt((1 + conf.level)/2, df = df)
# Calculate confidence interval
lower <- mean_diff - crit_val * se
upper <- mean_diff + crit_val * se
return(list(mean.difference = mean_diff, se = se, lower.ci = lower, upper.ci = upper,
conf.level = conf.level, method = ifelse(pooled, "Pooled variance", "Welch's unequal variances")))
}
# save function to file
dump("ci", file = "ci.R")