Show the code
library(tidyverse)
library(lme4)
library(lmerTest)
library(emmeans)
library(lattice)
library(plotly)library(tidyverse)
library(lme4)
library(lmerTest)
library(emmeans)
library(lattice)
library(plotly)After the data setup, to prevent the files from getting too long. We are bringing in the transformed data cohort4_long_final to begin working on looking at how length of participation influence normalized gains.
Because some participants are in the program for one month and others for three, we wanted to capture their growth from the moment they entered (True Baseline) to the moment they left (Final Exit).
\[ g = \frac{\text{Last Post} - \text{First Pre}}{\text{Max Possible Score} - \text{First Pre}}\]What it represents: It shows the percentage of the “available room for growth” the participant actually covered. A student who starts at a 5 and moves to a 6 has a “perfect” gain (1.0), just like a student moving from a 3 to a 6.
cohort4_final_gains <- cohort4_long_final %>%
group_by(StudyNumber) %>%
summarize(
# 1. Participation Length
Months_Participated = n_distinct(Month_Number),
# 2. Self-Efficacy: First Pre and Last Post
First_Pre_SE = Self_Efficacy[Measure_Type == "Pre" & Month_Number == min(Month_Number)][1],
Last_Post_SE = Self_Efficacy[Measure_Type == "Post" & Month_Number == max(Month_Number)][1],
# 3. Resilience: First Pre and Last Post
First_Pre_Res = Resilience[Measure_Type == "Pre" & Month_Number == min(Month_Number)][1],
Last_Post_Res = Resilience[Measure_Type == "Post" & Month_Number == max(Month_Number)][1],
# Keep metadata
Mentoring_Category = first(Mentoring_Category),
Research_Category = first(Research_Category),
.groups = "drop"
) %>%
# 4. Calculate Normalized Gains (Hake's g)
mutate(
# Gain for Self-Efficacy (1-6 scale)
Gain_SelfEfficacy = (Last_Post_SE - First_Pre_SE) / (6 - First_Pre_SE),
Gain_SelfEfficacy = ifelse(First_Pre_SE == 6, 0, Gain_SelfEfficacy),
# Gain for Resilience (1-5 scale)
Gain_Resilience = (Last_Post_Res - First_Pre_Res) / (5 - First_Pre_Res),
Gain_Resilience = ifelse(First_Pre_Res == 5, 0, Gain_Resilience)
)
glimpse(cohort4_final_gains)Rows: 40
Columns: 10
$ StudyNumber <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
$ Months_Participated <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3, 2, 2, 2, 2,…
$ First_Pre_SE <dbl> 4.250, 4.500, 5.000, 5.250, 5.375, 5.250, 5.500, 4…
$ Last_Post_SE <dbl> 4.500, 4.750, 5.125, 6.000, 5.125, 5.000, 5.000, 5…
$ First_Pre_Res <dbl> 4.0, 3.5, 4.0, 4.5, 5.0, 4.0, 5.0, 3.5, 4.0, 5.0, …
$ Last_Post_Res <dbl> 3.5, 4.0, 4.0, 5.0, 5.0, 3.5, 4.0, 3.5, 4.0, 5.0, …
$ Mentoring_Category <chr> "mentee", "mentee", "mentee", "mentee", "mentee", …
$ Research_Category <chr> "Undergraduates", "Undergraduates", "Undergraduate…
$ Gain_SelfEfficacy <dbl> 0.1428571, 0.1666667, 0.1250000, 1.0000000, -0.400…
$ Gain_Resilience <dbl> -0.5000000, 0.3333333, 0.0000000, 1.0000000, 0.000…
This cohort4_final_gains data frame allows us to ask, “Does a participant who participated for three months have a greater gain than someone who participated for one month?” To ensure that we do the right kind of statistical test, we first want to see if our data are normally distributed
# 1. Visual Check: Histogram of Self Efficacy
ggplot(cohort4_final_gains, aes(x = Gain_SelfEfficacy)) +
geom_histogram(fill = "steelblue", color = "white", bins = 15) +
theme_minimal() +
labs(title = "Distribution of Normalized Gains for Self-efficacy")# 1. Visual Check: Histogram of Resilience
ggplot(cohort4_final_gains, aes(x = Gain_Resilience)) +
geom_histogram(fill = "steelblue", color = "white", bins = 15) +
theme_minimal() +
labs(title = "Distribution of Normalized Gains for Resilience")# 2. Statistical Check: Shapiro-Wilk Test
# p > 0.05 means your data is likely Normal.
shapiro.test(cohort4_final_gains$Gain_SelfEfficacy)
Shapiro-Wilk normality test
data: cohort4_final_gains$Gain_SelfEfficacy
W = 0.95083, p-value = 0.08096
#shapiro wilk for resilience
shapiro.test(cohort4_final_gains$Gain_Resilience)
Shapiro-Wilk normality test
data: cohort4_final_gains$Gain_Resilience
W = 0.80281, p-value = 7.758e-06
Since the Shapiro test p-value is greater than .05 for self-efficacy, we can use parametric statistics. But it is less that .05 for Resilience so a Kruskal-Wallis test is more appropriate.
# Run ANOVA
anova_final_gains <- aov(Gain_SelfEfficacy ~ as.factor(Months_Participated), data = cohort4_final_gains)
# View results
summary(anova_final_gains) Df Sum Sq Mean Sq F value Pr(>F)
as.factor(Months_Participated) 2 0.123 0.06173 0.213 0.809
Residuals 37 10.701 0.28920
# 1. Ensure Months_Participated is a factor so it creates separate boxes
cohort4_final_gains <- cohort4_final_gains %>%
mutate(Months_Factor = as.factor(Months_Participated))
# 2. Create the Plot
ggplot(cohort4_final_gains, aes(x = Months_Factor, y = Gain_SelfEfficacy, fill = Months_Factor)) +
# Add the boxplot for the summary statistics
geom_boxplot(alpha = 0.5, outlier.shape = NA) +
# Add each StudyNumber as a jittered dot
# width = 0.2 prevents the dots from spreading too far horizontally
geom_jitter(width = 0.2, size = 2, alpha = 0.7, color = "#2c3e50") +
# Styling
theme_minimal(base_size = 14) +
scale_fill_brewer(palette = "Purples") + # Subtle blue gradient for time
labs(
title = "Self-Efficacy Growth by Program Duration",
subtitle = "Normalized Gain (First Pre to Last Post)",
x = "Number of Months Participated",
y = "Normalized Gain (Hake's g)",
caption = "Each dot represents one participant (StudyNumber)"
) +
theme(legend.position = "none") # Legend is redundant since x-axis is labeledA one-way ANOVA was conducted to compare the effect of participation length on normalized self-efficacy gains. The results indicated that there were no significant differences between those who participated for 1, 2, or 3 months, \(F(2, 37) = 0.21, p = .809\).
# 2. Create the Plot
ggplot(cohort4_final_gains, aes(x = Months_Factor, y = Gain_Resilience, fill = Months_Factor)) +
# Add the boxplot for the summary statistics
geom_boxplot(alpha = 0.5, outlier.shape = NA) +
# Add each StudyNumber as a jittered dot
# width = 0.2 prevents the dots from spreading too far horizontally
geom_jitter(width = 0.2, size = 2, alpha = 0.7, color = "#2c3e50") +
# Styling
theme_minimal(base_size = 14) +
scale_fill_brewer(palette = "Blues") + # Subtle blue gradient for time
labs(
title = "Resilience Growth by Program Duration",
subtitle = "Normalized Gain (First Pre to Last Post)",
x = "Number of Months Participated",
y = "Normalized Gain (Hake's g)",
caption = "Each dot represents one participant (StudyNumber)"
) +
theme(legend.position = "none") # Legend is redundant since x-axis is labeledggsave(
filename = "Gains-Resilience.svg",
device = "svg",
width = 8, # Width in inches
height = 6, # Height in inches
units = "in"
)resilience_kw <- kruskal.test(Gain_Resilience ~ as.factor(Months_Participated),
data = cohort4_final_gains)
# View the results
print(resilience_kw)
Kruskal-Wallis rank sum test
data: Gain_Resilience by as.factor(Months_Participated)
Kruskal-Wallis chi-squared = 0.52091, df = 2, p-value = 0.7707
A Kruskal-Wallis test showed that normalized gains in resilience did not significantly differ by length of participation, \(\chi^2(2) = .521, p = .775\).
long_gains <- cohort4_final_gains %>%
select(StudyNumber, Months_Participated, Gain_SelfEfficacy, Gain_Resilience) %>%
pivot_longer(cols = starts_with("Gain"), names_to = "Measure", values_to = "Value")
# Plot
ggplot(long_gains, aes(x = as.factor(Months_Participated), y = Value, fill = Measure)) +
geom_boxplot(alpha = 0.6, outlier.shape = NA) +
geom_jitter(width = 0.15, alpha = 0.4) +
facet_wrap(~Measure, scales = "free_y",
labeller = as_labeller(c(Gain_SelfEfficacy = "Self-Efficacy (Hake's g)",
Gain_Resilience = "Resilience (Hake's g)"))) +
theme_minimal() +
scale_fill_manual(values = c("#3498db", "#2ecc71")) +
labs(
title = "Comparison of Normalized Gains by Program Duration",
x = "Months of Participation",
y = "Normalized Gain Score"
) +
theme(legend.position = "none")# Filter and test for each month group
t_test_results <- cohort4_final_gains %>%
group_by(Months_Participated) %>%
summarise(
Mean_Gain = mean(Gain_SelfEfficacy, na.rm = TRUE),
P_Value = t.test(Gain_SelfEfficacy, mu = 0)$p.value
)
print(t_test_results)# A tibble: 3 × 3
Months_Participated Mean_Gain P_Value
<int> <dbl> <dbl>
1 1 -0.00825 0.953
2 2 -0.0476 0.732
3 3 0.0929 0.612
# Non-parametric test for resilience
wilcox_results <- cohort4_final_gains %>%
group_by(Months_Participated) %>%
summarise(
Median_Gain = median(Gain_Resilience, na.rm = TRUE),
P_Value = wilcox.test(Gain_Resilience, mu = 0)$p.value
)
print(wilcox_results)# A tibble: 3 × 3
Months_Participated Median_Gain P_Value
<int> <dbl> <dbl>
1 1 0 0.850
2 2 0 0.387
3 3 0 0.586
Since the baseline remained unchanged regardless of how long participants engaged in the program, we thought it would be interesting to examine some individual trajectories. The code below creates an interactive chart showing the actual score for each and every time a participant answered the survey questions.
# Create a chronological ordering factor
indiv_traject_data <- cohort4_long_final %>%
mutate(
# Create a label like "M1-Pre", "M1-Post", etc.
Time_Label = paste0("M", Month_Number, "-", Measure_Type),
# Create a numeric sequence for the x-axis
# (Month 1 Pre = 1, Month 1 Post = 2, Month 2 Pre = 3, etc.)
Timeline = (Month_Number - 1) * 2 + ifelse(Measure_Type == "Pre", 1, 2)
) %>%
mutate(Time_Label = reorder(Time_Label, Timeline)) # Force R to keep them in order
# Create a list of IDs who stayed for 3 months
completers_3mo <- cohort4_long_final %>%
group_by(StudyNumber) %>%
filter(max(Month_Number) == 3) %>%
pull(StudyNumber) %>%
unique()
indiv_traject_data <- indiv_traject_data %>%
filter(StudyNumber %in% completers_3mo)
# 1. Create a "Shared Data" object
shared_data <- highlight_key(indiv_traject_data, ~StudyNumber)
# 2. Build the plot using the shared data
p_interactive <- ggplot(shared_data, aes(x = Timeline, y = Self_Efficacy, group = StudyNumber)) +
geom_line(color = "steelblue", alpha = 0.3) +
geom_point(alpha = 0.5) +
scale_x_continuous(breaks = 1:6,
labels = c("M1-Pre", "M1-Post", "M2-Pre", "M2-Post", "M3-Pre", "M3-Post")) +
theme_minimal()
# 3. Use ggplotly with 'highlight' options
ggplotly(p_interactive) %>%
highlight(
on = "plotly_hover", # Line lights up when you hover
color = "#45185e", # The color it turns when highlighted
selectize = TRUE, # Adds a dropdown menu to select a specific StudyNumber
persistent = FALSE
)