#Excel data cleaning - removed all responses less than 30% complete. This is 4 post-course surveys and 11 pre-course suverys - if someone submitted more than one at a timepoint, the most complete was kept

dat=read.csv("POBRebootCombined.csv")
names=read.csv("POB1_StudentIDs.csv")
demo=read.csv("StudentDemo.csv")

#install.packages("dplyr")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#identify names from student IDs and create "first name" "last name" variables.
dat_ID <- merge(dat, names, by = "StudentID", all.x = TRUE)

dat_full=dat_ID #this is the dataset used in analysis

demo = demo %>%
  dplyr::rename(StudentID = Student.ID)

dat_full=merge(dat_full, demo, by = "StudentID", all.x = TRUE)

dat_full <- dat_full %>%
   dplyr::rename(Cohort.TR = Cohort) %>% #change name as this column has both the cohort and indicator of transfer status
  mutate(
    Cohort = substr(Cohort.TR, 1, 4), #only keep semester and year as new column called cohort
    Transfer = ifelse(grepl("TR", Cohort.TR), 1, 0) #create a new variable flag for transfer status
  )

#use studenet reported gender and race, as there are more options
  dat_full <- dat_full %>%
  mutate(Gender = case_when(
    Gender == "Female" ~ "Female",
    Gender == "Male"   ~ "Male",
    TRUE               ~ "Other"
  ))

dat_full <- dat_full %>%
  mutate(Race = case_when(
    Race == "White" ~ "White",
    TRUE               ~ "PEER"
  ))

#replace "TR student" designation with "NA" in the standardized tests and HS GPA columns
dat_full <- dat_full %>%
  mutate(
    across(c(HS.GPA_IR, SATmath, SATeng, ACT),
           ~na_if(., "TR student"))
  )

#some students have ACT, some have SAT. Convert SAT scores to ACT for most complete data. 

convert_sat_to_act <- function(sat_total) {
  case_when(
    sat_total >= 1570 ~ 36,
    sat_total >= 1530 ~ 35,
    sat_total >= 1490 ~ 34,
    sat_total >= 1450 ~ 33,
    sat_total >= 1420 ~ 32,
    sat_total >= 1390 ~ 31,
    sat_total >= 1360 ~ 30,
    sat_total >= 1330 ~ 29,
    sat_total >= 1290 ~ 28,
    sat_total >= 1250 ~ 27,
    sat_total >= 1210 ~ 26,
    sat_total >= 1170 ~ 25,
    sat_total >= 1130 ~ 24,
    sat_total >= 1090 ~ 23,
    sat_total >= 1050 ~ 22,
    sat_total >= 1010 ~ 21,
    sat_total >= 970  ~ 20,
    sat_total >= 930  ~ 19,
    sat_total >= 890  ~ 18,
    sat_total >= 850  ~ 17,
    sat_total >= 810  ~ 16,
    sat_total >= 770  ~ 15,
    sat_total >= 730  ~ 14,
    sat_total >= 690  ~ 13,
    sat_total >= 650  ~ 12,
    sat_total >= 620  ~ 11,
    sat_total >= 590  ~ 10,
    sat_total >= 560  ~ 9,
    TRUE ~ NA_real_
  )
}

# Apply conversion
dat_full <- dat_full %>%
  mutate(
    SATmath = as.numeric(SATmath),
    SATeng = as.numeric(SATeng),  # Assuming this is your SAT EBRW column
    SAT_total = SATmath + SATeng,
    ACT_equiv = convert_sat_to_act(SAT_total)
  )
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `SATmath = as.numeric(SATmath)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
#install.packages("readr")
library(readr)

#combine the data from the converted ACT scores and the ACT IR reported scores into a single useable column.
dat_full <- dat_full %>%
  mutate(
    ACT_numeric = suppressWarnings(as.numeric(ACT)),
    ACT.complete = coalesce(ACT_equiv, ACT_numeric)
  ) %>%
  select(-ACT_numeric)

dat_full <- dat_full %>%
  mutate(
    Needs.Program = if_else(
      DBLight == 1 | Landers == 1 | SusPrimus == 1,
      1, 0
    )
  )
###########################3
#identify all StudentIDs that have both "Pre" and "Post" observations in the Timepoint column and keep only those observations
repeated <- dat_ID %>%
  group_by(StudentID) %>%
  filter(all(c("Pre", "Post") %in% Timepoint)) %>%
  ungroup()

#keep only one Pre and one Post observation per StudentID, selecting the most complete one based on the highest value in the Progress column


dat_paired<- repeated %>%
  # Keep only students who have both "Pre" and "Post"
  group_by(StudentID) %>%
  filter(all(c("Pre", "Post") %in% Timepoint)) %>%
  # Select highest Progress per StudentID and Timepoint
  group_by(StudentID, Timepoint) %>%
  slice_max(order_by = Progress, n = 1, with_ties = FALSE) %>%
  ungroup()
#Demographics Report 

demo_report <- dat_full %>%
  select(
    StudentID, Race, Gender, STEM.major, Section, ClassStanding,
    Ethnicity_IR, FirstGen_IR, Age, State_IR, Major_IR, Minor_IR,
    HS.GPA_IR, SMCM.GPA, LABGrade_IR, Cohort, Transfer,
    ACT.complete, Needs.Program
  )

#install.packages("forcats")
library(forcats)

demo_report <- demo_report %>%
  mutate(
    StudentID     = as.character(StudentID), # IDs often better as character
    Race          = as.factor(Race),
    Gender        = as.factor(Gender),
    STEM.major    = as.factor(STEM.major),
    Section       = as.factor(Section),
    ClassStanding = as.factor(ClassStanding),
    Ethnicity_IR  = as.factor(Ethnicity_IR),
    FirstGen_IR   = as.factor(FirstGen_IR),
    Age           = as.numeric(Age),
    State_IR      = as.factor(State_IR),
    Major_IR      = as.factor(trimws(Major_IR)),
    Minor_IR      = as.factor(trimws(Minor_IR)),
    HS.GPA_IR     = as.numeric(HS.GPA_IR),
    SMCM.GPA      = as.numeric(SMCM.GPA),
    LABGrade_IR   = as.factor(trimws(LABGrade_IR)),
    Cohort        = as.factor(Cohort),
    Transfer      = as.factor(Transfer),
    ACT.complete  = as.numeric(ACT.complete),
    Needs.Program = as.factor(Needs.Program)
  )


# Count non-NA values per row, then pick the row with most non-NAs for each StudentID
demo_report <- demo_report %>%
  mutate(non_missing = rowSums(!is.na(.))) %>%
  group_by(StudentID) %>%
  slice_max(order_by = non_missing, n = 1, with_ties = FALSE) %>%
  ungroup() %>%
  select(-non_missing)  # remove helper column

# Get basic summary
#install.packages("vtable")
library(vtable)
## Loading required package: kableExtra
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
#Print table statistics in R Markdown
st(demo_report, out='browser')

##Grit https://www.researchgate.net/publication/6290064_Grit_Perseverance_and_Passion_for_Long-Term_Goals

#install.packages("psych")
library(psych)

grit_scale_map <- c(
  "Not at all like me"    = 1,
  "Not much like me"      = 2,
  "Somewhat like me"      = 3,
  "Mostly like me"        = 4,
  "Very much like me"     = 5
)

grit_items <- paste0("Grit", 1:12)

dat_full[grit_items] <- lapply(dat_full[grit_items], function(col) {
  as.numeric(grit_scale_map[as.character(col)])
})

grit_data <- dat_full[, paste0("Grit", 1:12)]
alpha_result <- psych::alpha(grit_data, check.keys = TRUE)
## Warning in psych::alpha(grit_data, check.keys = TRUE): Some items were negatively correlated with the first principal component and were automatically reversed.
##  This is indicated by a negative sign for the variable name.
print(alpha_result)
## 
## Reliability analysis   
## Call: psych::alpha(x = grit_data, check.keys = TRUE)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
##       0.77      0.78    0.82      0.22 3.4 0.023  3.2 0.54     0.22
## 
##     95% confidence boundaries 
##          lower alpha upper
## Feldt     0.72  0.77  0.81
## Duhachek  0.72  0.77  0.81
## 
##  Reliability if an item is dropped:
##         raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## Grit1-       0.76      0.77    0.81      0.23 3.3    0.024 0.034  0.22
## Grit2        0.74      0.75    0.80      0.22 3.0    0.026 0.037  0.21
## Grit3        0.76      0.77    0.81      0.23 3.3    0.024 0.032  0.26
## Grit4-       0.76      0.77    0.82      0.23 3.3    0.024 0.038  0.22
## Grit5        0.74      0.75    0.80      0.22 3.1    0.026 0.035  0.22
## Grit6-       0.75      0.75    0.79      0.21 3.0    0.025 0.035  0.21
## Grit7        0.74      0.75    0.80      0.22 3.0    0.026 0.033  0.22
## Grit8        0.74      0.75    0.80      0.21 3.0    0.027 0.037  0.19
## Grit9-       0.75      0.76    0.81      0.22 3.1    0.025 0.037  0.20
## Grit10-      0.77      0.77    0.82      0.24 3.4    0.023 0.034  0.22
## Grit11       0.77      0.78    0.82      0.24 3.5    0.023 0.030  0.26
## Grit12-      0.75      0.75    0.80      0.21 3.0    0.025 0.033  0.20
## 
##  Item statistics 
##           n raw.r std.r r.cor r.drop mean   sd
## Grit1-  219  0.43  0.46  0.39   0.30  2.8 0.81
## Grit2   211  0.63  0.61  0.57   0.50  3.7 1.03
## Grit3   197  0.51  0.47  0.42   0.36  3.2 0.96
## Grit4-  201  0.47  0.48  0.39   0.32  3.7 0.99
## Grit5   211  0.60  0.57  0.53   0.47  3.2 1.04
## Grit6-  217  0.60  0.63  0.61   0.50  2.6 0.79
## Grit7   206  0.61  0.59  0.56   0.50  3.1 0.97
## Grit8   209  0.66  0.63  0.60   0.53  3.5 1.12
## Grit9-  215  0.58  0.57  0.51   0.43  3.2 0.93
## Grit10- 204  0.43  0.42  0.34   0.25  3.0 1.03
## Grit11  213  0.42  0.40  0.33   0.27  3.4 1.02
## Grit12- 218  0.59  0.61  0.59   0.47  2.8 0.85
## 
## Non missing response frequency for each item
##           2    3    4    5 miss
## Grit1  0.03 0.17 0.41 0.39 0.02
## Grit2  0.14 0.26 0.31 0.29 0.05
## Grit3  0.26 0.40 0.22 0.12 0.12
## Grit4  0.24 0.34 0.28 0.14 0.10
## Grit5  0.30 0.34 0.21 0.15 0.05
## Grit6  0.02 0.13 0.29 0.55 0.03
## Grit7  0.33 0.35 0.21 0.10 0.08
## Grit8  0.22 0.30 0.20 0.28 0.06
## Grit9  0.08 0.29 0.34 0.29 0.04
## Grit10 0.11 0.20 0.29 0.40 0.09
## Grit11 0.23 0.29 0.31 0.17 0.04
## Grit12 0.05 0.16 0.39 0.41 0.02

A value like 0.77 indicates acceptable to good internal consistency, suggesting the items are measuring the same underlying construct (grit).

reverse_items <- c("Grit1", "Grit4", "Grit6", "Grit9", "Grit10", "Grit12")
dat_full[reverse_items] <- lapply(dat_full[reverse_items], function(col) {
  6 - col
})

grit_items <- paste0("Grit", 1:12)
dat_full$Grit_Total <- rowMeans(dat_full[grit_items], na.rm = TRUE)

dat_full$Grit_Perseverance <- rowMeans(dat_full[, c("Grit1", "Grit3", "Grit5", "Grit6", "Grit8", "Grit10")], na.rm = TRUE)
dat_full$Grit_Consistency <- rowMeans(dat_full[, c("Grit2", "Grit4", "Grit7", "Grit9", "Grit11", "Grit12")], na.rm = TRUE)

library(nlme)
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
anova(lme(Grit_Total ~ Timepoint, random = ~1 | StudentID, data = dat_full, na.action = na.omit))
##             numDF denDF  F-value p-value
## (Intercept)     1   134 3306.925  <.0001
## Timepoint       1    75    1.503   0.224
anova(lme(Grit_Perseverance ~ Timepoint, random = ~1 | StudentID, data = dat_full, na.action = na.omit))
##             numDF denDF   F-value p-value
## (Intercept)     1   133 2797.3183  <.0001
## Timepoint       1    75    0.1704  0.6809
anova(lme(Grit_Consistency ~ Timepoint, random = ~1 | StudentID, data = dat_full, na.action = na.omit))
##             numDF denDF  F-value p-value
## (Intercept)     1   134 3251.658  <.0001
## Timepoint       1    75    3.973  0.0499
summary(lme(Grit_Consistency ~ Timepoint, random = ~1 | StudentID, data = dat_full, na.action = na.omit))
## Linear mixed-effects model fit by REML
##   Data: dat_full 
##        AIC     BIC    logLik
##   343.7587 357.128 -167.8794
## 
## Random effects:
##  Formula: ~1 | StudentID
##         (Intercept)  Residual
## StdDev:   0.5098195 0.3219684
## 
## Fixed effects:  Grit_Consistency ~ Timepoint 
##                  Value  Std.Error  DF  t-value p-value
## (Intercept)   2.875513 0.05347397 134 53.77407  0.0000
## TimepointPre -0.107725 0.05404818  75 -1.99313  0.0499
##  Correlation: 
##              (Intr)
## TimepointPre -0.367
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -2.48280008 -0.49294970 -0.06418573  0.54877568  1.73595060 
## 
## Number of Observations: 211
## Number of Groups: 135
anova(lme(Grit_Total ~ Timepoint + Race + STEM.major + Section + Gender + FirstGen_IR + SMCM.GPA + HS.GPA_IR,  random = ~1 | StudentID,  data = dat_full,  na.action = na.omit))
##             numDF denDF  F-value p-value
## (Intercept)     1    60 5014.133  <.0001
## Timepoint       1    60    2.297  0.1349
## Race            1    37    0.303  0.5855
## STEM.major      2    60    0.140  0.8697
## Section         8    37    1.907  0.0882
## Gender          2    60    3.783  0.0284
## FirstGen_IR     1    37   10.251  0.0028
## SMCM.GPA        1    37   55.463  <.0001
## HS.GPA_IR      67    37    1.230  0.2498
# Overall Grit
anova(lme(Grit_Total ~ Gender*Timepoint,  random = ~1 | StudentID,  data = dat_full,  na.action = na.omit))
##                  numDF denDF  F-value p-value
## (Intercept)          1   134 3364.770  <.0001
## Gender               2    71    3.689  0.0299
## Timepoint            1    71    1.737  0.1917
## Gender:Timepoint     2    71    3.328  0.0415
anova(lme(Grit_Total ~  FirstGen_IR*Timepoint,  random = ~1 | StudentID,  data = dat_full,  na.action = na.omit))
##                       numDF denDF  F-value p-value
## (Intercept)               1   132 3388.207  <.0001
## FirstGen_IR               2   132    2.966  0.0550
## Timepoint                 1    73    1.504  0.2239
## FirstGen_IR:Timepoint     2    73    1.957  0.1486
anova(lme(Grit_Total ~ SMCM.GPA*Timepoint,  random = ~1 | StudentID,  data = dat_full,  na.action = na.omit))
##                    numDF denDF  F-value p-value
## (Intercept)            1   133 4091.675  <.0001
## SMCM.GPA               1   133   35.903  <.0001
## Timepoint              1    74    0.663  0.4180
## SMCM.GPA:Timepoint     1    74    2.724  0.1031
anova(lme(Grit_Total ~ LABGrade_IR,  random = ~1 | StudentID,  data = dat_full,  na.action = na.omit))
##             numDF denDF  F-value p-value
## (Intercept)     1   123 3553.153  <.0001
## LABGrade_IR    11   123    2.065  0.0277
dat_full$LABGrade_IR <- as.factor(dat_full$LABGrade_IR)

summary_data <- dat_full %>%
  group_by(LABGrade_IR) %>%
  summarise(
    mean_grit = mean(Grit_Total, na.rm = TRUE),
    se = sd(Grit_Total, na.rm = TRUE) / sqrt(n()),  # Standard error
    .groups = 'drop'
  ) %>%
  mutate(
    ci_lower = mean_grit - 1.96 * se,
    ci_upper = mean_grit + 1.96 * se
  )


plot_data <- dat_full %>%
  filter(Gender != "Other") %>%  # Remove "Other"
  mutate(Timepoint = factor(Timepoint, levels = c("Pre", "Post"))) %>%  # Set order
  group_by(Gender, Timepoint) %>%
  summarise(
    mean_grit = mean(Grit_Total, na.rm = TRUE),
    se_grit = sd(Grit_Total, na.rm = TRUE) / sqrt(n()),
    .groups = "drop"
  )

#install.packages("ggplot2")
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
# Create the plot
p1=ggplot(plot_data, aes(x = Timepoint, y = mean_grit, group = Gender, color = Gender)) +
  geom_line(size = 1) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = mean_grit - se_grit, ymax = mean_grit + se_grit), width = 0.1) +
  labs(
    title = "Grit: Overall Score",
    x = "Timepoint",
    y = "Mean Grit Total"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
dat_full %>%
  filter(Timepoint == "Post") %>%
  ggplot(aes(x = SMCM.GPA, y = Grit_Total)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +
  labs(
    title = "Grit Total vs SMCM GPA (Post Timepoint Only)",
    x = "SMCM GPA",
    y = "Grit Total"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

# Perseverance 
anova(lme(Grit_Perseverance ~ Timepoint + Race + STEM.major + Section + Gender + FirstGen_IR + SMCM.GPA + HS.GPA_IR,  random = ~1 | StudentID,  data = dat_full,  na.action = na.omit))
##             numDF denDF  F-value p-value
## (Intercept)     1    60 3752.795  <.0001
## Timepoint       1    60    0.145  0.7048
## Race            1    36    0.768  0.3868
## STEM.major      2    60    0.690  0.5053
## Section         8    36    2.040  0.0692
## Gender          2    60    2.992  0.0577
## FirstGen_IR     1    36    7.132  0.0113
## SMCM.GPA        1    36   45.314  <.0001
## HS.GPA_IR      67    36    1.078  0.4109
anova(lme(Grit_Perseverance ~ Gender*Timepoint,  random = ~1 | StudentID,  data = dat_full,  na.action = na.omit))
##                  numDF denDF   F-value p-value
## (Intercept)          1   133 2823.9731  <.0001
## Gender               2    71    2.5440  0.0857
## Timepoint            1    71    0.1500  0.6996
## Gender:Timepoint     2    71    4.2413  0.0182
anova(lme(Grit_Perseverance ~ FirstGen_IR*Timepoint,  random = ~1 | StudentID,  data = dat_full,  na.action = na.omit))
##                       numDF denDF   F-value p-value
## (Intercept)               1   131 2852.4133  <.0001
## FirstGen_IR               2   131    2.7900  0.0651
## Timepoint                 1    73    0.1551  0.6949
## FirstGen_IR:Timepoint     2    73    1.6719  0.1950
anova(lme(Grit_Perseverance ~ SMCM.GPA*Timepoint,  random = ~1 | StudentID,  data = dat_full,  na.action = na.omit))
##                    numDF denDF  F-value p-value
## (Intercept)            1   132 3450.833  <.0001
## SMCM.GPA               1   132   33.109  <.0001
## Timepoint              1    74    0.008  0.9308
## SMCM.GPA:Timepoint     1    74    0.196  0.6592
plot_data2 <- dat_full %>%
  filter(Gender != "Other") %>%  # Remove "Other"
  mutate(Timepoint = factor(Timepoint, levels = c("Pre", "Post"))) %>%  # Set order
  group_by(Gender, Timepoint) %>%
  summarise(
    mean_grit = mean(Grit_Perseverance, na.rm = TRUE),
    se_grit = sd(Grit_Perseverance, na.rm = TRUE) / sqrt(n()),
    .groups = "drop"
  )

# Create the plot
p2=ggplot(plot_data2, aes(x = Timepoint, y = mean_grit, group = Gender, color = Gender)) +
  geom_line(size = 1) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = mean_grit - se_grit, ymax = mean_grit + se_grit), width = 0.1) +
  labs(
    title = "Grit: Perseverance",
    x = "Timepoint",
    y = "Mean Grit Pers."
  ) +
  theme_minimal() +
  theme(legend.position = "none")

p2

anova(lme(Grit_Consistency ~ Timepoint + Race + STEM.major + Section + Gender + FirstGen_IR + SMCM.GPA + HS.GPA_IR,  random = ~1 | StudentID,  data = dat_full,  na.action = na.omit))
##             numDF denDF  F-value p-value
## (Intercept)     1    60 4646.327  <.0001
## Timepoint       1    60    6.363  0.0143
## Race            1    37    0.241  0.6267
## STEM.major      2    60    0.564  0.5719
## Section         8    37    1.523  0.1827
## Gender          2    60    4.399  0.0165
## FirstGen_IR     1    37    8.855  0.0051
## SMCM.GPA        1    37   42.871  <.0001
## HS.GPA_IR      67    37    1.281  0.2084
anova(lme(Grit_Consistency ~ Gender*Timepoint,  random = ~1 | StudentID,  data = dat_full,  na.action = na.omit))
##                  numDF denDF  F-value p-value
## (Intercept)          1   134 3348.430  <.0001
## Gender               2    71    4.585  0.0134
## Timepoint            1    71    4.559  0.0362
## Gender:Timepoint     2    71    0.672  0.5137
anova(lme(Grit_Consistency ~ FirstGen_IR*Timepoint,  random = ~1 | StudentID,  data = dat_full,  na.action = na.omit))
##                       numDF denDF  F-value p-value
## (Intercept)               1   132 3302.930  <.0001
## FirstGen_IR               2   132    2.172  0.1180
## Timepoint                 1    73    3.992  0.0494
## FirstGen_IR:Timepoint     2    73    1.037  0.3598
anova(lme(Grit_Consistency ~ LABGrade_IR,  random = ~1 | StudentID,  data = dat_full,  na.action = na.omit))
##             numDF denDF  F-value p-value
## (Intercept)     1   123 3413.294  <.0001
## LABGrade_IR    11   123    1.902  0.0451
plot_data3 <- dat_full %>%
  filter(Gender != "Other") %>%  # Remove "Other"
  mutate(Timepoint = factor(Timepoint, levels = c("Pre", "Post"))) %>%  # Set order
  group_by(Gender, Timepoint) %>%
  summarise(
    mean_grit = mean(Grit_Consistency, na.rm = TRUE),
    se_grit = sd(Grit_Consistency, na.rm = TRUE) / sqrt(n()),
    .groups = "drop"
  )

# Create the plot
p3=ggplot(plot_data3, aes(x = Timepoint, y = mean_grit, group = Gender, color = Gender)) +
  geom_line(size = 1) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = mean_grit - se_grit, ymax = mean_grit + se_grit), width = 0.1) +
  labs(
    title = "Grit: Consistency",
    x = "Timepoint",
    y = "Mean Grit Cons."
  ) +
  theme_minimal() +
  theme(legend.position = "none")

p3

library(patchwork)  # install.packages("patchwork") if not installed

# Assuming p1, p2, p3 are your ggplot objects

# Combine the plots
combined_plot <- p1 / (p2 + p3) + 
  plot_layout(heights = c(2, 1))  # Makes p1 taller

# Print the final layout
print(combined_plot)

A sig. difference was found in “Grit” subconstruct “consistency”. These items include: 1. New ideas and projects sometimes distract me from previous ones. 2. I often set a goal but later choose to pursue a different one. 3. I have difficulty maintaining my focus on projects that take more than a few months to complete. 4. I become interested in new pursuits every few months. 5. I have been obsessed with a certain idea or project for a short time but later lost interest. 6. I often abandon a task once it becomes difficult.

If a student improved in Consistency of Interests but not in Perseverance of Effort, it suggests a specific kind of developmental change in how they approach long-term goals:

Improvement in Consistency of Interests

No Change in Perseverance of Effort

This pattern often reflects:

This student might benefit from support in resilience, discipline, or coping strategies. Programs focused on grit-building could emphasize:

##Experimental Design- First Year Undergraduate Level

https://q4b.biology.ubc.ca/concept-inventories/experimental-design-first-year-undergraduate-level/experimental-design-first-year-undergraduate-level-concept-inventory-package/

correct_answers <- c(
  "No - mice should face a random direction and the enclosure should rotate randomly",
  "It should lead to mice with variable characteristics being distributed fairly evenly",
  "The other two methods do not directly assess weight changes in mice",
  "It provides more chances to compare activity rate across different temperatures",
  "Other variables as well as temperature will differ between treatment groups",
  "Mice in each treatment group should not vary in size, age or sex (small, young, females only)",
  "The 14C water temperature treatment group",
  "Take measurements from 14 fish in one temperature group but take measurements from all 15 fish in the other temperature groups (total n=44)",
  "The weights of fish in each treatment group were too variable",
  "Sex, age, health and weight at start",
  "Use a set of weighing scales that is able to measure smaller weights",
  "You can test all four of these hypotheses",
  "Spray plain water on another group of 150 tomato plants to compare results with P1, P2 and P3 ",
  "Potting soil type and temperature",
  "If you are able to support a hypothesis (either the alternate or null hypothesis) in all three experiments",
  "Any of the above sources could affect data more than the others in any given experiment",
  "Splitting the treatment groups into thirds is unfair as field conditions might vary",
  "Count aphids on 1 randomly selected leaf from 100 plants"
)

# Corresponding column names in your dataset
question_cols <- paste0("ED", 1:18)


for (i in seq_along(question_cols)) {
  col <- question_cols[i]
  dat_full[[col]] <- as.integer(
    trimws(iconv(dat_full[[col]], from = "", to = "UTF-8")) == correct_answers[i]
  )
}

# Optional: Calculate total score
dat_full$EDCI_Total_Score <- rowSums(dat_full[question_cols], na.rm = TRUE)


dat_full %>%
  group_by(Timepoint) %>%
  summarise(
    Mean_Score = mean(EDCI_Total_Score, na.rm = TRUE),
    SD_Score = sd(EDCI_Total_Score, na.rm = TRUE),
    N = n()
  )
## # A tibble: 2 × 4
##   Timepoint Mean_Score SD_Score     N
##   <chr>          <dbl>    <dbl> <int>
## 1 Post            7.24     3.77   132
## 2 Pre             7.60     3.33    91
anova(lme(EDCI_Total_Score ~ Timepoint + Race + STEM.major + Section + Gender + FirstGen_IR + SMCM.GPA + HS.GPA_IR,  random = ~1 | StudentID,  data = dat_full,  na.action = na.omit))
##             numDF denDF  F-value p-value
## (Intercept)     1    63 889.4430  <.0001
## Timepoint       1    63   1.7480  0.1909
## Race            1    37   9.6590  0.0036
## STEM.major      2    63   2.0217  0.1409
## Section         8    37   1.4900  0.1942
## Gender          2    63   0.5619  0.5730
## FirstGen_IR     1    37   0.1228  0.7280
## SMCM.GPA        1    37  22.7491  <.0001
## HS.GPA_IR      67    37   0.8162  0.7678
anova(lme(EDCI_Total_Score ~ Timepoint*Race ,  random = ~1 | StudentID,  data = dat_full,  na.action = na.omit))
##                numDF denDF  F-value p-value
## (Intercept)        1   133 869.1794  <.0001
## Timepoint          1    77   1.3332  0.2518
## Race               1   133   8.5629  0.0040
## Timepoint:Race     1    77   1.3806  0.2436
plot_data4 <- dat_full %>%
  mutate(Timepoint = factor(Timepoint, levels = c("Pre", "Post"))) %>%  # Set order
  group_by(Race, Timepoint) %>%
  summarise(
    mean = mean(EDCI_Total_Score, na.rm = TRUE),
    se = sd(EDCI_Total_Score, na.rm = TRUE) / sqrt(n()),
    .groups = "drop"
  )

# Create the plot
ggplot(plot_data4, aes(x = Timepoint, y = mean, group = Race, color = Race)) +
  geom_line(size = 1) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.1) +
  labs(
    title = "Interaction of Race and Timepoint on EDCI Total",
    x = "Timepoint",
    y = "Mean EDCI"
  ) +
  theme_minimal()

- graphic indicates:The density plot of EDCI total scores shows no evidence of ceiling or floor effects. Most scores are centered in the mid-range (around 7–11 out of 18), with no clustering at the maximum or minimum values. This suggests the assessment is appropriately scaled for this student group, allowing room to detect both lower and higher levels of understanding.

Subconstruct Items
Randomization and Bias ED1, ED2, ED16, ED17
Manipulation of Variables ED7
Measurement of Outcomes ED3, ED4, ED11
Control of Confounding ED5, ED6, ED10, ED13, ED14
Replication and Sample Size ED8, ED15, ED18
Variable Properties ED9, ED12
#analysis of subconstructs

randomization_bias_items <- c("ED1", "ED2", "ED16", "ED17")
manipulation_items        <- c("ED7")
measurement_items         <- c("ED3", "ED4", "ED11")
confounding_items         <- c("ED5", "ED6", "ED10", "ED13", "ED14")
replication_items         <- c("ED8", "ED15", "ED18")
variable_props_items      <- c("ED9", "ED12")

all_items <- c(
  randomization_bias_items,
  manipulation_items,
  measurement_items,
  confounding_items,
  replication_items,
  variable_props_items
)

ed_columns <- grep("^ED", names(dat_paired), value = TRUE)
dat_paired[ed_columns] <- lapply(dat_paired[ed_columns], function(x) as.numeric(as.character(x)))



dat_paired$Bias_Score         <- rowMeans(dat_paired[randomization_bias_items], na.rm = TRUE)
dat_paired$Manip_Score        <- rowMeans(dat_paired[manipulation_items], na.rm = TRUE)
dat_paired$Measure_Score      <- rowMeans(dat_paired[measurement_items], na.rm = TRUE)
dat_paired$Confound_Score     <- rowMeans(dat_paired[confounding_items], na.rm = TRUE)
dat_paired$Replication_Score  <- rowMeans(dat_paired[replication_items], na.rm = TRUE)
dat_paired$VariableProp_Score <- rowMeans(dat_paired[variable_props_items], na.rm = TRUE)

dat_full$Bias_Score         <- rowMeans(dat_full[randomization_bias_items], na.rm = TRUE)
dat_full$Manip_Score        <- rowMeans(dat_full[manipulation_items], na.rm = TRUE)
dat_full$Measure_Score      <- rowMeans(dat_full[measurement_items], na.rm = TRUE)
dat_full$Confound_Score     <- rowMeans(dat_full[confounding_items], na.rm = TRUE)
dat_full$Replication_Score  <- rowMeans(dat_full[replication_items], na.rm = TRUE)
dat_full$VariableProp_Score <- rowMeans(dat_full[variable_props_items], na.rm = TRUE)

dat_paired %>%
  group_by(Timepoint) %>%
  summarise(
    Mean_Bias         = mean(Bias_Score, na.rm = TRUE),
    Mean_Manip        = mean(Manip_Score, na.rm = TRUE),
    Mean_Measure      = mean(Measure_Score, na.rm = TRUE),
    Mean_Confound     = mean(Confound_Score, na.rm = TRUE),
    Mean_Replication  = mean(Replication_Score, na.rm = TRUE),
    Mean_VariableProp = mean(VariableProp_Score, na.rm = TRUE),
    N = n()
  )

anova(lme(Bias_Score ~ Timepoint, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))
anova(lme(Manip_Score ~ Timepoint, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))
anova(lme(Measure_Score ~ Timepoint, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))
anova(lme(Confound_Score ~ Timepoint, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))
anova(lme(Replication_Score ~ Timepoint, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))
anova(lme(VariableProp_Score ~ Timepoint, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))

anova(lme(Measure_Score ~ Timepoint+Age+Race+Major+Gender+First.gen, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))


anova(lme(Bias_Score ~ Timepoint+Age+Race+Major+Gender+First.gen, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))
anova(lme(Manip_Score ~ Timepoint+Age+Race+Major+Gender+First.gen, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))
anova(lme(Confound_Score ~ Timepoint+Age+Race+Major+Gender+First.gen, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))
anova(lme(Replication_Score ~ Timepoint+Age+Race+Major+Gender+First.gen, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))
anova(lme(VariableProp_Score ~ Timepoint+Age+Race+Major+Gender+First.gen, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))

anova(lme(Replication_Score ~ Timepoint*Gender, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))

##Project Ownership Survey:

https://www.lifescied.org/doi/10.1187/cbe.13-06-0123

Only on the Post-Suvery

# Scoring for POS1–POS10 (agreement scale)
ownership_map <- c(
  "Strongly disagree" = 1,
  "Somewhat disagree" = 2,
  "Neither agree nor disagree" = 3,
  "Somewhat agree" = 4,
  "Strongly agree" = 5
)

# Scoring for POS11–POS16 (emotion scale)
emotion_map <- c(
  "Very slightly" = 1,
  "Slightly" = 2,
  "Moderate" = 3,
  "Considerably" = 4,
  "Very strongly" = 5
)


# Ownership items: POS1–POS10
ownership_cols <- paste0("POS", 1:10)
dat_full[ownership_cols] <- lapply(dat_full[ownership_cols], function(x) {
  as.numeric(ownership_map[as.character(x)])
})

# Emotion items: POS11–POS16
emotion_cols <- paste0("POS", 11:16)
dat_full[emotion_cols] <- lapply(dat_full[emotion_cols], function(x) {
  as.numeric(emotion_map[as.character(x)])
})


all_pos_items <- dat_full[, paste0("POS", 1:16)]

# Calculate Cronbach's alpha
alpha_result <- psych::alpha(all_pos_items)

# View the results
print(alpha_result)
## 
## Reliability analysis   
## Call: psych::alpha(x = all_pos_items)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
##       0.88      0.89    0.94      0.34 8.1 0.011  3.3 0.72     0.33
## 
##     95% confidence boundaries 
##          lower alpha upper
## Feldt     0.86  0.88  0.91
## Duhachek  0.86  0.88  0.90
## 
##  Reliability if an item is dropped:
##       raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## POS1       0.87      0.88    0.94      0.34 7.6    0.011 0.028  0.33
## POS2       0.87      0.88    0.93      0.33 7.3    0.012 0.028  0.32
## POS3       0.88      0.89    0.94      0.34 7.7    0.011 0.031  0.33
## POS4       0.88      0.88    0.94      0.34 7.7    0.011 0.030  0.32
## POS5       0.87      0.88    0.94      0.33 7.4    0.012 0.029  0.32
## POS6       0.88      0.89    0.94      0.34 7.8    0.011 0.031  0.33
## POS7       0.87      0.88    0.94      0.33 7.2    0.012 0.030  0.32
## POS8       0.88      0.89    0.94      0.35 8.0    0.011 0.030  0.34
## POS9       0.87      0.88    0.93      0.32 7.2    0.012 0.028  0.31
## POS10      0.87      0.87    0.93      0.32 7.0    0.012 0.027  0.30
## POS11      0.88      0.89    0.94      0.34 7.7    0.011 0.030  0.33
## POS12      0.88      0.89    0.94      0.35 7.9    0.011 0.027  0.34
## POS13      0.88      0.89    0.94      0.34 7.8    0.011 0.029  0.35
## POS14      0.88      0.88    0.94      0.33 7.5    0.011 0.032  0.33
## POS15      0.89      0.89    0.94      0.36 8.3    0.011 0.026  0.35
## POS16      0.88      0.88    0.93      0.34 7.6    0.011 0.030  0.32
## 
##  Item statistics 
##         n raw.r std.r r.cor r.drop mean   sd
## POS1  108  0.69  0.63  0.62   0.59  3.3 1.20
## POS2  108  0.75  0.71  0.71   0.67  3.1 1.19
## POS3  108  0.55  0.59  0.55   0.52  4.1 0.79
## POS4  108  0.61  0.59  0.56   0.53  4.1 0.91
## POS5  107  0.72  0.67  0.65   0.63  3.8 1.12
## POS6  108  0.61  0.57  0.53   0.49  2.2 1.23
## POS7  108  0.77  0.73  0.71   0.70  2.8 1.21
## POS8  107  0.53  0.50  0.45   0.43  3.8 1.03
## POS9  108  0.79  0.76  0.77   0.71  3.8 1.20
## POS10 108  0.82  0.81  0.82   0.76  3.5 1.23
## POS11  78  0.58  0.58  0.55   0.44  3.0 0.66
## POS12  78  0.49  0.52  0.51   0.36  3.1 0.79
## POS13  75  0.54  0.54  0.53   0.40  3.0 0.74
## POS14  77  0.67  0.65  0.64   0.49  3.0 0.68
## POS15  80  0.39  0.39  0.35   0.26  3.2 0.69
## POS16  73  0.62  0.60  0.59   0.45  3.0 0.71
## 
## Non missing response frequency for each item
##          1    2    3    4    5 miss
## POS1  0.11 0.11 0.27 0.35 0.16 0.52
## POS2  0.13 0.14 0.36 0.24 0.13 0.52
## POS3  0.02 0.01 0.12 0.54 0.31 0.52
## POS4  0.01 0.06 0.10 0.44 0.39 0.52
## POS5  0.06 0.08 0.16 0.42 0.28 0.52
## POS6  0.41 0.24 0.20 0.08 0.06 0.52
## POS7  0.19 0.21 0.31 0.20 0.08 0.52
## POS8  0.04 0.07 0.22 0.40 0.27 0.52
## POS9  0.10 0.02 0.19 0.40 0.30 0.52
## POS10 0.10 0.08 0.24 0.34 0.23 0.52
## POS11 0.00 0.21 0.56 0.23 0.00 0.65
## POS12 0.00 0.27 0.37 0.36 0.00 0.65
## POS13 0.00 0.25 0.45 0.29 0.00 0.66
## POS14 0.00 0.23 0.55 0.22 0.00 0.65
## POS15 0.00 0.15 0.49 0.36 0.00 0.64
## POS16 0.00 0.26 0.51 0.23 0.00 0.67
all_pos_items <- paste0("POS", 1:16)

Cronbach’s alpha = 0.88 → excellent reliability

The 16-item Project Ownership Survey showed excellent internal consistency, with a Cronbach’s alpha of 0.88. This suggests that the items reliably measure a common underlying construct related to students’ sense of ownership and affect in research experiences. The average inter-item correlation (0.32) falls within the acceptable range, indicating coherent but non-redundant items.

all_pos_items <- dat_full[, paste0("POS", 1:16)]

# Calculate overall mean score
dat_full$Ownership_Score <- rowMeans(dat_full[ownership_cols], na.rm = TRUE)
dat_full$Emotion_Score   <- rowMeans(dat_full[emotion_cols], na.rm = TRUE)

summary(dat_full[, c("Ownership_Score", "Emotion_Score")])
##  Ownership_Score Emotion_Score  
##  Min.   :1.50    Min.   :2.000  
##  1st Qu.:3.00    1st Qu.:2.667  
##  Median :3.50    Median :3.000  
##  Mean   :3.45    Mean   :3.026  
##  3rd Qu.:4.00    3rd Qu.:3.333  
##  Max.   :5.00    Max.   :4.000  
##  NA's   :115     NA's   :126
anova(lme(Emotion_Score ~ Ownership_Score , random = ~1 | StudentID, data = dat_full, na.action = na.omit))
##                 numDF denDF  F-value p-value
## (Intercept)         1    87 4240.424  <.0001
## Ownership_Score     1     7   41.086   4e-04
summary(lme(Emotion_Score ~ Ownership_Score , random = ~1 | StudentID, data = dat_full, na.action = na.omit))
## Linear mixed-effects model fit by REML
##   Data: dat_full 
##        AIC      BIC    logLik
##   124.1051 134.2782 -58.05253
## 
## Random effects:
##  Formula: ~1 | StudentID
##         (Intercept)  Residual
## StdDev:   0.3331371 0.2833528
## 
## Fixed effects:  Emotion_Score ~ Ownership_Score 
##                     Value  Std.Error DF  t-value p-value
## (Intercept)     1.3920173 0.25503887 87 5.458059   0e+00
## Ownership_Score 0.4610219 0.07192424  7 6.409826   4e-04
##  Correlation: 
##                 (Intr)
## Ownership_Score -0.984
## 
## Standardized Within-Group Residuals:
##          Min           Q1          Med           Q3          Max 
## -2.633892317 -0.385863800  0.007442814  0.469794749  1.536269445 
## 
## Number of Observations: 96
## Number of Groups: 88
library(tidyr)
dat_long <- dat_full %>%
  select(StudentID, Ownership_Score, Emotion_Score) %>%
  pivot_longer(cols = c(Ownership_Score, Emotion_Score),
               names_to = "Subscale",
               values_to = "Score")

means_df <- dat_long %>%
  group_by(Subscale) %>%
  summarise(Mean_Score = mean(Score, na.rm = TRUE), .groups = "drop")

# Step 3: Plot with density, mean lines, and mean value labels
ggplot(dat_long, aes(x = Score, fill = Subscale, color = Subscale)) +
  geom_density(alpha = 0.4, adjust = 1.2) +
  geom_vline(data = means_df, aes(xintercept = Mean_Score, color = Subscale),
             linetype = "dashed", size = 1) +
  geom_text(data = means_df, aes(x = Mean_Score, y = 0, label = round(Mean_Score, 2), color = Subscale),
            angle = 90, vjust = -0.5, hjust = -11.6, size = 4, show.legend = FALSE) +
  labs(
    title = "Density Plot of POS Subscale Scores with Means",
    x = "Score (1–5 scale)",
    y = "Density"
  ) +
  scale_fill_brewer(palette = "Set1") +
  scale_color_brewer(palette = "Set1") +
  theme_minimal()
## Warning: Removed 241 rows containing non-finite outside the scale range
## (`stat_density()`).

library(ggpubr)
ggplot(dat_full, aes(x = Emotion_Score, y = Ownership_Score)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +
  stat_cor(method = "pearson", label.x = min(dat_full$Emotion_Score, na.rm = TRUE), 
           label.y = max(dat_full$Ownership_Score, na.rm = TRUE)) +
  labs(
    title = "Correlation Between Emotion and Ownership Scores",
    x = "Emotion Score",
    y = "Ownership Score"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 127 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 127 rows containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: Removed 127 rows containing missing values or values outside the scale range
## (`geom_point()`).

Ownership score was significantly higher than emotion score. on average, students rated their sense of ownership in the project (e.g., feeling responsible, engaged, personally invested) higher than they rated their emotional reactions (e.g., feeling joy, amazement, surprise). - Corre. between ownership and emotion

Interpretation in Context:

This might suggest:

The project was meaningful and engaging,but it didn’t always produce strong emotional highs (or lows),Or students express ownership in a cognitive/goal-oriented way, not just emotionally.

# Correlations with subscales
anova(lme(Grit_Total ~ Ownership_Score , random = ~1 | StudentID, data = dat_full, na.action = na.omit))
##                 numDF denDF   F-value p-value
## (Intercept)         1    95 2214.6042  <.0001
## Ownership_Score     1    11    3.8879  0.0743
anova(lme(Grit_Total ~ Emotion_Score , random = ~1 | StudentID, data = dat_full, na.action = na.omit))
##               numDF denDF   F-value p-value
## (Intercept)       1    88 2001.4689  <.0001
## Emotion_Score     1     7    1.6232  0.2433
anova(lme(Grit_Perseverance ~ Ownership_Score , random = ~1 | StudentID, data = dat_full, na.action = na.omit))
##                 numDF denDF   F-value p-value
## (Intercept)         1    94 1800.6793  <.0001
## Ownership_Score     1    11    3.0786  0.1071
anova(lme(Grit_Perseverance ~ Emotion_Score , random = ~1 | StudentID, data = dat_full, na.action = na.omit))
##               numDF denDF  F-value p-value
## (Intercept)       1    87 1598.328  <.0001
## Emotion_Score     1     7    0.768  0.4099
anova(lme(Grit_Consistency ~ Ownership_Score , random = ~1 | StudentID, data = dat_full, na.action = na.omit))
##                 numDF denDF F-value p-value
## (Intercept)         1    95 2247.36  <.0001
## Ownership_Score     1    11    3.68  0.0814
anova(lme(Grit_Consistency ~ Emotion_Score , random = ~1 | StudentID, data = dat_full, na.action = na.omit))
##               numDF denDF   F-value p-value
## (Intercept)       1    88 2046.9984  <.0001
## Emotion_Score     1     7    0.9765   0.356
anova(lme(EDCI_Total_Score ~ Ownership_Score , random = ~1 | StudentID, data = dat_full, na.action = na.omit))
##                 numDF denDF  F-value p-value
## (Intercept)         1    95 754.4126  <.0001
## Ownership_Score     1    11   0.1425   0.713
anova(lme(EDCI_Total_Score ~ Emotion_Score , random = ~1 | StudentID, data = dat_full, na.action = na.omit))
##               numDF denDF  F-value p-value
## (Intercept)       1    88 757.6615  <.0001
## Emotion_Score     1     7   1.0332  0.3433
anova(lm(Emotion_Score ~ Gender, data = dat_full,na.action = na.omit))
## Analysis of Variance Table
## 
## Response: Emotion_Score
##           Df  Sum Sq Mean Sq F value Pr(>F)
## Gender     2  0.2124 0.10619  0.3644 0.6956
## Residuals 94 27.3948 0.29143
anova(lm(Ownership_Score ~ Gender, data = dat_full,na.action = na.omit))
## Analysis of Variance Table
## 
## Response: Ownership_Score
##            Df Sum Sq Mean Sq F value Pr(>F)  
## Gender      2  2.876 1.43820  2.4041 0.0953 .
## Residuals 105 62.813 0.59822                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm(Ownership_Score ~ Emotion_Score, data = dat_full,na.action = na.omit))
## Analysis of Variance Table
## 
## Response: Ownership_Score
##               Df Sum Sq Mean Sq F value    Pr(>F)    
## Emotion_Score  1 12.255 12.2554  45.382 1.283e-09 ***
## Residuals     94 25.385  0.2701                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

##STEPU https://pmc.ncbi.nlm.nih.gov/articles/PMC5132356/

captures students’ values, priorities, and self-assessed learning goals related to science education.

Subcategories: Collaboration/Communication- Work in groups, Scientific writing (Q1-2) Conceptual Understanding- Acquire major scientific concepts, Understand dynamic nature of science, Understand how science applies to everyday life (Q3-7) Technical Skills- Learn lab skills, Memorize facts, Remember formulas (5-8) Higher-Order Thinking- Apply quantitative reasoning, Problem-solving, Creativity and innovation (9-12) Science Literacy- Information literacy, Interdisciplinary understanding (13-14)

stepu_map <- c(
  "Not Important" = 1,
  "Slightly important" = 2,
  "Fairly important" = 3,
  "Important" = 4,
  "Very Important" = 5
)

# Specify STEPU columns
#Individual question analysis 
stepu_items <- paste0("STEP", 1:14)

dat_full[stepu_items] <- lapply(dat_full[stepu_items], function(col) {
  as.numeric(stepu_map[as.character(col)])
})

stepu_long <- reshape(
  dat_full[, c("Timepoint", stepu_items)],
  varying = stepu_items,
  v.names = "Score",
  timevar = "Question",
  times = stepu_items,
  ids = seq_len(nrow(dat_full)),
  direction = "long"
)

# Calculate mean scores per question per timepoint
stepu_means <- aggregate(
  Score ~ Timepoint + Question,
  data = stepu_long,
  FUN = function(x) mean(x, na.rm = TRUE)
)

# View result
print(stepu_means)
##    Timepoint Question    Score
## 1       Post    STEP1 3.816794
## 2        Pre    STEP1 3.219780
## 3       Post   STEP10 4.572519
## 4        Pre   STEP10 4.692308
## 5       Post   STEP11 4.229008
## 6        Pre   STEP11 4.263736
## 7       Post   STEP12 4.083333
## 8        Pre   STEP12 4.087912
## 9       Post   STEP13 3.992424
## 10       Pre   STEP13 3.934066
## 11      Post   STEP14 4.469697
## 12       Pre   STEP14 4.373626
## 13      Post    STEP2 4.022727
## 14       Pre    STEP2 3.865169
## 15      Post    STEP3 3.748092
## 16       Pre    STEP3 3.666667
## 17      Post    STEP4 4.213740
## 18       Pre    STEP4 4.318681
## 19      Post    STEP5 4.310606
## 20       Pre    STEP5 4.230769
## 21      Post    STEP6 4.083969
## 22       Pre    STEP6 3.988889
## 23      Post    STEP7 4.030769
## 24       Pre    STEP7 4.065934
## 25      Post    STEP8 3.462121
## 26       Pre    STEP8 3.461538
## 27      Post    STEP9 3.954198
## 28       Pre    STEP9 3.857143
dat_paired[stepu_items] <- lapply(dat_paired[stepu_items], function(col) {
  as.numeric(stepu_map[as.character(col)])
})

# Subcategory item groups
collab_items   <- c("STEP1", "STEP2")
concept_items  <- c("STEP3", "STEP4", "STEP5", "STEP6", "STEP7")
technical_items<- c("STEP5", "STEP6", "STEP7", "STEP8")
thinking_items <- c("STEP9", "STEP10", "STEP11", "STEP12")
literacy_items <- c("STEP13", "STEP14")

# Compute subscale means per student
dat_full <- dat_full %>%
  dplyr::mutate(
    Collab_Comm       = rowMeans(across(all_of(collab_items)), na.rm = TRUE),
    Concept_Under     = rowMeans(across(all_of(concept_items)), na.rm = TRUE),
    Technical_Skills  = rowMeans(across(all_of(technical_items)), na.rm = TRUE),
    HigherOrder_Think = rowMeans(across(all_of(thinking_items)), na.rm = TRUE),
    Science_Literacy  = rowMeans(across(all_of(literacy_items)), na.rm = TRUE)
  )

# Optional: view sample
head(dat_full[, c("Collab_Comm", "Concept_Under", "Technical_Skills", 
                    "HigherOrder_Think", "Science_Literacy")])
##   Collab_Comm Concept_Under Technical_Skills HigherOrder_Think Science_Literacy
## 1         4.5           4.6             4.25              4.50              4.0
## 2         3.0           3.8             3.50              3.75              4.0
## 3         5.0           3.2             4.00              4.50              4.0
## 4         2.0           2.0             2.25              4.25              4.0
## 5         3.5           2.8             2.25              4.75              3.0
## 6         4.0           4.0             4.25              4.25              4.5
stepu_long <- reshape(
  dat_full[, c("Timepoint", stepu_items)],
  varying = stepu_items,
  v.names = "Score",
  timevar = "Question",
  times = stepu_items,
  ids = seq_len(nrow(dat_full)),
  direction = "long"
)
# Calculate mean scores per question per timepoint
stepu_means <- aggregate(
  Score ~ Timepoint + Question,
  data = stepu_long,
  FUN = function(x) mean(x, na.rm = TRUE)
)

# View result
print(stepu_means)
##    Timepoint Question    Score
## 1       Post    STEP1 3.816794
## 2        Pre    STEP1 3.219780
## 3       Post   STEP10 4.572519
## 4        Pre   STEP10 4.692308
## 5       Post   STEP11 4.229008
## 6        Pre   STEP11 4.263736
## 7       Post   STEP12 4.083333
## 8        Pre   STEP12 4.087912
## 9       Post   STEP13 3.992424
## 10       Pre   STEP13 3.934066
## 11      Post   STEP14 4.469697
## 12       Pre   STEP14 4.373626
## 13      Post    STEP2 4.022727
## 14       Pre    STEP2 3.865169
## 15      Post    STEP3 3.748092
## 16       Pre    STEP3 3.666667
## 17      Post    STEP4 4.213740
## 18       Pre    STEP4 4.318681
## 19      Post    STEP5 4.310606
## 20       Pre    STEP5 4.230769
## 21      Post    STEP6 4.083969
## 22       Pre    STEP6 3.988889
## 23      Post    STEP7 4.030769
## 24       Pre    STEP7 4.065934
## 25      Post    STEP8 3.462121
## 26       Pre    STEP8 3.461538
## 27      Post    STEP9 3.954198
## 28       Pre    STEP9 3.857143
dat_full %>%
group_by(Timepoint) %>%
  summarise(
    Mean_Collab_Comm       = mean(Collab_Comm, na.rm = TRUE),
    Mean_Concept_Under     = mean(Concept_Under, na.rm = TRUE),
    Mean_Technical_Skills  = mean(Technical_Skills, na.rm = TRUE),
    Mean_HigherOrder_Think = mean(HigherOrder_Think, na.rm = TRUE),
    Mean_Science_Literacy  = mean(Science_Literacy, na.rm = TRUE),
    N = n()
  )
## # A tibble: 2 × 7
##   Timepoint Mean_Collab_Comm Mean_Concept_Under Mean_Technical_Skills
##   <chr>                <dbl>              <dbl>                 <dbl>
## 1 Post                  3.92               4.08                  3.97
## 2 Pre                   3.53               4.06                  3.94
## # ℹ 3 more variables: Mean_HigherOrder_Think <dbl>,
## #   Mean_Science_Literacy <dbl>, N <int>
anova(lme(Collab_Comm ~ Timepoint, random = ~1 | StudentID, data = dat_full,na.action = na.omit))
##             numDF denDF  F-value p-value
## (Intercept)     1   134 3603.196  <.0001
## Timepoint       1    78   14.013   3e-04
anova(lme(Concept_Under ~ Timepoint, random = ~1 | StudentID, data = dat_full,na.action = na.omit))
##             numDF denDF  F-value p-value
## (Intercept)     1   134 4592.855  <.0001
## Timepoint       1    78    0.000  0.9852
anova(lme(Technical_Skills ~ Timepoint, random = ~1 | StudentID, data = dat_full,na.action = na.omit))
##             numDF denDF  F-value p-value
## (Intercept)     1   134 3677.333  <.0001
## Timepoint       1    78    0.003  0.9564
anova(lme(HigherOrder_Think ~ Timepoint, random = ~1 | StudentID, data = dat_full,na.action = na.omit))
##             numDF denDF  F-value p-value
## (Intercept)     1   134 7609.933  <.0001
## Timepoint       1    78    0.374  0.5428
anova(lme(Science_Literacy ~ Timepoint, random = ~1 | StudentID, data = dat_full,na.action = na.omit))
##             numDF denDF  F-value p-value
## (Intercept)     1   134 4697.444  <.0001
## Timepoint       1    78    0.483   0.489
###########################
# Compute subscale means per student
dat_paired <- dat_paired %>%
  mutate(
    Collab_Comm       = rowMeans(across(all_of(collab_items)), na.rm = TRUE),
    Concept_Under     = rowMeans(across(all_of(concept_items)), na.rm = TRUE),
    Technical_Skills  = rowMeans(across(all_of(technical_items)), na.rm = TRUE),
    HigherOrder_Think = rowMeans(across(all_of(thinking_items)), na.rm = TRUE),
    Science_Literacy  = rowMeans(across(all_of(literacy_items)), na.rm = TRUE)
  )

# Optional: view sample
head(dat_paired[, c("Collab_Comm", "Concept_Under", "Technical_Skills", 
                    "HigherOrder_Think", "Science_Literacy")])
## # A tibble: 6 × 5
##   Collab_Comm Concept_Under Technical_Skills HigherOrder_Think Science_Literacy
##         <dbl>         <dbl>            <dbl>             <dbl>            <dbl>
## 1           3           3.8             3.5               3.75              4  
## 2           5           3.2             4                 4.5               4  
## 3           4           4               4                 4                 4  
## 4           3           3.8             4                 4                 4  
## 5           3           4.6             4.75              4.5               4.5
## 6           4           4.8             4.75              5                 5
dat_paired %>%
  group_by(Timepoint) %>%
  summarise(
    Mean_Collab_Comm       = mean(Collab_Comm, na.rm = TRUE),
    Mean_Concept_Under     = mean(Concept_Under, na.rm = TRUE),
    Mean_Technical_Skills  = mean(Technical_Skills, na.rm = TRUE),
    Mean_HigherOrder_Think = mean(HigherOrder_Think, na.rm = TRUE),
    Mean_Science_Literacy  = mean(Science_Literacy, na.rm = TRUE),
    N = n()
  )
## # A tibble: 2 × 7
##   Timepoint Mean_Collab_Comm Mean_Concept_Under Mean_Technical_Skills
##   <chr>                <dbl>              <dbl>                 <dbl>
## 1 Post                  3.91               4.10                  4.03
## 2 Pre                   3.57               4.14                  4.06
## # ℹ 3 more variables: Mean_HigherOrder_Think <dbl>,
## #   Mean_Science_Literacy <dbl>, N <int>
anova(lme(Collab_Comm ~ Timepoint, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))
##             numDF denDF   F-value p-value
## (Intercept)     1    60 1752.1770  <.0001
## Timepoint       1    60    6.6285  0.0125
anova(lme(Concept_Under ~ Timepoint, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))
##             numDF denDF  F-value p-value
## (Intercept)     1    60 3564.520  <.0001
## Timepoint       1    60    0.235  0.6295
anova(lme(Technical_Skills ~ Timepoint, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))
##             numDF denDF   F-value p-value
## (Intercept)     1    60 2626.7657  <.0001
## Timepoint       1    60    0.0861  0.7703
anova(lme(HigherOrder_Think ~ Timepoint, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))
##             numDF denDF  F-value p-value
## (Intercept)     1    60 5834.914  <.0001
## Timepoint       1    60    0.726  0.3977
anova(lme(Science_Literacy ~ Timepoint, random = ~1 | StudentID, data = dat_paired,na.action = na.omit))
##             numDF denDF   F-value p-value
## (Intercept)     1    60 2556.2781  <.0001
## Timepoint       1    60    0.1776   0.675

STEPU Question 1 shows measureable improvement. This question is about the importance of working in groups. The subconstruct of Collaboration and Communication significantly increase pre-post time period. Students left the class with more value in group work and collaboration.

##LCAS Survey- https://www.lifescied.org/doi/10.1187/cbe.15-03-0073 Do sum totals for each of the subcategories and compare the total scores.

The final three-factor solution of the LCAS consists of (do each and total)::

Collaboration. This factor consists of six items that ask students to evaluate the frequency with which they engaged in activities related to collaboration (e.g., provide help to other students, discuss work with other students, or critique other students’ work) and metacognition (e.g., “reflect on what I was learning”). The response options are 1, weekly; 2, monthly; 3, one or two times; 4, never.

Discovery and relevance. The second factor consists of five items that ask students to rate their agreement with statements about whether their lab work could lead to discovery of something new, development of new arguments, or generation of information of interest to the scientific community. The response options range from 1, strongly disagree, to 6, strongly agree.

Iteration. The third factor consists of six items that ask students to rate their agreement with statements about whether they had time or direction to repeat aspects of their work, such as making revisions, changing methods, and analyzing additional data. The response options ranged from 1, strongly disagree, to 6, strongly agree.

# Define scoring maps
frequency_map <- c(
  "Never" = 1,
  "One or Two Times" = 2,
  "Monthly" = 3,
  "Weekly" = 4,
  "I don't know" = NA,
  "I prefer not to respond" = NA
)

agreement_map <- c(
  "Strongly Disagree" = 1,
  "Disagree" = 2,
  "Somewhat Disagree" = 3,
  "Somewhat Agree" = 4,
  "Agree" = 5,
  "Strongly Agree" = 6,
  "I don't know" = NA,
  "I prefer not to respond" = NA
)

# Define correct column groupings
iteration_items <- paste0("LCAS", 1:6)      # Frequency
discovery_items <- paste0("LCAS.B", 1:5)    # Agreement
collab_items    <- paste0("LCAS.C", 1:6)    # Agreement

# Recode each subscale properly
dat_full[iteration_items] <- lapply(dat_full[iteration_items], function(x) {
  as.numeric(frequency_map[as.character(x)])
})

dat_full[discovery_items] <- lapply(dat_full[discovery_items], function(x) {
  as.numeric(agreement_map[as.character(x)])
})

dat_full[collab_items] <- lapply(dat_full[collab_items], function(x) {
  as.numeric(agreement_map[as.character(x)])
})

safe_row_mean <- function(x) {
  if (all(is.na(x))) NA else mean(x, na.rm = TRUE)
}

dat_full <- dat_full %>%
  rowwise() %>%
  mutate(
    LCAS_Iteration_Mean = safe_row_mean(c_across(all_of(iteration_items))),
   LCAS_Discovery_Mean = safe_row_mean(c_across(all_of(discovery_items))),
   LCAS_Collaboration_Mean = safe_row_mean(c_across(all_of(collab_items))),
  LCAS_Overall_Mean = safe_row_mean(c(
  LCAS_Iteration_Mean, LCAS_Discovery_Mean, LCAS_Collaboration_Mean
    ))
  ) %>%
  ungroup()


lcas_means_long <- dat_full %>%
  select(StudentID,
         LCAS_Iteration_Mean,
         LCAS_Discovery_Mean,
         LCAS_Collaboration_Mean,
         LCAS_Overall_Mean) %>%
  pivot_longer(
    cols = starts_with("LCAS_"),
    names_to = "Subscale",
    values_to = "Mean_Score"
  )

# Step 2: Density plot
ggplot(lcas_means_long, aes(x = Mean_Score, fill = Subscale, color = Subscale)) +
  geom_density(alpha = 0.4) +
  facet_wrap(~ Subscale, scales = "free") +
  labs(
    title = "Density Plot of LCAS Subscale and Overall Mean Scores",
    x = "Mean Score",
    y = "Density"
  ) +
  theme_minimal()
## Warning: Removed 465 rows containing non-finite outside the scale range
## (`stat_density()`).

iteration_labels <- c(
  LCAS1 = "Worked collaboratively in groups",
  LCAS2 = "Helped classmates with lab activities",
  LCAS3 = "Discussed data as a group",
  LCAS4 = "Worked on shared problems",
  LCAS5 = "Gave input to group decisions",
  LCAS6 = "Shared ideas or interpretations"
)

iteration_long <- dat_full %>%
  select(all_of(iteration_items)) %>%
  pivot_longer(cols = everything(), names_to = "Item", values_to = "Score") %>%
  filter(!is.na(Score)) %>%
  mutate(Score = factor(Score, levels = 1:4)) %>%
  group_by(Item, Score) %>%
  summarise(Count = n(), .groups = "drop") %>%
  group_by(Item) %>%
  mutate(Percent = Count / sum(Count) * 100,
         Label = paste0(round(Percent), "%"),
         Question = iteration_labels[Item])  # Apply label text

# Step 3: Plot barplots per item
ggplot(iteration_long, aes(x = Score, y = Count, fill = Score)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = Label), vjust = -0.5, size = 3) +
  facet_wrap(~ Question) +
  labs(
    title = "Distribution of LCAS Iteration Responses",
    x = "Score (1 = Never, 4 = Weekly)",
    y = "Number of Students"
  ) +
  scale_fill_brewer(palette = "Blues") +
  theme_minimal() +
  theme(legend.position = "none")

collab_items    <- paste0("LCAS", 1:6)
discovery_items <- paste0("LCAS.B", 1:5)
iteration_items <- paste0("LCAS.C", 1:6)

alpha_collab <- psych::alpha(dat_full[, collab_items])
alpha_iter   <- psych::alpha(dat_full[, iteration_items])
alpha_discovery <- psych::alpha(dat_full[, discovery_items])

print(alpha_collab$total$raw_alpha)      # Collaboration alpha
## [1] 0.8056417
print(alpha_iter$total$raw_alpha)        # Iteration alpha
## [1] 0.8298066
print(alpha_discovery$total$raw_alpha)   # Discovery alpha
## [1] 0.7973332

All appropraite chr. alpha values for the data.