Quiz Analysis

Overview:

Student quiz data was exported directly from Canvas.
Student rosters were used to assign section (treatment) to their quiz results.
There were 228 unique users for which we had quiz results
We split the quiz questions into “attention checks” and “viewpoint”

Attention checks are questions in which answers are directly from videos. Questions vary based on video presented.
Viewpoint questions are related to AI content, and are about their personal stance. All students in both treatment got the same viewpoint questions.

Any individuals who got a total score of “0” on the attention checks were removed from the analysis, as they are assumed to have not been participating in the exercise or paying attention to the content.

Creation of Cohort

library(dplyr)
library(ggplot2)
library(tidyr)
library(nlme)

s1=read.csv("Bio1010S1.csv")
s2=read.csv("Bio1010S2.csv")
s2 <- s2[, -4]

user.treatment <- bind_rows(list(s1 = s1, s2 = s2), .id = "source")
section_info <- unique(user.treatment)

quizzes=read.csv("Quiz_data_combined.csv") 
quizzes$Username=as.factor(quizzes$Username)
(unique(quizzes$Username))#228 unique users with quiz data 

#Create a "type" column that labels each question as either an "attention" question or a "viewpoint"
quizzes$Type <- ifelse(grepl("TRUE|FALSE", quizzes$Response.Text, ignore.case = TRUE), "attention", "viewpoint")

#Clean the user emails to be only the first three letters of their name and their 4 digit identifier
quizzes$User.Email <- substr(quizzes$User.Email, 1, 7)

#add in user data for each occurrence 
quizzes <- merge(quizzes, section_info[, c("SIS.Login.ID", "source")], by.x = "User.Email", by.y = "SIS.Login.ID", all.x = TRUE)


#Assess quiz responses for engagement using the "attention" check questions
###subset quiz data to only include the attention check questions 
att.check=subset(quizzes, Type == "attention")

#Create a "score" column, where if their response matches the correct response, we give them a score of 1, and otherwise they get a score of 0. 
att.check$score <- ifelse(att.check$Correct. == "TRUE", 1, 0)

# Sum total scores for each "User.Email" by "Topic"
total_scores <- att.check %>%
  group_by(User.Email, Topic) %>%
  summarize(total_score = sum(score, na.rm = TRUE)) %>%
  ungroup()

# Identify users and topics where the score is zero
zero_scores <- total_scores %>%
  filter(total_score == 0) 

# Remove these users' data for topics where they scored zero
att.check <- att.check %>%
  anti_join(zero_scores, by = c("User.Email", "Topic"))

# Calculate average score per section grouped by topic and section
avg_scores <- att.check %>%
  group_by(Topic, Section) %>%
  summarize(
    avg_score = mean(score, na.rm = TRUE),
    n = sum(!is.na(score)),
    sd_score = sd(score, na.rm = TRUE),
    se_score = sd_score / sqrt(n),
    .groups = "drop"
  )

This table prints the average score total and the standard error range for each of the topics by treatment.

table_pretty <- avg_scores %>%
  mutate(
    result = paste0(
      round(avg_score, 3),
      " ± ",
      round(se_score, 3)
    )
  ) %>%
  dplyr::select(Topic, Section, result) %>%
  tidyr::pivot_wider(
    names_from = Section,
    values_from = result
  )

table_pretty

## # A tibble: 6 × 3
##   Topic        Control       IA           
##   <chr>        <chr>         <chr>        
## 1 BioDiv       0.778 ± 0.033 0.877 ± 0.021
## 2 Digestion    1 ± 0         1 ± 0        
## 3 EnvInjustice 0.9 ± 0.025   0.981 ± 0.01 
## 4 MolBiology   0.95 ± 0.017  0.71 ± 0.028 
## 5 Respiration  0.796 ± 0.031 0.749 ± 0.027
## 6 Vaccines     0.825 ± 0.03  0.886 ± 0.02

# Create a ggplot object to visualize the data
ggplot(avg_scores, aes(x = Section, y = avg_score)) +
  geom_col(fill = "blue") +
  geom_errorbar(
    aes(ymin = avg_score - se_score, ymax = avg_score + se_score),
    width = 0.2
  ) +
  facet_wrap(~ Topic, scales = "free") +
  labs(
    title = "Average Attention Check Score per Section",
    x = "Section",
    y = "Average Score"
  ) +
  theme_minimal()

#include random variable of User.Email to account for repeated assessment
#output is for all topics combined
summary(lme(fixed = score ~ Section, random = ~ 1 | User.Email, data = att.check))

## Linear mixed-effects model fit by REML
##   Data: att.check 
##        AIC      BIC    logLik
##   1697.693 1720.543 -844.8465
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 0.0001214032 0.3519632
## 
## Fixed effects:  score ~ Section 
##                  Value  Std.Error   DF  t-value p-value
## (Intercept)  0.8636884 0.01191216 2011 72.50480  0.0000
## SectionIA   -0.0138716 0.01525296  225 -0.90944  0.3641
##  Correlation: 
##           (Intr)
## SectionIA -0.781
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.4539174  0.3872892  0.4267008  0.4267012  0.4267021 
## 
## Number of Observations: 2238
## Number of Groups: 227

#assess each topic independently 
split_data <- split(att.check, att.check$Topic)

# Function to fit a linear mixed-effects model for each topic
fit_model <- function(df) {
  lme(fixed = score ~ Section, random = ~ 1 | User.Email, data = df)
}
# Apply the function to each subset of data and store the models in a list
models <- lapply(split_data, fit_model)
# To see the summary for each model
summaries <- lapply(models, summary)
# Optional: print summaries for review
summaries

## $BioDiv
## Linear mixed-effects model fit by REML
##   Data: df 
##        AIC     BIC    logLik
##   349.4332 365.399 -170.7166
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 9.492587e-06 0.3659177
## 
## Fixed effects:  score ~ Section 
##                 Value  Std.Error  DF   t-value p-value
## (Intercept) 0.7784810 0.02911085 201 26.741953   0.000
## SectionIA   0.0985682 0.03736569 199  2.637932   0.009
##  Correlation: 
##           (Intr)
## SectionIA -0.779
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.3968481  0.3360067  0.3360067  0.6053792  0.6053792 
## 
## Number of Observations: 402
## Number of Groups: 201 
## 
## $Digestion
## Linear mixed-effects model fit by REML
##   Data: df 
##         AIC       BIC   logLik
##   -15503.43 -15489.83 7755.714
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)     Residual
## StdDev: 1.331307e-16 2.777743e-17
## 
## Fixed effects:  score ~ Section 
##             Value   Std.Error  DF       t-value p-value
## (Intercept)     1 1.44974e-17 221  6.897788e+16  0.0000
## SectionIA       0 1.86327e-17 221 -1.000000e+00  0.4207
##  Correlation: 
##           (Intr)
## SectionIA -0.778
## 
## Standardized Within-Group Residuals:
## Min  Q1 Med  Q3 Max 
##   0   0   0   0   0 
## 
## Number of Observations: 223
## Number of Groups: 223 
## 
## $EnvInjustice
## Linear mixed-effects model fit by REML
##   Data: df 
##         AIC       BIC   logLik
##   -51.04712 -35.67294 29.52356
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 1.484812e-05 0.2188422
## 
## Fixed effects:  score ~ Section 
##                 Value  Std.Error  DF  t-value p-value
## (Intercept) 0.9000000 0.01849554 173 48.66038   0e+00
## SectionIA   0.0806763 0.02394675 172  3.36899   9e-04
##  Correlation: 
##           (Intr)
## SectionIA -0.772
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -4.48120344  0.08829957  0.08829957  0.45695030  0.45695032 
## 
## Number of Observations: 347
## Number of Groups: 174 
## 
## $MolBiology
## Linear mixed-effects model fit by REML
##   Data: df 
##        AIC      BIC    logLik
##   404.0105 420.1715 -198.0052
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 1.207599e-05 0.3828284
## 
## Fixed effects:  score ~ Section 
##                  Value  Std.Error  DF   t-value p-value
## (Intercept)  0.9500000 0.03026525 211 31.389139       0
## SectionIA   -0.2400763 0.03841048 209 -6.250281       0
##  Correlation: 
##           (Intr)
## SectionIA -0.788
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.4815293  0.1306068  0.1306068  0.7577189  0.7577189 
## 
## Number of Observations: 422
## Number of Groups: 211 
## 
## $Respiration
## Linear mixed-effects model fit by REML
##   Data: df 
##        AIC      BIC    logLik
##   496.4418 512.6782 -244.2209
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 1.523438e-05 0.4228163
## 
## Fixed effects:  score ~ Section 
##                  Value  Std.Error  DF   t-value p-value
## (Intercept)  0.7964072 0.03271851 214 24.341184  0.0000
## SectionIA   -0.0473578 0.04183597 214 -1.131987  0.2589
##  Correlation: 
##           (Intr)
## SectionIA -0.782
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -1.8835774  0.4815160  0.4815160  0.5935215  0.5935215 
## 
## Number of Observations: 430
## Number of Groups: 216 
## 
## $Vaccines
## Linear mixed-effects model fit by REML
##   Data: df 
##       AIC      BIC   logLik
##   308.808 324.8921 -150.404
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 1.685284e-05 0.3441218
## 
## Fixed effects:  score ~ Section 
##                 Value  Std.Error  DF   t-value p-value
## (Intercept) 0.8250000 0.02720522 207 30.325062  0.0000
## SectionIA   0.0608268 0.03473246 205  1.751295  0.0814
##  Correlation: 
##           (Intr)
## SectionIA -0.783
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.5741660  0.3317814  0.3317814  0.5085408  0.5085408 
## 
## Number of Observations: 414
## Number of Groups: 207

Assess viewpoint questions only

viewpoint=subset(quizzes, Type == "viewpoint")


#Remove all user info on a per topic basis if they scored a "0" on the attention check for that topic
viewpoint <- viewpoint %>%
  anti_join(zero_scores, by = c("User.Email", "Topic"))

viewpoint$Question.Text <- enc2utf8(as.character(viewpoint$Question.Text))
viewpoint$Question.Text <- iconv(viewpoint$Question.Text, to = "UTF-8", sub = "")
viewpoint$Question.Text <- trimws(viewpoint$Question.Text)
viewpoint$Question.Text <- gsub("\\s+", " ", viewpoint$Question.Text)

###add in a column called "score" with likert ranging from 1-5 (strongly disagree to strongly agree)
viewpoint$score <- case_when(
  grepl("Strongly Disagree", viewpoint$Response.Text, ignore.case = TRUE) ~ 1,
  grepl("Neither Agree nor Disagree", viewpoint$Response.Text, ignore.case = TRUE) ~ 3,
  grepl("Strongly Agree", viewpoint$Response.Text, ignore.case = TRUE) ~ 5,
  grepl("Disagree", viewpoint$Response.Text, ignore.case = TRUE) ~ 2,
  grepl("Agree", viewpoint$Response.Text, ignore.case = TRUE) ~ 4,
  TRUE ~ NA_real_
)

viewpoint <- viewpoint %>%
  mutate(
    Question.Text = case_when(
      Question.Text %in% c(
        "Marijuana is a useful medical tool that should be available only by PRESCRIPTION for treating various illnesses.",
        "Medical marijuana is a useful medical tool that should be available only by PRESCRIPTION for treating various illnesses."
      ) ~ "Medical marijuana is a useful medical tool that should be available only by PRESCRIPTION for treating various illnesses.",
      TRUE ~ Question.Text
    )
  )

#Calculate the average score per question per section

# Calculate the average score per question per section
average_scores <- viewpoint %>%
  group_by(Section, Question.Text, Topic) %>%
  summarize(avg_score = mean(score, na.rm = TRUE), .groups = "drop")

library(tidyr)

# Pivot to wide format so each Section has its own column
pivoted_scores <- average_scores %>%
  pivot_wider(names_from = Section, values_from = avg_score)

# Check column names
print(colnames(pivoted_scores))

# Calculate the difference between IA and Control
pivoted_scores <- pivoted_scores %>%
  mutate(score_difference = IA - Control)

# Optional: round numeric columns for easier reading
pivoted_scores <- pivoted_scores %>%
  mutate(
    IA = round(IA, 3),
    Control = round(Control, 3),
    score_difference = round(score_difference, 3)
  )

print(pivoted_scores)

pivoted_scores <- pivoted_scores %>%
  mutate(across(where(is.character), ~ iconv(.x, from = "", to = "UTF-8", sub = "")))

library(DT)

datatable(
  pivoted_scores,
  options = list(
    pageLength = 57,
    scrollX = TRUE,
    columnDefs = list(
      list(
        targets = 0:(ncol(pivoted_scores) - 1),
        className = "dt-left"
      )
    )
  ),
  rownames = FALSE
) %>%
  formatStyle(
    columns = names(pivoted_scores),
    whiteSpace = "normal"
  )

average_scores_topic <- viewpoint %>%
  group_by(Section, Topic) %>%
  summarize(
    avg_score = mean(score, na.rm = TRUE),
    sd_score = sd(score, na.rm = TRUE),
    n = sum(!is.na(score)),
    se_score = sd_score / sqrt(n),
    .groups = "drop"
  )

average_scores_topic

## # A tibble: 12 × 6
##    Section Topic        avg_score sd_score     n se_score
##    <chr>   <chr>            <dbl>    <dbl> <int>    <dbl>
##  1 Control BioDiv            3.80    1.09    547   0.0467
##  2 Control Digestion         3.42    1.29    207   0.0896
##  3 Control EnvInjustice      4.00    0.975   560   0.0412
##  4 Control MolBiology        3.55    1.07    480   0.0490
##  5 Control Respiration       4.05    1.07    328   0.0591
##  6 Control Vaccines          3.89    0.954   560   0.0403
##  7 IA      BioDiv            3.75    1.11    847   0.0382
##  8 IA      Digestion         3.40    1.26    330   0.0692
##  9 IA      EnvInjustice      4.03    0.894   824   0.0312
## 10 IA      MolBiology        3.51    1.03    778   0.0369
## 11 IA      Respiration       4.09    1.14    523   0.0498
## 12 IA      Vaccines          3.84    0.981   889   0.0329

plot_topic <- ggplot(average_scores_topic,
                     aes(x = Topic, y = avg_score, fill = Section)) +
  geom_col(position = position_dodge(width = 0.8)) +
  geom_errorbar(
    aes(ymin = avg_score - se_score,
        ymax = avg_score + se_score),
    position = position_dodge(width = 0.8),
    width = 0.2
  ) +
  labs(
    title = "Average Viewpoint Score by Topic and Section",
    x = "Topic",
    y = "Average Likert Score"
  ) +
  theme_minimal()

print(plot_topic)

#Create a df with an average Viewpoint score for each student by topic

user_topic_scores <- viewpoint %>%
  group_by(User.Email, Topic) %>%
  summarize(
    avg_score = mean(score, na.rm = TRUE),
    n_questions = sum(!is.na(score)),
    .groups = "drop"
  )

user_topic_scores

user_topic_scores_wide <- user_topic_scores %>%
  pivot_wider(
    names_from = Topic,
    values_from = avg_score
  )

user_topic_scores_wide

user_topic_scores_wide_collapsed <- user_topic_scores_wide %>%
  group_by(User.Email) %>%
  summarize(across(where(is.numeric), ~ mean(.x, na.rm = TRUE))) %>%
  mutate(
    Average = rowMeans(across(-User.Email), na.rm = TRUE)
  )

Survey Analysis

library(lavaan)
surv=read.csv("survey.quant.final.csv")
#remove any empty rows 
surv <- surv[apply(surv, 1, function(x) !all(x == "" | is.na(x))), ]

#Remove rows with additional question text (non-data rows)
surv <- surv[-c(2, 3), ]

#only keep data rows who complete the survey and who have consented to participation
surv=subset(surv, Consent == "I Consent")
surv=subset(surv, Progress == "100")

#change construct likert scales to numeric
surv[, 88:146] <- lapply(surv[, 88:146], as.numeric)

Assign students to sections (IA vs Control)

#Q141 is the studentID in email form 
surv$Email <- tolower(substr(surv$Email, 1, 7))

surv <- merge(surv, section_info[, c("SIS.Login.ID", "source")], by.x = "Email", by.y = "SIS.Login.ID", all.x = TRUE)

Cognitive Engagement

# CE.1 through CE.8
# reverse #5-8
columns_to_reverse <- paste0("CE.", 5:8)
surv[columns_to_reverse] <- 8 - surv[columns_to_reverse]

#Create model that contains all questions related to grit
(CE.model <- '
CogEng =~ CE.1 + CE.2 + CE.3 + CE.4 + CE.5 + CE.6 + CE.7 + CE.8')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionCE <- cfa(CE.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionCE, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionCE, "std", edge.label.cex =1, label.cex=1)

#Create model that contains all questions related to CE
(CEb.model <- '
CogEng =~ CE.1 + CE.5 + CE.6 + CE.7 + CE.8')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionCEb <- cfa(CEb.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionCEb, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionCEb, "std", edge.label.cex =1, label.cex=1)

#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionCEb, "case.idx")
fscores <- lavPredict(solutionCEb, type="lv")

## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
  surv[idx, fs] <- fscores[ , fs]
}

summary(lme(CogEng ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))

## Linear mixed-effects model fit by REML
##   Data: surv 
##        AIC      BIC    logLik
##   223.1895 238.0179 -107.5947
## 
## Random effects:
##  Formula: ~1 | Email
##         (Intercept)  Residual
## StdDev:   0.1445638 0.3099086
## 
## Fixed effects:  CogEng ~ Timepoint 
##                    Value  Std.Error  DF   t-value p-value
## (Intercept)  -0.04031050 0.03029389 207 -1.330648  0.1848
## TimepointPre  0.06565277 0.03737536  94  1.756579  0.0822
##  Correlation: 
##              (Intr)
## TimepointPre -0.732
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.9509088 -0.4653659  0.1101215  0.6423174  2.0215912 
## 
## Number of Observations: 303
## Number of Groups: 208

Emotional Engagement

reverse 5-10

columns_to_reverse <- paste0("EE.", 5:10)
surv[columns_to_reverse] <- 8 - surv[columns_to_reverse]

#Create model that contains all questions related to grit
(EE.model <- '
EmEng =~ EE.1 + EE.2 + EE.3 + EE.4 + EE.5 + EE.6 + EE.7 + EE.8 + EE.9 + EE.10')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionEE <- cfa(EE.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionEE, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionEE, "std", edge.label.cex =1, label.cex=1)

#Create model that contains all questions related to grit
(EEb.model <- '
EmEng =~  EE.3 + EE.4 + EE.5 + EE.8 + EE.9 + EE.10')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionEEb <- cfa(EEb.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionEEb, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionEEb, "std", edge.label.cex =1, label.cex=1)

#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionEEb, "case.idx")
fscores <- lavPredict(solutionEEb, type="lv")

## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
  surv[idx, fs] <- fscores[ , fs]
}

summary(lme(EmEng ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))

Social Engagement

reverse 5-7

columns_to_reverse <- paste0("SE.", 5:7)
surv[columns_to_reverse] <- 8 - surv[columns_to_reverse]

#Create model that contains all questions related to grit
(SE.model <- '
SocEng =~ SE.1 + SE.2 + SE.3 + SE.4 + SE.5 + SE.6 + SE.7')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionSE <- cfa(SE.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionSE, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionSE, "std", edge.label.cex =1, label.cex=1)

#Create model that contains all questions related to grit
(SEb.model <- '
SocEng =~ SE.1  + SE.3  + SE.5  + SE.7')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionSEb <- cfa(SEb.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionSEb, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionSEb, "std", edge.label.cex =1, label.cex=1)

#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionSEb, "case.idx")
fscores <- lavPredict(solutionSEb, type="lv")

## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
  surv[idx, fs] <- fscores[ , fs]
}

Adult Emotional Empathy

reverse #3

columns_to_reverse <- paste0("AEE.", 3)
surv[columns_to_reverse] <- 8 - surv[columns_to_reverse]

#Create model that contains all questions related to grit
(AEE.model <- '
AdEmp =~ AEE.1 + AEE.2 + AEE.3 + AEE.4 + AEE.5 + AEE.6 + AEE.7 + AEE.8 + AEE.9 + AEE.10')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionAEE <- cfa(AEE.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionAEE, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionAEE, "std", edge.label.cex =1, label.cex=1)

#Create model that contains all questions related to grit
(AEEb.model <- '
AdEmp =~ AEE.1 + AEE.2  + AEE.4  + AEE.6 + AEE.7  + AEE.9 ')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionAEEb <- cfa(AEEb.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionAEEb, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionAEEb, "std", edge.label.cex =1, label.cex=1)


#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionAEEb, "case.idx")
fscores <- lavPredict(solutionAEEb, type="lv")

## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
  surv[idx, fs] <- fscores[ , fs]
}

Interpersonal Reactivity Index

reverse 2, 3, 7, 8, 9,

columns_to_reverse <- paste0("IRI.", c(2,3,7,8,9))
surv[columns_to_reverse] <- 8 - surv[columns_to_reverse]

#Create model that contains all questions related to grit
(IRI.model <- '
IntReact =~ IRI.1 + IRI.2 + IRI.3 + IRI.4 + IRI.5 + IRI.6 + IRI.7 + IRI.8 + IRI.9 + IRI.10 + IRI.11 + IRI.12 + IRI.13 + IRI.14')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionIRI <- cfa(IRI.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionIRI, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionIRI, "std", edge.label.cex =1, label.cex=1)

#Create model that contains all questions related to grit
(IRIb.model <- '
IntReact =~   IRI.3 + IRI.4 + IRI.5  + IRI.7 + IRI.8 + IRI.9 + IRI.10 + IRI.11 + IRI.12 + IRI.13 + IRI.14')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionIRIb <- cfa(IRIb.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionIRIb, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionIRIb, "std", edge.label.cex =1, label.cex=1)


#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionIRIb, "case.idx")
fscores <- lavPredict(solutionIRIb, type="lv")

## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
  surv[idx, fs] <- fscores[ , fs]
}


summary(lme(IntReact ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))

Science Identity

#Create model that contains all questions related to grit
(SI.model <- '
SciId =~ SI.1 + SI.2 + SI.3 + SI.4 + SI.5')

## [1] "\nSciId =~ SI.1 + SI.2 + SI.3 + SI.4 + SI.5"

#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionSI <- cfa(SI.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionSI, fit.measures=T, standardized=TRUE)

## lavaan 0.6-21 ended normally after 25 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        10
## 
##                                                   Used       Total
##   Number of observations                           294         392
## 
## Model Test User Model:
##                                               Standard      Scaled
##   Test Statistic                                38.708      30.035
##   Degrees of freedom                                 5           5
##   P-value (Chi-square)                           0.000       0.000
##   Scaling correction factor                                  1.289
##     Satorra-Bentler correction                                    
## 
## Model Test Baseline Model:
## 
##   Test statistic                               568.889     370.321
##   Degrees of freedom                                10          10
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.536
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.940       0.931
##   Tucker-Lewis Index (TLI)                       0.879       0.861
##                                                                   
##   Robust Comparative Fit Index (CFI)                         0.942
##   Robust Tucker-Lewis Index (TLI)                            0.883
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)              -2471.029   -2471.029
##   Loglikelihood unrestricted model (H1)      -2451.675   -2451.675
##                                                                   
##   Akaike (AIC)                                4962.057    4962.057
##   Bayesian (BIC)                              4998.893    4998.893
##   Sample-size adjusted Bayesian (SABIC)       4967.181    4967.181
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.151       0.131
##   90 Percent confidence interval - lower         0.109       0.093
##   90 Percent confidence interval - upper         0.198       0.172
##   P-value H_0: RMSEA <= 0.050                    0.000       0.000
##   P-value H_0: RMSEA >= 0.080                    0.997       0.985
##                                                                   
##   Robust RMSEA                                               0.148
##   90 Percent confidence interval - lower                     0.100
##   90 Percent confidence interval - upper                     0.201
##   P-value H_0: Robust RMSEA <= 0.050                         0.001
##   P-value H_0: Robust RMSEA >= 0.080                         0.988
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.045       0.045
## 
## Parameter Estimates:
## 
##   Standard errors                           Robust.sem
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   SciId =~                                                              
##     SI.1              1.000                               1.253    0.791
##     SI.2              0.323    0.068    4.770    0.000    0.405    0.304
##     SI.3              0.973    0.071   13.716    0.000    1.219    0.814
##     SI.4              1.095    0.083   13.142    0.000    1.372    0.815
##     SI.5              0.893    0.082   10.942    0.000    1.119    0.651
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .SI.1              0.937    0.131    7.159    0.000    0.937    0.374
##    .SI.2              1.610    0.149   10.808    0.000    1.610    0.908
##    .SI.3              0.757    0.122    6.201    0.000    0.757    0.337
##    .SI.4              0.949    0.215    4.422    0.000    0.949    0.335
##    .SI.5              1.700    0.223    7.620    0.000    1.700    0.576
##     SciId             1.570    0.197    7.971    0.000    1.000    1.000

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionSI, "std", edge.label.cex =1, label.cex=1)

#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionSI, "case.idx")
fscores <- lavPredict(solutionSI, type="lv")

## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
  surv[idx, fs] <- fscores[ , fs]
}

summary(lme(SciId ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))

## Linear mixed-effects model fit by REML
##   Data: surv 
##        AIC      BIC    logLik
##   880.3411 895.0481 -436.1706
## 
## Random effects:
##  Formula: ~1 | Email
##         (Intercept)  Residual
## StdDev:   0.9961904 0.6346924
## 
## Fixed effects:  SciId ~ Timepoint 
##                   Value  Std.Error  DF   t-value p-value
## (Intercept)   0.2955271 0.09753661 202  3.029910  0.0028
## TimepointPre -0.3964644 0.08768428  90 -4.521499  0.0000
##  Correlation: 
##              (Intr)
## TimepointPre -0.573
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -1.94877302 -0.44828314 -0.08856004  0.44676570  2.50132821 
## 
## Number of Observations: 294
## Number of Groups: 203

Scientific Community Values

#Create model that contains all questions related to grit
(SCV.model <- '
SciComVal=~ SCE.1 + SCE.2 + SCE.3 + SCE.4 + SCE.5')

## [1] "\nSciComVal=~ SCE.1 + SCE.2 + SCE.3 + SCE.4 + SCE.5"

#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionSCV <- cfa(SCV.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionSCV, fit.measures=T, standardized=TRUE)

## lavaan 0.6-21 ended normally after 27 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        10
## 
##                                                   Used       Total
##   Number of observations                           340         392
## 
## Model Test User Model:
##                                               Standard      Scaled
##   Test Statistic                                 5.143       3.842
##   Degrees of freedom                                 5           5
##   P-value (Chi-square)                           0.399       0.572
##   Scaling correction factor                                  1.339
##     Satorra-Bentler correction                                    
## 
## Model Test Baseline Model:
## 
##   Test statistic                               415.314     335.697
##   Degrees of freedom                                10          10
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.237
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    1.000       1.000
##   Tucker-Lewis Index (TLI)                       0.999       1.007
##                                                                   
##   Robust Comparative Fit Index (CFI)                         1.000
##   Robust Tucker-Lewis Index (TLI)                            1.008
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)              -2225.301   -2225.301
##   Loglikelihood unrestricted model (H1)      -2222.729   -2222.729
##                                                                   
##   Akaike (AIC)                                4470.601    4470.601
##   Bayesian (BIC)                              4508.891    4508.891
##   Sample-size adjusted Bayesian (SABIC)       4477.169    4477.169
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.009       0.000
##   90 Percent confidence interval - lower         0.000       0.000
##   90 Percent confidence interval - upper         0.076       0.058
##   P-value H_0: RMSEA <= 0.050                    0.772       0.914
##   P-value H_0: RMSEA >= 0.080                    0.038       0.006
##                                                                   
##   Robust RMSEA                                               0.000
##   90 Percent confidence interval - lower                     0.000
##   90 Percent confidence interval - upper                     0.076
##   P-value H_0: Robust RMSEA <= 0.050                         0.823
##   P-value H_0: Robust RMSEA >= 0.080                         0.039
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.020       0.020
## 
## Parameter Estimates:
## 
##   Standard errors                           Robust.sem
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   SciComVal =~                                                          
##     SCE.1             1.000                               0.463    0.511
##     SCE.2             1.716    0.198    8.683    0.000    0.794    0.766
##     SCE.3             1.422    0.144    9.906    0.000    0.658    0.746
##     SCE.4             1.283    0.179    7.150    0.000    0.594    0.491
##     SCE.5             1.490    0.187    7.987    0.000    0.690    0.656
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .SCE.1             0.606    0.045   13.588    0.000    0.606    0.739
##    .SCE.2             0.444    0.078    5.729    0.000    0.444    0.414
##    .SCE.3             0.346    0.060    5.755    0.000    0.346    0.444
##    .SCE.4             1.111    0.151    7.343    0.000    1.111    0.759
##    .SCE.5             0.631    0.080    7.854    0.000    0.631    0.570
##     SciComVal         0.214    0.045    4.768    0.000    1.000    1.000

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionSCV, "std", edge.label.cex =1, label.cex=1)

#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionSCV, "case.idx")
fscores <- lavPredict(solutionSCV, type="lv")

## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
  surv[idx, fs] <- fscores[ , fs]
}

summary(lme(SciComVal ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))

## Linear mixed-effects model fit by REML
##   Data: surv 
##        AIC      BIC    logLik
##   365.7951 381.0872 -178.8975
## 
## Random effects:
##  Formula: ~1 | Email
##         (Intercept)  Residual
## StdDev:   0.2658323 0.3209356
## 
## Fixed effects:  SciComVal ~ Timepoint 
##                    Value  Std.Error  DF   t-value p-value
## (Intercept)  -0.03692936 0.03460149 228 -1.067277  0.2870
## TimepointPre  0.05151883 0.03828727 110  1.345587  0.1812
##  Correlation: 
##              (Intr)
## TimepointPre -0.689
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.5597954 -0.4627069  0.0837014  0.5986544  1.6859895 
## 
## Number of Observations: 340
## Number of Groups: 229

#Calculate change in pre/post values for variables that passed CFA criteria

#Cognitive Engagement
CE.selected_columns <- c(1,147:149)
CogEng=surv[, CE.selected_columns]

CogEng <- CogEng %>%
  group_by(Email, Timepoint, source) %>%
  summarise(CogEng = mean(CogEng), .groups = 'drop')

  
CE.wide_data <- CogEng %>%
  pivot_wider(
    names_from = Timepoint,
    values_from = CogEng,
    names_prefix = "CogEng_"
  )

CE.wide_data <- CE.wide_data %>%
mutate(CogEng_diff = ifelse(is.na(CogEng_Pre) | is.na(CogEng_Post), NA, CogEng_Post - CogEng_Pre))

summary(lm(CogEng_diff ~ source, data = CE.wide_data, na.action = na.omit))

## 
## Call:
## lm(formula = CogEng_diff ~ source, data = CE.wide_data, na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.52267 -0.14393  0.05461  0.24922  1.22329 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.07075    0.05821  -1.215    0.227
## sources2    -0.01114    0.09591  -0.116    0.908
## 
## Residual standard error: 0.4509 on 93 degrees of freedom
##   (155 observations deleted due to missingness)
## Multiple R-squared:  0.000145,   Adjusted R-squared:  -0.01061 
## F-statistic: 0.01348 on 1 and 93 DF,  p-value: 0.9078

###################################

ID.selected_columns <- c(1,147:148, 150)
SciID=surv[, ID.selected_columns] 

SciID <- SciID %>%
  group_by(Email, Timepoint, source) %>%
  summarise(SciId = mean(SciId), .groups = 'drop')

  
SI.wide_data <- SciID %>%
  pivot_wider(
    names_from = Timepoint,
    values_from = SciId,
    names_prefix = "SI_"
  )

SI.wide_data <- SI.wide_data %>%
mutate(SI_diff = ifelse(is.na(SI_Pre) | is.na(SI_Post), NA, SI_Post - SI_Pre))

summary(lm(SI_diff ~ source, data = SI.wide_data, na.action = na.omit))

## 
## Call:
## lm(formula = SI_diff ~ source, data = SI.wide_data, na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.13889 -0.63852 -0.08524  0.49341  2.79219 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.4287     0.1127   3.803 0.000261 ***
## sources2     -0.1611     0.1963  -0.820 0.414200    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8804 on 89 degrees of freedom
##   (159 observations deleted due to missingness)
## Multiple R-squared:  0.007505,   Adjusted R-squared:  -0.003647 
## F-statistic: 0.673 on 1 and 89 DF,  p-value: 0.4142

#######################
SCV.selected_columns <- c(1,147:148, 151)
SCV=surv[, SCV.selected_columns] 

SCV <- SCV %>%
  group_by(Email, Timepoint, source) %>%
  summarise(SCV = mean(SciComVal), .groups = 'drop')

  
SCV.wide_data <- SCV %>%
  pivot_wider(
    names_from = Timepoint,
    values_from = SCV,
    names_prefix = "SCV_"
  )

SCV.wide_data <- SCV.wide_data %>%
mutate(SCV_diff = ifelse(is.na(SCV_Pre) | is.na(SCV_Post), NA, SCV_Post - SCV_Pre))

summary(lm(SCV_diff ~ source, data = SCV.wide_data, na.action = na.omit))

## 
## Call:
## lm(formula = SCV_diff ~ source, data = SCV.wide_data, na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.25953 -0.27792  0.08473  0.30680  1.03728 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.08697    0.05394  -1.612    0.110
## sources2     0.12240    0.08729   1.402    0.164
## 
## Residual standard error: 0.4448 on 108 degrees of freedom
##   (140 observations deleted due to missingness)
## Multiple R-squared:  0.01788,    Adjusted R-squared:  0.008787 
## F-statistic: 1.966 on 1 and 108 DF,  p-value: 0.1637

#write.csv(surv, file = "quant_calc.csv", row.names = F)

for notes on how to produce and interpret semPlots, see : https://www.r-bloggers.com/2018/04/statistics-sunday-using-semplot/ https://cran.r-project.org/web/packages/semPlot/semPlot.pdf http://sachaepskamp.com/documentation/semPlot/semPaths.html

#CFI: Measures whether the model fits the data better than a more restricted baseline model. Higher is better, with okay fit > .9. Find CFI under User Model vs. Baseline model

#TLI (Tucker-Lewis index): Similar to CFI, but it penalizes overly complex models (making it more conservative than CFI). Measures whether the model fits the data better than a more restricted baseline model. Higher is better, with okay fit > .9.

#Find Test statistic, DOF, and P value under Model Test User Model (chi-squared goodness of fit test)

#RMSEA p value:The RMSEA P-value is the Probability that RMSEA <= .05. If that P-value is greater than 5% you can argue that the RMSEA value does not indicate a model rejection (the RMSEA value doesn’t reject the model if the RMSEA value is between 0 and 0.05). Usually this is useful when the RMSEA value is near the cutoff value of 0.05. If the RMSEA value is not near 0.05 one can typically ignore the confidence limit and the P-value and simply use the actual RMSEA value.

#Interpretation:

#RMSEA < 0.05: Indicates a close fit of the model to the data. #0.05 ≤ RMSEA < 0.08: Indicates a reasonable fit. #0.08 ≤ RMSEA < 0.10: Indicates a mediocre fit. #RMSEA ≥ 0.10: Indicates a poor fit.

#p-value > 0.05: The null hypothesis cannot be rejected, suggesting that the model provides a close fit to the data. #p-value ≤ 0.05: The null hypothesis is rejected, indicating that the model does not provide a close fit to the data.

#std.all are the standardized coefficients

See this publication and references within on reporting CFA results properly: https://www.researchgate.net/publication/24187223_Reporting_Practices_in_Confirmatory_Factor_Analysis_An_Overview_and_Some_Recommendations

#Does view of importance (IA.Importance) & IA.Comfort & IA.Grad.Imp change the outcome of constructs above within the IA group?

library(MASS)

post.only=filter(surv, Timepoint == "Post")
post.only$IA.Addressed <- factor(
  post.only$IA.Addressed,
  levels = c("Never", "About half the time", "Sometimes", "Most of the time", "Always"),
  ordered = TRUE
)

post.only$source <- factor(post.only$source)

model <- polr(IA.Addressed ~ source, data = post.only, Hess = TRUE, na.action = na.omit)
summary(model)

## Call:
## polr(formula = IA.Addressed ~ source, data = post.only, na.action = na.omit, 
##     Hess = TRUE)
## 
## Coefficients:
##           Value Std. Error t value
## sources2 0.2978     0.3149  0.9458
## 
## Intercepts:
##                               Value   Std. Error t value
## Never|About half the time     -1.3523  0.2346    -5.7646
## About half the time|Sometimes -0.5049  0.2046    -2.4676
## Sometimes|Most of the time     2.0816  0.2785     7.4750
## Most of the time|Always        4.4550  0.7248     6.1467
## 
## Residual Deviance: 383.2326 
## AIC: 393.2326 
## (2 observations deleted due to missingness)

table(post.only$IA.Addressed, post.only$source, useNA = "ifany")

##                      
##                       s1 s2 <NA>
##   Never               20  9    0
##   About half the time 15 10    0
##   Sometimes           50 31    1
##   Most of the time     9  8    0
##   Always               1  1    0
##   <NA>                 0  1    0

post.only$IA.Importance <- as.numeric(as.character(post.only$IA.Importance))
post.only$IA.Comfort <- as.numeric(as.character(post.only$IA.Comfort))

summary(lm(IA.Importance ~ source, data = post.only, na.action = na.omit))

## 
## Call:
## lm(formula = IA.Importance ~ source, data = post.only, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.1607 -1.1607  0.0333  1.8393  3.0333 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.9667     0.2206  31.586   <2e-16 ***
## sources2      0.1940     0.3561   0.545    0.587    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.092 on 144 degrees of freedom
##   (10 observations deleted due to missingness)
## Multiple R-squared:  0.002058,   Adjusted R-squared:  -0.004873 
## F-statistic: 0.2969 on 1 and 144 DF,  p-value: 0.5867

summary(lm(IA.Comfort ~ source, data = post.only, na.action = na.omit))

## 
## Call:
## lm(formula = IA.Comfort ~ source, data = post.only, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2642 -2.0698 -0.0698  1.9302  3.9302 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.0698     0.2553   23.78   <2e-16 ***
## sources2      0.1944     0.4134    0.47    0.639    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.367 on 137 degrees of freedom
##   (17 observations deleted due to missingness)
## Multiple R-squared:  0.001611,   Adjusted R-squared:  -0.005677 
## F-statistic: 0.2211 on 1 and 137 DF,  p-value: 0.639

Qualitative Analysis

resp=read.csv("Qual_textresp.csv")
des=read.csv("Qual_Designations.csv")

des_clean <- des %>%
  filter(!if_all(starts_with("Designation."), ~ is.na(.)))

library(dplyr)
library(tidyr)
library(purrr)

des_long <- des_clean %>%
  pivot_longer(
    cols = starts_with("Designation."),
    names_to = "Topic",
    values_to = "Response"
  ) %>%
  filter(!is.na(Response)) %>%
  mutate(
    Section = factor(Section),
    Response = factor(Response),
    Topic = factor(Topic)
  )

# results <- des_long %>%
#   split(.$Topic) %>%
#   map_dfr(function(dat) {
# 
#     tab <- table(dat$Section, dat$Response)
# 
#     # remove empty rows/columns
#     tab <- tab[rowSums(tab) > 0, colSums(tab) > 0, drop = FALSE]
# 
#     # check table is still testable
#     if (nrow(tab) < 2 || ncol(tab) < 2) {
#       return(data.frame(
#         Topic = as.character(unique(dat$Topic)),
#         test = NA,
#         statistic = NA,
#         p_value = NA,
#         df = NA,
#         note = "Not enough variation"
#       ))
#     }
# 
#     chi <- suppressWarnings(chisq.test(tab))
# 
#     data.frame(
#       Topic = as.character(unique(dat$Topic)),
#       test = "Chi-square",
#       statistic = unname(chi$statistic),
#       p_value = chi$p.value,
#       df = unname(chi$parameter),
#       note = "OK"
#     )
#   })
# 
# results

results_fisher <- des_long %>%
  split(.$Topic) %>%
  map_dfr(function(dat) {

    tab <- table(dat$Section, dat$Response)
    tab <- tab[rowSums(tab) > 0, colSums(tab) > 0, drop = FALSE]

    if (nrow(tab) < 2 || ncol(tab) < 2) {
      return(data.frame(
        Topic = as.character(unique(dat$Topic)),
        p_value = NA,
        note = "Not enough variation"
      ))
    }

    fish <- fisher.test(tab)

    data.frame(
      Topic = as.character(unique(dat$Topic)),
      p_value = fish$p.value,
      note = "Fisher exact test"
    )
  })

results_fisher

##             Topic      p_value              note
## 1  Designation.DB 0.1112633218 Fisher exact test
## 2 Designation.Env 0.4896338538 Fisher exact test
## 3  Designation.GE 0.2990936696 Fisher exact test
## 4 Designation.Mar 0.0002512388 Fisher exact test
## 5 Designation.Vac 0.1863084072 Fisher exact test
## 6 Designation.Vap 0.4164906739 Fisher exact test

des_long <- des_long %>%
  mutate(Response_ord = factor(Response,
    levels = c(
      "Negative",
      "Negative-Neutral",
      "Neutral/No opinion/Freedom",
      "Positive-Neutral",
      "Positive"
    ),
    ordered = TRUE
  ))

library(MASS)

model <- polr(Response_ord ~ Section + Topic, data = des_long, Hess = TRUE)
summary(model)

## Call:
## polr(formula = Response_ord ~ Section + Topic, data = des_long, 
##     Hess = TRUE)
## 
## Coefficients:
##                         Value Std. Error t value
## Section2              0.49686     0.1661  2.9915
## TopicDesignation.Env  0.03744     0.3008  0.1245
## TopicDesignation.GE   0.08432     0.2795  0.3017
## TopicDesignation.Mar  0.63632     0.2615  2.4336
## TopicDesignation.Vac  4.18553     0.3383 12.3705
## TopicDesignation.Vap -1.14137     0.3195 -3.5718
## 
## Intercepts:
##                                             Value   Std. Error t value
## Negative|Negative-Neutral                    0.7857  0.2142     3.6685
## Negative-Neutral|Neutral/No opinion/Freedom  2.0531  0.2334     8.7978
## Neutral/No opinion/Freedom|Positive-Neutral  2.0533  0.2334     8.7982
## Positive-Neutral|Positive                    4.0427  0.3084    13.1068
## 
## Residual Deviance: 1179.775 
## AIC: 1199.775 
## (1335 observations deleted due to missingness)

coef_table <- coef(summary(model))
p <- pnorm(abs(coef_table[, "t value"]), lower.tail = FALSE) * 2
cbind(coef_table, "p value" = p)

##                                                   Value Std. Error    t value
## Section2                                     0.49685980  0.1660929  2.9914577
## TopicDesignation.Env                         0.03744155  0.3007908  0.1244771
## TopicDesignation.GE                          0.08432139  0.2795291  0.3016551
## TopicDesignation.Mar                         0.63632010  0.2614763  2.4335670
## TopicDesignation.Vac                         4.18553063  0.3383491 12.3704509
## TopicDesignation.Vap                        -1.14137216  0.3195484 -3.5718284
## Negative|Negative-Neutral                    0.78566185  0.2141649  3.6684898
## Negative-Neutral|Neutral/No opinion/Freedom  2.05314450  0.2333707  8.7977833
## Neutral/No opinion/Freedom|Positive-Neutral  2.05327306  0.2333734  8.7982308
## Positive-Neutral|Positive                    4.04274339  0.3084459 13.1068162
##                                                  p value
## Section2                                    2.776490e-03
## TopicDesignation.Env                        9.009376e-01
## TopicDesignation.GE                         7.629150e-01
## TopicDesignation.Mar                        1.495087e-02
## TopicDesignation.Vac                        3.776951e-35
## TopicDesignation.Vap                        3.544976e-04
## Negative|Negative-Neutral                   2.439874e-04
## Negative-Neutral|Neutral/No opinion/Freedom 1.395452e-18
## Neutral/No opinion/Freedom|Positive-Neutral 1.389899e-18
## Positive-Neutral|Positive                   3.009701e-39

#why are there extra columns here?

des.table= des_long %>%
   split(.$Topic) %>%
   purrr::walk(~ print(table(.x$Section, .x$Response)))

##    
##        Negative Negative-Neutral Neutral/No opinon/Freedom Positive
##   1  0       46                7                        12        1
##   2  0       24               13                         6        1
##    
##     Positive-Neutral
##   1               11
##   2                5
##    
##        Negative Negative-Neutral Neutral/No opinon/Freedom Positive
##   1  0       32               10                        26        1
##   2  0       17               11                        15        2
##    
##     Positive-Neutral
##   1                3
##   2                3
##    
##        Negative Negative-Neutral Neutral/No opinon/Freedom Positive
##   1  0       45                8                        12        1
##   2  0       24               12                         7        1
##    
##     Positive-Neutral
##   1               12
##   2                6
##    
##        Negative Negative-Neutral Neutral/No opinon/Freedom Positive
##   1  0       45               19                         7        2
##   2  0       13               17                         3        2
##    
##     Positive-Neutral
##   1                6
##   2               17
##    
##        Negative Negative-Neutral Neutral/No opinon/Freedom Positive
##   1  0        0                4                        15       38
##   2  0        4                2                         8       25
##    
##     Positive-Neutral
##   1               22
##   2               13
##    
##        Negative Negative-Neutral Neutral/No opinon/Freedom Positive
##   1  0       61               12                         5        0
##   2  0       41                5                         6        1
##    
##     Positive-Neutral
##   1                1
##   2                0

When I converted the designation scale to an ordered score: The marijuana topic shows a substantial ideological framing difference between sections.

Students in Section 2 gave responses that were much less negative / more nuanced.

This suggests:

IA may be most influential when the topic is politically or culturally loaded.

Students do not automatically apply IA to all topics.

library(dplyr)

des_long <- des_long %>%
  mutate(Response_ord = factor(Response,
    levels = c(
      "Negative",
      "Negative-Neutral",
      "Neutral/No opinion/Freedom",
      "Positive-Neutral",
      "Positive"
    ),
    ordered = TRUE
  ),
  Response_num = as.numeric(Response_ord))

des_long <- des_long %>%
  mutate(
    Response = na_if(as.character(Response), "NA"),
    Section  = na_if(as.character(Section), "NA")
  ) %>%
  filter(!is.na(Response), !is.na(Section))

means <- des_long %>%
  group_by(Topic, Section) %>%
  summarise(
    mean_score = mean(Response_num, na.rm = TRUE),
    sd = sd(Response_num, na.rm = TRUE),
    n = n(),
    se = sd / sqrt(n),
    .groups = "drop"
  )

means

## # A tibble: 12 × 6
##    Topic           Section mean_score    sd     n     se
##    <fct>           <chr>        <dbl> <dbl> <int>  <dbl>
##  1 Designation.DB  1             1.68 1.20     77 0.137 
##  2 Designation.DB  2             1.74 1.09     49 0.156 
##  3 Designation.Env 1             1.5  0.960    72 0.113 
##  4 Designation.Env 2             1.85 1.20     48 0.174 
##  5 Designation.GE  1             1.73 1.22     78 0.138 
##  6 Designation.GE  2             1.79 1.15     50 0.162 
##  7 Designation.Mar 1             1.62 1.04     79 0.117 
##  8 Designation.Mar 2             2.55 1.32     52 0.184 
##  9 Designation.Vac 1             4.47 0.796    79 0.0896
## 10 Designation.Vac 2             4.20 1.25     52 0.173 
## 11 Designation.Vap 1             1.20 0.496    79 0.0558
## 12 Designation.Vap 2             1.19 0.647    53 0.0889

library(ggplot2)

ggplot(means, aes(x = Topic, y = mean_score, fill = Section)) +
  geom_col(position = position_dodge(width = 0.7)) +
  geom_errorbar(
    aes(ymin = mean_score - se, ymax = mean_score + se),
    width = .2,
    position = position_dodge(.7)
  ) +
  coord_flip() +
  labs(
    x = "Topic",
    y = "Average Ideological Framing Score"
  ) +
  theme_bw()

Across most topics (environmental justice, vaccines, GE, vaping, etc.), the average ideological framing scores were very similar between sections.

This suggests that students do not necessarily change their stance on socio-scientific issues simply because ideological awareness is introduced.

In other words:

AI instruction appears to influence how students reason, rather than what position they ultimately take.

The marijuana topic shows the clearest difference between sections.(1.62-2.55)

Students in the control section expressed more neutral or mixed framing, while the AI section tended to give more negative responses.

Possible explanation:

The AI section may have encouraged students to critically interrogate ideological narratives surrounding the issue, which may have led some students to articulate stronger evaluative positions rather than neutral ones.

Another possibility is that marijuana is a highly culturally framed issue, and the ideological discussion may have prompted students to engage more directly with those frames.

AI section shows more consistent framing

In several topics, the AI section responses cluster more tightly around specific categories (often negative or negative-neutral), while the control section spreads more across categories.

This suggests that AI instruction may have helped students:

recognize ideological framing more clearly

articulate positions more decisively

Rather than defaulting to neutral responses.

ggplot(des_long, aes(x = Section, y = Response_num, fill = Section)) +
  geom_violin(alpha = 0.4, trim = FALSE) +
  geom_boxplot(width = 0.15, outlier.shape = NA, alpha = 0.6) +
  facet_wrap(~Topic) +
  theme_bw() +
  labs(
    y = "Ideological Framing Score",
    x = "Section",
    title = "Distribution of Ideological Framing Responses by Section"
  )

## Warning: Removed 122 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 122 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Control section uses neutral categories more often. This pattern is common when students lack an explicit analytic framework for interpreting socio-scientific issues.

Without ideological awareness instruction, students may:

hedge their answers

default to neutrality

frame responses as individual opinion rather than ideological analysis.

Looking at the qualitative responses:

AI section (Section 1)

Students more often reference:

systemic issues

ideological framing

social structures

policy implications

Responses tend to focus on how the issue is constructed or framed.

Control section (Section 2)

Students more often focus on:

individual harms

fairness

environmental consequences

personal opinion

Responses tend to focus on what the issue is, rather than how it is framed.

resp <- read.csv("Qual_textresp.csv")

library(tidytext)
library(dplyr)

words <- resp %>%
  unnest_tokens(word, Vid.EnvInj.Text)

library(stopwords)

words_clean <- words %>%
  filter(!word %in% stop_words$word)

word_counts <- words_clean %>%
  count(Section, word, sort = TRUE)

word_counts

##     Section               word  n
## 1         2      environmental 34
## 2         1      environmental 30
## 3         2             racism 30
## 4         1             racism 21
## 5         1          injustice 17
## 6         1             people 14
## 7         2        communities 14
## 8         1        communities 13
## 9         2          injustice 12
## 10        2             people 12
## 11        2             health 11
## 12        1         government 10
## 13        2      understanding 10
## 14        1              water  9
## 15        2              water  9
## 16        1          community  8
## 17        1              topic  8
## 18        1      understanding  8
## 19        2              clean  8
## 20        1        environment  7
## 21        1             health  7
## 22        2              issue  7
## 23        2       marginalized  7
## 24        2          resources  7
## 25        1              clean  6
## 26        1                due  6
## 27        1             issues  6
## 28        1               live  6
## 29        1                lot  6
## 30        2             access  6
## 31        2           affected  6
## 32        2        environment  6
## 33        1           affected  5
## 34        1             income  5
## 35        1              money  5
## 36        2          community  5
## 37        2        disparities  5
## 38        2              flint  5
## 39        2              waste  5
## 40        1             access  4
## 41        1                don  4
## 42        1           educated  4
## 43        1               feel  4
## 44        1              issue  4
## 45        1              lower  4
## 46        1       marginalized  4
## 47        1               poor  4
## 48        1          resources  4
## 49        1               time  4
## 50        2                air  4
## 51        2           concerns  4
## 52        2             crisis  4
## 53        2           exposure  4
## 54        2             issues  4
## 55        2               lack  4
## 56        2           michigan  4
## 57        2               race  4
## 58        1          attention  3
## 59        1               care  3
## 60        1               dont  3
## 61        1              flint  3
## 62        1            healthy  3
## 63        1             impact  3
## 64        1           informed  3
## 65        1           involved  3
## 66        1                low  3
## 67        1           michigan  3
## 68        1          pollution  3
## 69        1            poverty  3
## 70        1            protect  3
## 71        1               race  3
## 72        1              risks  3
## 73        1                sad  3
## 74        1            society  3
## 75        1              speak  3
## 76        1              world  3
## 77        2              color  3
## 78        2            concept  3
## 79        2               dont  3
## 80        2                due  3
## 81        2       environments  3
## 82        2              equal  3
## 83        2            hazards  3
## 84        2            healthy  3
## 85        2            leading  3
## 86        2         pollutants  3
## 87        2          pollution  3
## 88        2             social  3
## 89        2           systemic  3
## 90        1          advantage  2
## 91        1             afford  2
## 92        1                air  2
## 93        1             answer  2
## 94        1             cancer  2
## 95        1               city  2
## 96        1              class  2
## 97        1           concerns  2
## 98        1              cycle  2
## 99        1      disadvantaged  2
## 100       1          disasters  2
## 101       1         disturbing  2
## 102       1             fields  2
## 103       1              fight  2
## 104       1              found  2
## 105       1         generation  2
## 106       1             harder  2
## 107       1          hardships  2
## 108       1               hate  2
## 109       1            hazards  2
## 110       1                hit  2
## 111       1           homeless  2
## 112       1               huge  2
## 113       1       inequalities  2
## 114       1               land  2
## 115       1             looked  2
## 116       1              major  2
## 117       1              makes  2
## 118       1         minorities  2
## 119       1        necessarily  2
## 120       1             person  2
## 121       1         pollutants  2
## 122       1             poorly  2
## 123       1              rates  2
## 124       1      socioeconomic  2
## 125       1             status  2
## 126       1               stop  2
## 127       1            subject  2
## 128       1              super  2
## 129       1           systemic  2
## 130       1             topics  2
## 131       1              trash  2
## 132       1         understand  2
## 133       1              worse  2
## 134       2         addressing  2
## 135       2            adverse  2
## 136       2            biggest  2
## 137       2          burdening  2
## 138       2             cancer  2
## 139       2            country  2
## 140       2                don  2
## 141       2              earth  2
## 142       2           economic  2
## 143       2            effects  2
## 144       2             ensure  2
## 145       2          equitable  2
## 146       2       exacerbating  2
## 147       2           existing  2
## 148       2             family  2
## 149       2              green  2
## 150       2           honestly  2
## 151       2          impactful  2
## 152       2         importance  2
## 153       2          increased  2
## 154       2         industrial  2
## 155       2       inequalities  2
## 156       2            involve  2
## 157       2              leads  2
## 158       2               live  2
## 159       2                lot  2
## 160       2      neighborhoods  2
## 161       2       perpetuating  2
## 162       2               poor  2
## 163       2            poverty  2
## 164       2              races  2
## 165       2             racist  2
## 166       2              rates  2
## 167       2        respiratory  2
## 168       2              sites  2
## 169       2         situations  2
## 170       2            society  2
## 171       2             spaces  2
## 172       2             system  2
## 173       2           targeted  2
## 174       2              topic  2
## 175       2        underscores  2
## 176       2         understand  2
## 177       2           unfairly  2
## 178       2             united  2
## 179      NA             racism  2
## 180       1               2024  1
## 181       1                  3  1
## 182       1                 5x  1
## 183       1            ability  1
## 184       1          achieving  1
## 185       1         addressing  1
## 186       1           advocate  1
## 187       1             affect  1
## 188       1                age  1
## 189       1                ago  1
## 190       1       agricultural  1
## 191       1             amount  1
## 192       1            amounts  1
## 193       1            animals  1
## 194       1         apartments  1
## 195       1            arising  1
## 196       1             aspect  1
## 197       1              aware  1
## 198       1          awareness  1
## 199       1              awful  1
## 200       1                bad  1
## 201       1               bear  1
## 202       1            becasue  1
## 203       1                bit  1
## 204       1            brought  1
## 205       1              brunt  1
## 206       1              build  1
## 207       1               call  1
## 208       1             caring  1
## 209       1             caught  1
## 210       1             caused  1
## 211       1             change  1
## 212       1              cheap  1
## 213       1           chemical  1
## 214       1           children  1
## 215       1            choices  1
## 216       1             cities  1
## 217       1               clue  1
## 218       1            colored  1
## 219       1             coming  1
## 220       1          companies  1
## 221       1            company  1
## 222       1         conditions  1
## 223       1             condos  1
## 224       1         considered  1
## 225       1        contributes  1
## 226       1            country  1
## 227       1            created  1
## 228       1              crime  1
## 229       1            crucial  1
## 230       1            current  1
## 231       1             damage  1
## 232       1                day  1
## 233       1               deal  1
## 234       1              dealt  1
## 235       1             decent  1
## 236       1           decision  1
## 237       1           decrease  1
## 238       1            depends  1
## 239       1          difficult  1
## 240       1           directly  1
## 241       1     discrimination  1
## 242       1          discussed  1
## 243       1         discussing  1
## 244       1        disparities  1
## 245       1   disproportionate  1
## 246       1          diversity  1
## 247       1              doesn  1
## 248       1              drink  1
## 249       1           drinking  1
## 250       1               drop  1
## 251       1                eat  1
## 252       1           economic  1
## 253       1          economics  1
## 254       1          ecosystem  1
## 255       1          education  1
## 256       1           effected  1
## 257       1            effects  1
## 258       1          ehtically  1
## 259       1          eliminate  1
## 260       1             endure  1
## 261       1            enhance  1
## 262       1           ensuring  1
## 263       1             entire  1
## 264       1         enviroment  1
## 265       1       enviromental  1
## 266       1    environmentally  1
## 267       1     environtmental  1
## 268       1            equally  1
## 269       1          equitable  1
## 270       1        exacerbates  1
## 271       1              exact  1
## 272       1           existing  1
## 273       1         experience  1
## 274       1        experiences  1
## 275       1            exposed  1
## 276       1           exposure  1
## 277       1          extremely  1
## 278       1               eyes  1
## 279       1             facing  1
## 280       1           families  1
## 281       1              fault  1
## 282       1        financially  1
## 283       1                fix  1
## 284       1              fixes  1
## 285       1               food  1
## 286       1               form  1
## 287       1             formed  1
## 288       1             fought  1
## 289       1             freaks  1
## 290       1            garbage  1
## 291       1               goal  1
## 292       1                god  1
## 293       1            grocery  1
## 294       1           habitats  1
## 295       1            handled  1
## 296       1               hard  1
## 297       1            harmful  1
## 298       1            helping  1
## 299       1         highlights  1
## 300       1            history  1
## 301       1             honest  1
## 302       1           honestly  1
## 303       1           hopeless  1
## 304       1           horrible  1
## 305       1          hospitals  1
## 306       1              human  1
## 307       1               hurt  1
## 308       1               idea  1
## 309       1                ill  1
## 310       1          illnesses  1
## 311       1          impactful  1
## 312       1          impacting  1
## 313       1            impacts  1
## 314       1       impoverished  1
## 315       1           incident  1
## 316       1         insightful  1
## 317       1          instances  1
## 318       1         invalidate  1
## 319       1             invest  1
## 320       1            justice  1
## 321       1          knowledge  1
## 322       1      knowledgeable  1
## 323       1               lack  1
## 324       1           landfill  1
## 325       1          landfills  1
## 326       1            leading  1
## 327       1              learn  1
## 328       1            learned  1
## 329       1             levels  1
## 330       1             lights  1
## 331       1             linger  1
## 332       1              lives  1
## 333       1             living  1
## 334       1              local  1
## 335       1           location  1
## 336       1          louisiana  1
## 337       1           maintain  1
## 338       1         maintained  1
## 339       1            managed  1
## 340       1              means  1
## 341       1                 mi  1
## 342       1           minority  1
## 343       1               move  1
## 344       1           negative  1
## 345       1         negatively  1
## 346       1          officials  1
## 347       1           opinions  1
## 348       1        opportunity  1
## 349       1             passed  1
## 350       1            peoples  1
## 351       1             plants  1
## 352       1              plays  1
## 353       1           policies  1
## 354       1             poorer  1
## 355       1        predominant  1
## 356       1          prejudice  1
## 357       1           pressing  1
## 358       1          prevalent  1
## 359       1            prevent  1
## 360       1             prices  1
## 361       1           properly  1
## 362       1           provided  1
## 363       1             public  1
## 364       1              races  1
## 365       1           rational  1
## 366       1             reason  1
## 367       1             refers  1
## 368       1         regulation  1
## 369       1       reoccurrence  1
## 370       1          represent  1
## 371       1          residents  1
## 372       1        respiratory  1
## 373       1     responsibility  1
## 374       1        restaurants  1
## 375       1       restrictions  1
## 376       1             rising  1
## 377       1               risk  1
## 378       1              river  1
## 379       1              roads  1
## 380       1               role  1
## 381       1                run  1
## 382       1               save  1
## 383       1          scapegoat  1
## 384       1         segregated  1
## 385       1          sidewalks  1
## 386       1             simple  1
## 387       1          situation  1
## 388       1           solution  1
## 389       1           someting  1
## 390       1             sooner  1
## 391       1              spend  1
## 392       1             spread  1
## 393       1              stand  1
## 394       1         standpoint  1
## 395       1           status's  1
## 396       1              stems  1
## 397       1               step  1
## 398       1             stores  1
## 399       1            support  1
## 400       1          supported  1
## 401       1              takes  1
## 402       1           themself  1
## 403       1           therefor  1
## 404       1              times  1
## 405       1            today's  1
## 406       1              treat  1
## 407       1            treated  1
## 408       1             unfair  1
## 409       1             unique  1
## 410       1             united  1
## 411       1             unjust  1
## 412       1         vulnerable  1
## 413       1             weapon  1
## 414       1          willpower  1
## 415       1              wrong  1
## 416       1             zoning  1
## 417       1               <NA>  1
## 418       2         absolutely  1
## 419       2            address  1
## 420       2          addressed  1
## 421       2           adequate  1
## 422       2          advantage  1
## 423       2          affecting  1
## 424       2            affects  1
## 425       2           allowing  1
## 426       2             amount  1
## 427       2            amounts  1
## 428       2             animal  1
## 429       2             answer  1
## 430       2         apparently  1
## 431       2             aspect  1
## 432       2                bad  1
## 433       2              based  1
## 434       2             biases  1
## 435       2              birth  1
## 436       2              black  1
## 437       2              blind  1
## 438       2               born  1
## 439       2              broad  1
## 440       2             broken  1
## 441       2           building  1
## 442       2               care  1
## 443       2          causation  1
## 444       2             caused  1
## 445       2             change  1
## 446       2             choose  1
## 447       2               city  1
## 448       2              claim  1
## 449       2              class  1
## 450       2            climate  1
## 451       2       concentrated  1
## 452       2         constantly  1
## 453       2        contributes  1
## 454       2        correlation  1
## 455       2          countries  1
## 456       2              crazy  1
## 457       2            created  1
## 458       2            creates  1
## 459       2           credible  1
## 460       2             cuases  1
## 461       2            current  1
## 462       2             cycles  1
## 463       2              death  1
## 464       2             decade  1
## 465       2          depleting  1
## 466       2           deserves  1
## 467       2           destroys  1
## 468       2        destruction  1
## 469       2      developmental  1
## 470       2      disadvantaged  1
## 471       2           disagree  1
## 472       2      discriminated  1
## 473       2           disposal  1
## 474       2 disproportionately  1
## 475       2        distributed  1
## 476       2       distribution  1
## 477       2            diverse  1
## 478       2        documentary  1
## 479       2             dumped  1
## 480       2             easily  1
## 481       2         economical  1
## 482       2          ecosystem  1
## 483       2            educate  1
## 484       2           educated  1
## 485       2           effected  1
## 486       2          emissions  1
## 487       2       enviormental  1
## 488       2            equally  1
## 489       2            examine  1
## 490       2           examples  1
## 491       2            existed  1
## 492       2             exists  1
## 493       2         extinction  1
## 494       2            extreme  1
## 495       2                eye  1
## 496       2               eyes  1
## 497       2         facilities  1
## 498       2             facing  1
## 499       2            failure  1
## 500       2           failures  1
## 501       2           families  1
## 502       2               feel  1
## 503       2              field  1
## 504       2         frequently  1
## 505       2             gender  1
## 506       2             giving  1
## 507       2              god's  1
## 508       2         government  1
## 509       2            growing  1
## 510       2          happening  1
## 511       2               harm  1
## 512       2            harmful  1
## 513       2               hate  1
## 514       2          hazardous  1
## 515       2         healthcare  1
## 516       2          hierarchy  1
## 517       2             highly  1
## 518       2           horrible  1
## 519       2            housing  1
## 520       2          illnesses  1
## 521       2           imminent  1
## 522       2             impact  1
## 523       2            impacts  1
## 524       2           incident  1
## 525       2             income  1
## 526       2            incomes  1
## 527       2           industry  1
## 528       2           inequity  1
## 529       2        information  1
## 530       2     infrastructure  1
## 531       2        institution  1
## 532       2            judging  1
## 533       2            keeping  1
## 534       2              lakes  1
## 535       2              layer  1
## 536       2              leave  1
## 537       2               life  1
## 538       2             living  1
## 539       2            located  1
## 540       2                low  1
## 541       2               main  1
## 542       2         management  1
## 543       2             market  1
## 544       2             matter  1
## 545       2              means  1
## 546       2               meat  1
## 547       2            mention  1
## 548       2            minimal  1
## 549       2           minority  1
## 550       2           mistaken  1
## 551       2              money  1
## 552       2               move  1
## 553       2            natural  1
## 554       2         negatively  1
## 555       2            neglect  1
## 556       2           occupied  1
## 557       2          occurring  1
## 558       2          officials  1
## 559       2            opinion  1
## 560       2           outcomes  1
## 561       2           outwards  1
## 562       2              ozone  1
## 563       2            perfect  1
## 564       2         personally  1
## 565       2          personlly  1
## 566       2         phenomenon  1
## 567       2             phrase  1
## 568       2           physical  1
## 569       2           policies  1
## 570       2             policy  1
## 571       2           politics  1
## 572       2           pollutes  1
## 573       2          polluting  1
## 574       2              ponds  1
## 575       2             poorer  1
## 576       2         population  1
## 577       2      predominantly  1
## 578       2          prevalent  1
## 579       2        prioritized  1
## 580       2          privilege  1
## 581       2      progressively  1
## 582       2           properly  1
## 583       2               push  1
## 584       2          qualities  1
## 585       2            quicker  1
## 586       2               real  1
## 587       2           recently  1
## 588       2              refer  1
## 589       2            resolve  1
## 590       2         resource's  1
## 591       2            respond  1
## 592       2             result  1
## 593       2         ridiculous  1
## 594       2                sad  1
## 595       2             school  1
## 596       2           severely  1
## 597       2            shouldn  1
## 598       2        significant  1
## 599       2             single  1
## 600       2      socioeconomic  1
## 601       2             solved  1
## 602       2              speak  1
## 603       2            species  1
## 604       2             status  1
## 605       2             steady  1
## 606       2         stereotype  1
## 607       2           stewards  1
## 608       2            studied  1
## 609       2           superior  1
## 610       2        sustainable  1
## 611       2       systemically  1
## 612       2               tend  1
## 613       2               time  1
## 614       2            today's  1
## 615       2             toxins  1
## 616       2            treated  1
## 617       2          treatment  1
## 618       2              types  1
## 619       2             unable  1
## 620       2            unaware  1
## 621       2   underrepresented  1
## 622       2          understnd  1
## 623       2            unequal  1
## 624       2             unfair  1
## 625       2         unfamiliar  1
## 626       2            ungodly  1
## 627       2              views  1
## 628       2         vulnerable  1
## 629       2            watched  1
## 630       2              white  1
## 631       2           whomever  1
## 632       2          workplace  1
## 633       2              world  1
## 634       2             worldy  1
## 635       2           worrying  1
## 636       2              worse  1
## 637       2            worship  1
## 638       2              wrong  1
## 639      NA            aspects  1
## 640      NA             aspire  1
## 641      NA            current  1
## 642      NA            discuss  1
## 643      NA      environmental  1
## 644      NA           honestly  1
## 645      NA          impactful  1
## 646      NA          injustice  1
## 647      NA      understanding  1
## 648      NA                ups  1

top_words <- word_counts %>%
  group_by(Section) %>%
  slice_max(n, n = 20)

top_words

## # A tibble: 58 × 3
## # Groups:   Section [3]
##    Section word              n
##      <int> <chr>         <int>
##  1       1 environmental    30
##  2       1 racism           21
##  3       1 injustice        17
##  4       1 people           14
##  5       1 communities      13
##  6       1 government       10
##  7       1 water             9
##  8       1 community         8
##  9       1 topic             8
## 10       1 understanding     8
## # ℹ 48 more rows

ggplot(top_words, aes(x = reorder(word, n), y = n, fill = factor(Section))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Section, scales = "free") +
  coord_flip() +
  labs(
    x = "",
    y = "Word Frequency",
    title = "Most Common Words in Student Responses by Section"
  ) +
  theme_bw()

keywords <- c("system","policy","government","industry","media","bias","justice","freedom")

words_clean %>%
  filter(word %in% keywords) %>%
  count(Section, word)

##   Section       word  n
## 1       1 government 10
## 2       1    justice  1
## 3       2 government  1
## 4       2   industry  1
## 5       2     policy  1
## 6       2     system  2

ia_terms <- c(
  "system","systemic","structure","structural",
  "policy","policies",
  "government","state",
  "industry","corporate","corporation",
  "power","control",
  "bias","biased",
  "media","narrative","framing",
  "justice","inequality","equity",
  "rights","freedom",
  "capitalism","market",
  "regulation","regulated"
)

library(dplyr)
library(tidyr)
library(tidytext)

text_long <- resp %>%
  pivot_longer(
    cols = starts_with("Vid."),
    names_to = "Topic",
    values_to = "Text"
  )
words <- text_long %>%
  unnest_tokens(word, Text)

data(stop_words)

words_clean <- words %>%
  anti_join(stop_words, by = "word")

ia_words <- words_clean %>%
  filter(word %in% ia_terms)

ia_counts <- ia_words %>%
  count(Section, Topic, word, sort = TRUE)

ia_counts

## # A tibble: 28 × 4
##    Section Topic           word           n
##      <int> <chr>           <chr>      <int>
##  1       1 Vid.Resp.Text   regulated     16
##  2       2 Vid.Resp.Text   regulated     12
##  3       1 Vid.EnvInj.Text government    10
##  4       2 Vid.EnvInj.Text systemic       3
##  5       2 Vid.Resp.Text   regulation     3
##  6       1 Vid.EnvInj.Text systemic       2
##  7       1 Vid.MolBio.Text control        2
##  8       1 Vid.Resp.Text   government     2
##  9       1 Vid.Resp.Text   regulation     2
## 10       2 Vid.EnvInj.Text system         2
## # ℹ 18 more rows

ia_summary <- ia_words %>%
  count(Section, Topic)

ia_summary

## # A tibble: 8 × 3
##   Section Topic                 n
##     <int> <chr>             <int>
## 1       1 Vid.EnvInj.Text      15
## 2       1 Vid.MolBio.Text       2
## 3       1 Vid.Resp.Text        22
## 4       1 Vid.Vaccines.Text     3
## 5       2 Vid.EnvInj.Text      10
## 6       2 Vid.MolBio.Text       2
## 7       2 Vid.Resp.Text        16
## 8       2 Vid.Vaccines.Text     3

library(ggplot2)

ggplot(ia_summary,
       aes(x = Topic, y = n, fill = factor(Section))) +
  geom_col(position = "dodge") +
  coord_flip() +
  theme_bw() +
  labs(
    y = "Number of IA terms used",
    x = "Topic",
    fill = "Section"
  )

total_words <- words_clean %>%
  count(Section, Topic)

ia_rate <- ia_summary %>%
  left_join(total_words, by = c("Section","Topic"),
            suffix = c("_ia","_total")) %>%
  mutate(rate = n_ia / n_total)

ia_rate

## # A tibble: 8 × 5
##   Section Topic              n_ia n_total    rate
##     <int> <chr>             <int>   <int>   <dbl>
## 1       1 Vid.EnvInj.Text      15     620 0.0242 
## 2       1 Vid.MolBio.Text       2     534 0.00375
## 3       1 Vid.Resp.Text        22     546 0.0403 
## 4       1 Vid.Vaccines.Text     3     526 0.00570
## 5       2 Vid.EnvInj.Text      10     581 0.0172 
## 6       2 Vid.MolBio.Text       2     514 0.00389
## 7       2 Vid.Resp.Text        16     657 0.0244 
## 8       2 Vid.Vaccines.Text     3     529 0.00567

ia_rate %>%
  arrange(desc(rate))

## # A tibble: 8 × 5
##   Section Topic              n_ia n_total    rate
##     <int> <chr>             <int>   <int>   <dbl>
## 1       1 Vid.Resp.Text        22     546 0.0403 
## 2       2 Vid.Resp.Text        16     657 0.0244 
## 3       1 Vid.EnvInj.Text      15     620 0.0242 
## 4       2 Vid.EnvInj.Text      10     581 0.0172 
## 5       1 Vid.Vaccines.Text     3     526 0.00570
## 6       2 Vid.Vaccines.Text     3     529 0.00567
## 7       2 Vid.MolBio.Text       2     514 0.00389
## 8       1 Vid.MolBio.Text       2     534 0.00375

ggplot(ia_rate,
       aes(x = Topic, y = rate, fill = factor(Section))) +
  geom_col(position = "dodge") +
  coord_flip() +
  theme_bw() +
  labs(
    y = "Rate of IA terms used",
    x = "Topic",
    fill = "Section"
  )

library(dplyr)
library(tidytext)
library(tidyr)
#install.packages("tidylo")
library(tidylo)


log_odds <- word_counts %>%
  bind_log_odds(set = Section, feature = word, n = n)

log_odds_plot <- log_odds %>%
  slice_max(abs(log_odds_weighted), n = 20) %>%
  mutate(
    more_common = ifelse(log_odds_weighted > 0, "AI", "Control"),
    word = reorder(word, log_odds_weighted)
  )

xmax <- max(abs(log_odds_plot$log_odds_weighted)) * 1.15

ggplot(log_odds_plot,
       aes(x = word, y = log_odds_weighted, fill = more_common)) +
  geom_col(width = 0.8) +
  coord_flip() +
  geom_hline(yintercept = 0, linewidth = 0.6) +
  annotate("text", x = 1, y = xmax * 0.85, label = "AI", hjust = 1, fontface = "bold") +
  annotate("text", x = 1, y = -xmax * 0.85, label = "Control", hjust = 0, fontface = "bold") +
  scale_fill_manual(values = c("AI" = "#D95F5F", "Control" = "#2C7FB8")) +
  scale_y_continuous(limits = c(-xmax, xmax)) +
  labs(
    x = NULL,
    y = "Weighted log odds",
    fill = "More common in",
    title = "Words most characteristic of each section"
  ) +
  theme_bw() +
  theme(
    panel.grid.major.y = element_blank(),
    legend.position = "none"
  )

library(tidylo)

log_odds_topic <- words_clean %>%
  count(Topic, Section, word) %>%
  group_by(Topic) %>%
  filter(sum(n) >= 3) %>%
  bind_log_odds(set = Section, feature = word, n = n) %>%
  ungroup()

plot_topic <- log_odds_topic %>%
  group_by(Topic) %>%
  slice_max(abs(log_odds_weighted), n = 8) %>%
  mutate(more_common = ifelse(log_odds_weighted > 0, "AI", "Control"))

ggplot(plot_topic,
       aes(x = reorder(word, log_odds_weighted), y = log_odds_weighted, fill = more_common)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~Topic, scales = "free_y") +
  scale_fill_manual(values = c("AI" = "#D95F5F", "Control" = "#2C7FB8")) +
  labs(
    x = NULL,
    y = "Weighted log odds",
    fill = "More common in",
    title = "Topic-specific words distinguishing AI and Control sections"
  ) +
  theme_bw()

The results suggest that explicit ideological awareness instruction influences students’ interpretive reasoning about socio-scientific issues, but does not consistently change the direction of their attitudes.

Students exposed to AI instruction appear more likely to:

articulate clearer positions

recognize ideological structures

analyze issues beyond individual harms.

Explicit instruction in ideological awareness does not necessarily change students’ positions on socio-scientific issues, but it appears to influence how students interpret and articulate those issues. Students in the AI section demonstrated more structured ideological reasoning, whereas students in the control section more frequently expressed neutral or opinion-based responses.

combine the qual codes with the quantitative data to look for differences.

Your data suggest:

Students can recognize ideological issues even without explicit instruction, but explicit IA instruction shifts how they reason about those issues.

Specifically, IA instruction appears to:

increase emotional engagement

strengthen science identity

encourage structural interpretation of socio-scientific issues

rather than simply increasing advocacy or opinion strength.

#focus in on long response more clearly

structural_terms <- c(
  "system","systemic","structure","structural",
  "policy","policies","regulation","regulated",
  "government","state","institution","institutions",
  "industry","corporate","corporation","companies",
  "power","powerful","control",
  "inequality","inequity","equity",
  "justice","racism","discrimination",
  "community","communities","neighborhood","neighborhoods",
  "environmental","environment","pollution",
  "media","narrative","framing",
  "economic","economics","capitalism","market"
)

library(dplyr)
library(tidytext)
library(tidyr)

text_long <- resp %>%
  pivot_longer(
    cols = starts_with("Vid."),
    names_to = "Topic",
    values_to = "Text"
  )

words <- text_long %>%
  unnest_tokens(word, Text)

data(stop_words)

words_clean <- words %>%
  anti_join(stop_words, by = "word")

structural_words <- words_clean %>%
  filter(word %in% structural_terms)

structural_counts <- structural_words %>%
  count(SortingID, Section, Topic)


total_words <- words_clean %>%
  count(SortingID, Section, Topic)

structural_rate <- structural_counts %>%
  left_join(total_words, by = c("SortingID","Section","Topic"),
            suffix = c("_struct","_total")) %>%
  mutate(rate = n_struct / n_total)

structural_summary <- structural_rate %>%
  filter(!is.na(Section)) %>%     # remove NA section rows
  group_by(Section, Topic) %>%
  summarise(
    mean_rate = mean(rate, na.rm = TRUE),
    sd = sd(rate, na.rm = TRUE),
    n = n(),
    se = sd / sqrt(n),            # <-- this creates the SE column
    .groups = "drop"
  )

structural_summary <- structural_summary %>%
  filter(!is.na(Section))

library(ggplot2)

ggplot(structural_summary,
       aes(x = Topic, y = mean_rate, fill = factor(Section))) +
  geom_col(position = "dodge") +
  coord_flip() +
  theme_bw() +
  labs(
    y = "Structural reasoning rate",
    x = "Topic",
    fill = "Section",
    title = "Use of structural reasoning language in responses"
  )

lm(rate ~ Section + Topic, data = structural_rate)

## 
## Call:
## lm(formula = rate ~ Section + Topic, data = structural_rate)
## 
## Coefficients:
##            (Intercept)                 Section    TopicVid.MolBio.Text  
##               0.281361               -0.005751               -0.207970  
##     TopicVid.Resp.Text  TopicVid.Vaccines.Text  
##              -0.120820               -0.179134

high_structural <- structural_rate %>%
  arrange(desc(rate)) %>%
  slice_head(n = 10)

resp %>%
  filter(SortingID %in% high_structural$SortingID)

##    SortingID Section Timepoint   Consent              Email
## 1         15       1      Post I Consent kme0042@auburn.edu
## 2         19       2      Post I Consent emo0021@auburn.edu
## 3         28       1      Post I Consent rer0061@auburn.edu
## 4         39       2      Post I Consent Hno0009@auburn.edu
## 5         47       2      Post I Consent lrp0037@auburn.edu
## 6         48       2      Post I Consent mzs0263@auburn.edu
## 7         69       1      Post I Consent akd0082@auburn.edu
## 8         86       1      Post I Consent mzp0147@auburn.edu
## 9        104       2      Post I Consent mar0147@auburn.edu
## 10       119       1      Post I Consent meg0156@auburn.edu
##                                                                                                                             Learn.Science
## 1                                                                                                      So we can know how the world works
## 2                                                                                         To learn about how the world around them works.
## 3  Learning science is somethings that is important because it can help you learn more about how your body works and how the earth works.
## 4                                                                 To understand basic concepts of how the world and things around us work
## 5                                                                                                to understand our world and how it works
## 6                                                                                                      Because science is objective truth
## 7                                                                                                                                        
## 8                                                                                                                                        
## 9                 It is important for students to learn science because science greatly effects their own life and the world around them.
## 10                                                      It is important to learn biology so that people can have basic knowledge of life.
##                                                                                                                                                                                                                                 Benefits
## 1                                                                                                It would help people have a better understanding of the more political parts of the science field so they can  form their own opinions.
## 2                                                                                                                                                                          Make people more aware of what's going on in today's society.
## 3                                                                                                                                  Having different ideal that others might not of heard about which might open their eye to new ideals.
## 4                                                                                                                                                            Biology would be inclusive and appealing to all groups if this was included
## 5                                                                                                                                                                                                      to have a more unbiased education
## 6  With my biology class being so introductory, I'm not sure which materials would benefit from ideological awareness, but maybe making sure to emphasize the difference in sex and gender when teaching about the reproductive system. 
## 7                                                                                                                                                                          Students would know some things are not soley based on facts.
## 8                                                                                                                                                                                                                     to have awareness.
## 9                                                                                                        More ideologically aware resources would benefit biology content by helping people learn more about their own anatomy and life.
## 10                                                                                                                                                            \nI think this may be able to help students understand a few things better
##                                                                                                      Downsides
## 1                                                              You could offend people if you are not careful.
## 2                                                                      It can deteriorate from class material.
## 3   Some people not be open to learning new ideas, which might lead them to not evey listening to other ideas.
## 4                              I think some people would get annoyed with the fact that it has to be mentioned
## 5                                                                                 take time away from material
## 6                                                                                    I don't see any downsides
## 7                                                                        Some people would argue these topics.
## 8                                                                       may sway people\xd5s thought processes
## 9  One downside would be that many aspects of ideologies can lead to controversy and take away from the facts.
## 10                                                                           this could greatly offend people.
##                                                                                                                                                                                                                                      Vid.Resp.Text
## 1                                                                                                                                                                                                    I think that they should be highly regulated.
## 2                                                                                                                                                                                       Vaping and using marijuana is bad for the public's health.
## 3                                                                                                                                    Vaping and marijuana use is not good for the health of user, and the government needed to regulate them more.
## 4                                                                                                                                                           I think vaping should be banned but marijuana should be available for prescription use
## 5                                                                                                                                            I do not believe in the recreational use of vapes/marijuana because it has been proven harmful for us
## 6  I think vaping is horrible. I think that right now, marijuana use is dangerous in our state because it is not regulated because it is illegal. I think that there are safe ways to use marijuana, but I do not think that vaping is ever safe. 
## 7                                                                                                                                                                                                     I do not do it because it is bad for health.
## 8                                                                                                                                                                                         it\xd5s not good for you, it could harm those around you
## 9                                                                             Marijuana can be very beneficial to those suffering with chronic pain. However, Marijuana should be well regulated and only accessible to those with a prescription.
## 10                                                                                                                                                      I think that it has not been around long enough to know the harmful side effects of vaping
##                                                                                                                                                                                                                                      Vid.MolBio.Text
## 1                                                                                                I believe we should not allow this because it will create an even bigger divide in society because not everyone will have access to the technology.
## 2                                                                                                                                                                        I don't know enough information on human genome editing to give my opinion.
## 3                                                                                                                                                                                                          Designer babies can be bad for the world.
## 4                                                                                                                       I think that it is wrong and having a baby should be a natural process unless altering it to not have death related diseases
## 5                                                                                                                                                          I think the ethical and societal implications outweigh the other pros for genetic editing
## 6                                                                                                                                                                          I don't really know enough about this topic to make a thoughtful comment.
## 7                                                                                                                                                                                                                             Not sure what this is.
## 8                                                                                                                                                                                  i don\xd5t understand it enough to have a strong thought about it
## 9  I do not believe genetic editing is ethical if people are trying to make children look a certain way. This practice can be dangerous and is unnatural. Genetic editing should only be used when necessary to prevent a life threatening mutation.
## 10                                                                                                                                                                                                   I think that this is wrong based on my religion
##                                                                                                                                              Vid.Vaccines.Text
## 1  We should all try to get vaccinated but we shouldn\xd5t be forced to do anything we don\xd5t want to do. We also need to have correct information about it.
## 2                                                                                                                      Vaccines are good for society's health.
## 3                                                                           Misinformation has a really negative impact on whether or not people get vaccines.
## 4                                                                     I think vaccinations are important to the general public and stop disease from spreading
## 5                                                                                                       I do not know enough information on these vaccinations
## 6                                                                                I think that it is important to get vaccinated for the benefit of the public.
## 7                                                                                                   It can help people not spread diseases that they may have.
## 8                                                                       vaccines should be required, for not only the person\xd5s safety but those around them
## 9                          Vaccinations are soemtimes useful when preventing possible disease. Vaccines should be well studied and should not be experimental.
## 10                                                                                  I think that vaccines are good but they SHOULD NOT be forced on the public
##                                                                                             Vid.EnvInj.Text
## 1  I find the most concerning impact to be that these things are passed down from generation to generation.
## 2                                       I do not know enough about environmental racism to give my opinion.
## 3        Environmental racism and injustice is something that need to be worked on within the United State.
## 4                                             I don\xd5t know much about environmental racism and injustice
## 5                                                      I find environmental racism concerning and impactful
## 6                 I don't know much about environmental racism, but I do find the concept very concerning. 
## 7                                                      I am not aware of any environmental racism concerns.
## 8                                                         just take care of the environment and each other.
## 9                                                                 I am unfamiliar with environmental racism
## 10                                                              I have no clue what environmental racism is
##     X X.1 X.2 X.3 X.4 X.5 X.6 X.7 X.8 X.9 X.10 X.11 X.12 X.13 X.14 X.15
## 1  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 2  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 3  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 4  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 5  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 6  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 7  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 8  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 9  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 10 NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA

summary(lm(rate ~ Section * Topic, data = structural_rate))

## 
## Call:
## lm(formula = rate ~ Section * Topic, data = structural_rate)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24113 -0.08461 -0.02008  0.04872  0.40360 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.24153    0.04950   4.880 3.28e-06 ***
## Section                         0.02154    0.03210   0.671    0.503    
## TopicVid.MolBio.Text           -0.05913    0.17742  -0.333    0.740    
## TopicVid.Resp.Text             -0.03934    0.08607  -0.457    0.648    
## TopicVid.Vaccines.Text         -0.07817    0.13059  -0.599    0.551    
## Section:TopicVid.MolBio.Text   -0.09438    0.10549  -0.895    0.373    
## Section:TopicVid.Resp.Text     -0.05704    0.05731  -0.995    0.322    
## Section:TopicVid.Vaccines.Text -0.07044    0.08672  -0.812    0.418    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1376 on 121 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.2532, Adjusted R-squared:   0.21 
## F-statistic: 5.862 on 7 and 121 DF,  p-value: 7.124e-06

structural_summary$Topic <- factor(
  structural_summary$Topic,
  levels = c(
    "Vid.EnvInj.Text",
    "Vid.Resp.Text",
    "Vid.Vaccines.Text",
    "Vid.MolBio.Text"
  )
)
ggplot(structural_summary,
       aes(x = Topic, y = mean_rate, color = factor(Section))) +
  geom_point(size = 4) +
  geom_errorbar(aes(ymin = mean_rate - se,
                    ymax = mean_rate + se),
                width = .1) +
  coord_flip() +
  theme_bw() +
  labs(
    y = "Structural reasoning rate",
    x = "Topic",
    color = "Section"
  )

AI students appear more likely to:

connect science to systems

recognize social structures

interpret responsibility beyond individuals

library(dplyr)
library(tidyr)
library(tidytext)

structural_terms <- c(
  "system","systemic","structure","structural",
  "policy","policies","regulation","regulated",
  "government","state","institution","institutions",
  "industry","corporate","corporation","companies",
  "power","control",
  "inequality","inequity","equity",
  "justice","racism","discrimination",
  "community","communities","neighborhood","neighborhoods",
  "environmental","environment",
  "media","narrative","framing",
  "economic","economics","capitalism","market"
)

text_long <- resp %>%
  pivot_longer(
    cols = starts_with("Vid."),
    names_to = "Topic",
    values_to = "Text"
  ) %>%
  mutate(
    Section = na_if(as.character(Section), "NA"),
    Section = recode(Section, "1" = "AI", "2" = "Control")
  ) %>%
  filter(!is.na(Section), !is.na(Text), Text != "")


words_clean <- text_long %>%
  unnest_tokens(word, Text) %>%
  anti_join(stop_words, by = "word")

structural_by_response <- words_clean %>%
  group_by(SortingID, Section, Topic) %>%
  summarise(
    total_words = n(),
    structural_n = sum(word %in% structural_terms),
    structural_present = as.integer(structural_n > 0),
    structural_rate = structural_n / total_words,
    .groups = "drop"
  )

library(lme4)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## 
## Attaching package: 'lme4'

## The following object is masked from 'package:nlme':
## 
##     lmList

model_presence <- glmer(
  structural_present ~ Section + (1 | Topic) + (1 | SortingID),
  data = structural_by_response,
  family = binomial
)

summary(model_presence)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: structural_present ~ Section + (1 | Topic) + (1 | SortingID)
##    Data: structural_by_response
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     455.8     472.7    -223.9     447.8       500 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.8812 -0.4372 -0.2473  0.3373  4.4931 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  SortingID (Intercept) 1.040    1.020   
##  Topic     (Intercept) 2.209    1.486   
## Number of obs: 504, groups:  SortingID, 133; Topic, 4
## 
## Fixed effects:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     -1.7854     0.7781  -2.295   0.0218 *
## SectionControl   0.3311     0.3119   1.061   0.2885  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## SectinCntrl -0.175

exp(fixef(model_presence))

##    (Intercept) SectionControl 
##      0.1677246      1.3924618

exp(confint(model_presence, parm = "beta_", method = "Wald"))

##                     2.5 %    97.5 %
## (Intercept)    0.03649859 0.7707571
## SectionControl 0.75555417 2.5662618

library(lme4)

model_rate <- lmer(
  structural_rate ~ Section + (1 | Topic) + (1 | SortingID),
  data = structural_by_response
)

summary(model_rate)

## Linear mixed model fit by REML ['lmerMod']
## Formula: structural_rate ~ Section + (1 | Topic) + (1 | SortingID)
##    Data: structural_by_response
## 
## REML criterion at convergence: -853.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.7268 -0.3660 -0.0739  0.0050  4.9553 
## 
## Random effects:
##  Groups    Name        Variance  Std.Dev.
##  SortingID (Intercept) 0.0001068 0.01033 
##  Topic     (Intercept) 0.0059033 0.07683 
##  Residual              0.0101001 0.10050 
## Number of obs: 504, groups:  SortingID, 133; Topic, 4
## 
## Fixed effects:
##                Estimate Std. Error t value
## (Intercept)    0.052119   0.038868   1.341
## SectionControl 0.008608   0.009320   0.924
## 
## Correlation of Fixed Effects:
##             (Intr)
## SectinCntrl -0.096

model_rate_topic <- lmer(
  structural_rate ~ Section * Topic + (1 | SortingID),
  data = structural_by_response
)

summary(model_rate_topic)

## Linear mixed model fit by REML ['lmerMod']
## Formula: structural_rate ~ Section * Topic + (1 | SortingID)
##    Data: structural_by_response
## 
## REML criterion at convergence: -838.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.0270 -0.3505 -0.0757 -0.0184  5.1401 
## 
## Random effects:
##  Groups    Name        Variance  Std.Dev.
##  SortingID (Intercept) 0.0001306 0.01143 
##  Residual              0.0099822 0.09991 
## Number of obs: 504, groups:  SortingID, 133
## 
## Fixed effects:
##                                       Estimate Std. Error t value
## (Intercept)                            0.14740    0.01202  12.264
## SectionControl                         0.05403    0.01896   2.849
## TopicVid.MolBio.Text                  -0.14300    0.01661  -8.610
## TopicVid.Resp.Text                    -0.10253    0.01645  -6.231
## TopicVid.Vaccines.Text                -0.13727    0.01641  -8.367
## SectionControl:TopicVid.MolBio.Text   -0.05513    0.02623  -2.101
## SectionControl:TopicVid.Resp.Text     -0.06425    0.02592  -2.479
## SectionControl:TopicVid.Vaccines.Text -0.05787    0.02596  -2.229
## 
## Correlation of Fixed Effects:
##             (Intr) SctnCn TV.MB. TV.R.T TV.V.T SC:TV.M SC:TV.R
## SectinCntrl -0.634                                            
## TpcVd.MlB.T -0.715  0.453                                     
## TpcVd.Rsp.T -0.722  0.458  0.522                              
## TpcVd.Vcc.T -0.724  0.459  0.524  0.529                       
## ScC:TV.MB.T  0.453 -0.714 -0.633 -0.331 -0.332                
## SctC:TV.R.T  0.458 -0.723 -0.332 -0.635 -0.336  0.523         
## SctC:TV.V.T  0.458 -0.722 -0.331 -0.334 -0.632  0.522   0.528

library(ggeffects)
library(ggplot2)



model_topic <- glmer(
  structural_present ~ Section * Topic + (1 | SortingID),
  data = structural_by_response,
  family = binomial
)

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00267592 (tol = 0.002, component 1)

summary(model_topic)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: structural_present ~ Section * Topic + (1 | SortingID)
##    Data: structural_by_response
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     444.4     482.4    -213.2     426.4       495 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.5620 -0.3672 -0.2174  0.2520  6.3521 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  SortingID (Intercept) 1.406    1.186   
## Number of obs: 504, groups:  SortingID, 133
## 
## Fixed effects:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                             0.3842     0.3127   1.229    0.219    
## SectionControl                          0.7216     0.5116   1.411    0.158    
## TopicVid.MolBio.Text                   -4.1764     0.7307  -5.715 1.09e-08 ***
## TopicVid.Resp.Text                     -1.6779     0.4263  -3.936 8.29e-05 ***
## TopicVid.Vaccines.Text                 -3.2420     0.5603  -5.786 7.19e-09 ***
## SectionControl:TopicVid.MolBio.Text     0.0754     0.9546   0.079    0.937    
## SectionControl:TopicVid.Resp.Text      -0.7639     0.6555  -1.165    0.244    
## SectionControl:TopicVid.Vaccines.Text  -0.6430     0.8137  -0.790    0.429    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) SctnCn TV.MB. TV.R.T TV.V.T SC:TV.M SC:TV.R
## SectinCntrl -0.593                                            
## TpcVd.MlB.T -0.389  0.171                                     
## TpcVd.Rsp.T -0.627  0.325  0.389                              
## TpcVd.Vcc.T -0.499  0.231  0.369  0.480                       
## ScC:TV.MB.T  0.265 -0.451 -0.646 -0.193 -0.151                
## SctC:TV.R.T  0.375 -0.656 -0.135 -0.548 -0.183  0.367         
## SctC:TV.V.T  0.306 -0.534 -0.117 -0.211 -0.538  0.309   0.434 
## optimizer (Nelder_Mead) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.00267592 (tol = 0.002, component 1)

VarCorr(model_presence)

##  Groups    Name        Std.Dev.
##  SortingID (Intercept) 1.0196  
##  Topic     (Intercept) 1.4861

student_structural <- structural_by_response %>%
  group_by(SortingID, Section) %>%
  summarise(
    mean_rate = mean(structural_rate, na.rm = TRUE),
    .groups = "drop"
  )

ggplot(student_structural,
       aes(x = as.factor(Section), y = mean_rate)) +
  geom_violin(fill = "grey80") +
  geom_boxplot(width = .1) +
  geom_jitter(width = .1, alpha = .5) +
  theme_bw()

student_structural <- student_structural %>%
  mutate(level = case_when(
    mean_rate == 0 ~ "None",
    mean_rate < 0.05 ~ "Low",
    mean_rate < 0.10 ~ "Moderate",
    TRUE ~ "High"
  ))

student_structural <- student_structural %>%
  filter(!is.na(Section))

ggplot(student_structural,
       aes(x = level, fill = factor(Section))) +
  geom_bar(position = "dodge") +
  theme_bw() +
  labs(
    x = "Structural reasoning level",
    y = "Number of students",
    fill = "Section"
  )

student_structural$level <- factor(
  student_structural$level,
  levels = c("None","Low","Moderate","High")
)

ggplot(student_structural,
       aes(x = factor(Section), fill = level)) +
  geom_bar(position = "fill") +
  theme_bw() +
  labs(
    x = "Section",
    y = "Proportion of students",
    fill = "Structural reasoning level"
  )

table_levels <- table(student_structural$Section,
                      student_structural$level)

chisq.test(table_levels)

## 
##  Pearson's Chi-squared test
## 
## data:  table_levels
## X-squared = 1.695, df = 3, p-value = 0.638

This means many students never used structural reasoning language in any response.

That is actually a major finding:

Structural reasoning is not a default explanation style for most students.

This is very consistent with research showing students tend to default to proximate explanations (pollution, waste, sickness) rather than systemic explanations.

Test whether IA changes how students articulate structural reasoning

library(dplyr)
library(tidyr)
library(tidytext)

structural_terms <- c(
  "system","systemic","structure","structural",
  "policy","policies","regulation","regulated",
  "government","state","institution","institutions",
  "industry","corporate","corporation","companies",
  "power","control",
  "inequality","inequity","equity",
  "justice","racism","discrimination",
  "community","communities","neighborhood","neighborhoods",
  "environmental","environment",
  "media","narrative","framing",
  "economic","economics","capitalism","market"
)

text_long <- resp %>%
  pivot_longer(
    cols = starts_with("Vid."),
    names_to = "Topic",
    values_to = "Text"
  ) %>%
  mutate(
    Section = na_if(as.character(Section), "NA")
  ) %>%
  filter(!is.na(Section), !is.na(Text), Text != "")

words_clean <- text_long %>%
  unnest_tokens(word, Text) %>%
  anti_join(stop_words, by = "word")

structural_by_response <- words_clean %>%
  group_by(SortingID, Section, Topic) %>%
  summarise(
    total_words = n(),
    structural_n = sum(word %in% structural_terms),
    structural_present = structural_n > 0,
    structural_rate = structural_n / total_words,
    .groups = "drop"
  )

library(dplyr)
library(tidyr)
library(tidytext)

structural_terms <- c(
  "system","systemic","structure","structural",
  "policy","policies","regulation","regulated",
  "government","state","institution","institutions",
  "industry","corporate","corporation","companies",
  "power","control",
  "inequality","inequity","equity",
  "justice","racism","discrimination",
  "community","communities","neighborhood","neighborhoods",
  "environmental","environment",
  "media","narrative","framing",
  "economic","economics","capitalism","market"
)

text_long <- resp %>%
  pivot_longer(
    cols = starts_with("Vid."),
    names_to = "Topic",
    values_to = "Text"
  ) %>%
  mutate(
    Section = na_if(as.character(Section), "NA")
  ) %>%
  filter(!is.na(Section), !is.na(Text), Text != "")

words_clean <- text_long %>%
  unnest_tokens(word, Text) %>%
  anti_join(stop_words, by = "word")

structural_by_response <- words_clean %>%
  group_by(SortingID, Section, Topic) %>%
  summarise(
    total_words = n(),
    structural_n = sum(word %in% structural_terms),
    structural_present = structural_n > 0,
    structural_rate = structural_n / total_words,
    .groups = "drop"
  )

structural_responses <- structural_by_response %>%
  filter(structural_present)

structural_words_only <- words_clean %>%
  semi_join(structural_responses, by = c("SortingID", "Section", "Topic"))

library(tidylo)

word_counts_structural <- structural_words_only %>%
  count(Section, word) %>%
  group_by(word) %>%
  filter(sum(n) >= 3) %>%
  ungroup()

log_odds_structural <- word_counts_structural %>%
  bind_log_odds(set = Section, feature = word, n = n)

library(ggplot2)

plot_words <- log_odds_structural %>%
  slice_max(abs(log_odds_weighted), n = 20) %>%
  mutate(
    more_common = ifelse(log_odds_weighted > 0, "AI", "Control"),
    word = reorder(word, log_odds_weighted)
  )

ggplot(plot_words,
       aes(x = word, y = log_odds_weighted, fill = more_common)) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(values = c("AI" = "#D95F5F", "Control" = "#2C7FB8")) +
  theme_bw() +
  labs(
    x = NULL,
    y = "Weighted log odds",
    fill = "More common in",
    title = "How AI vs Control articulate structural reasoning"
  )

Check for reasoning archetypes

student_profiles <- structural_by_response %>%
  group_by(SortingID, Section) %>%
  summarise(
    n_topics = n(),
    n_structural = sum(structural_present),
    prop_structural = mean(structural_present),
    mean_rate = mean(structural_rate, na.rm = TRUE),
    max_rate = max(structural_rate, na.rm = TRUE),
    .groups = "drop"
  )
student_profiles <- student_profiles %>%
  mutate(archetype = case_when(
    n_structural == 0 ~ "None",
    prop_structural <= 0.25 ~ "Occasional",
    prop_structural <= 0.75 ~ "Mixed",
    prop_structural > 0.75 ~ "Consistent"
  ))
ggplot(student_profiles,
       aes(x = factor(Section), fill = archetype)) +
  geom_bar(position = "fill") +
  theme_bw() +
  labs(
    x = "Section",
    y = "Proportion of students",
    fill = "Reasoning archetype",
    title = "Student structural reasoning archetypes by section"
  )

archetype_tab <- table(student_profiles$Section, student_profiles$archetype)
chisq.test(archetype_tab)

## Warning in chisq.test(archetype_tab): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  archetype_tab
## X-squared = 2.1101, df = 3, p-value = 0.5499

archetype_tab

##    
##     Consistent Mixed None Occasional
##   1          1    18   30         31
##   2          0    17   17         19

ggplot(student_profiles, aes(x = factor(Section), y = prop_structural)) +
  geom_violin(fill = "grey85") +
  geom_boxplot(width = 0.15, outlier.shape = NA) +
  geom_jitter(width = 0.08, alpha = 0.6) +
  theme_bw() +
  labs(
    x = "Section",
    y = "Proportion of responses with structural reasoning",
    title = "Cross-topic consistency in structural reasoning"
  )

Madeline_Intro_bio

Abby Beatty

2024-04-25