#assign section information

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(tidyr)
library(nlme)

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:dplyr':
## 
##     collapse

s1=read.csv("Bio1010S1.csv")
s2=read.csv("Bio1010S2.csv")
s2 <- s2[, -4]

user.treatment <- bind_rows(list(s1 = s1, s2 = s2), .id = "source")
section_info <- unique(user.treatment)

Quiz Analysis

quizzes=read.csv("Quiz_data_combined.csv") #228 unique users with quiz data 

#Create a "type" column that labels each question as either an "attention" question or a "viewpoint"
quizzes$Type <- ifelse(grepl("TRUE|FALSE", quizzes$Response.Text, ignore.case = TRUE), "attention", "viewpoint")

quizzes$User.Email <- substr(quizzes$User.Email, 1, 7)

#add in user data for each occurrence 
quizzes <- merge(quizzes, section_info[, c("SIS.Login.ID", "source")], by.x = "User.Email", by.y = "SIS.Login.ID", all.x = TRUE)


#Assess quiz responses for engagement using the "attention" check questions
###subset quiz data to only include the attention check questions 
att.check=subset(quizzes, Type == "attention")

#Create a "score" column, where if their response matches the correct response, we give them a score of 1, and otherwise they get a score of 0. 
att.check$score <- ifelse(att.check$Correct. == "TRUE", 1, 0)

#get the average score for each section
att.check %>%
  group_by(Topic, Section) %>%
  summarize(average_score = mean(score))

## `summarise()` has grouped output by 'Topic'. You can override using the
## `.groups` argument.

## # A tibble: 12 × 3
## # Groups:   Topic [6]
##    Topic        Section average_score
##    <chr>        <chr>           <dbl>
##  1 BioDiv       Control         0.769
##  2 BioDiv       IA              0.863
##  3 Digestion    Control         1    
##  4 Digestion    IA              0.993
##  5 EnvInjustice Control         0.887
##  6 EnvInjustice IA              0.981
##  7 MolBiology   Control         0.927
##  8 MolBiology   IA              0.705
##  9 Respiration  Control         0.787
## 10 Respiration  IA              0.738
## 11 Vaccines     Control         0.815
## 12 Vaccines     IA              0.865

att.check %>%
  group_by(Section) %>%
  summarize(average_score = mean(score))

## # A tibble: 2 × 2
##   Section average_score
##   <chr>           <dbl>
## 1 Control         0.852
## 2 IA              0.839

# Calculate average score per section grouped by topic and section
avg_scores <- att.check %>%
  group_by(Topic, Section) %>%
  summarize(avg_score = mean(score, na.rm = TRUE))

## `summarise()` has grouped output by 'Topic'. You can override using the
## `.groups` argument.

# Create a ggplot object to visualize the data
plot <- ggplot(avg_scores, aes(x = Section, y = avg_score)) +
  geom_bar(stat = "identity", position = "dodge", fill = "blue") +
  facet_wrap(~ Topic, scales = "free") +
  labs(title = "Average Score per Section Faceted by Topic",
       x = "Section", y = "Average Score") +
  theme_minimal()

# Display the plot
print(plot)

summary(lme(fixed = score ~ Section, random = ~ 1 | User.Email, data = att.check))

## Linear mixed-effects model fit by REML
##   Data: att.check 
##      AIC      BIC    logLik
##   1852.5 1875.401 -922.2498
## 
## Random effects:
##  Formula: ~1 | User.Email
##         (Intercept)  Residual
## StdDev:  0.04760064 0.3595739
## 
## Fixed effects:  score ~ Section 
##                  Value  Std.Error   DF  t-value p-value
## (Intercept)  0.8519613 0.01315351 2040 64.77062  0.0000
## SectionIA   -0.0127315 0.01684294  225 -0.75590  0.4505
##  Correlation: 
##           (Intr)
## SectionIA -0.781
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.3950415  0.3748525  0.4157131  0.4565736  0.6311915 
## 
## Number of Observations: 2267
## Number of Groups: 227

split_data <- split(att.check, att.check$Topic)

# Function to fit a linear mixed-effects model for each topic
fit_model <- function(df) {
  lme(fixed = score ~ Section, random = ~ 1 | User.Email, data = df)
}
# Apply the function to each subset of data and store the models in a list
models <- lapply(split_data, fit_model)
# To see the summary for each model
summaries <- lapply(models, summary)
# Optional: print summaries for review
summaries

## $BioDiv
## Linear mixed-effects model fit by REML
##   Data: df 
##        AIC      BIC    logLik
##   379.2015 395.2269 -185.6008
## 
## Random effects:
##  Formula: ~1 | User.Email
##         (Intercept) Residual
## StdDev: 1.21428e-05 0.377255
## 
## Fixed effects:  score ~ Section 
##                 Value  Std.Error  DF   t-value p-value
## (Intercept) 0.7687500 0.02982463 204 25.775680  0.0000
## SectionIA   0.0941532 0.03825421 202  2.461251  0.0147
##  Correlation: 
##           (Intr)
## SectionIA -0.78 
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.2873209  0.3634061  0.3634061  0.6129806  0.6129806 
## 
## Number of Observations: 408
## Number of Groups: 204 
## 
## $Digestion
## Linear mixed-effects model fit by REML
##   Data: df 
##         AIC       BIC  logLik
##   -553.6341 -540.0234 280.817
## 
## Random effects:
##  Formula: ~1 | User.Email
##         (Intercept)   Residual
## StdDev:   0.0626107 0.02347944
## 
## Fixed effects:  score ~ Section 
##                  Value   Std.Error  DF   t-value p-value
## (Intercept)  1.0000000 0.007128196 222 140.28796  0.0000
## SectionIA   -0.0073529 0.009148169 222  -0.80376  0.4224
##  Correlation: 
##           (Intr)
## SectionIA -0.779
## 
## Standardized Within-Group Residuals:
##           Min            Q1           Med            Q3           Max 
## -5.212435e+00  3.782793e-14  3.861063e-02  3.861063e-02  3.861063e-02 
## 
## Number of Observations: 224
## Number of Groups: 224 
## 
## $EnvInjustice
## Linear mixed-effects model fit by REML
##   Data: df 
##         AIC       BIC   logLik
##   -21.44766 -6.050361 14.72383
## 
## Random effects:
##  Formula: ~1 | User.Email
##         (Intercept)  Residual
## StdDev:  0.02535476 0.2271112
## 
## Fixed effects:  score ~ Section 
##                 Value  Std.Error  DF  t-value p-value
## (Intercept) 0.8873239 0.01929483 174 45.98766   0e+00
## SectionIA   0.0933535 0.02505320 173  3.72621   3e-04
##  Correlation: 
##           (Intr)
## SectionIA -0.77 
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -4.26657464  0.08301035  0.08301035  0.48406097  0.53760489 
## 
## Number of Observations: 349
## Number of Groups: 175 
## 
## $MolBiology
## Linear mixed-effects model fit by REML
##   Data: df 
##        AIC      BIC    logLik
##   433.7067 449.9244 -212.8533
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept) Residual
## StdDev: 4.512674e-06 0.393839
## 
## Fixed effects:  score ~ Section 
##                  Value  Std.Error  DF   t-value p-value
## (Intercept)  0.9268293 0.03075366 214 30.137204       0
## SectionIA   -0.2222838 0.03915766 212 -5.676637       0
##  Correlation: 
##           (Intr)
## SectionIA -0.785
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.3533203  0.1857884  0.1857884  0.7501912  0.7501912 
## 
## Number of Observations: 428
## Number of Groups: 214 
## 
## $Respiration
## Linear mixed-effects model fit by REML
##   Data: df 
##        AIC      BIC    logLik
##   516.3417 532.6339 -254.1709
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 1.652571e-05 0.4292832
## 
## Fixed effects:  score ~ Section 
##                  Value  Std.Error  DF   t-value p-value
## (Intercept)  0.7869822 0.03302179 217 23.832212  0.0000
## SectionIA   -0.0491545 0.04219764 217 -1.164864  0.2454
##  Correlation: 
##           (Intr)
## SectionIA -0.783
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -1.8332471  0.4962173  0.4962173  0.6107210  0.6107210 
## 
## Number of Observations: 436
## Number of Groups: 219 
## 
## $Vaccines
## Linear mixed-effects model fit by REML
##   Data: df 
##       AIC     BIC    logLik
##   354.685 370.846 -173.3425
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 8.920891e-06 0.3609937
## 
## Fixed effects:  score ~ Section 
##                 Value  Std.Error  DF   t-value p-value
## (Intercept) 0.8148148 0.02836234 211 28.728756  0.0000
## SectionIA   0.0505698 0.03613363 209  1.399522  0.1631
##  Correlation: 
##           (Intr)
## SectionIA -0.785
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.3972292  0.3729023  0.3729023  0.5129873  0.5129873 
## 
## Number of Observations: 422
## Number of Groups: 211

#Remove data for each user by topic if they have an attention check of “0”.

# Sum total scores for each "User.Email" by "Topic"
total_scores <- att.check %>%
  group_by(User.Email, Topic) %>%
  summarize(total_score = sum(score, na.rm = TRUE)) %>%
  ungroup()

## `summarise()` has grouped output by 'User.Email'. You can override using the
## `.groups` argument.

# Identify users and topics where the score is zero
zero_scores <- total_scores %>%
  filter(total_score == 0) 

# Remove these users' data for topics where they scored zero
att.check <- att.check %>%
  anti_join(zero_scores, by = c("User.Email", "Topic"))

# Calculate sum total scores for each "User.Email" by "topic" in the filtered data
#Check to make sure none of these are 0
sum_scores <- att.check %>%
  group_by(User.Email, Topic) %>%
  summarize(total_score = sum(score, na.rm = TRUE)) %>%
  ungroup()

## `summarise()` has grouped output by 'User.Email'. You can override using the
## `.groups` argument.

#get the average score for each section
att.check %>%
  group_by(Topic, Section) %>%
  summarize(average_score = mean(score))

## `summarise()` has grouped output by 'Topic'. You can override using the
## `.groups` argument.

## # A tibble: 12 × 3
## # Groups:   Topic [6]
##    Topic        Section average_score
##    <chr>        <chr>           <dbl>
##  1 BioDiv       Control         0.778
##  2 BioDiv       IA              0.877
##  3 Digestion    Control         1    
##  4 Digestion    IA              1    
##  5 EnvInjustice Control         0.9  
##  6 EnvInjustice IA              0.981
##  7 MolBiology   Control         0.95 
##  8 MolBiology   IA              0.710
##  9 Respiration  Control         0.796
## 10 Respiration  IA              0.749
## 11 Vaccines     Control         0.825
## 12 Vaccines     IA              0.886

att.check %>%
  group_by(Section) %>%
  summarize(average_score = mean(score))

## # A tibble: 2 × 2
##   Section average_score
##   <chr>           <dbl>
## 1 Control         0.864
## 2 IA              0.850

library(nlme)
summary(lme(score ~ Section, random = ~ 1 | User.Email, data = att.check))

## Linear mixed-effects model fit by REML
##   Data: att.check 
##        AIC      BIC    logLik
##   1697.693 1720.543 -844.8465
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 0.0001214032 0.3519632
## 
## Fixed effects:  score ~ Section 
##                  Value  Std.Error   DF  t-value p-value
## (Intercept)  0.8636884 0.01191216 2011 72.50480  0.0000
## SectionIA   -0.0138716 0.01525296  225 -0.90944  0.3641
##  Correlation: 
##           (Intr)
## SectionIA -0.781
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.4539174  0.3872892  0.4267008  0.4267012  0.4267021 
## 
## Number of Observations: 2238
## Number of Groups: 227

split_data <- split(att.check, att.check$Topic)

# Function to fit a linear mixed-effects model for each topic
fit_model <- function(df) {
  lme(fixed = score ~ Section, random = ~ 1 | User.Email, data = df)
}
# Apply the function to each subset of data and store the models in a list
models <- lapply(split_data, fit_model)
# To see the summary for each model
summaries <- lapply(models, summary)
# Optional: print summaries for review
summaries

## $BioDiv
## Linear mixed-effects model fit by REML
##   Data: df 
##        AIC     BIC    logLik
##   349.4332 365.399 -170.7166
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 9.492587e-06 0.3659177
## 
## Fixed effects:  score ~ Section 
##                 Value  Std.Error  DF   t-value p-value
## (Intercept) 0.7784810 0.02911085 201 26.741953   0.000
## SectionIA   0.0985682 0.03736569 199  2.637932   0.009
##  Correlation: 
##           (Intr)
## SectionIA -0.779
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.3968481  0.3360067  0.3360067  0.6053792  0.6053792 
## 
## Number of Observations: 402
## Number of Groups: 201 
## 
## $Digestion
## Linear mixed-effects model fit by REML
##   Data: df 
##         AIC       BIC   logLik
##   -15503.43 -15489.83 7755.714
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)     Residual
## StdDev: 1.331307e-16 2.777743e-17
## 
## Fixed effects:  score ~ Section 
##             Value   Std.Error  DF       t-value p-value
## (Intercept)     1 1.44974e-17 221  6.897788e+16  0.0000
## SectionIA       0 1.86327e-17 221 -1.000000e+00  0.4207
##  Correlation: 
##           (Intr)
## SectionIA -0.778
## 
## Standardized Within-Group Residuals:
## Min  Q1 Med  Q3 Max 
##   0   0   0   0   0 
## 
## Number of Observations: 223
## Number of Groups: 223 
## 
## $EnvInjustice
## Linear mixed-effects model fit by REML
##   Data: df 
##         AIC       BIC   logLik
##   -51.04712 -35.67294 29.52356
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 1.484812e-05 0.2188422
## 
## Fixed effects:  score ~ Section 
##                 Value  Std.Error  DF  t-value p-value
## (Intercept) 0.9000000 0.01849554 173 48.66038   0e+00
## SectionIA   0.0806763 0.02394675 172  3.36899   9e-04
##  Correlation: 
##           (Intr)
## SectionIA -0.772
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -4.48120344  0.08829957  0.08829957  0.45695030  0.45695032 
## 
## Number of Observations: 347
## Number of Groups: 174 
## 
## $MolBiology
## Linear mixed-effects model fit by REML
##   Data: df 
##        AIC      BIC    logLik
##   404.0105 420.1715 -198.0052
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 1.207599e-05 0.3828284
## 
## Fixed effects:  score ~ Section 
##                  Value  Std.Error  DF   t-value p-value
## (Intercept)  0.9500000 0.03026525 211 31.389139       0
## SectionIA   -0.2400763 0.03841048 209 -6.250281       0
##  Correlation: 
##           (Intr)
## SectionIA -0.788
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.4815293  0.1306068  0.1306068  0.7577189  0.7577189 
## 
## Number of Observations: 422
## Number of Groups: 211 
## 
## $Respiration
## Linear mixed-effects model fit by REML
##   Data: df 
##        AIC      BIC    logLik
##   496.4418 512.6782 -244.2209
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 1.523438e-05 0.4228163
## 
## Fixed effects:  score ~ Section 
##                  Value  Std.Error  DF   t-value p-value
## (Intercept)  0.7964072 0.03271851 214 24.341184  0.0000
## SectionIA   -0.0473578 0.04183597 214 -1.131987  0.2589
##  Correlation: 
##           (Intr)
## SectionIA -0.782
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -1.8835774  0.4815160  0.4815160  0.5935215  0.5935215 
## 
## Number of Observations: 430
## Number of Groups: 216 
## 
## $Vaccines
## Linear mixed-effects model fit by REML
##   Data: df 
##       AIC      BIC   logLik
##   308.808 324.8921 -150.404
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 1.685284e-05 0.3441218
## 
## Fixed effects:  score ~ Section 
##                 Value  Std.Error  DF   t-value p-value
## (Intercept) 0.8250000 0.02720522 207 30.325062  0.0000
## SectionIA   0.0608268 0.03473246 205  1.751295  0.0814
##  Correlation: 
##           (Intr)
## SectionIA -0.783
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.5741660  0.3317814  0.3317814  0.5085408  0.5085408 
## 
## Number of Observations: 414
## Number of Groups: 207

library(ggplot2)

# Calculate average score per section grouped by topic and section
avg_scores <- att.check %>%
  group_by(Topic, Section) %>%
  summarize(avg_score = mean(score, na.rm = TRUE))

## `summarise()` has grouped output by 'Topic'. You can override using the
## `.groups` argument.

# Create a ggplot object to visualize the data
plot <- ggplot(avg_scores, aes(x = Section, y = avg_score)) +
  geom_bar(stat = "identity", position = "dodge", fill = "blue") +
  facet_wrap(~ Topic, scales = "free") +
  labs(title = "Average Score per Section Faceted by Topic",
       x = "Section", y = "Average Score") +
  theme_minimal()

# Display the plot
print(plot)

#Assess viewpoint questions only

viewpoint=subset(quizzes, Type == "viewpoint")

#Remove all user info on a per topic basis if they scored a "0" on the attention check for that topic
viewpoint <- viewpoint %>%
  anti_join(zero_scores, by = c("User.Email", "Topic"))

###add in a column called "score" with likert ranging from 1-5 (strongly disagree to strongly agree)
viewpoint$score <- ifelse(grepl("Neither Agree nor Disagree", viewpoint$Response.Text, ignore.case = TRUE), 3,
                  ifelse(grepl("Agree", viewpoint$Response.Text, ignore.case = TRUE) & !grepl("Disagree", viewpoint$Response.Text, ignore.case = TRUE), 4,
                  ifelse(grepl("Strongly Agree", viewpoint$Response.Text, ignore.case = TRUE), 5,
                  ifelse(grepl("Strongly Disagree", viewpoint$Response.Text, ignore.case = TRUE), 1,
                  ifelse(grepl("Disagree", viewpoint$Response.Text, ignore.case = TRUE), 2, NA)))))

#Calculate the average score per question per section

average_scores <- viewpoint %>%
  group_by(Section, Question.Text, Topic) %>%
  summarize(avg_score = mean(score, na.rm = TRUE)) %>%
  ungroup()

## `summarise()` has grouped output by 'Section', 'Question.Text'. You can
## override using the `.groups` argument.

#calculate the difference for each question between sections
library(tidyr)
# Assuming that `average_scores` is the data frame obtained from your previous calculations
# Ensure that there are exactly two Section subcategories per Question.Text
# Pivot to have each Section in separate columns
pivoted_scores <- average_scores %>%
  pivot_wider(names_from = Section, values_from = avg_score)

# Check the column names to confirm the two subcategories are present
print(colnames(pivoted_scores))

## [1] "Question.Text" "Topic"         "Control"       "IA"

# Assuming the two subcategories are, for example, 'Section1' and 'Section2'
# Calculate the difference between them for each Question.Text
pivoted_scores <- pivoted_scores %>%
  mutate(score_difference = IA - Control)

# View the results with differences
print(pivoted_scores)

## # A tibble: 47 × 5
##    Question.Text                            Topic Control    IA score_difference
##    <chr>                                    <chr>   <dbl> <dbl>            <dbl>
##  1 "Active advocacy for and support of pol… BioD…    3.81 NA            NA      
##  2 "Choosing to consume products that are … BioD…    3.91 NA            NA      
##  3 "Communities affected by environmental … EnvI…    3.81  3.87          0.0595 
##  4 "Dietary supplements, including vitamin… Dige…    3.29  3.75          0.465  
##  5 "Economic development justifies environ… EnvI…    2.97  3.03          0.0577 
##  6 "Editing the human genome is a risk to … MolB…    3.75  3.84          0.0885 
##  7 "Education about the food web and human… BioD…    3.86  3.85         -0.00773
##  8 "Environmental injustices are primarily… EnvI…    3.63  3.69          0.0607 
##  9 "Ethical considerations, such as equity… Vacc…    3.81  3.63         -0.183  
## 10 "Gene editing technology has already sa… MolB…    3.55 NA            NA      
## # ℹ 37 more rows

# library(DT)
# 
# # Create an interactive HTML table with DT
# datatable(
#   pivoted_scores,
#   options = list(
#     pageLength = 57,             # Adjust this to change the number of rows per page
#     scrollX = TRUE,              # Enables horizontal scrolling if necessary
#     columnDefs = list(
#       list(targets = 0:ncol(pivoted_scores), className = 'wrap')  # Apply text wrapping
#     )
#   )
# )

pivoted_scores$AbsValDiff <- abs(pivoted_scores$score_difference)
average_abs_val_diff <- aggregate(AbsValDiff ~ Topic, data = pivoted_scores, mean)

average_scores_topic <- viewpoint %>%
  group_by(Section, Topic) %>%
  summarize(avg_score = mean(score, na.rm = TRUE)) %>%
  ungroup()

## `summarise()` has grouped output by 'Section'. You can override using the
## `.groups` argument.

#calculate the difference for each question between sections
library(tidyr)
# Assuming that `average_scores` is the data frame obtained from your previous calculations
# Ensure that there are exactly two Section subcategories per Question.Text
# Pivot to have each Section in separate columns
pivoted_scores <- average_scores %>%
  pivot_wider(names_from = Section, values_from = avg_score)

# Check the column names to confirm the two subcategories are present
print(colnames(pivoted_scores))

## [1] "Question.Text" "Topic"         "Control"       "IA"

# Assuming the two subcategories are, for example, 'Section1' and 'Section2'
# Calculate the difference between them for each Question.Text
pivoted_scores <- pivoted_scores %>%
  mutate(score_difference = IA - Control)

# View the results with differences
print(pivoted_scores)

## # A tibble: 47 × 5
##    Question.Text                            Topic Control    IA score_difference
##    <chr>                                    <chr>   <dbl> <dbl>            <dbl>
##  1 "Active advocacy for and support of pol… BioD…    3.81 NA            NA      
##  2 "Choosing to consume products that are … BioD…    3.91 NA            NA      
##  3 "Communities affected by environmental … EnvI…    3.81  3.87          0.0595 
##  4 "Dietary supplements, including vitamin… Dige…    3.29  3.75          0.465  
##  5 "Economic development justifies environ… EnvI…    2.97  3.03          0.0577 
##  6 "Editing the human genome is a risk to … MolB…    3.75  3.84          0.0885 
##  7 "Education about the food web and human… BioD…    3.86  3.85         -0.00773
##  8 "Environmental injustices are primarily… EnvI…    3.63  3.69          0.0607 
##  9 "Ethical considerations, such as equity… Vacc…    3.81  3.63         -0.183  
## 10 "Gene editing technology has already sa… MolB…    3.55 NA            NA      
## # ℹ 37 more rows

library(DT)
# 
# # Create an interactive HTML table with DT
# datatable(
#   pivoted_scores,
#   options = list(
#     pageLength = 57,             # Adjust this to change the number of rows per page
#     scrollX = TRUE,              # Enables horizontal scrolling if necessary
#     columnDefs = list(
#       list(targets = 0:ncol(pivoted_scores), className = 'wrap')  # Apply text wrapping
#     )
#   )
# )

Survey Analysis

library(lavaan)

## This is lavaan 0.6-21
## lavaan is FREE software! Please report any bugs.

surv=read.csv("survey.quant.final.csv")
#remove any empty rows 
surv <- surv[apply(surv, 1, function(x) !all(x == "" | is.na(x))), ]

#Remove rows with additional question text (non-data rows)
surv <- surv[-c(2, 3), ]

#only keep data rows who complete the survey and who have consented to participation
surv=subset(surv, Consent == "I Consent")
surv=subset(surv, Progress == "100")

#change construct likert scales to numeric
surv[, 88:146] <- lapply(surv[, 88:146], as.numeric)

Assign studnets to sections (IA vs Control)

#Q141 is the studentID in email form 
surv$Email <- tolower(substr(surv$Email, 1, 7))

surv <- merge(surv, section_info[, c("SIS.Login.ID", "source")], by.x = "Email", by.y = "SIS.Login.ID", all.x = TRUE)

Cognitive Engagement

# CE.1 through CE.8
# reverse #5-8
columns_to_reverse <- paste0("CE.", 5:8)
surv[columns_to_reverse] <- 8 - surv[columns_to_reverse]

#Create model that contains all questions related to grit
(CE.model <- '
CogEng =~ CE.1 + CE.2 + CE.3 + CE.4 + CE.5 + CE.6 + CE.7 + CE.8')

## [1] "\nCogEng =~ CE.1 + CE.2 + CE.3 + CE.4 + CE.5 + CE.6 + CE.7 + CE.8"

#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionCE <- cfa(CE.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionCE, fit.measures=T, standardized=TRUE)

## lavaan 0.6-21 ended normally after 31 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        16
## 
##                                                   Used       Total
##   Number of observations                           292         392
## 
## Model Test User Model:
##                                               Standard      Scaled
##   Test Statistic                               268.246     175.429
##   Degrees of freedom                                20          20
##   P-value (Chi-square)                           0.000       0.000
##   Scaling correction factor                                  1.529
##     Satorra-Bentler correction                                    
## 
## Model Test Baseline Model:
## 
##   Test statistic                               840.594     652.303
##   Degrees of freedom                                28          28
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.289
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.695       0.751
##   Tucker-Lewis Index (TLI)                       0.572       0.651
##                                                                   
##   Robust Comparative Fit Index (CFI)                         0.705
##   Robust Tucker-Lewis Index (TLI)                            0.586
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)              -3623.834   -3623.834
##   Loglikelihood unrestricted model (H1)      -3489.711   -3489.711
##                                                                   
##   Akaike (AIC)                                7279.669    7279.669
##   Bayesian (BIC)                              7338.497    7338.497
##   Sample-size adjusted Bayesian (SABIC)       7287.757    7287.757
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.206       0.163
##   90 Percent confidence interval - lower         0.185       0.145
##   90 Percent confidence interval - upper         0.228       0.181
##   P-value H_0: RMSEA <= 0.050                    0.000       0.000
##   P-value H_0: RMSEA >= 0.080                    1.000       1.000
##                                                                   
##   Robust RMSEA                                               0.202
##   90 Percent confidence interval - lower                     0.175
##   90 Percent confidence interval - upper                     0.230
##   P-value H_0: Robust RMSEA <= 0.050                         0.000
##   P-value H_0: Robust RMSEA >= 0.080                         1.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.131       0.131
## 
## Parameter Estimates:
## 
##   Standard errors                           Robust.sem
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   CogEng =~                                                             
##     CE.1              1.000                               0.777    0.627
##     CE.2              0.942    0.112    8.384    0.000    0.732    0.633
##     CE.3              0.863    0.102    8.448    0.000    0.671    0.711
##     CE.4              0.944    0.106    8.904    0.000    0.734    0.760
##     CE.5              0.999    0.145    6.870    0.000    0.776    0.493
##     CE.6              0.757    0.126    5.999    0.000    0.589    0.375
##     CE.7              1.138    0.126    9.014    0.000    0.884    0.597
##     CE.8              1.086    0.147    7.375    0.000    0.844    0.526
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .CE.1              0.934    0.151    6.189    0.000    0.934    0.607
##    .CE.2              0.801    0.095    8.427    0.000    0.801    0.599
##    .CE.3              0.440    0.058    7.556    0.000    0.440    0.494
##    .CE.4              0.393    0.059    6.695    0.000    0.393    0.422
##    .CE.5              1.881    0.164   11.440    0.000    1.881    0.757
##    .CE.6              2.120    0.202   10.506    0.000    2.120    0.860
##    .CE.7              1.414    0.181    7.833    0.000    1.414    0.644
##    .CE.8              1.858    0.170   10.963    0.000    1.858    0.723
##     CogEng            0.604    0.111    5.440    0.000    1.000    1.000

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionCE, "std", edge.label.cex =1, label.cex=1)

#Create model that contains all questions related to grit
(CEb.model <- '
CogEng =~ CE.1 + CE.5 + CE.6 + CE.7 + CE.8')

## [1] "\nCogEng =~ CE.1 + CE.5 + CE.6 + CE.7 + CE.8"

#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionCEb <- cfa(CEb.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionCEb, fit.measures=T, standardized=TRUE)

## lavaan 0.6-21 ended normally after 33 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        10
## 
##                                                   Used       Total
##   Number of observations                           303         392
## 
## Model Test User Model:
##                                               Standard      Scaled
##   Test Statistic                                 4.797       4.300
##   Degrees of freedom                                 5           5
##   P-value (Chi-square)                           0.441       0.507
##   Scaling correction factor                                  1.116
##     Satorra-Bentler correction                                    
## 
## Model Test Baseline Model:
## 
##   Test statistic                               385.857     290.818
##   Degrees of freedom                                10          10
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.327
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    1.000       1.000
##   Tucker-Lewis Index (TLI)                       1.001       1.005
##                                                                   
##   Robust Comparative Fit Index (CFI)                         1.000
##   Robust Tucker-Lewis Index (TLI)                            1.004
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)              -2556.977   -2556.977
##   Loglikelihood unrestricted model (H1)      -2554.578   -2554.578
##                                                                   
##   Akaike (AIC)                                5133.954    5133.954
##   Bayesian (BIC)                              5171.091    5171.091
##   Sample-size adjusted Bayesian (SABIC)       5139.377    5139.377
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.000       0.000
##   90 Percent confidence interval - lower         0.000       0.000
##   90 Percent confidence interval - upper         0.078       0.071
##   P-value H_0: RMSEA <= 0.050                    0.774       0.834
##   P-value H_0: RMSEA >= 0.080                    0.044       0.025
##                                                                   
##   Robust RMSEA                                               0.000
##   90 Percent confidence interval - lower                     0.000
##   90 Percent confidence interval - upper                     0.078
##   P-value H_0: Robust RMSEA <= 0.050                         0.795
##   P-value H_0: Robust RMSEA >= 0.080                         0.044
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.019       0.019
## 
## Parameter Estimates:
## 
##   Standard errors                           Robust.sem
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   CogEng =~                                                             
##     CE.1              1.000                               0.379    0.307
##     CE.5              2.743    0.698    3.928    0.000    1.038    0.663
##     CE.6              2.428    0.646    3.759    0.000    0.919    0.588
##     CE.7              3.127    0.801    3.905    0.000    1.184    0.798
##     CE.8              3.205    0.815    3.931    0.000    1.213    0.757
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .CE.1              1.382    0.149    9.296    0.000    1.382    0.906
##    .CE.5              1.377    0.150    9.199    0.000    1.377    0.561
##    .CE.6              1.597    0.212    7.545    0.000    1.597    0.654
##    .CE.7              0.801    0.137    5.847    0.000    0.801    0.364
##    .CE.8              1.098    0.152    7.221    0.000    1.098    0.427
##     CogEng            0.143    0.065    2.215    0.027    1.000    1.000

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionCEb, "std", edge.label.cex =1, label.cex=1)

#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionCEb, "case.idx")
fscores <- lavPredict(solutionCEb, type="lv")

## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
  surv[idx, fs] <- fscores[ , fs]
}

summary(lme(CogEng ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))

## Linear mixed-effects model fit by REML
##   Data: surv 
##        AIC      BIC    logLik
##   223.1895 238.0179 -107.5947
## 
## Random effects:
##  Formula: ~1 | Email
##         (Intercept)  Residual
## StdDev:   0.1445638 0.3099086
## 
## Fixed effects:  CogEng ~ Timepoint 
##                    Value  Std.Error  DF   t-value p-value
## (Intercept)  -0.04031050 0.03029389 207 -1.330648  0.1848
## TimepointPre  0.06565277 0.03737536  94  1.756579  0.0822
##  Correlation: 
##              (Intr)
## TimepointPre -0.732
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.9509088 -0.4653659  0.1101215  0.6423174  2.0215912 
## 
## Number of Observations: 303
## Number of Groups: 208

Emotional Engagement

reverse 5-10

columns_to_reverse <- paste0("EE.", 5:10)
surv[columns_to_reverse] <- 8 - surv[columns_to_reverse]

#Create model that contains all questions related to grit
(EE.model <- '
EmEng =~ EE.1 + EE.2 + EE.3 + EE.4 + EE.5 + EE.6 + EE.7 + EE.8 + EE.9 + EE.10')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionEE <- cfa(EE.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionEE, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionEE, "std", edge.label.cex =1, label.cex=1)

#Create model that contains all questions related to grit
(EEb.model <- '
EmEng =~  EE.3 + EE.4 + EE.5 + EE.8 + EE.9 + EE.10')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionEEb <- cfa(EEb.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionEEb, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionEEb, "std", edge.label.cex =1, label.cex=1)

#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionEEb, "case.idx")
fscores <- lavPredict(solutionEEb, type="lv")

## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
  surv[idx, fs] <- fscores[ , fs]
}

summary(lme(EmEng ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))

Social Engagement

reverse 5-7

columns_to_reverse <- paste0("SE.", 5:7)
surv[columns_to_reverse] <- 8 - surv[columns_to_reverse]

#Create model that contains all questions related to grit
(SE.model <- '
SocEng =~ SE.1 + SE.2 + SE.3 + SE.4 + SE.5 + SE.6 + SE.7')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionSE <- cfa(SE.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionSE, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionSE, "std", edge.label.cex =1, label.cex=1)

#Create model that contains all questions related to grit
(SEb.model <- '
SocEng =~ SE.1  + SE.3  + SE.5  + SE.7')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionSEb <- cfa(SEb.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionSEb, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionSEb, "std", edge.label.cex =1, label.cex=1)

#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionSEb, "case.idx")
fscores <- lavPredict(solutionSEb, type="lv")

## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
  surv[idx, fs] <- fscores[ , fs]
}

summary(lme(SocEng ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))

Adult Emotional Empathy

reverse #3

columns_to_reverse <- paste0("AEE.", 3)
surv[columns_to_reverse] <- 8 - surv[columns_to_reverse]

#Create model that contains all questions related to grit
(AEE.model <- '
AdEmp =~ AEE.1 + AEE.2 + AEE.3 + AEE.4 + AEE.5 + AEE.6 + AEE.7 + AEE.8 + AEE.9 + AEE.10')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionAEE <- cfa(AEE.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionAEE, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionAEE, "std", edge.label.cex =1, label.cex=1)

#Create model that contains all questions related to grit
(AEEb.model <- '
AdEmp =~ AEE.1 + AEE.2  + AEE.4  + AEE.6 + AEE.7  + AEE.9 ')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionAEEb <- cfa(AEEb.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionAEEb, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionAEEb, "std", edge.label.cex =1, label.cex=1)


#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionAEEb, "case.idx")
fscores <- lavPredict(solutionAEEb, type="lv")

## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
  surv[idx, fs] <- fscores[ , fs]
}

summary(lme(AdEmp ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))

Interpersonal Reactivity Index

reverse 2, 3, 7, 8, 9,

columns_to_reverse <- paste0("IRI.", c(2,3,7,8,9))
surv[columns_to_reverse] <- 8 - surv[columns_to_reverse]

#Create model that contains all questions related to grit
(IRI.model <- '
IntReact =~ IRI.1 + IRI.2 + IRI.3 + IRI.4 + IRI.5 + IRI.6 + IRI.7 + IRI.8 + IRI.9 + IRI.10 + IRI.11 + IRI.12 + IRI.13 + IRI.14')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionIRI <- cfa(IRI.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionIRI, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionIRI, "std", edge.label.cex =1, label.cex=1)

#Create model that contains all questions related to grit
(IRIb.model <- '
IntReact =~   IRI.3 + IRI.4 + IRI.5  + IRI.7 + IRI.8 + IRI.9 + IRI.10 + IRI.11 + IRI.12 + IRI.13 + IRI.14')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionIRIb <- cfa(IRIb.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionIRIb, fit.measures=T, standardized=TRUE)

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionIRIb, "std", edge.label.cex =1, label.cex=1)


#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionIRIb, "case.idx")
fscores <- lavPredict(solutionIRIb, type="lv")

## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
  surv[idx, fs] <- fscores[ , fs]
}

summary(lme(IntReact ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))

Science Identity

#Create model that contains all questions related to grit
(SI.model <- '
SciId =~ SI.1 + SI.2 + SI.3 + SI.4 + SI.5')

## [1] "\nSciId =~ SI.1 + SI.2 + SI.3 + SI.4 + SI.5"

#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionSI <- cfa(SI.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionSI, fit.measures=T, standardized=TRUE)

## lavaan 0.6-21 ended normally after 25 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        10
## 
##                                                   Used       Total
##   Number of observations                           294         392
## 
## Model Test User Model:
##                                               Standard      Scaled
##   Test Statistic                                38.708      30.035
##   Degrees of freedom                                 5           5
##   P-value (Chi-square)                           0.000       0.000
##   Scaling correction factor                                  1.289
##     Satorra-Bentler correction                                    
## 
## Model Test Baseline Model:
## 
##   Test statistic                               568.889     370.321
##   Degrees of freedom                                10          10
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.536
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.940       0.931
##   Tucker-Lewis Index (TLI)                       0.879       0.861
##                                                                   
##   Robust Comparative Fit Index (CFI)                         0.942
##   Robust Tucker-Lewis Index (TLI)                            0.883
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)              -2471.029   -2471.029
##   Loglikelihood unrestricted model (H1)      -2451.675   -2451.675
##                                                                   
##   Akaike (AIC)                                4962.057    4962.057
##   Bayesian (BIC)                              4998.893    4998.893
##   Sample-size adjusted Bayesian (SABIC)       4967.181    4967.181
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.151       0.131
##   90 Percent confidence interval - lower         0.109       0.093
##   90 Percent confidence interval - upper         0.198       0.172
##   P-value H_0: RMSEA <= 0.050                    0.000       0.000
##   P-value H_0: RMSEA >= 0.080                    0.997       0.985
##                                                                   
##   Robust RMSEA                                               0.148
##   90 Percent confidence interval - lower                     0.100
##   90 Percent confidence interval - upper                     0.201
##   P-value H_0: Robust RMSEA <= 0.050                         0.001
##   P-value H_0: Robust RMSEA >= 0.080                         0.988
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.045       0.045
## 
## Parameter Estimates:
## 
##   Standard errors                           Robust.sem
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   SciId =~                                                              
##     SI.1              1.000                               1.253    0.791
##     SI.2              0.323    0.068    4.770    0.000    0.405    0.304
##     SI.3              0.973    0.071   13.716    0.000    1.219    0.814
##     SI.4              1.095    0.083   13.142    0.000    1.372    0.815
##     SI.5              0.893    0.082   10.942    0.000    1.119    0.651
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .SI.1              0.937    0.131    7.159    0.000    0.937    0.374
##    .SI.2              1.610    0.149   10.808    0.000    1.610    0.908
##    .SI.3              0.757    0.122    6.201    0.000    0.757    0.337
##    .SI.4              0.949    0.215    4.422    0.000    0.949    0.335
##    .SI.5              1.700    0.223    7.620    0.000    1.700    0.576
##     SciId             1.570    0.197    7.971    0.000    1.000    1.000

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionSI, "std", edge.label.cex =1, label.cex=1)

#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionSI, "case.idx")
fscores <- lavPredict(solutionSI, type="lv")

## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
  surv[idx, fs] <- fscores[ , fs]
}

summary(lme(SciId ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))

## Linear mixed-effects model fit by REML
##   Data: surv 
##        AIC      BIC    logLik
##   880.3411 895.0481 -436.1706
## 
## Random effects:
##  Formula: ~1 | Email
##         (Intercept)  Residual
## StdDev:   0.9961904 0.6346924
## 
## Fixed effects:  SciId ~ Timepoint 
##                   Value  Std.Error  DF   t-value p-value
## (Intercept)   0.2955271 0.09753661 202  3.029910  0.0028
## TimepointPre -0.3964644 0.08768428  90 -4.521499  0.0000
##  Correlation: 
##              (Intr)
## TimepointPre -0.573
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -1.94877302 -0.44828314 -0.08856004  0.44676570  2.50132821 
## 
## Number of Observations: 294
## Number of Groups: 203

Scientific Community Values

#Create model that contains all questions related to grit
(SCV.model <- '
SciComVal=~ SCE.1 + SCE.2 + SCE.3 + SCE.4 + SCE.5')

## [1] "\nSciComVal=~ SCE.1 + SCE.2 + SCE.3 + SCE.4 + SCE.5"

#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionSCV <- cfa(SCV.model, data=surv, estimator= "MLM")

#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionSCV, fit.measures=T, standardized=TRUE)

## lavaan 0.6-21 ended normally after 27 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        10
## 
##                                                   Used       Total
##   Number of observations                           340         392
## 
## Model Test User Model:
##                                               Standard      Scaled
##   Test Statistic                                 5.143       3.842
##   Degrees of freedom                                 5           5
##   P-value (Chi-square)                           0.399       0.572
##   Scaling correction factor                                  1.339
##     Satorra-Bentler correction                                    
## 
## Model Test Baseline Model:
## 
##   Test statistic                               415.314     335.697
##   Degrees of freedom                                10          10
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.237
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    1.000       1.000
##   Tucker-Lewis Index (TLI)                       0.999       1.007
##                                                                   
##   Robust Comparative Fit Index (CFI)                         1.000
##   Robust Tucker-Lewis Index (TLI)                            1.008
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)              -2225.301   -2225.301
##   Loglikelihood unrestricted model (H1)      -2222.729   -2222.729
##                                                                   
##   Akaike (AIC)                                4470.601    4470.601
##   Bayesian (BIC)                              4508.891    4508.891
##   Sample-size adjusted Bayesian (SABIC)       4477.169    4477.169
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.009       0.000
##   90 Percent confidence interval - lower         0.000       0.000
##   90 Percent confidence interval - upper         0.076       0.058
##   P-value H_0: RMSEA <= 0.050                    0.772       0.914
##   P-value H_0: RMSEA >= 0.080                    0.038       0.006
##                                                                   
##   Robust RMSEA                                               0.000
##   90 Percent confidence interval - lower                     0.000
##   90 Percent confidence interval - upper                     0.076
##   P-value H_0: Robust RMSEA <= 0.050                         0.823
##   P-value H_0: Robust RMSEA >= 0.080                         0.039
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.020       0.020
## 
## Parameter Estimates:
## 
##   Standard errors                           Robust.sem
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   SciComVal =~                                                          
##     SCE.1             1.000                               0.463    0.511
##     SCE.2             1.716    0.198    8.683    0.000    0.794    0.766
##     SCE.3             1.422    0.144    9.906    0.000    0.658    0.746
##     SCE.4             1.283    0.179    7.150    0.000    0.594    0.491
##     SCE.5             1.490    0.187    7.987    0.000    0.690    0.656
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .SCE.1             0.606    0.045   13.588    0.000    0.606    0.739
##    .SCE.2             0.444    0.078    5.729    0.000    0.444    0.414
##    .SCE.3             0.346    0.060    5.755    0.000    0.346    0.444
##    .SCE.4             1.111    0.151    7.343    0.000    1.111    0.759
##    .SCE.5             0.631    0.080    7.854    0.000    0.631    0.570
##     SciComVal         0.214    0.045    4.768    0.000    1.000    1.000

#Produce SEM plot showing all estimates and standard deviations using the standardized values. 
  #label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionSCV, "std", edge.label.cex =1, label.cex=1)

#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionSCV, "case.idx")
fscores <- lavPredict(solutionSCV, type="lv")

## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
  surv[idx, fs] <- fscores[ , fs]
}

summary(lme(SciComVal ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))

## Linear mixed-effects model fit by REML
##   Data: surv 
##        AIC      BIC    logLik
##   365.7951 381.0872 -178.8975
## 
## Random effects:
##  Formula: ~1 | Email
##         (Intercept)  Residual
## StdDev:   0.2658323 0.3209356
## 
## Fixed effects:  SciComVal ~ Timepoint 
##                    Value  Std.Error  DF   t-value p-value
## (Intercept)  -0.03692936 0.03460149 228 -1.067277  0.2870
## TimepointPre  0.05151883 0.03828727 110  1.345587  0.1812
##  Correlation: 
##              (Intr)
## TimepointPre -0.689
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.5597954 -0.4627069  0.0837014  0.5986544  1.6859895 
## 
## Number of Observations: 340
## Number of Groups: 229

#Calculate change in pre/post values for variables that passed CFA criteria

#Cognitive Engagement
CE.selected_columns <- c(1,147:149)
CogEng=surv[, CE.selected_columns]

CogEng <- CogEng %>%
  group_by(Email, Timepoint, source) %>%
  summarise(CogEng = mean(CogEng), .groups = 'drop')

  
CE.wide_data <- CogEng %>%
  pivot_wider(
    names_from = Timepoint,
    values_from = CogEng,
    names_prefix = "CogEng_"
  )

CE.wide_data <- CE.wide_data %>%
mutate(CogEng_diff = ifelse(is.na(CogEng_Pre) | is.na(CogEng_Post), NA, CogEng_Post - CogEng_Pre))

summary(lm(CogEng_diff ~ source, data = CE.wide_data, na.action = na.omit))

## 
## Call:
## lm(formula = CogEng_diff ~ source, data = CE.wide_data, na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.52267 -0.14393  0.05461  0.24922  1.22329 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.07075    0.05821  -1.215    0.227
## sources2    -0.01114    0.09591  -0.116    0.908
## 
## Residual standard error: 0.4509 on 93 degrees of freedom
##   (155 observations deleted due to missingness)
## Multiple R-squared:  0.000145,   Adjusted R-squared:  -0.01061 
## F-statistic: 0.01348 on 1 and 93 DF,  p-value: 0.9078

###################################

ID.selected_columns <- c(1,147:148, 150)
SciID=surv[, ID.selected_columns] 

SciID <- SciID %>%
  group_by(Email, Timepoint, source) %>%
  summarise(SciId = mean(SciId), .groups = 'drop')

  
SI.wide_data <- SciID %>%
  pivot_wider(
    names_from = Timepoint,
    values_from = SciId,
    names_prefix = "SI_"
  )

SI.wide_data <- SI.wide_data %>%
mutate(SI_diff = ifelse(is.na(SI_Pre) | is.na(SI_Post), NA, SI_Post - SI_Pre))

summary(lm(SI_diff ~ source, data = SI.wide_data, na.action = na.omit))

## 
## Call:
## lm(formula = SI_diff ~ source, data = SI.wide_data, na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.13889 -0.63852 -0.08524  0.49341  2.79219 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.4287     0.1127   3.803 0.000261 ***
## sources2     -0.1611     0.1963  -0.820 0.414200    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8804 on 89 degrees of freedom
##   (159 observations deleted due to missingness)
## Multiple R-squared:  0.007505,   Adjusted R-squared:  -0.003647 
## F-statistic: 0.673 on 1 and 89 DF,  p-value: 0.4142

#######################
SCV.selected_columns <- c(1,147:148, 151)
SCV=surv[, SCV.selected_columns] 

SCV <- SCV %>%
  group_by(Email, Timepoint, source) %>%
  summarise(SCV = mean(SciComVal), .groups = 'drop')

  
SCV.wide_data <- SCV %>%
  pivot_wider(
    names_from = Timepoint,
    values_from = SCV,
    names_prefix = "SCV_"
  )

SCV.wide_data <- SCV.wide_data %>%
mutate(SCV_diff = ifelse(is.na(SCV_Pre) | is.na(SCV_Post), NA, SCV_Post - SCV_Pre))

summary(lm(SCV_diff ~ source, data = SCV.wide_data, na.action = na.omit))

## 
## Call:
## lm(formula = SCV_diff ~ source, data = SCV.wide_data, na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.25953 -0.27792  0.08473  0.30680  1.03728 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.08697    0.05394  -1.612    0.110
## sources2     0.12240    0.08729   1.402    0.164
## 
## Residual standard error: 0.4448 on 108 degrees of freedom
##   (140 observations deleted due to missingness)
## Multiple R-squared:  0.01788,    Adjusted R-squared:  0.008787 
## F-statistic: 1.966 on 1 and 108 DF,  p-value: 0.1637

write.csv(surv, file = "quant_calc.csv", row.names = F)

for notes on how to produce and interpret semPlots, see : https://www.r-bloggers.com/2018/04/statistics-sunday-using-semplot/ https://cran.r-project.org/web/packages/semPlot/semPlot.pdf http://sachaepskamp.com/documentation/semPlot/semPaths.html

#CFI: Measures whether the model fits the data better than a more restricted baseline model. Higher is better, with okay fit > .9. Find CFI under User Model vs. Baseline model

#TLI (Tucker-Lewis index): Similar to CFI, but it penalizes overly complex models (making it more conservative than CFI). Measures whether the model fits the data better than a more restricted baseline model. Higher is better, with okay fit > .9.

#Find Test statistic, DOF, and P value under Model Test User Model (chi-squared goodness of fit test)

#RMSEA p value:The RMSEA P-value is the Probability that RMSEA <= .05. If that P-value is greater than 5% you can argue that the RMSEA value does not indicate a model rejection (the RMSEA value doesn’t reject the model if the RMSEA value is between 0 and 0.05). Usually this is useful when the RMSEA value is near the cutoff value of 0.05. If the RMSEA value is not near 0.05 one can typically ignore the confidence limit and the P-value and simply use the actual RMSEA value.

#Interpretation:

#RMSEA < 0.05: Indicates a close fit of the model to the data. #0.05 ≤ RMSEA < 0.08: Indicates a reasonable fit. #0.08 ≤ RMSEA < 0.10: Indicates a mediocre fit. #RMSEA ≥ 0.10: Indicates a poor fit.

#p-value > 0.05: The null hypothesis cannot be rejected, suggesting that the model provides a close fit to the data. #p-value ≤ 0.05: The null hypothesis is rejected, indicating that the model does not provide a close fit to the data.

#std.all are the standardized coefficients

See this publication and references within on reporting CFA results properly: https://www.researchgate.net/publication/24187223_Reporting_Practices_in_Confirmatory_Factor_Analysis_An_Overview_and_Some_Recommendations

#Does view of importance (IA.Importance) & IA.Comfort & IA.Grad.Imp change the outcome of constructs above within the IA group? #Same but Science.Career #Same by Gender & Race & First Gen

IA.Grad.Imp & IA.Importance & IA.Comfort pre/post

#Change in comfort per student for each Comfort. topic (bigger change for IA group?) #Does this change based on views of explicit links or demographics?

resp=read.csv("Qual_textresp.csv")
des=read.csv("Qual_Designations.csv")

des_clean <- des %>%
  filter(!if_all(starts_with("Designation."), ~ is.na(.)))

library(dplyr)
library(tidyr)
library(purrr)

des_long <- des_clean %>%
  pivot_longer(
    cols = starts_with("Designation."),
    names_to = "Topic",
    values_to = "Response"
  ) %>%
  filter(!is.na(Response)) %>%
  mutate(
    Section = factor(Section),
    Response = factor(Response),
    Topic = factor(Topic)
  )

# results <- des_long %>%
#   split(.$Topic) %>%
#   map_dfr(function(dat) {
# 
#     tab <- table(dat$Section, dat$Response)
# 
#     # remove empty rows/columns
#     tab <- tab[rowSums(tab) > 0, colSums(tab) > 0, drop = FALSE]
# 
#     # check table is still testable
#     if (nrow(tab) < 2 || ncol(tab) < 2) {
#       return(data.frame(
#         Topic = as.character(unique(dat$Topic)),
#         test = NA,
#         statistic = NA,
#         p_value = NA,
#         df = NA,
#         note = "Not enough variation"
#       ))
#     }
# 
#     chi <- suppressWarnings(chisq.test(tab))
# 
#     data.frame(
#       Topic = as.character(unique(dat$Topic)),
#       test = "Chi-square",
#       statistic = unname(chi$statistic),
#       p_value = chi$p.value,
#       df = unname(chi$parameter),
#       note = "OK"
#     )
#   })
# 
# results

results_fisher <- des_long %>%
  split(.$Topic) %>%
  map_dfr(function(dat) {

    tab <- table(dat$Section, dat$Response)
    tab <- tab[rowSums(tab) > 0, colSums(tab) > 0, drop = FALSE]

    if (nrow(tab) < 2 || ncol(tab) < 2) {
      return(data.frame(
        Topic = as.character(unique(dat$Topic)),
        p_value = NA,
        note = "Not enough variation"
      ))
    }

    fish <- fisher.test(tab)

    data.frame(
      Topic = as.character(unique(dat$Topic)),
      p_value = fish$p.value,
      note = "Fisher exact test"
    )
  })

results_fisher

##             Topic      p_value              note
## 1  Designation.DB 0.1112633218 Fisher exact test
## 2 Designation.Env 0.4896338538 Fisher exact test
## 3  Designation.GE 0.2990936696 Fisher exact test
## 4 Designation.Mar 0.0002512388 Fisher exact test
## 5 Designation.Vac 0.1863084072 Fisher exact test
## 6 Designation.Vap 0.4164906739 Fisher exact test

des_long <- des_long %>%
  mutate(Response_ord = factor(Response,
    levels = c(
      "Negative",
      "Negative-Neutral",
      "Neutral/No opinion/Freedom",
      "Positive-Neutral",
      "Positive"
    ),
    ordered = TRUE
  ))

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

model <- polr(Response_ord ~ Section + Topic, data = des_long, Hess = TRUE)
summary(model)

## Call:
## polr(formula = Response_ord ~ Section + Topic, data = des_long, 
##     Hess = TRUE)
## 
## Coefficients:
##                         Value Std. Error t value
## Section2              0.49686     0.1661  2.9915
## TopicDesignation.Env  0.03744     0.3008  0.1245
## TopicDesignation.GE   0.08432     0.2795  0.3017
## TopicDesignation.Mar  0.63632     0.2615  2.4336
## TopicDesignation.Vac  4.18553     0.3383 12.3705
## TopicDesignation.Vap -1.14137     0.3195 -3.5718
## 
## Intercepts:
##                                             Value   Std. Error t value
## Negative|Negative-Neutral                    0.7857  0.2142     3.6685
## Negative-Neutral|Neutral/No opinion/Freedom  2.0531  0.2334     8.7978
## Neutral/No opinion/Freedom|Positive-Neutral  2.0533  0.2334     8.7982
## Positive-Neutral|Positive                    4.0427  0.3084    13.1068
## 
## Residual Deviance: 1179.775 
## AIC: 1199.775 
## (1335 observations deleted due to missingness)

coef_table <- coef(summary(model))
p <- pnorm(abs(coef_table[, "t value"]), lower.tail = FALSE) * 2
cbind(coef_table, "p value" = p)

##                                                   Value Std. Error    t value
## Section2                                     0.49685980  0.1660929  2.9914577
## TopicDesignation.Env                         0.03744155  0.3007908  0.1244771
## TopicDesignation.GE                          0.08432139  0.2795291  0.3016551
## TopicDesignation.Mar                         0.63632010  0.2614763  2.4335670
## TopicDesignation.Vac                         4.18553063  0.3383491 12.3704509
## TopicDesignation.Vap                        -1.14137216  0.3195484 -3.5718284
## Negative|Negative-Neutral                    0.78566185  0.2141649  3.6684898
## Negative-Neutral|Neutral/No opinion/Freedom  2.05314450  0.2333707  8.7977833
## Neutral/No opinion/Freedom|Positive-Neutral  2.05327306  0.2333734  8.7982308
## Positive-Neutral|Positive                    4.04274339  0.3084459 13.1068162
##                                                  p value
## Section2                                    2.776490e-03
## TopicDesignation.Env                        9.009376e-01
## TopicDesignation.GE                         7.629150e-01
## TopicDesignation.Mar                        1.495087e-02
## TopicDesignation.Vac                        3.776951e-35
## TopicDesignation.Vap                        3.544976e-04
## Negative|Negative-Neutral                   2.439874e-04
## Negative-Neutral|Neutral/No opinion/Freedom 1.395452e-18
## Neutral/No opinion/Freedom|Positive-Neutral 1.389899e-18
## Positive-Neutral|Positive                   3.009701e-39

#why are there extra columns here?

des.table= des_long %>%
   split(.$Topic) %>%
   purrr::walk(~ print(table(.x$Section, .x$Response)))

##    
##        Negative Negative-Neutral Neutral/No opinon/Freedom Positive
##   1  0       46                7                        12        1
##   2  0       24               13                         6        1
##    
##     Positive-Neutral
##   1               11
##   2                5
##    
##        Negative Negative-Neutral Neutral/No opinon/Freedom Positive
##   1  0       32               10                        26        1
##   2  0       17               11                        15        2
##    
##     Positive-Neutral
##   1                3
##   2                3
##    
##        Negative Negative-Neutral Neutral/No opinon/Freedom Positive
##   1  0       45                8                        12        1
##   2  0       24               12                         7        1
##    
##     Positive-Neutral
##   1               12
##   2                6
##    
##        Negative Negative-Neutral Neutral/No opinon/Freedom Positive
##   1  0       45               19                         7        2
##   2  0       13               17                         3        2
##    
##     Positive-Neutral
##   1                6
##   2               17
##    
##        Negative Negative-Neutral Neutral/No opinon/Freedom Positive
##   1  0        0                4                        15       38
##   2  0        4                2                         8       25
##    
##     Positive-Neutral
##   1               22
##   2               13
##    
##        Negative Negative-Neutral Neutral/No opinon/Freedom Positive
##   1  0       61               12                         5        0
##   2  0       41                5                         6        1
##    
##     Positive-Neutral
##   1                1
##   2                0

When I converted the designation scale to an ordered score: The marijuana topic shows a substantial ideological framing difference between sections.

Students in Section 2 gave responses that were much less negative / more nuanced.

This suggests:

IA may be most influential when the topic is politically or culturally loaded.

Students do not automatically apply IA to all topics.

library(dplyr)

des_long <- des_long %>%
  mutate(Response_ord = factor(Response,
    levels = c(
      "Negative",
      "Negative-Neutral",
      "Neutral/No opinion/Freedom",
      "Positive-Neutral",
      "Positive"
    ),
    ordered = TRUE
  ),
  Response_num = as.numeric(Response_ord))

des_long <- des_long %>%
  mutate(
    Response = na_if(as.character(Response), "NA"),
    Section  = na_if(as.character(Section), "NA")
  ) %>%
  filter(!is.na(Response), !is.na(Section))

means <- des_long %>%
  group_by(Topic, Section) %>%
  summarise(
    mean_score = mean(Response_num, na.rm = TRUE),
    sd = sd(Response_num, na.rm = TRUE),
    n = n(),
    se = sd / sqrt(n),
    .groups = "drop"
  )

means

## # A tibble: 12 × 6
##    Topic           Section mean_score    sd     n     se
##    <fct>           <chr>        <dbl> <dbl> <int>  <dbl>
##  1 Designation.DB  1             1.68 1.20     77 0.137 
##  2 Designation.DB  2             1.74 1.09     49 0.156 
##  3 Designation.Env 1             1.5  0.960    72 0.113 
##  4 Designation.Env 2             1.85 1.20     48 0.174 
##  5 Designation.GE  1             1.73 1.22     78 0.138 
##  6 Designation.GE  2             1.79 1.15     50 0.162 
##  7 Designation.Mar 1             1.62 1.04     79 0.117 
##  8 Designation.Mar 2             2.55 1.32     52 0.184 
##  9 Designation.Vac 1             4.47 0.796    79 0.0896
## 10 Designation.Vac 2             4.20 1.25     52 0.173 
## 11 Designation.Vap 1             1.20 0.496    79 0.0558
## 12 Designation.Vap 2             1.19 0.647    53 0.0889

library(ggplot2)

ggplot(means, aes(x = Topic, y = mean_score, fill = Section)) +
  geom_col(position = position_dodge(width = 0.7)) +
  geom_errorbar(
    aes(ymin = mean_score - se, ymax = mean_score + se),
    width = .2,
    position = position_dodge(.7)
  ) +
  coord_flip() +
  labs(
    x = "Topic",
    y = "Average Ideological Framing Score"
  ) +
  theme_bw()

Across most topics (environmental justice, vaccines, GE, vaping, etc.), the average ideological framing scores were very similar between sections.

This suggests that students do not necessarily change their stance on socio-scientific issues simply because ideological awareness is introduced.

In other words:

AI instruction appears to influence how students reason, rather than what position they ultimately take.

The marijuana topic shows the clearest difference between sections.(1.62-2.55)

Students in the control section expressed more neutral or mixed framing, while the AI section tended to give more negative responses.

Possible explanation:

The AI section may have encouraged students to critically interrogate ideological narratives surrounding the issue, which may have led some students to articulate stronger evaluative positions rather than neutral ones.

Another possibility is that marijuana is a highly culturally framed issue, and the ideological discussion may have prompted students to engage more directly with those frames.

AI section shows more consistent framing

In several topics, the AI section responses cluster more tightly around specific categories (often negative or negative-neutral), while the control section spreads more across categories.

This suggests that AI instruction may have helped students:

recognize ideological framing more clearly

articulate positions more decisively

Rather than defaulting to neutral responses.

ggplot(des_long, aes(x = Section, y = Response_num, fill = Section)) +
  geom_violin(alpha = 0.4, trim = FALSE) +
  geom_boxplot(width = 0.15, outlier.shape = NA, alpha = 0.6) +
  facet_wrap(~Topic) +
  theme_bw() +
  labs(
    y = "Ideological Framing Score",
    x = "Section",
    title = "Distribution of Ideological Framing Responses by Section"
  )

## Warning: Removed 122 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 122 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Control section uses neutral categories more often. This pattern is common when students lack an explicit analytic framework for interpreting socio-scientific issues.

Without ideological awareness instruction, students may:

hedge their answers

default to neutrality

frame responses as individual opinion rather than ideological analysis.

Looking at the qualitative responses:

AI section (Section 1)

Students more often reference:

systemic issues

ideological framing

social structures

policy implications

Responses tend to focus on how the issue is constructed or framed.

Control section (Section 2)

Students more often focus on:

individual harms

fairness

environmental consequences

personal opinion

Responses tend to focus on what the issue is, rather than how it is framed.

resp <- read.csv("Qual_textresp.csv")

library(tidytext)
library(dplyr)

words <- resp %>%
  unnest_tokens(word, Vid.EnvInj.Text)

library(stopwords)

words_clean <- words %>%
  filter(!word %in% stop_words$word)

word_counts <- words_clean %>%
  count(Section, word, sort = TRUE)

word_counts

##     Section               word  n
## 1         2      environmental 34
## 2         1      environmental 30
## 3         2             racism 30
## 4         1             racism 21
## 5         1          injustice 17
## 6         1             people 14
## 7         2        communities 14
## 8         1        communities 13
## 9         2          injustice 12
## 10        2             people 12
## 11        2             health 11
## 12        1         government 10
## 13        2      understanding 10
## 14        1              water  9
## 15        2              water  9
## 16        1          community  8
## 17        1              topic  8
## 18        1      understanding  8
## 19        2              clean  8
## 20        1        environment  7
## 21        1             health  7
## 22        2              issue  7
## 23        2       marginalized  7
## 24        2          resources  7
## 25        1              clean  6
## 26        1                due  6
## 27        1             issues  6
## 28        1               live  6
## 29        1                lot  6
## 30        2             access  6
## 31        2           affected  6
## 32        2        environment  6
## 33        1           affected  5
## 34        1             income  5
## 35        1              money  5
## 36        2          community  5
## 37        2        disparities  5
## 38        2              flint  5
## 39        2              waste  5
## 40        1             access  4
## 41        1                don  4
## 42        1           educated  4
## 43        1               feel  4
## 44        1              issue  4
## 45        1              lower  4
## 46        1       marginalized  4
## 47        1               poor  4
## 48        1          resources  4
## 49        1               time  4
## 50        2                air  4
## 51        2           concerns  4
## 52        2             crisis  4
## 53        2           exposure  4
## 54        2             issues  4
## 55        2               lack  4
## 56        2           michigan  4
## 57        2               race  4
## 58        1          attention  3
## 59        1               care  3
## 60        1               dont  3
## 61        1              flint  3
## 62        1            healthy  3
## 63        1             impact  3
## 64        1           informed  3
## 65        1           involved  3
## 66        1                low  3
## 67        1           michigan  3
## 68        1          pollution  3
## 69        1            poverty  3
## 70        1            protect  3
## 71        1               race  3
## 72        1              risks  3
## 73        1                sad  3
## 74        1            society  3
## 75        1              speak  3
## 76        1              world  3
## 77        2              color  3
## 78        2            concept  3
## 79        2               dont  3
## 80        2                due  3
## 81        2       environments  3
## 82        2              equal  3
## 83        2            hazards  3
## 84        2            healthy  3
## 85        2            leading  3
## 86        2         pollutants  3
## 87        2          pollution  3
## 88        2             social  3
## 89        2           systemic  3
## 90        1          advantage  2
## 91        1             afford  2
## 92        1                air  2
## 93        1             answer  2
## 94        1             cancer  2
## 95        1               city  2
## 96        1              class  2
## 97        1           concerns  2
## 98        1              cycle  2
## 99        1      disadvantaged  2
## 100       1          disasters  2
## 101       1         disturbing  2
## 102       1             fields  2
## 103       1              fight  2
## 104       1              found  2
## 105       1         generation  2
## 106       1             harder  2
## 107       1          hardships  2
## 108       1               hate  2
## 109       1            hazards  2
## 110       1                hit  2
## 111       1           homeless  2
## 112       1               huge  2
## 113       1       inequalities  2
## 114       1               land  2
## 115       1             looked  2
## 116       1              major  2
## 117       1              makes  2
## 118       1         minorities  2
## 119       1        necessarily  2
## 120       1             person  2
## 121       1         pollutants  2
## 122       1             poorly  2
## 123       1              rates  2
## 124       1      socioeconomic  2
## 125       1             status  2
## 126       1               stop  2
## 127       1            subject  2
## 128       1              super  2
## 129       1           systemic  2
## 130       1             topics  2
## 131       1              trash  2
## 132       1         understand  2
## 133       1              worse  2
## 134       2         addressing  2
## 135       2            adverse  2
## 136       2            biggest  2
## 137       2          burdening  2
## 138       2             cancer  2
## 139       2            country  2
## 140       2                don  2
## 141       2              earth  2
## 142       2           economic  2
## 143       2            effects  2
## 144       2             ensure  2
## 145       2          equitable  2
## 146       2       exacerbating  2
## 147       2           existing  2
## 148       2             family  2
## 149       2              green  2
## 150       2           honestly  2
## 151       2          impactful  2
## 152       2         importance  2
## 153       2          increased  2
## 154       2         industrial  2
## 155       2       inequalities  2
## 156       2            involve  2
## 157       2              leads  2
## 158       2               live  2
## 159       2                lot  2
## 160       2      neighborhoods  2
## 161       2       perpetuating  2
## 162       2               poor  2
## 163       2            poverty  2
## 164       2              races  2
## 165       2             racist  2
## 166       2              rates  2
## 167       2        respiratory  2
## 168       2              sites  2
## 169       2         situations  2
## 170       2            society  2
## 171       2             spaces  2
## 172       2             system  2
## 173       2           targeted  2
## 174       2              topic  2
## 175       2        underscores  2
## 176       2         understand  2
## 177       2           unfairly  2
## 178       2             united  2
## 179      NA             racism  2
## 180       1               2024  1
## 181       1                  3  1
## 182       1                 5x  1
## 183       1            ability  1
## 184       1          achieving  1
## 185       1         addressing  1
## 186       1           advocate  1
## 187       1             affect  1
## 188       1                age  1
## 189       1                ago  1
## 190       1       agricultural  1
## 191       1             amount  1
## 192       1            amounts  1
## 193       1            animals  1
## 194       1         apartments  1
## 195       1            arising  1
## 196       1             aspect  1
## 197       1              aware  1
## 198       1          awareness  1
## 199       1              awful  1
## 200       1                bad  1
## 201       1               bear  1
## 202       1            becasue  1
## 203       1                bit  1
## 204       1            brought  1
## 205       1              brunt  1
## 206       1              build  1
## 207       1               call  1
## 208       1             caring  1
## 209       1             caught  1
## 210       1             caused  1
## 211       1             change  1
## 212       1              cheap  1
## 213       1           chemical  1
## 214       1           children  1
## 215       1            choices  1
## 216       1             cities  1
## 217       1               clue  1
## 218       1            colored  1
## 219       1             coming  1
## 220       1          companies  1
## 221       1            company  1
## 222       1         conditions  1
## 223       1             condos  1
## 224       1         considered  1
## 225       1        contributes  1
## 226       1            country  1
## 227       1            created  1
## 228       1              crime  1
## 229       1            crucial  1
## 230       1            current  1
## 231       1             damage  1
## 232       1                day  1
## 233       1               deal  1
## 234       1              dealt  1
## 235       1             decent  1
## 236       1           decision  1
## 237       1           decrease  1
## 238       1            depends  1
## 239       1          difficult  1
## 240       1           directly  1
## 241       1     discrimination  1
## 242       1          discussed  1
## 243       1         discussing  1
## 244       1        disparities  1
## 245       1   disproportionate  1
## 246       1          diversity  1
## 247       1              doesn  1
## 248       1              drink  1
## 249       1           drinking  1
## 250       1               drop  1
## 251       1                eat  1
## 252       1           economic  1
## 253       1          economics  1
## 254       1          ecosystem  1
## 255       1          education  1
## 256       1           effected  1
## 257       1            effects  1
## 258       1          ehtically  1
## 259       1          eliminate  1
## 260       1             endure  1
## 261       1            enhance  1
## 262       1           ensuring  1
## 263       1             entire  1
## 264       1         enviroment  1
## 265       1       enviromental  1
## 266       1    environmentally  1
## 267       1     environtmental  1
## 268       1            equally  1
## 269       1          equitable  1
## 270       1        exacerbates  1
## 271       1              exact  1
## 272       1           existing  1
## 273       1         experience  1
## 274       1        experiences  1
## 275       1            exposed  1
## 276       1           exposure  1
## 277       1          extremely  1
## 278       1               eyes  1
## 279       1             facing  1
## 280       1           families  1
## 281       1              fault  1
## 282       1        financially  1
## 283       1                fix  1
## 284       1              fixes  1
## 285       1               food  1
## 286       1               form  1
## 287       1             formed  1
## 288       1             fought  1
## 289       1             freaks  1
## 290       1            garbage  1
## 291       1               goal  1
## 292       1                god  1
## 293       1            grocery  1
## 294       1           habitats  1
## 295       1            handled  1
## 296       1               hard  1
## 297       1            harmful  1
## 298       1            helping  1
## 299       1         highlights  1
## 300       1            history  1
## 301       1             honest  1
## 302       1           honestly  1
## 303       1           hopeless  1
## 304       1           horrible  1
## 305       1          hospitals  1
## 306       1              human  1
## 307       1               hurt  1
## 308       1               idea  1
## 309       1                ill  1
## 310       1          illnesses  1
## 311       1          impactful  1
## 312       1          impacting  1
## 313       1            impacts  1
## 314       1       impoverished  1
## 315       1           incident  1
## 316       1         insightful  1
## 317       1          instances  1
## 318       1         invalidate  1
## 319       1             invest  1
## 320       1            justice  1
## 321       1          knowledge  1
## 322       1      knowledgeable  1
## 323       1               lack  1
## 324       1           landfill  1
## 325       1          landfills  1
## 326       1            leading  1
## 327       1              learn  1
## 328       1            learned  1
## 329       1             levels  1
## 330       1             lights  1
## 331       1             linger  1
## 332       1              lives  1
## 333       1             living  1
## 334       1              local  1
## 335       1           location  1
## 336       1          louisiana  1
## 337       1           maintain  1
## 338       1         maintained  1
## 339       1            managed  1
## 340       1              means  1
## 341       1                 mi  1
## 342       1           minority  1
## 343       1               move  1
## 344       1           negative  1
## 345       1         negatively  1
## 346       1          officials  1
## 347       1           opinions  1
## 348       1        opportunity  1
## 349       1             passed  1
## 350       1            peoples  1
## 351       1             plants  1
## 352       1              plays  1
## 353       1           policies  1
## 354       1             poorer  1
## 355       1        predominant  1
## 356       1          prejudice  1
## 357       1           pressing  1
## 358       1          prevalent  1
## 359       1            prevent  1
## 360       1             prices  1
## 361       1           properly  1
## 362       1           provided  1
## 363       1             public  1
## 364       1              races  1
## 365       1           rational  1
## 366       1             reason  1
## 367       1             refers  1
## 368       1         regulation  1
## 369       1       reoccurrence  1
## 370       1          represent  1
## 371       1          residents  1
## 372       1        respiratory  1
## 373       1     responsibility  1
## 374       1        restaurants  1
## 375       1       restrictions  1
## 376       1             rising  1
## 377       1               risk  1
## 378       1              river  1
## 379       1              roads  1
## 380       1               role  1
## 381       1                run  1
## 382       1               save  1
## 383       1          scapegoat  1
## 384       1         segregated  1
## 385       1          sidewalks  1
## 386       1             simple  1
## 387       1          situation  1
## 388       1           solution  1
## 389       1           someting  1
## 390       1             sooner  1
## 391       1              spend  1
## 392       1             spread  1
## 393       1              stand  1
## 394       1         standpoint  1
## 395       1           status's  1
## 396       1              stems  1
## 397       1               step  1
## 398       1             stores  1
## 399       1            support  1
## 400       1          supported  1
## 401       1              takes  1
## 402       1           themself  1
## 403       1           therefor  1
## 404       1              times  1
## 405       1            today's  1
## 406       1              treat  1
## 407       1            treated  1
## 408       1             unfair  1
## 409       1             unique  1
## 410       1             united  1
## 411       1             unjust  1
## 412       1         vulnerable  1
## 413       1             weapon  1
## 414       1          willpower  1
## 415       1              wrong  1
## 416       1             zoning  1
## 417       1               <NA>  1
## 418       2         absolutely  1
## 419       2            address  1
## 420       2          addressed  1
## 421       2           adequate  1
## 422       2          advantage  1
## 423       2          affecting  1
## 424       2            affects  1
## 425       2           allowing  1
## 426       2             amount  1
## 427       2            amounts  1
## 428       2             animal  1
## 429       2             answer  1
## 430       2         apparently  1
## 431       2             aspect  1
## 432       2                bad  1
## 433       2              based  1
## 434       2             biases  1
## 435       2              birth  1
## 436       2              black  1
## 437       2              blind  1
## 438       2               born  1
## 439       2              broad  1
## 440       2             broken  1
## 441       2           building  1
## 442       2               care  1
## 443       2          causation  1
## 444       2             caused  1
## 445       2             change  1
## 446       2             choose  1
## 447       2               city  1
## 448       2              claim  1
## 449       2              class  1
## 450       2            climate  1
## 451       2       concentrated  1
## 452       2         constantly  1
## 453       2        contributes  1
## 454       2        correlation  1
## 455       2          countries  1
## 456       2              crazy  1
## 457       2            created  1
## 458       2            creates  1
## 459       2           credible  1
## 460       2             cuases  1
## 461       2            current  1
## 462       2             cycles  1
## 463       2              death  1
## 464       2             decade  1
## 465       2          depleting  1
## 466       2           deserves  1
## 467       2           destroys  1
## 468       2        destruction  1
## 469       2      developmental  1
## 470       2      disadvantaged  1
## 471       2           disagree  1
## 472       2      discriminated  1
## 473       2           disposal  1
## 474       2 disproportionately  1
## 475       2        distributed  1
## 476       2       distribution  1
## 477       2            diverse  1
## 478       2        documentary  1
## 479       2             dumped  1
## 480       2             easily  1
## 481       2         economical  1
## 482       2          ecosystem  1
## 483       2            educate  1
## 484       2           educated  1
## 485       2           effected  1
## 486       2          emissions  1
## 487       2       enviormental  1
## 488       2            equally  1
## 489       2            examine  1
## 490       2           examples  1
## 491       2            existed  1
## 492       2             exists  1
## 493       2         extinction  1
## 494       2            extreme  1
## 495       2                eye  1
## 496       2               eyes  1
## 497       2         facilities  1
## 498       2             facing  1
## 499       2            failure  1
## 500       2           failures  1
## 501       2           families  1
## 502       2               feel  1
## 503       2              field  1
## 504       2         frequently  1
## 505       2             gender  1
## 506       2             giving  1
## 507       2              god's  1
## 508       2         government  1
## 509       2            growing  1
## 510       2          happening  1
## 511       2               harm  1
## 512       2            harmful  1
## 513       2               hate  1
## 514       2          hazardous  1
## 515       2         healthcare  1
## 516       2          hierarchy  1
## 517       2             highly  1
## 518       2           horrible  1
## 519       2            housing  1
## 520       2          illnesses  1
## 521       2           imminent  1
## 522       2             impact  1
## 523       2            impacts  1
## 524       2           incident  1
## 525       2             income  1
## 526       2            incomes  1
## 527       2           industry  1
## 528       2           inequity  1
## 529       2        information  1
## 530       2     infrastructure  1
## 531       2        institution  1
## 532       2            judging  1
## 533       2            keeping  1
## 534       2              lakes  1
## 535       2              layer  1
## 536       2              leave  1
## 537       2               life  1
## 538       2             living  1
## 539       2            located  1
## 540       2                low  1
## 541       2               main  1
## 542       2         management  1
## 543       2             market  1
## 544       2             matter  1
## 545       2              means  1
## 546       2               meat  1
## 547       2            mention  1
## 548       2            minimal  1
## 549       2           minority  1
## 550       2           mistaken  1
## 551       2              money  1
## 552       2               move  1
## 553       2            natural  1
## 554       2         negatively  1
## 555       2            neglect  1
## 556       2           occupied  1
## 557       2          occurring  1
## 558       2          officials  1
## 559       2            opinion  1
## 560       2           outcomes  1
## 561       2           outwards  1
## 562       2              ozone  1
## 563       2            perfect  1
## 564       2         personally  1
## 565       2          personlly  1
## 566       2         phenomenon  1
## 567       2             phrase  1
## 568       2           physical  1
## 569       2           policies  1
## 570       2             policy  1
## 571       2           politics  1
## 572       2           pollutes  1
## 573       2          polluting  1
## 574       2              ponds  1
## 575       2             poorer  1
## 576       2         population  1
## 577       2      predominantly  1
## 578       2          prevalent  1
## 579       2        prioritized  1
## 580       2          privilege  1
## 581       2      progressively  1
## 582       2           properly  1
## 583       2               push  1
## 584       2          qualities  1
## 585       2            quicker  1
## 586       2               real  1
## 587       2           recently  1
## 588       2              refer  1
## 589       2            resolve  1
## 590       2         resource's  1
## 591       2            respond  1
## 592       2             result  1
## 593       2         ridiculous  1
## 594       2                sad  1
## 595       2             school  1
## 596       2           severely  1
## 597       2            shouldn  1
## 598       2        significant  1
## 599       2             single  1
## 600       2      socioeconomic  1
## 601       2             solved  1
## 602       2              speak  1
## 603       2            species  1
## 604       2             status  1
## 605       2             steady  1
## 606       2         stereotype  1
## 607       2           stewards  1
## 608       2            studied  1
## 609       2           superior  1
## 610       2        sustainable  1
## 611       2       systemically  1
## 612       2               tend  1
## 613       2               time  1
## 614       2            today's  1
## 615       2             toxins  1
## 616       2            treated  1
## 617       2          treatment  1
## 618       2              types  1
## 619       2             unable  1
## 620       2            unaware  1
## 621       2   underrepresented  1
## 622       2          understnd  1
## 623       2            unequal  1
## 624       2             unfair  1
## 625       2         unfamiliar  1
## 626       2            ungodly  1
## 627       2              views  1
## 628       2         vulnerable  1
## 629       2            watched  1
## 630       2              white  1
## 631       2           whomever  1
## 632       2          workplace  1
## 633       2              world  1
## 634       2             worldy  1
## 635       2           worrying  1
## 636       2              worse  1
## 637       2            worship  1
## 638       2              wrong  1
## 639      NA            aspects  1
## 640      NA             aspire  1
## 641      NA            current  1
## 642      NA            discuss  1
## 643      NA      environmental  1
## 644      NA           honestly  1
## 645      NA          impactful  1
## 646      NA          injustice  1
## 647      NA      understanding  1
## 648      NA                ups  1

top_words <- word_counts %>%
  group_by(Section) %>%
  slice_max(n, n = 20)

top_words

## # A tibble: 58 × 3
## # Groups:   Section [3]
##    Section word              n
##      <int> <chr>         <int>
##  1       1 environmental    30
##  2       1 racism           21
##  3       1 injustice        17
##  4       1 people           14
##  5       1 communities      13
##  6       1 government       10
##  7       1 water             9
##  8       1 community         8
##  9       1 topic             8
## 10       1 understanding     8
## # ℹ 48 more rows

ggplot(top_words, aes(x = reorder(word, n), y = n, fill = factor(Section))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Section, scales = "free") +
  coord_flip() +
  labs(
    x = "",
    y = "Word Frequency",
    title = "Most Common Words in Student Responses by Section"
  ) +
  theme_bw()

keywords <- c("system","policy","government","industry","media","bias","justice","freedom")

words_clean %>%
  filter(word %in% keywords) %>%
  count(Section, word)

##   Section       word  n
## 1       1 government 10
## 2       1    justice  1
## 3       2 government  1
## 4       2   industry  1
## 5       2     policy  1
## 6       2     system  2

ia_terms <- c(
  "system","systemic","structure","structural",
  "policy","policies",
  "government","state",
  "industry","corporate","corporation",
  "power","control",
  "bias","biased",
  "media","narrative","framing",
  "justice","inequality","equity",
  "rights","freedom",
  "capitalism","market",
  "regulation","regulated"
)

library(dplyr)
library(tidyr)
library(tidytext)

text_long <- resp %>%
  pivot_longer(
    cols = starts_with("Vid."),
    names_to = "Topic",
    values_to = "Text"
  )
words <- text_long %>%
  unnest_tokens(word, Text)

data(stop_words)

words_clean <- words %>%
  anti_join(stop_words, by = "word")

ia_words <- words_clean %>%
  filter(word %in% ia_terms)

ia_counts <- ia_words %>%
  count(Section, Topic, word, sort = TRUE)

ia_counts

## # A tibble: 28 × 4
##    Section Topic           word           n
##      <int> <chr>           <chr>      <int>
##  1       1 Vid.Resp.Text   regulated     16
##  2       2 Vid.Resp.Text   regulated     12
##  3       1 Vid.EnvInj.Text government    10
##  4       2 Vid.EnvInj.Text systemic       3
##  5       2 Vid.Resp.Text   regulation     3
##  6       1 Vid.EnvInj.Text systemic       2
##  7       1 Vid.MolBio.Text control        2
##  8       1 Vid.Resp.Text   government     2
##  9       1 Vid.Resp.Text   regulation     2
## 10       2 Vid.EnvInj.Text system         2
## # ℹ 18 more rows

ia_summary <- ia_words %>%
  count(Section, Topic)

ia_summary

## # A tibble: 8 × 3
##   Section Topic                 n
##     <int> <chr>             <int>
## 1       1 Vid.EnvInj.Text      15
## 2       1 Vid.MolBio.Text       2
## 3       1 Vid.Resp.Text        22
## 4       1 Vid.Vaccines.Text     3
## 5       2 Vid.EnvInj.Text      10
## 6       2 Vid.MolBio.Text       2
## 7       2 Vid.Resp.Text        16
## 8       2 Vid.Vaccines.Text     3

library(ggplot2)

ggplot(ia_summary,
       aes(x = Topic, y = n, fill = factor(Section))) +
  geom_col(position = "dodge") +
  coord_flip() +
  theme_bw() +
  labs(
    y = "Number of IA terms used",
    x = "Topic",
    fill = "Section"
  )

total_words <- words_clean %>%
  count(Section, Topic)

ia_rate <- ia_summary %>%
  left_join(total_words, by = c("Section","Topic"),
            suffix = c("_ia","_total")) %>%
  mutate(rate = n_ia / n_total)

ia_rate

## # A tibble: 8 × 5
##   Section Topic              n_ia n_total    rate
##     <int> <chr>             <int>   <int>   <dbl>
## 1       1 Vid.EnvInj.Text      15     620 0.0242 
## 2       1 Vid.MolBio.Text       2     534 0.00375
## 3       1 Vid.Resp.Text        22     546 0.0403 
## 4       1 Vid.Vaccines.Text     3     526 0.00570
## 5       2 Vid.EnvInj.Text      10     581 0.0172 
## 6       2 Vid.MolBio.Text       2     514 0.00389
## 7       2 Vid.Resp.Text        16     657 0.0244 
## 8       2 Vid.Vaccines.Text     3     529 0.00567

ia_rate %>%
  arrange(desc(rate))

## # A tibble: 8 × 5
##   Section Topic              n_ia n_total    rate
##     <int> <chr>             <int>   <int>   <dbl>
## 1       1 Vid.Resp.Text        22     546 0.0403 
## 2       2 Vid.Resp.Text        16     657 0.0244 
## 3       1 Vid.EnvInj.Text      15     620 0.0242 
## 4       2 Vid.EnvInj.Text      10     581 0.0172 
## 5       1 Vid.Vaccines.Text     3     526 0.00570
## 6       2 Vid.Vaccines.Text     3     529 0.00567
## 7       2 Vid.MolBio.Text       2     514 0.00389
## 8       1 Vid.MolBio.Text       2     534 0.00375

ggplot(ia_rate,
       aes(x = Topic, y = rate, fill = factor(Section))) +
  geom_col(position = "dodge") +
  coord_flip() +
  theme_bw() +
  labs(
    y = "Rate of IA terms used",
    x = "Topic",
    fill = "Section"
  )

library(dplyr)
library(tidytext)
library(tidyr)
#install.packages("tidylo")
library(tidylo)


log_odds <- word_counts %>%
  bind_log_odds(set = Section, feature = word, n = n)

log_odds_plot <- log_odds %>%
  slice_max(abs(log_odds_weighted), n = 20) %>%
  mutate(
    more_common = ifelse(log_odds_weighted > 0, "AI", "Control"),
    word = reorder(word, log_odds_weighted)
  )

xmax <- max(abs(log_odds_plot$log_odds_weighted)) * 1.15

ggplot(log_odds_plot,
       aes(x = word, y = log_odds_weighted, fill = more_common)) +
  geom_col(width = 0.8) +
  coord_flip() +
  geom_hline(yintercept = 0, linewidth = 0.6) +
  annotate("text", x = 1, y = xmax * 0.85, label = "AI", hjust = 1, fontface = "bold") +
  annotate("text", x = 1, y = -xmax * 0.85, label = "Control", hjust = 0, fontface = "bold") +
  scale_fill_manual(values = c("AI" = "#D95F5F", "Control" = "#2C7FB8")) +
  scale_y_continuous(limits = c(-xmax, xmax)) +
  labs(
    x = NULL,
    y = "Weighted log odds",
    fill = "More common in",
    title = "Words most characteristic of each section"
  ) +
  theme_bw() +
  theme(
    panel.grid.major.y = element_blank(),
    legend.position = "none"
  )

library(tidylo)

log_odds_topic <- words_clean %>%
  count(Topic, Section, word) %>%
  group_by(Topic) %>%
  filter(sum(n) >= 3) %>%
  bind_log_odds(set = Section, feature = word, n = n) %>%
  ungroup()

plot_topic <- log_odds_topic %>%
  group_by(Topic) %>%
  slice_max(abs(log_odds_weighted), n = 8) %>%
  mutate(more_common = ifelse(log_odds_weighted > 0, "AI", "Control"))

ggplot(plot_topic,
       aes(x = reorder(word, log_odds_weighted), y = log_odds_weighted, fill = more_common)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~Topic, scales = "free_y") +
  scale_fill_manual(values = c("AI" = "#D95F5F", "Control" = "#2C7FB8")) +
  labs(
    x = NULL,
    y = "Weighted log odds",
    fill = "More common in",
    title = "Topic-specific words distinguishing AI and Control sections"
  ) +
  theme_bw()

The results suggest that explicit ideological awareness instruction influences students’ interpretive reasoning about socio-scientific issues, but does not consistently change the direction of their attitudes.

Students exposed to AI instruction appear more likely to:

articulate clearer positions

recognize ideological structures

analyze issues beyond individual harms.

Explicit instruction in ideological awareness does not necessarily change students’ positions on socio-scientific issues, but it appears to influence how students interpret and articulate those issues. Students in the AI section demonstrated more structured ideological reasoning, whereas students in the control section more frequently expressed neutral or opinion-based responses.

combine the qual codes with the quantitative data to look for differences.

Your data suggest:

Students can recognize ideological issues even without explicit instruction, but explicit IA instruction shifts how they reason about those issues.

Specifically, IA instruction appears to:

increase emotional engagement

strengthen science identity

encourage structural interpretation of socio-scientific issues

rather than simply increasing advocacy or opinion strength.

#focus in on long response more clearly

structural_terms <- c(
  "system","systemic","structure","structural",
  "policy","policies","regulation","regulated",
  "government","state","institution","institutions",
  "industry","corporate","corporation","companies",
  "power","powerful","control",
  "inequality","inequity","equity",
  "justice","racism","discrimination",
  "community","communities","neighborhood","neighborhoods",
  "environmental","environment","pollution",
  "media","narrative","framing",
  "economic","economics","capitalism","market"
)

library(dplyr)
library(tidytext)
library(tidyr)

text_long <- resp %>%
  pivot_longer(
    cols = starts_with("Vid."),
    names_to = "Topic",
    values_to = "Text"
  )

words <- text_long %>%
  unnest_tokens(word, Text)

data(stop_words)

words_clean <- words %>%
  anti_join(stop_words, by = "word")

structural_words <- words_clean %>%
  filter(word %in% structural_terms)

structural_counts <- structural_words %>%
  count(SortingID, Section, Topic)


total_words <- words_clean %>%
  count(SortingID, Section, Topic)

structural_rate <- structural_counts %>%
  left_join(total_words, by = c("SortingID","Section","Topic"),
            suffix = c("_struct","_total")) %>%
  mutate(rate = n_struct / n_total)

structural_summary <- structural_rate %>%
  filter(!is.na(Section)) %>%     # remove NA section rows
  group_by(Section, Topic) %>%
  summarise(
    mean_rate = mean(rate, na.rm = TRUE),
    sd = sd(rate, na.rm = TRUE),
    n = n(),
    se = sd / sqrt(n),            # <-- this creates the SE column
    .groups = "drop"
  )

structural_summary <- structural_summary %>%
  filter(!is.na(Section))

library(ggplot2)

ggplot(structural_summary,
       aes(x = Topic, y = mean_rate, fill = factor(Section))) +
  geom_col(position = "dodge") +
  coord_flip() +
  theme_bw() +
  labs(
    y = "Structural reasoning rate",
    x = "Topic",
    fill = "Section",
    title = "Use of structural reasoning language in responses"
  )

lm(rate ~ Section + Topic, data = structural_rate)

## 
## Call:
## lm(formula = rate ~ Section + Topic, data = structural_rate)
## 
## Coefficients:
##            (Intercept)                 Section    TopicVid.MolBio.Text  
##               0.281361               -0.005751               -0.207970  
##     TopicVid.Resp.Text  TopicVid.Vaccines.Text  
##              -0.120820               -0.179134

high_structural <- structural_rate %>%
  arrange(desc(rate)) %>%
  slice_head(n = 10)

resp %>%
  filter(SortingID %in% high_structural$SortingID)

##    SortingID Section Timepoint   Consent              Email
## 1         15       1      Post I Consent kme0042@auburn.edu
## 2         19       2      Post I Consent emo0021@auburn.edu
## 3         28       1      Post I Consent rer0061@auburn.edu
## 4         39       2      Post I Consent Hno0009@auburn.edu
## 5         47       2      Post I Consent lrp0037@auburn.edu
## 6         48       2      Post I Consent mzs0263@auburn.edu
## 7         69       1      Post I Consent akd0082@auburn.edu
## 8         86       1      Post I Consent mzp0147@auburn.edu
## 9        104       2      Post I Consent mar0147@auburn.edu
## 10       119       1      Post I Consent meg0156@auburn.edu
##                                                                                                                             Learn.Science
## 1                                                                                                      So we can know how the world works
## 2                                                                                         To learn about how the world around them works.
## 3  Learning science is somethings that is important because it can help you learn more about how your body works and how the earth works.
## 4                                                                 To understand basic concepts of how the world and things around us work
## 5                                                                                                to understand our world and how it works
## 6                                                                                                      Because science is objective truth
## 7                                                                                                                                        
## 8                                                                                                                                        
## 9                 It is important for students to learn science because science greatly effects their own life and the world around them.
## 10                                                      It is important to learn biology so that people can have basic knowledge of life.
##                                                                                                                                                                                                                                 Benefits
## 1                                                                                                It would help people have a better understanding of the more political parts of the science field so they can  form their own opinions.
## 2                                                                                                                                                                          Make people more aware of what's going on in today's society.
## 3                                                                                                                                  Having different ideal that others might not of heard about which might open their eye to new ideals.
## 4                                                                                                                                                            Biology would be inclusive and appealing to all groups if this was included
## 5                                                                                                                                                                                                      to have a more unbiased education
## 6  With my biology class being so introductory, I'm not sure which materials would benefit from ideological awareness, but maybe making sure to emphasize the difference in sex and gender when teaching about the reproductive system. 
## 7                                                                                                                                                                          Students would know some things are not soley based on facts.
## 8                                                                                                                                                                                                                     to have awareness.
## 9                                                                                                        More ideologically aware resources would benefit biology content by helping people learn more about their own anatomy and life.
## 10                                                                                                                                                            \nI think this may be able to help students understand a few things better
##                                                                                                      Downsides
## 1                                                              You could offend people if you are not careful.
## 2                                                                      It can deteriorate from class material.
## 3   Some people not be open to learning new ideas, which might lead them to not evey listening to other ideas.
## 4                              I think some people would get annoyed with the fact that it has to be mentioned
## 5                                                                                 take time away from material
## 6                                                                                    I don't see any downsides
## 7                                                                        Some people would argue these topics.
## 8                                                                       may sway people\xd5s thought processes
## 9  One downside would be that many aspects of ideologies can lead to controversy and take away from the facts.
## 10                                                                           this could greatly offend people.
##                                                                                                                                                                                                                                      Vid.Resp.Text
## 1                                                                                                                                                                                                    I think that they should be highly regulated.
## 2                                                                                                                                                                                       Vaping and using marijuana is bad for the public's health.
## 3                                                                                                                                    Vaping and marijuana use is not good for the health of user, and the government needed to regulate them more.
## 4                                                                                                                                                           I think vaping should be banned but marijuana should be available for prescription use
## 5                                                                                                                                            I do not believe in the recreational use of vapes/marijuana because it has been proven harmful for us
## 6  I think vaping is horrible. I think that right now, marijuana use is dangerous in our state because it is not regulated because it is illegal. I think that there are safe ways to use marijuana, but I do not think that vaping is ever safe. 
## 7                                                                                                                                                                                                     I do not do it because it is bad for health.
## 8                                                                                                                                                                                         it\xd5s not good for you, it could harm those around you
## 9                                                                             Marijuana can be very beneficial to those suffering with chronic pain. However, Marijuana should be well regulated and only accessible to those with a prescription.
## 10                                                                                                                                                      I think that it has not been around long enough to know the harmful side effects of vaping
##                                                                                                                                                                                                                                      Vid.MolBio.Text
## 1                                                                                                I believe we should not allow this because it will create an even bigger divide in society because not everyone will have access to the technology.
## 2                                                                                                                                                                        I don't know enough information on human genome editing to give my opinion.
## 3                                                                                                                                                                                                          Designer babies can be bad for the world.
## 4                                                                                                                       I think that it is wrong and having a baby should be a natural process unless altering it to not have death related diseases
## 5                                                                                                                                                          I think the ethical and societal implications outweigh the other pros for genetic editing
## 6                                                                                                                                                                          I don't really know enough about this topic to make a thoughtful comment.
## 7                                                                                                                                                                                                                             Not sure what this is.
## 8                                                                                                                                                                                  i don\xd5t understand it enough to have a strong thought about it
## 9  I do not believe genetic editing is ethical if people are trying to make children look a certain way. This practice can be dangerous and is unnatural. Genetic editing should only be used when necessary to prevent a life threatening mutation.
## 10                                                                                                                                                                                                   I think that this is wrong based on my religion
##                                                                                                                                              Vid.Vaccines.Text
## 1  We should all try to get vaccinated but we shouldn\xd5t be forced to do anything we don\xd5t want to do. We also need to have correct information about it.
## 2                                                                                                                      Vaccines are good for society's health.
## 3                                                                           Misinformation has a really negative impact on whether or not people get vaccines.
## 4                                                                     I think vaccinations are important to the general public and stop disease from spreading
## 5                                                                                                       I do not know enough information on these vaccinations
## 6                                                                                I think that it is important to get vaccinated for the benefit of the public.
## 7                                                                                                   It can help people not spread diseases that they may have.
## 8                                                                       vaccines should be required, for not only the person\xd5s safety but those around them
## 9                          Vaccinations are soemtimes useful when preventing possible disease. Vaccines should be well studied and should not be experimental.
## 10                                                                                  I think that vaccines are good but they SHOULD NOT be forced on the public
##                                                                                             Vid.EnvInj.Text
## 1  I find the most concerning impact to be that these things are passed down from generation to generation.
## 2                                       I do not know enough about environmental racism to give my opinion.
## 3        Environmental racism and injustice is something that need to be worked on within the United State.
## 4                                             I don\xd5t know much about environmental racism and injustice
## 5                                                      I find environmental racism concerning and impactful
## 6                 I don't know much about environmental racism, but I do find the concept very concerning. 
## 7                                                      I am not aware of any environmental racism concerns.
## 8                                                         just take care of the environment and each other.
## 9                                                                 I am unfamiliar with environmental racism
## 10                                                              I have no clue what environmental racism is
##     X X.1 X.2 X.3 X.4 X.5 X.6 X.7 X.8 X.9 X.10 X.11 X.12 X.13 X.14 X.15
## 1  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 2  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 3  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 4  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 5  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 6  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 7  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 8  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 9  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA
## 10 NA  NA  NA  NA  NA  NA  NA  NA  NA  NA   NA   NA   NA   NA   NA   NA

summary(lm(rate ~ Section * Topic, data = structural_rate))

## 
## Call:
## lm(formula = rate ~ Section * Topic, data = structural_rate)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24113 -0.08461 -0.02008  0.04872  0.40360 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.24153    0.04950   4.880 3.28e-06 ***
## Section                         0.02154    0.03210   0.671    0.503    
## TopicVid.MolBio.Text           -0.05913    0.17742  -0.333    0.740    
## TopicVid.Resp.Text             -0.03934    0.08607  -0.457    0.648    
## TopicVid.Vaccines.Text         -0.07817    0.13059  -0.599    0.551    
## Section:TopicVid.MolBio.Text   -0.09438    0.10549  -0.895    0.373    
## Section:TopicVid.Resp.Text     -0.05704    0.05731  -0.995    0.322    
## Section:TopicVid.Vaccines.Text -0.07044    0.08672  -0.812    0.418    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1376 on 121 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.2532, Adjusted R-squared:   0.21 
## F-statistic: 5.862 on 7 and 121 DF,  p-value: 7.124e-06

structural_summary$Topic <- factor(
  structural_summary$Topic,
  levels = c(
    "Vid.EnvInj.Text",
    "Vid.Resp.Text",
    "Vid.Vaccines.Text",
    "Vid.MolBio.Text"
  )
)
ggplot(structural_summary,
       aes(x = Topic, y = mean_rate, color = factor(Section))) +
  geom_point(size = 4) +
  geom_errorbar(aes(ymin = mean_rate - se,
                    ymax = mean_rate + se),
                width = .1) +
  coord_flip() +
  theme_bw() +
  labs(
    y = "Structural reasoning rate",
    x = "Topic",
    color = "Section"
  )

AI students appear more likely to:

connect science to systems

recognize social structures

interpret responsibility beyond individuals

library(dplyr)
library(tidyr)
library(tidytext)

structural_terms <- c(
  "system","systemic","structure","structural",
  "policy","policies","regulation","regulated",
  "government","state","institution","institutions",
  "industry","corporate","corporation","companies",
  "power","control",
  "inequality","inequity","equity",
  "justice","racism","discrimination",
  "community","communities","neighborhood","neighborhoods",
  "environmental","environment",
  "media","narrative","framing",
  "economic","economics","capitalism","market"
)

text_long <- resp %>%
  pivot_longer(
    cols = starts_with("Vid."),
    names_to = "Topic",
    values_to = "Text"
  ) %>%
  mutate(
    Section = na_if(as.character(Section), "NA"),
    Section = recode(Section, "1" = "AI", "2" = "Control")
  ) %>%
  filter(!is.na(Section), !is.na(Text), Text != "")


words_clean <- text_long %>%
  unnest_tokens(word, Text) %>%
  anti_join(stop_words, by = "word")

structural_by_response <- words_clean %>%
  group_by(SortingID, Section, Topic) %>%
  summarise(
    total_words = n(),
    structural_n = sum(word %in% structural_terms),
    structural_present = as.integer(structural_n > 0),
    structural_rate = structural_n / total_words,
    .groups = "drop"
  )

library(lme4)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## 
## Attaching package: 'lme4'

## The following object is masked from 'package:nlme':
## 
##     lmList

model_presence <- glmer(
  structural_present ~ Section + (1 | Topic) + (1 | SortingID),
  data = structural_by_response,
  family = binomial
)

summary(model_presence)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: structural_present ~ Section + (1 | Topic) + (1 | SortingID)
##    Data: structural_by_response
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     455.8     472.7    -223.9     447.8       500 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.8812 -0.4372 -0.2473  0.3373  4.4931 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  SortingID (Intercept) 1.040    1.020   
##  Topic     (Intercept) 2.209    1.486   
## Number of obs: 504, groups:  SortingID, 133; Topic, 4
## 
## Fixed effects:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     -1.7854     0.7781  -2.295   0.0218 *
## SectionControl   0.3311     0.3119   1.061   0.2885  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## SectinCntrl -0.175

exp(fixef(model_presence))

##    (Intercept) SectionControl 
##      0.1677246      1.3924618

exp(confint(model_presence, parm = "beta_", method = "Wald"))

##                     2.5 %    97.5 %
## (Intercept)    0.03649859 0.7707571
## SectionControl 0.75555417 2.5662618

library(lme4)

model_rate <- lmer(
  structural_rate ~ Section + (1 | Topic) + (1 | SortingID),
  data = structural_by_response
)

summary(model_rate)

## Linear mixed model fit by REML ['lmerMod']
## Formula: structural_rate ~ Section + (1 | Topic) + (1 | SortingID)
##    Data: structural_by_response
## 
## REML criterion at convergence: -853.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.7268 -0.3660 -0.0739  0.0050  4.9553 
## 
## Random effects:
##  Groups    Name        Variance  Std.Dev.
##  SortingID (Intercept) 0.0001068 0.01033 
##  Topic     (Intercept) 0.0059033 0.07683 
##  Residual              0.0101001 0.10050 
## Number of obs: 504, groups:  SortingID, 133; Topic, 4
## 
## Fixed effects:
##                Estimate Std. Error t value
## (Intercept)    0.052119   0.038868   1.341
## SectionControl 0.008608   0.009320   0.924
## 
## Correlation of Fixed Effects:
##             (Intr)
## SectinCntrl -0.096

model_rate_topic <- lmer(
  structural_rate ~ Section * Topic + (1 | SortingID),
  data = structural_by_response
)

summary(model_rate_topic)

## Linear mixed model fit by REML ['lmerMod']
## Formula: structural_rate ~ Section * Topic + (1 | SortingID)
##    Data: structural_by_response
## 
## REML criterion at convergence: -838.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.0270 -0.3505 -0.0757 -0.0184  5.1401 
## 
## Random effects:
##  Groups    Name        Variance  Std.Dev.
##  SortingID (Intercept) 0.0001306 0.01143 
##  Residual              0.0099822 0.09991 
## Number of obs: 504, groups:  SortingID, 133
## 
## Fixed effects:
##                                       Estimate Std. Error t value
## (Intercept)                            0.14740    0.01202  12.264
## SectionControl                         0.05403    0.01896   2.849
## TopicVid.MolBio.Text                  -0.14300    0.01661  -8.610
## TopicVid.Resp.Text                    -0.10253    0.01645  -6.231
## TopicVid.Vaccines.Text                -0.13727    0.01641  -8.367
## SectionControl:TopicVid.MolBio.Text   -0.05513    0.02623  -2.101
## SectionControl:TopicVid.Resp.Text     -0.06425    0.02592  -2.479
## SectionControl:TopicVid.Vaccines.Text -0.05787    0.02596  -2.229
## 
## Correlation of Fixed Effects:
##             (Intr) SctnCn TV.MB. TV.R.T TV.V.T SC:TV.M SC:TV.R
## SectinCntrl -0.634                                            
## TpcVd.MlB.T -0.715  0.453                                     
## TpcVd.Rsp.T -0.722  0.458  0.522                              
## TpcVd.Vcc.T -0.724  0.459  0.524  0.529                       
## ScC:TV.MB.T  0.453 -0.714 -0.633 -0.331 -0.332                
## SctC:TV.R.T  0.458 -0.723 -0.332 -0.635 -0.336  0.523         
## SctC:TV.V.T  0.458 -0.722 -0.331 -0.334 -0.632  0.522   0.528

library(ggeffects)
library(ggplot2)



model_topic <- glmer(
  structural_present ~ Section * Topic + (1 | SortingID),
  data = structural_by_response,
  family = binomial
)

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00267592 (tol = 0.002, component 1)

summary(model_topic)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: structural_present ~ Section * Topic + (1 | SortingID)
##    Data: structural_by_response
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     444.4     482.4    -213.2     426.4       495 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.5620 -0.3672 -0.2174  0.2520  6.3521 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  SortingID (Intercept) 1.406    1.186   
## Number of obs: 504, groups:  SortingID, 133
## 
## Fixed effects:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                             0.3842     0.3127   1.229    0.219    
## SectionControl                          0.7216     0.5116   1.411    0.158    
## TopicVid.MolBio.Text                   -4.1764     0.7307  -5.715 1.09e-08 ***
## TopicVid.Resp.Text                     -1.6779     0.4263  -3.936 8.29e-05 ***
## TopicVid.Vaccines.Text                 -3.2420     0.5603  -5.786 7.19e-09 ***
## SectionControl:TopicVid.MolBio.Text     0.0754     0.9546   0.079    0.937    
## SectionControl:TopicVid.Resp.Text      -0.7639     0.6555  -1.165    0.244    
## SectionControl:TopicVid.Vaccines.Text  -0.6430     0.8137  -0.790    0.429    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) SctnCn TV.MB. TV.R.T TV.V.T SC:TV.M SC:TV.R
## SectinCntrl -0.593                                            
## TpcVd.MlB.T -0.389  0.171                                     
## TpcVd.Rsp.T -0.627  0.325  0.389                              
## TpcVd.Vcc.T -0.499  0.231  0.369  0.480                       
## ScC:TV.MB.T  0.265 -0.451 -0.646 -0.193 -0.151                
## SctC:TV.R.T  0.375 -0.656 -0.135 -0.548 -0.183  0.367         
## SctC:TV.V.T  0.306 -0.534 -0.117 -0.211 -0.538  0.309   0.434 
## optimizer (Nelder_Mead) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.00267592 (tol = 0.002, component 1)

VarCorr(model_presence)

##  Groups    Name        Std.Dev.
##  SortingID (Intercept) 1.0196  
##  Topic     (Intercept) 1.4861

student_structural <- structural_by_response %>%
  group_by(SortingID, Section) %>%
  summarise(
    mean_rate = mean(structural_rate, na.rm = TRUE),
    .groups = "drop"
  )

ggplot(student_structural,
       aes(x = as.factor(Section), y = mean_rate)) +
  geom_violin(fill = "grey80") +
  geom_boxplot(width = .1) +
  geom_jitter(width = .1, alpha = .5) +
  theme_bw()

student_structural <- student_structural %>%
  mutate(level = case_when(
    mean_rate == 0 ~ "None",
    mean_rate < 0.05 ~ "Low",
    mean_rate < 0.10 ~ "Moderate",
    TRUE ~ "High"
  ))

student_structural <- student_structural %>%
  filter(!is.na(Section))

ggplot(student_structural,
       aes(x = level, fill = factor(Section))) +
  geom_bar(position = "dodge") +
  theme_bw() +
  labs(
    x = "Structural reasoning level",
    y = "Number of students",
    fill = "Section"
  )

student_structural$level <- factor(
  student_structural$level,
  levels = c("None","Low","Moderate","High")
)

ggplot(student_structural,
       aes(x = factor(Section), fill = level)) +
  geom_bar(position = "fill") +
  theme_bw() +
  labs(
    x = "Section",
    y = "Proportion of students",
    fill = "Structural reasoning level"
  )

table_levels <- table(student_structural$Section,
                      student_structural$level)

chisq.test(table_levels)

## 
##  Pearson's Chi-squared test
## 
## data:  table_levels
## X-squared = 1.695, df = 3, p-value = 0.638

This means many students never used structural reasoning language in any response.

That is actually a major finding:

Structural reasoning is not a default explanation style for most students.

This is very consistent with research showing students tend to default to proximate explanations (pollution, waste, sickness) rather than systemic explanations.

Test whether IA changes how students articulate structural reasoning

library(dplyr)
library(tidyr)
library(tidytext)

structural_terms <- c(
  "system","systemic","structure","structural",
  "policy","policies","regulation","regulated",
  "government","state","institution","institutions",
  "industry","corporate","corporation","companies",
  "power","control",
  "inequality","inequity","equity",
  "justice","racism","discrimination",
  "community","communities","neighborhood","neighborhoods",
  "environmental","environment",
  "media","narrative","framing",
  "economic","economics","capitalism","market"
)

text_long <- resp %>%
  pivot_longer(
    cols = starts_with("Vid."),
    names_to = "Topic",
    values_to = "Text"
  ) %>%
  mutate(
    Section = na_if(as.character(Section), "NA")
  ) %>%
  filter(!is.na(Section), !is.na(Text), Text != "")

words_clean <- text_long %>%
  unnest_tokens(word, Text) %>%
  anti_join(stop_words, by = "word")

structural_by_response <- words_clean %>%
  group_by(SortingID, Section, Topic) %>%
  summarise(
    total_words = n(),
    structural_n = sum(word %in% structural_terms),
    structural_present = structural_n > 0,
    structural_rate = structural_n / total_words,
    .groups = "drop"
  )

library(dplyr)
library(tidyr)
library(tidytext)

structural_terms <- c(
  "system","systemic","structure","structural",
  "policy","policies","regulation","regulated",
  "government","state","institution","institutions",
  "industry","corporate","corporation","companies",
  "power","control",
  "inequality","inequity","equity",
  "justice","racism","discrimination",
  "community","communities","neighborhood","neighborhoods",
  "environmental","environment",
  "media","narrative","framing",
  "economic","economics","capitalism","market"
)

text_long <- resp %>%
  pivot_longer(
    cols = starts_with("Vid."),
    names_to = "Topic",
    values_to = "Text"
  ) %>%
  mutate(
    Section = na_if(as.character(Section), "NA")
  ) %>%
  filter(!is.na(Section), !is.na(Text), Text != "")

words_clean <- text_long %>%
  unnest_tokens(word, Text) %>%
  anti_join(stop_words, by = "word")

structural_by_response <- words_clean %>%
  group_by(SortingID, Section, Topic) %>%
  summarise(
    total_words = n(),
    structural_n = sum(word %in% structural_terms),
    structural_present = structural_n > 0,
    structural_rate = structural_n / total_words,
    .groups = "drop"
  )

structural_responses <- structural_by_response %>%
  filter(structural_present)

structural_words_only <- words_clean %>%
  semi_join(structural_responses, by = c("SortingID", "Section", "Topic"))

library(tidylo)

word_counts_structural <- structural_words_only %>%
  count(Section, word) %>%
  group_by(word) %>%
  filter(sum(n) >= 3) %>%
  ungroup()

log_odds_structural <- word_counts_structural %>%
  bind_log_odds(set = Section, feature = word, n = n)

library(ggplot2)

plot_words <- log_odds_structural %>%
  slice_max(abs(log_odds_weighted), n = 20) %>%
  mutate(
    more_common = ifelse(log_odds_weighted > 0, "AI", "Control"),
    word = reorder(word, log_odds_weighted)
  )

ggplot(plot_words,
       aes(x = word, y = log_odds_weighted, fill = more_common)) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(values = c("AI" = "#D95F5F", "Control" = "#2C7FB8")) +
  theme_bw() +
  labs(
    x = NULL,
    y = "Weighted log odds",
    fill = "More common in",
    title = "How AI vs Control articulate structural reasoning"
  )

Check for reasoning archetypes

student_profiles <- structural_by_response %>%
  group_by(SortingID, Section) %>%
  summarise(
    n_topics = n(),
    n_structural = sum(structural_present),
    prop_structural = mean(structural_present),
    mean_rate = mean(structural_rate, na.rm = TRUE),
    max_rate = max(structural_rate, na.rm = TRUE),
    .groups = "drop"
  )
student_profiles <- student_profiles %>%
  mutate(archetype = case_when(
    n_structural == 0 ~ "None",
    prop_structural <= 0.25 ~ "Occasional",
    prop_structural <= 0.75 ~ "Mixed",
    prop_structural > 0.75 ~ "Consistent"
  ))
ggplot(student_profiles,
       aes(x = factor(Section), fill = archetype)) +
  geom_bar(position = "fill") +
  theme_bw() +
  labs(
    x = "Section",
    y = "Proportion of students",
    fill = "Reasoning archetype",
    title = "Student structural reasoning archetypes by section"
  )

archetype_tab <- table(student_profiles$Section, student_profiles$archetype)
chisq.test(archetype_tab)

## Warning in chisq.test(archetype_tab): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  archetype_tab
## X-squared = 2.1101, df = 3, p-value = 0.5499

archetype_tab

##    
##     Consistent Mixed None Occasional
##   1          1    18   30         31
##   2          0    17   17         19

ggplot(student_profiles, aes(x = factor(Section), y = prop_structural)) +
  geom_violin(fill = "grey85") +
  geom_boxplot(width = 0.15, outlier.shape = NA) +
  geom_jitter(width = 0.08, alpha = 0.6) +
  theme_bw() +
  labs(
    x = "Section",
    y = "Proportion of responses with structural reasoning",
    title = "Cross-topic consistency in structural reasoning"
  )

Madeline_Intro_bio

Abby Beatty

2024-04-25