Overview:
library(dplyr)
library(ggplot2)
library(tidyr)
library(nlme)
s1=read.csv("Bio1010S1.csv")
s2=read.csv("Bio1010S2.csv")
s2 <- s2[, -4]
user.treatment <- bind_rows(list(s1 = s1, s2 = s2), .id = "source")
section_info <- unique(user.treatment)
quizzes=read.csv("Quiz_data_combined.csv")
quizzes$Username=as.factor(quizzes$Username)
(unique(quizzes$Username))#228 unique users with quiz data
#Create a "type" column that labels each question as either an "attention" question or a "viewpoint"
quizzes$Type <- ifelse(grepl("TRUE|FALSE", quizzes$Response.Text, ignore.case = TRUE), "attention", "viewpoint")
#Clean the user emails to be only the first three letters of their name and their 4 digit identifier
quizzes$User.Email <- substr(quizzes$User.Email, 1, 7)
#add in user data for each occurrence
quizzes <- merge(quizzes, section_info[, c("SIS.Login.ID", "source")], by.x = "User.Email", by.y = "SIS.Login.ID", all.x = TRUE)
#Assess quiz responses for engagement using the "attention" check questions
###subset quiz data to only include the attention check questions
att.check=subset(quizzes, Type == "attention")
#Create a "score" column, where if their response matches the correct response, we give them a score of 1, and otherwise they get a score of 0.
att.check$score <- ifelse(att.check$Correct. == "TRUE", 1, 0)
# Sum total scores for each "User.Email" by "Topic"
total_scores <- att.check %>%
group_by(User.Email, Topic) %>%
summarize(total_score = sum(score, na.rm = TRUE)) %>%
ungroup()
# Identify users and topics where the score is zero
zero_scores <- total_scores %>%
filter(total_score == 0)
# Remove these users' data for topics where they scored zero
att.check <- att.check %>%
anti_join(zero_scores, by = c("User.Email", "Topic"))
# Calculate average score per section grouped by topic and section
avg_scores <- att.check %>%
group_by(Topic, Section) %>%
summarize(
avg_score = mean(score, na.rm = TRUE),
n = sum(!is.na(score)),
sd_score = sd(score, na.rm = TRUE),
se_score = sd_score / sqrt(n),
.groups = "drop"
)
This table prints the average score total and the standard error range for each of the topics by treatment.
table_pretty <- avg_scores %>%
mutate(
result = paste0(
round(avg_score, 3),
" ± ",
round(se_score, 3)
)
) %>%
dplyr::select(Topic, Section, result) %>%
tidyr::pivot_wider(
names_from = Section,
values_from = result
)
table_pretty
## # A tibble: 6 × 3
## Topic Control IA
## <chr> <chr> <chr>
## 1 BioDiv 0.778 ± 0.033 0.877 ± 0.021
## 2 Digestion 1 ± 0 1 ± 0
## 3 EnvInjustice 0.9 ± 0.025 0.981 ± 0.01
## 4 MolBiology 0.95 ± 0.017 0.71 ± 0.028
## 5 Respiration 0.796 ± 0.031 0.749 ± 0.027
## 6 Vaccines 0.825 ± 0.03 0.886 ± 0.02
# Create a ggplot object to visualize the data
ggplot(avg_scores, aes(x = Section, y = avg_score)) +
geom_col(fill = "blue") +
geom_errorbar(
aes(ymin = avg_score - se_score, ymax = avg_score + se_score),
width = 0.2
) +
facet_wrap(~ Topic, scales = "free") +
labs(
title = "Average Attention Check Score per Section",
x = "Section",
y = "Average Score"
) +
theme_minimal()
#include random variable of User.Email to account for repeated assessment
#output is for all topics combined
summary(lme(fixed = score ~ Section, random = ~ 1 | User.Email, data = att.check))
## Linear mixed-effects model fit by REML
## Data: att.check
## AIC BIC logLik
## 1697.693 1720.543 -844.8465
##
## Random effects:
## Formula: ~1 | User.Email
## (Intercept) Residual
## StdDev: 0.0001214032 0.3519632
##
## Fixed effects: score ~ Section
## Value Std.Error DF t-value p-value
## (Intercept) 0.8636884 0.01191216 2011 72.50480 0.0000
## SectionIA -0.0138716 0.01525296 225 -0.90944 0.3641
## Correlation:
## (Intr)
## SectionIA -0.781
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -2.4539174 0.3872892 0.4267008 0.4267012 0.4267021
##
## Number of Observations: 2238
## Number of Groups: 227
#assess each topic independently
split_data <- split(att.check, att.check$Topic)
# Function to fit a linear mixed-effects model for each topic
fit_model <- function(df) {
lme(fixed = score ~ Section, random = ~ 1 | User.Email, data = df)
}
# Apply the function to each subset of data and store the models in a list
models <- lapply(split_data, fit_model)
# To see the summary for each model
summaries <- lapply(models, summary)
# Optional: print summaries for review
summaries
## $BioDiv
## Linear mixed-effects model fit by REML
## Data: df
## AIC BIC logLik
## 349.4332 365.399 -170.7166
##
## Random effects:
## Formula: ~1 | User.Email
## (Intercept) Residual
## StdDev: 9.492587e-06 0.3659177
##
## Fixed effects: score ~ Section
## Value Std.Error DF t-value p-value
## (Intercept) 0.7784810 0.02911085 201 26.741953 0.000
## SectionIA 0.0985682 0.03736569 199 2.637932 0.009
## Correlation:
## (Intr)
## SectionIA -0.779
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -2.3968481 0.3360067 0.3360067 0.6053792 0.6053792
##
## Number of Observations: 402
## Number of Groups: 201
##
## $Digestion
## Linear mixed-effects model fit by REML
## Data: df
## AIC BIC logLik
## -15503.43 -15489.83 7755.714
##
## Random effects:
## Formula: ~1 | User.Email
## (Intercept) Residual
## StdDev: 1.331307e-16 2.777743e-17
##
## Fixed effects: score ~ Section
## Value Std.Error DF t-value p-value
## (Intercept) 1 1.44974e-17 221 6.897788e+16 0.0000
## SectionIA 0 1.86327e-17 221 -1.000000e+00 0.4207
## Correlation:
## (Intr)
## SectionIA -0.778
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## 0 0 0 0 0
##
## Number of Observations: 223
## Number of Groups: 223
##
## $EnvInjustice
## Linear mixed-effects model fit by REML
## Data: df
## AIC BIC logLik
## -51.04712 -35.67294 29.52356
##
## Random effects:
## Formula: ~1 | User.Email
## (Intercept) Residual
## StdDev: 1.484812e-05 0.2188422
##
## Fixed effects: score ~ Section
## Value Std.Error DF t-value p-value
## (Intercept) 0.9000000 0.01849554 173 48.66038 0e+00
## SectionIA 0.0806763 0.02394675 172 3.36899 9e-04
## Correlation:
## (Intr)
## SectionIA -0.772
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -4.48120344 0.08829957 0.08829957 0.45695030 0.45695032
##
## Number of Observations: 347
## Number of Groups: 174
##
## $MolBiology
## Linear mixed-effects model fit by REML
## Data: df
## AIC BIC logLik
## 404.0105 420.1715 -198.0052
##
## Random effects:
## Formula: ~1 | User.Email
## (Intercept) Residual
## StdDev: 1.207599e-05 0.3828284
##
## Fixed effects: score ~ Section
## Value Std.Error DF t-value p-value
## (Intercept) 0.9500000 0.03026525 211 31.389139 0
## SectionIA -0.2400763 0.03841048 209 -6.250281 0
## Correlation:
## (Intr)
## SectionIA -0.788
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -2.4815293 0.1306068 0.1306068 0.7577189 0.7577189
##
## Number of Observations: 422
## Number of Groups: 211
##
## $Respiration
## Linear mixed-effects model fit by REML
## Data: df
## AIC BIC logLik
## 496.4418 512.6782 -244.2209
##
## Random effects:
## Formula: ~1 | User.Email
## (Intercept) Residual
## StdDev: 1.523438e-05 0.4228163
##
## Fixed effects: score ~ Section
## Value Std.Error DF t-value p-value
## (Intercept) 0.7964072 0.03271851 214 24.341184 0.0000
## SectionIA -0.0473578 0.04183597 214 -1.131987 0.2589
## Correlation:
## (Intr)
## SectionIA -0.782
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -1.8835774 0.4815160 0.4815160 0.5935215 0.5935215
##
## Number of Observations: 430
## Number of Groups: 216
##
## $Vaccines
## Linear mixed-effects model fit by REML
## Data: df
## AIC BIC logLik
## 308.808 324.8921 -150.404
##
## Random effects:
## Formula: ~1 | User.Email
## (Intercept) Residual
## StdDev: 1.685284e-05 0.3441218
##
## Fixed effects: score ~ Section
## Value Std.Error DF t-value p-value
## (Intercept) 0.8250000 0.02720522 207 30.325062 0.0000
## SectionIA 0.0608268 0.03473246 205 1.751295 0.0814
## Correlation:
## (Intr)
## SectionIA -0.783
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -2.5741660 0.3317814 0.3317814 0.5085408 0.5085408
##
## Number of Observations: 414
## Number of Groups: 207
viewpoint=subset(quizzes, Type == "viewpoint")
#Remove all user info on a per topic basis if they scored a "0" on the attention check for that topic
viewpoint <- viewpoint %>%
anti_join(zero_scores, by = c("User.Email", "Topic"))
viewpoint$Question.Text <- enc2utf8(as.character(viewpoint$Question.Text))
viewpoint$Question.Text <- iconv(viewpoint$Question.Text, to = "UTF-8", sub = "")
viewpoint$Question.Text <- trimws(viewpoint$Question.Text)
viewpoint$Question.Text <- gsub("\\s+", " ", viewpoint$Question.Text)
###add in a column called "score" with likert ranging from 1-5 (strongly disagree to strongly agree)
viewpoint$score <- case_when(
grepl("Strongly Disagree", viewpoint$Response.Text, ignore.case = TRUE) ~ 1,
grepl("Neither Agree nor Disagree", viewpoint$Response.Text, ignore.case = TRUE) ~ 3,
grepl("Strongly Agree", viewpoint$Response.Text, ignore.case = TRUE) ~ 5,
grepl("Disagree", viewpoint$Response.Text, ignore.case = TRUE) ~ 2,
grepl("Agree", viewpoint$Response.Text, ignore.case = TRUE) ~ 4,
TRUE ~ NA_real_
)
viewpoint <- viewpoint %>%
mutate(
Question.Text = case_when(
Question.Text %in% c(
"Marijuana is a useful medical tool that should be available only by PRESCRIPTION for treating various illnesses.",
"Medical marijuana is a useful medical tool that should be available only by PRESCRIPTION for treating various illnesses."
) ~ "Medical marijuana is a useful medical tool that should be available only by PRESCRIPTION for treating various illnesses.",
TRUE ~ Question.Text
)
)
#Calculate the average score per question per section
# Calculate the average score per question per section
average_scores <- viewpoint %>%
group_by(Section, Question.Text, Topic) %>%
summarize(avg_score = mean(score, na.rm = TRUE), .groups = "drop")
library(tidyr)
# Pivot to wide format so each Section has its own column
pivoted_scores <- average_scores %>%
pivot_wider(names_from = Section, values_from = avg_score)
# Check column names
print(colnames(pivoted_scores))
# Calculate the difference between IA and Control
pivoted_scores <- pivoted_scores %>%
mutate(score_difference = IA - Control)
# Optional: round numeric columns for easier reading
pivoted_scores <- pivoted_scores %>%
mutate(
IA = round(IA, 3),
Control = round(Control, 3),
score_difference = round(score_difference, 3)
)
print(pivoted_scores)
pivoted_scores <- pivoted_scores %>%
mutate(across(where(is.character), ~ iconv(.x, from = "", to = "UTF-8", sub = "")))
library(DT)
datatable(
pivoted_scores,
options = list(
pageLength = 57,
scrollX = TRUE,
columnDefs = list(
list(
targets = 0:(ncol(pivoted_scores) - 1),
className = "dt-left"
)
)
),
rownames = FALSE
) %>%
formatStyle(
columns = names(pivoted_scores),
whiteSpace = "normal"
)
average_scores_topic <- viewpoint %>%
group_by(Section, Topic) %>%
summarize(
avg_score = mean(score, na.rm = TRUE),
sd_score = sd(score, na.rm = TRUE),
n = sum(!is.na(score)),
se_score = sd_score / sqrt(n),
.groups = "drop"
)
average_scores_topic
## # A tibble: 12 × 6
## Section Topic avg_score sd_score n se_score
## <chr> <chr> <dbl> <dbl> <int> <dbl>
## 1 Control BioDiv 3.80 1.09 547 0.0467
## 2 Control Digestion 3.42 1.29 207 0.0896
## 3 Control EnvInjustice 4.00 0.975 560 0.0412
## 4 Control MolBiology 3.55 1.07 480 0.0490
## 5 Control Respiration 4.05 1.07 328 0.0591
## 6 Control Vaccines 3.89 0.954 560 0.0403
## 7 IA BioDiv 3.75 1.11 847 0.0382
## 8 IA Digestion 3.40 1.26 330 0.0692
## 9 IA EnvInjustice 4.03 0.894 824 0.0312
## 10 IA MolBiology 3.51 1.03 778 0.0369
## 11 IA Respiration 4.09 1.14 523 0.0498
## 12 IA Vaccines 3.84 0.981 889 0.0329
plot_topic <- ggplot(average_scores_topic,
aes(x = Topic, y = avg_score, fill = Section)) +
geom_col(position = position_dodge(width = 0.8)) +
geom_errorbar(
aes(ymin = avg_score - se_score,
ymax = avg_score + se_score),
position = position_dodge(width = 0.8),
width = 0.2
) +
labs(
title = "Average Viewpoint Score by Topic and Section",
x = "Topic",
y = "Average Likert Score"
) +
theme_minimal()
print(plot_topic)
#Create a df with an average Viewpoint score for each student by topic
user_topic_scores <- viewpoint %>%
group_by(User.Email, Topic) %>%
summarize(
avg_score = mean(score, na.rm = TRUE),
n_questions = sum(!is.na(score)),
.groups = "drop"
)
user_topic_scores
user_topic_scores_wide <- user_topic_scores %>%
pivot_wider(
names_from = Topic,
values_from = avg_score
)
user_topic_scores_wide
user_topic_scores_wide_collapsed <- user_topic_scores_wide %>%
group_by(User.Email) %>%
summarize(across(where(is.numeric), ~ mean(.x, na.rm = TRUE))) %>%
mutate(
Average = rowMeans(across(-User.Email), na.rm = TRUE)
)
library(lavaan)
surv=read.csv("survey.quant.final.csv")
#remove any empty rows
surv <- surv[apply(surv, 1, function(x) !all(x == "" | is.na(x))), ]
#Remove rows with additional question text (non-data rows)
surv <- surv[-c(2, 3), ]
#only keep data rows who complete the survey and who have consented to participation
surv=subset(surv, Consent == "I Consent")
surv=subset(surv, Progress == "100")
#change construct likert scales to numeric
surv[, 88:146] <- lapply(surv[, 88:146], as.numeric)
#Q141 is the studentID in email form
surv$Email <- tolower(substr(surv$Email, 1, 7))
surv <- merge(surv, section_info[, c("SIS.Login.ID", "source")], by.x = "Email", by.y = "SIS.Login.ID", all.x = TRUE)
# CE.1 through CE.8
# reverse #5-8
columns_to_reverse <- paste0("CE.", 5:8)
surv[columns_to_reverse] <- 8 - surv[columns_to_reverse]
#Create model that contains all questions related to grit
(CE.model <- '
CogEng =~ CE.1 + CE.2 + CE.3 + CE.4 + CE.5 + CE.6 + CE.7 + CE.8')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionCE <- cfa(CE.model, data=surv, estimator= "MLM")
#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionCE, fit.measures=T, standardized=TRUE)
#Produce SEM plot showing all estimates and standard deviations using the standardized values.
#label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionCE, "std", edge.label.cex =1, label.cex=1)
#Create model that contains all questions related to CE
(CEb.model <- '
CogEng =~ CE.1 + CE.5 + CE.6 + CE.7 + CE.8')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionCEb <- cfa(CEb.model, data=surv, estimator= "MLM")
#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionCEb, fit.measures=T, standardized=TRUE)
#Produce SEM plot showing all estimates and standard deviations using the standardized values.
#label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionCEb, "std", edge.label.cex =1, label.cex=1)
#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionCEb, "case.idx")
fscores <- lavPredict(solutionCEb, type="lv")
## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
surv[idx, fs] <- fscores[ , fs]
}
summary(lme(CogEng ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))
## Linear mixed-effects model fit by REML
## Data: surv
## AIC BIC logLik
## 223.1895 238.0179 -107.5947
##
## Random effects:
## Formula: ~1 | Email
## (Intercept) Residual
## StdDev: 0.1445638 0.3099086
##
## Fixed effects: CogEng ~ Timepoint
## Value Std.Error DF t-value p-value
## (Intercept) -0.04031050 0.03029389 207 -1.330648 0.1848
## TimepointPre 0.06565277 0.03737536 94 1.756579 0.0822
## Correlation:
## (Intr)
## TimepointPre -0.732
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -2.9509088 -0.4653659 0.1101215 0.6423174 2.0215912
##
## Number of Observations: 303
## Number of Groups: 208
columns_to_reverse <- paste0("EE.", 5:10)
surv[columns_to_reverse] <- 8 - surv[columns_to_reverse]
#Create model that contains all questions related to grit
(EE.model <- '
EmEng =~ EE.1 + EE.2 + EE.3 + EE.4 + EE.5 + EE.6 + EE.7 + EE.8 + EE.9 + EE.10')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionEE <- cfa(EE.model, data=surv, estimator= "MLM")
#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionEE, fit.measures=T, standardized=TRUE)
#Produce SEM plot showing all estimates and standard deviations using the standardized values.
#label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionEE, "std", edge.label.cex =1, label.cex=1)
#Create model that contains all questions related to grit
(EEb.model <- '
EmEng =~ EE.3 + EE.4 + EE.5 + EE.8 + EE.9 + EE.10')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionEEb <- cfa(EEb.model, data=surv, estimator= "MLM")
#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionEEb, fit.measures=T, standardized=TRUE)
#Produce SEM plot showing all estimates and standard deviations using the standardized values.
#label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionEEb, "std", edge.label.cex =1, label.cex=1)
#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionEEb, "case.idx")
fscores <- lavPredict(solutionEEb, type="lv")
## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
surv[idx, fs] <- fscores[ , fs]
}
summary(lme(EmEng ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))
columns_to_reverse <- paste0("AEE.", 3)
surv[columns_to_reverse] <- 8 - surv[columns_to_reverse]
#Create model that contains all questions related to grit
(AEE.model <- '
AdEmp =~ AEE.1 + AEE.2 + AEE.3 + AEE.4 + AEE.5 + AEE.6 + AEE.7 + AEE.8 + AEE.9 + AEE.10')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionAEE <- cfa(AEE.model, data=surv, estimator= "MLM")
#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionAEE, fit.measures=T, standardized=TRUE)
#Produce SEM plot showing all estimates and standard deviations using the standardized values.
#label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionAEE, "std", edge.label.cex =1, label.cex=1)
#Create model that contains all questions related to grit
(AEEb.model <- '
AdEmp =~ AEE.1 + AEE.2 + AEE.4 + AEE.6 + AEE.7 + AEE.9 ')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionAEEb <- cfa(AEEb.model, data=surv, estimator= "MLM")
#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionAEEb, fit.measures=T, standardized=TRUE)
#Produce SEM plot showing all estimates and standard deviations using the standardized values.
#label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionAEEb, "std", edge.label.cex =1, label.cex=1)
#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionAEEb, "case.idx")
fscores <- lavPredict(solutionAEEb, type="lv")
## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
surv[idx, fs] <- fscores[ , fs]
}
columns_to_reverse <- paste0("IRI.", c(2,3,7,8,9))
surv[columns_to_reverse] <- 8 - surv[columns_to_reverse]
#Create model that contains all questions related to grit
(IRI.model <- '
IntReact =~ IRI.1 + IRI.2 + IRI.3 + IRI.4 + IRI.5 + IRI.6 + IRI.7 + IRI.8 + IRI.9 + IRI.10 + IRI.11 + IRI.12 + IRI.13 + IRI.14')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionIRI <- cfa(IRI.model, data=surv, estimator= "MLM")
#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionIRI, fit.measures=T, standardized=TRUE)
#Produce SEM plot showing all estimates and standard deviations using the standardized values.
#label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionIRI, "std", edge.label.cex =1, label.cex=1)
#Create model that contains all questions related to grit
(IRIb.model <- '
IntReact =~ IRI.3 + IRI.4 + IRI.5 + IRI.7 + IRI.8 + IRI.9 + IRI.10 + IRI.11 + IRI.12 + IRI.13 + IRI.14')
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionIRIb <- cfa(IRIb.model, data=surv, estimator= "MLM")
#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionIRIb, fit.measures=T, standardized=TRUE)
#Produce SEM plot showing all estimates and standard deviations using the standardized values.
#label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionIRIb, "std", edge.label.cex =1, label.cex=1)
#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionIRIb, "case.idx")
fscores <- lavPredict(solutionIRIb, type="lv")
## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
surv[idx, fs] <- fscores[ , fs]
}
summary(lme(IntReact ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))
#Create model that contains all questions related to grit
(SI.model <- '
SciId =~ SI.1 + SI.2 + SI.3 + SI.4 + SI.5')
## [1] "\nSciId =~ SI.1 + SI.2 + SI.3 + SI.4 + SI.5"
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionSI <- cfa(SI.model, data=surv, estimator= "MLM")
#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionSI, fit.measures=T, standardized=TRUE)
## lavaan 0.6-21 ended normally after 25 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of model parameters 10
##
## Used Total
## Number of observations 294 392
##
## Model Test User Model:
## Standard Scaled
## Test Statistic 38.708 30.035
## Degrees of freedom 5 5
## P-value (Chi-square) 0.000 0.000
## Scaling correction factor 1.289
## Satorra-Bentler correction
##
## Model Test Baseline Model:
##
## Test statistic 568.889 370.321
## Degrees of freedom 10 10
## P-value 0.000 0.000
## Scaling correction factor 1.536
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 0.940 0.931
## Tucker-Lewis Index (TLI) 0.879 0.861
##
## Robust Comparative Fit Index (CFI) 0.942
## Robust Tucker-Lewis Index (TLI) 0.883
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -2471.029 -2471.029
## Loglikelihood unrestricted model (H1) -2451.675 -2451.675
##
## Akaike (AIC) 4962.057 4962.057
## Bayesian (BIC) 4998.893 4998.893
## Sample-size adjusted Bayesian (SABIC) 4967.181 4967.181
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.151 0.131
## 90 Percent confidence interval - lower 0.109 0.093
## 90 Percent confidence interval - upper 0.198 0.172
## P-value H_0: RMSEA <= 0.050 0.000 0.000
## P-value H_0: RMSEA >= 0.080 0.997 0.985
##
## Robust RMSEA 0.148
## 90 Percent confidence interval - lower 0.100
## 90 Percent confidence interval - upper 0.201
## P-value H_0: Robust RMSEA <= 0.050 0.001
## P-value H_0: Robust RMSEA >= 0.080 0.988
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.045 0.045
##
## Parameter Estimates:
##
## Standard errors Robust.sem
## Information Expected
## Information saturated (h1) model Structured
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## SciId =~
## SI.1 1.000 1.253 0.791
## SI.2 0.323 0.068 4.770 0.000 0.405 0.304
## SI.3 0.973 0.071 13.716 0.000 1.219 0.814
## SI.4 1.095 0.083 13.142 0.000 1.372 0.815
## SI.5 0.893 0.082 10.942 0.000 1.119 0.651
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .SI.1 0.937 0.131 7.159 0.000 0.937 0.374
## .SI.2 1.610 0.149 10.808 0.000 1.610 0.908
## .SI.3 0.757 0.122 6.201 0.000 0.757 0.337
## .SI.4 0.949 0.215 4.422 0.000 0.949 0.335
## .SI.5 1.700 0.223 7.620 0.000 1.700 0.576
## SciId 1.570 0.197 7.971 0.000 1.000 1.000
#Produce SEM plot showing all estimates and standard deviations using the standardized values.
#label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionSI, "std", edge.label.cex =1, label.cex=1)
#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionSI, "case.idx")
fscores <- lavPredict(solutionSI, type="lv")
## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
surv[idx, fs] <- fscores[ , fs]
}
summary(lme(SciId ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))
## Linear mixed-effects model fit by REML
## Data: surv
## AIC BIC logLik
## 880.3411 895.0481 -436.1706
##
## Random effects:
## Formula: ~1 | Email
## (Intercept) Residual
## StdDev: 0.9961904 0.6346924
##
## Fixed effects: SciId ~ Timepoint
## Value Std.Error DF t-value p-value
## (Intercept) 0.2955271 0.09753661 202 3.029910 0.0028
## TimepointPre -0.3964644 0.08768428 90 -4.521499 0.0000
## Correlation:
## (Intr)
## TimepointPre -0.573
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -1.94877302 -0.44828314 -0.08856004 0.44676570 2.50132821
##
## Number of Observations: 294
## Number of Groups: 203
#Create model that contains all questions related to grit
(SCV.model <- '
SciComVal=~ SCE.1 + SCE.2 + SCE.3 + SCE.4 + SCE.5')
## [1] "\nSciComVal=~ SCE.1 + SCE.2 + SCE.3 + SCE.4 + SCE.5"
#run confirmatory factor analysis on those questions using the MLM mode explained above
solutionSCV <- cfa(SCV.model, data=surv, estimator= "MLM")
#Collect summary output, including all fitness measures based on the standardized outputs.
summary(solutionSCV, fit.measures=T, standardized=TRUE)
## lavaan 0.6-21 ended normally after 27 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of model parameters 10
##
## Used Total
## Number of observations 340 392
##
## Model Test User Model:
## Standard Scaled
## Test Statistic 5.143 3.842
## Degrees of freedom 5 5
## P-value (Chi-square) 0.399 0.572
## Scaling correction factor 1.339
## Satorra-Bentler correction
##
## Model Test Baseline Model:
##
## Test statistic 415.314 335.697
## Degrees of freedom 10 10
## P-value 0.000 0.000
## Scaling correction factor 1.237
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 1.000 1.000
## Tucker-Lewis Index (TLI) 0.999 1.007
##
## Robust Comparative Fit Index (CFI) 1.000
## Robust Tucker-Lewis Index (TLI) 1.008
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -2225.301 -2225.301
## Loglikelihood unrestricted model (H1) -2222.729 -2222.729
##
## Akaike (AIC) 4470.601 4470.601
## Bayesian (BIC) 4508.891 4508.891
## Sample-size adjusted Bayesian (SABIC) 4477.169 4477.169
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.009 0.000
## 90 Percent confidence interval - lower 0.000 0.000
## 90 Percent confidence interval - upper 0.076 0.058
## P-value H_0: RMSEA <= 0.050 0.772 0.914
## P-value H_0: RMSEA >= 0.080 0.038 0.006
##
## Robust RMSEA 0.000
## 90 Percent confidence interval - lower 0.000
## 90 Percent confidence interval - upper 0.076
## P-value H_0: Robust RMSEA <= 0.050 0.823
## P-value H_0: Robust RMSEA >= 0.080 0.039
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.020 0.020
##
## Parameter Estimates:
##
## Standard errors Robust.sem
## Information Expected
## Information saturated (h1) model Structured
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## SciComVal =~
## SCE.1 1.000 0.463 0.511
## SCE.2 1.716 0.198 8.683 0.000 0.794 0.766
## SCE.3 1.422 0.144 9.906 0.000 0.658 0.746
## SCE.4 1.283 0.179 7.150 0.000 0.594 0.491
## SCE.5 1.490 0.187 7.987 0.000 0.690 0.656
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .SCE.1 0.606 0.045 13.588 0.000 0.606 0.739
## .SCE.2 0.444 0.078 5.729 0.000 0.444 0.414
## .SCE.3 0.346 0.060 5.755 0.000 0.346 0.444
## .SCE.4 1.111 0.151 7.343 0.000 1.111 0.759
## .SCE.5 0.631 0.080 7.854 0.000 0.631 0.570
## SciComVal 0.214 0.045 4.768 0.000 1.000 1.000
#Produce SEM plot showing all estimates and standard deviations using the standardized values.
#label and text sizes were increased from default 0.8 to 1.0
semPlot::semPaths(solutionSCV, "std", edge.label.cex =1, label.cex=1)
#Create index file of the factor scores collected from cfa. These will be adjust values to meet the loading determined for each survey question
idx <- lavInspect(solutionSCV, "case.idx")
fscores <- lavPredict(solutionSCV, type="lv")
## loop over factors and merge them with the original data set. All missing values will be input as "NA".
for (fs in colnames(fscores)) {
surv[idx, fs] <- fscores[ , fs]
}
summary(lme(SciComVal ~ Timepoint, random = ~ 1 | Email, data = surv, na.action = na.omit))
## Linear mixed-effects model fit by REML
## Data: surv
## AIC BIC logLik
## 365.7951 381.0872 -178.8975
##
## Random effects:
## Formula: ~1 | Email
## (Intercept) Residual
## StdDev: 0.2658323 0.3209356
##
## Fixed effects: SciComVal ~ Timepoint
## Value Std.Error DF t-value p-value
## (Intercept) -0.03692936 0.03460149 228 -1.067277 0.2870
## TimepointPre 0.05151883 0.03828727 110 1.345587 0.1812
## Correlation:
## (Intr)
## TimepointPre -0.689
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -2.5597954 -0.4627069 0.0837014 0.5986544 1.6859895
##
## Number of Observations: 340
## Number of Groups: 229
#Calculate change in pre/post values for variables that passed CFA criteria
#Cognitive Engagement
CE.selected_columns <- c(1,147:149)
CogEng=surv[, CE.selected_columns]
CogEng <- CogEng %>%
group_by(Email, Timepoint, source) %>%
summarise(CogEng = mean(CogEng), .groups = 'drop')
CE.wide_data <- CogEng %>%
pivot_wider(
names_from = Timepoint,
values_from = CogEng,
names_prefix = "CogEng_"
)
CE.wide_data <- CE.wide_data %>%
mutate(CogEng_diff = ifelse(is.na(CogEng_Pre) | is.na(CogEng_Post), NA, CogEng_Post - CogEng_Pre))
summary(lm(CogEng_diff ~ source, data = CE.wide_data, na.action = na.omit))
##
## Call:
## lm(formula = CogEng_diff ~ source, data = CE.wide_data, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.52267 -0.14393 0.05461 0.24922 1.22329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.07075 0.05821 -1.215 0.227
## sources2 -0.01114 0.09591 -0.116 0.908
##
## Residual standard error: 0.4509 on 93 degrees of freedom
## (155 observations deleted due to missingness)
## Multiple R-squared: 0.000145, Adjusted R-squared: -0.01061
## F-statistic: 0.01348 on 1 and 93 DF, p-value: 0.9078
###################################
ID.selected_columns <- c(1,147:148, 150)
SciID=surv[, ID.selected_columns]
SciID <- SciID %>%
group_by(Email, Timepoint, source) %>%
summarise(SciId = mean(SciId), .groups = 'drop')
SI.wide_data <- SciID %>%
pivot_wider(
names_from = Timepoint,
values_from = SciId,
names_prefix = "SI_"
)
SI.wide_data <- SI.wide_data %>%
mutate(SI_diff = ifelse(is.na(SI_Pre) | is.na(SI_Post), NA, SI_Post - SI_Pre))
summary(lm(SI_diff ~ source, data = SI.wide_data, na.action = na.omit))
##
## Call:
## lm(formula = SI_diff ~ source, data = SI.wide_data, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.13889 -0.63852 -0.08524 0.49341 2.79219
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.4287 0.1127 3.803 0.000261 ***
## sources2 -0.1611 0.1963 -0.820 0.414200
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8804 on 89 degrees of freedom
## (159 observations deleted due to missingness)
## Multiple R-squared: 0.007505, Adjusted R-squared: -0.003647
## F-statistic: 0.673 on 1 and 89 DF, p-value: 0.4142
#######################
SCV.selected_columns <- c(1,147:148, 151)
SCV=surv[, SCV.selected_columns]
SCV <- SCV %>%
group_by(Email, Timepoint, source) %>%
summarise(SCV = mean(SciComVal), .groups = 'drop')
SCV.wide_data <- SCV %>%
pivot_wider(
names_from = Timepoint,
values_from = SCV,
names_prefix = "SCV_"
)
SCV.wide_data <- SCV.wide_data %>%
mutate(SCV_diff = ifelse(is.na(SCV_Pre) | is.na(SCV_Post), NA, SCV_Post - SCV_Pre))
summary(lm(SCV_diff ~ source, data = SCV.wide_data, na.action = na.omit))
##
## Call:
## lm(formula = SCV_diff ~ source, data = SCV.wide_data, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.25953 -0.27792 0.08473 0.30680 1.03728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.08697 0.05394 -1.612 0.110
## sources2 0.12240 0.08729 1.402 0.164
##
## Residual standard error: 0.4448 on 108 degrees of freedom
## (140 observations deleted due to missingness)
## Multiple R-squared: 0.01788, Adjusted R-squared: 0.008787
## F-statistic: 1.966 on 1 and 108 DF, p-value: 0.1637
#write.csv(surv, file = "quant_calc.csv", row.names = F)
for notes on how to produce and interpret semPlots, see : https://www.r-bloggers.com/2018/04/statistics-sunday-using-semplot/ https://cran.r-project.org/web/packages/semPlot/semPlot.pdf http://sachaepskamp.com/documentation/semPlot/semPaths.html
#CFI: Measures whether the model fits the data better than a more restricted baseline model. Higher is better, with okay fit > .9. Find CFI under User Model vs. Baseline model
#TLI (Tucker-Lewis index): Similar to CFI, but it penalizes overly complex models (making it more conservative than CFI). Measures whether the model fits the data better than a more restricted baseline model. Higher is better, with okay fit > .9.
#Find Test statistic, DOF, and P value under Model Test User Model (chi-squared goodness of fit test)
#RMSEA p value:The RMSEA P-value is the Probability that RMSEA <= .05. If that P-value is greater than 5% you can argue that the RMSEA value does not indicate a model rejection (the RMSEA value doesn’t reject the model if the RMSEA value is between 0 and 0.05). Usually this is useful when the RMSEA value is near the cutoff value of 0.05. If the RMSEA value is not near 0.05 one can typically ignore the confidence limit and the P-value and simply use the actual RMSEA value.
#Interpretation:
#RMSEA < 0.05: Indicates a close fit of the model to the data. #0.05 ≤ RMSEA < 0.08: Indicates a reasonable fit. #0.08 ≤ RMSEA < 0.10: Indicates a mediocre fit. #RMSEA ≥ 0.10: Indicates a poor fit.
#p-value > 0.05: The null hypothesis cannot be rejected, suggesting that the model provides a close fit to the data. #p-value ≤ 0.05: The null hypothesis is rejected, indicating that the model does not provide a close fit to the data.
#std.all are the standardized coefficients
See this publication and references within on reporting CFA results properly: https://www.researchgate.net/publication/24187223_Reporting_Practices_in_Confirmatory_Factor_Analysis_An_Overview_and_Some_Recommendations
#Does view of importance (IA.Importance) & IA.Comfort & IA.Grad.Imp change the outcome of constructs above within the IA group?
library(MASS)
post.only=filter(surv, Timepoint == "Post")
post.only$IA.Addressed <- factor(
post.only$IA.Addressed,
levels = c("Never", "About half the time", "Sometimes", "Most of the time", "Always"),
ordered = TRUE
)
post.only$source <- factor(post.only$source)
model <- polr(IA.Addressed ~ source, data = post.only, Hess = TRUE, na.action = na.omit)
summary(model)
## Call:
## polr(formula = IA.Addressed ~ source, data = post.only, na.action = na.omit,
## Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## sources2 0.2978 0.3149 0.9458
##
## Intercepts:
## Value Std. Error t value
## Never|About half the time -1.3523 0.2346 -5.7646
## About half the time|Sometimes -0.5049 0.2046 -2.4676
## Sometimes|Most of the time 2.0816 0.2785 7.4750
## Most of the time|Always 4.4550 0.7248 6.1467
##
## Residual Deviance: 383.2326
## AIC: 393.2326
## (2 observations deleted due to missingness)
table(post.only$IA.Addressed, post.only$source, useNA = "ifany")
##
## s1 s2 <NA>
## Never 20 9 0
## About half the time 15 10 0
## Sometimes 50 31 1
## Most of the time 9 8 0
## Always 1 1 0
## <NA> 0 1 0
post.only$IA.Importance <- as.numeric(as.character(post.only$IA.Importance))
post.only$IA.Comfort <- as.numeric(as.character(post.only$IA.Comfort))
summary(lm(IA.Importance ~ source, data = post.only, na.action = na.omit))
##
## Call:
## lm(formula = IA.Importance ~ source, data = post.only, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.1607 -1.1607 0.0333 1.8393 3.0333
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.9667 0.2206 31.586 <2e-16 ***
## sources2 0.1940 0.3561 0.545 0.587
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.092 on 144 degrees of freedom
## (10 observations deleted due to missingness)
## Multiple R-squared: 0.002058, Adjusted R-squared: -0.004873
## F-statistic: 0.2969 on 1 and 144 DF, p-value: 0.5867
summary(lm(IA.Comfort ~ source, data = post.only, na.action = na.omit))
##
## Call:
## lm(formula = IA.Comfort ~ source, data = post.only, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2642 -2.0698 -0.0698 1.9302 3.9302
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.0698 0.2553 23.78 <2e-16 ***
## sources2 0.1944 0.4134 0.47 0.639
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.367 on 137 degrees of freedom
## (17 observations deleted due to missingness)
## Multiple R-squared: 0.001611, Adjusted R-squared: -0.005677
## F-statistic: 0.2211 on 1 and 137 DF, p-value: 0.639
resp=read.csv("Qual_textresp.csv")
des=read.csv("Qual_Designations.csv")
des_clean <- des %>%
filter(!if_all(starts_with("Designation."), ~ is.na(.)))
library(dplyr)
library(tidyr)
library(purrr)
des_long <- des_clean %>%
pivot_longer(
cols = starts_with("Designation."),
names_to = "Topic",
values_to = "Response"
) %>%
filter(!is.na(Response)) %>%
mutate(
Section = factor(Section),
Response = factor(Response),
Topic = factor(Topic)
)
# results <- des_long %>%
# split(.$Topic) %>%
# map_dfr(function(dat) {
#
# tab <- table(dat$Section, dat$Response)
#
# # remove empty rows/columns
# tab <- tab[rowSums(tab) > 0, colSums(tab) > 0, drop = FALSE]
#
# # check table is still testable
# if (nrow(tab) < 2 || ncol(tab) < 2) {
# return(data.frame(
# Topic = as.character(unique(dat$Topic)),
# test = NA,
# statistic = NA,
# p_value = NA,
# df = NA,
# note = "Not enough variation"
# ))
# }
#
# chi <- suppressWarnings(chisq.test(tab))
#
# data.frame(
# Topic = as.character(unique(dat$Topic)),
# test = "Chi-square",
# statistic = unname(chi$statistic),
# p_value = chi$p.value,
# df = unname(chi$parameter),
# note = "OK"
# )
# })
#
# results
results_fisher <- des_long %>%
split(.$Topic) %>%
map_dfr(function(dat) {
tab <- table(dat$Section, dat$Response)
tab <- tab[rowSums(tab) > 0, colSums(tab) > 0, drop = FALSE]
if (nrow(tab) < 2 || ncol(tab) < 2) {
return(data.frame(
Topic = as.character(unique(dat$Topic)),
p_value = NA,
note = "Not enough variation"
))
}
fish <- fisher.test(tab)
data.frame(
Topic = as.character(unique(dat$Topic)),
p_value = fish$p.value,
note = "Fisher exact test"
)
})
results_fisher
## Topic p_value note
## 1 Designation.DB 0.1112633218 Fisher exact test
## 2 Designation.Env 0.4896338538 Fisher exact test
## 3 Designation.GE 0.2990936696 Fisher exact test
## 4 Designation.Mar 0.0002512388 Fisher exact test
## 5 Designation.Vac 0.1863084072 Fisher exact test
## 6 Designation.Vap 0.4164906739 Fisher exact test
des_long <- des_long %>%
mutate(Response_ord = factor(Response,
levels = c(
"Negative",
"Negative-Neutral",
"Neutral/No opinion/Freedom",
"Positive-Neutral",
"Positive"
),
ordered = TRUE
))
library(MASS)
model <- polr(Response_ord ~ Section + Topic, data = des_long, Hess = TRUE)
summary(model)
## Call:
## polr(formula = Response_ord ~ Section + Topic, data = des_long,
## Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## Section2 0.49686 0.1661 2.9915
## TopicDesignation.Env 0.03744 0.3008 0.1245
## TopicDesignation.GE 0.08432 0.2795 0.3017
## TopicDesignation.Mar 0.63632 0.2615 2.4336
## TopicDesignation.Vac 4.18553 0.3383 12.3705
## TopicDesignation.Vap -1.14137 0.3195 -3.5718
##
## Intercepts:
## Value Std. Error t value
## Negative|Negative-Neutral 0.7857 0.2142 3.6685
## Negative-Neutral|Neutral/No opinion/Freedom 2.0531 0.2334 8.7978
## Neutral/No opinion/Freedom|Positive-Neutral 2.0533 0.2334 8.7982
## Positive-Neutral|Positive 4.0427 0.3084 13.1068
##
## Residual Deviance: 1179.775
## AIC: 1199.775
## (1335 observations deleted due to missingness)
coef_table <- coef(summary(model))
p <- pnorm(abs(coef_table[, "t value"]), lower.tail = FALSE) * 2
cbind(coef_table, "p value" = p)
## Value Std. Error t value
## Section2 0.49685980 0.1660929 2.9914577
## TopicDesignation.Env 0.03744155 0.3007908 0.1244771
## TopicDesignation.GE 0.08432139 0.2795291 0.3016551
## TopicDesignation.Mar 0.63632010 0.2614763 2.4335670
## TopicDesignation.Vac 4.18553063 0.3383491 12.3704509
## TopicDesignation.Vap -1.14137216 0.3195484 -3.5718284
## Negative|Negative-Neutral 0.78566185 0.2141649 3.6684898
## Negative-Neutral|Neutral/No opinion/Freedom 2.05314450 0.2333707 8.7977833
## Neutral/No opinion/Freedom|Positive-Neutral 2.05327306 0.2333734 8.7982308
## Positive-Neutral|Positive 4.04274339 0.3084459 13.1068162
## p value
## Section2 2.776490e-03
## TopicDesignation.Env 9.009376e-01
## TopicDesignation.GE 7.629150e-01
## TopicDesignation.Mar 1.495087e-02
## TopicDesignation.Vac 3.776951e-35
## TopicDesignation.Vap 3.544976e-04
## Negative|Negative-Neutral 2.439874e-04
## Negative-Neutral|Neutral/No opinion/Freedom 1.395452e-18
## Neutral/No opinion/Freedom|Positive-Neutral 1.389899e-18
## Positive-Neutral|Positive 3.009701e-39
#why are there extra columns here?
des.table= des_long %>%
split(.$Topic) %>%
purrr::walk(~ print(table(.x$Section, .x$Response)))
##
## Negative Negative-Neutral Neutral/No opinon/Freedom Positive
## 1 0 46 7 12 1
## 2 0 24 13 6 1
##
## Positive-Neutral
## 1 11
## 2 5
##
## Negative Negative-Neutral Neutral/No opinon/Freedom Positive
## 1 0 32 10 26 1
## 2 0 17 11 15 2
##
## Positive-Neutral
## 1 3
## 2 3
##
## Negative Negative-Neutral Neutral/No opinon/Freedom Positive
## 1 0 45 8 12 1
## 2 0 24 12 7 1
##
## Positive-Neutral
## 1 12
## 2 6
##
## Negative Negative-Neutral Neutral/No opinon/Freedom Positive
## 1 0 45 19 7 2
## 2 0 13 17 3 2
##
## Positive-Neutral
## 1 6
## 2 17
##
## Negative Negative-Neutral Neutral/No opinon/Freedom Positive
## 1 0 0 4 15 38
## 2 0 4 2 8 25
##
## Positive-Neutral
## 1 22
## 2 13
##
## Negative Negative-Neutral Neutral/No opinon/Freedom Positive
## 1 0 61 12 5 0
## 2 0 41 5 6 1
##
## Positive-Neutral
## 1 1
## 2 0
When I converted the designation scale to an ordered score: The marijuana topic shows a substantial ideological framing difference between sections.
Students in Section 2 gave responses that were much less negative / more nuanced.
This suggests:
IA may be most influential when the topic is politically or culturally loaded.
Students do not automatically apply IA to all topics.
library(dplyr)
des_long <- des_long %>%
mutate(Response_ord = factor(Response,
levels = c(
"Negative",
"Negative-Neutral",
"Neutral/No opinion/Freedom",
"Positive-Neutral",
"Positive"
),
ordered = TRUE
),
Response_num = as.numeric(Response_ord))
des_long <- des_long %>%
mutate(
Response = na_if(as.character(Response), "NA"),
Section = na_if(as.character(Section), "NA")
) %>%
filter(!is.na(Response), !is.na(Section))
means <- des_long %>%
group_by(Topic, Section) %>%
summarise(
mean_score = mean(Response_num, na.rm = TRUE),
sd = sd(Response_num, na.rm = TRUE),
n = n(),
se = sd / sqrt(n),
.groups = "drop"
)
means
## # A tibble: 12 × 6
## Topic Section mean_score sd n se
## <fct> <chr> <dbl> <dbl> <int> <dbl>
## 1 Designation.DB 1 1.68 1.20 77 0.137
## 2 Designation.DB 2 1.74 1.09 49 0.156
## 3 Designation.Env 1 1.5 0.960 72 0.113
## 4 Designation.Env 2 1.85 1.20 48 0.174
## 5 Designation.GE 1 1.73 1.22 78 0.138
## 6 Designation.GE 2 1.79 1.15 50 0.162
## 7 Designation.Mar 1 1.62 1.04 79 0.117
## 8 Designation.Mar 2 2.55 1.32 52 0.184
## 9 Designation.Vac 1 4.47 0.796 79 0.0896
## 10 Designation.Vac 2 4.20 1.25 52 0.173
## 11 Designation.Vap 1 1.20 0.496 79 0.0558
## 12 Designation.Vap 2 1.19 0.647 53 0.0889
library(ggplot2)
ggplot(means, aes(x = Topic, y = mean_score, fill = Section)) +
geom_col(position = position_dodge(width = 0.7)) +
geom_errorbar(
aes(ymin = mean_score - se, ymax = mean_score + se),
width = .2,
position = position_dodge(.7)
) +
coord_flip() +
labs(
x = "Topic",
y = "Average Ideological Framing Score"
) +
theme_bw()
Across most topics (environmental justice, vaccines, GE, vaping, etc.), the average ideological framing scores were very similar between sections.
This suggests that students do not necessarily change their stance on socio-scientific issues simply because ideological awareness is introduced.
In other words:
AI instruction appears to influence how students reason, rather than what position they ultimately take.
The marijuana topic shows the clearest difference between sections.(1.62-2.55)
Students in the control section expressed more neutral or mixed framing, while the AI section tended to give more negative responses.
Possible explanation:
The AI section may have encouraged students to critically interrogate ideological narratives surrounding the issue, which may have led some students to articulate stronger evaluative positions rather than neutral ones.
Another possibility is that marijuana is a highly culturally framed issue, and the ideological discussion may have prompted students to engage more directly with those frames.
AI section shows more consistent framing
In several topics, the AI section responses cluster more tightly around specific categories (often negative or negative-neutral), while the control section spreads more across categories.
This suggests that AI instruction may have helped students:
recognize ideological framing more clearly
articulate positions more decisively
Rather than defaulting to neutral responses.
ggplot(des_long, aes(x = Section, y = Response_num, fill = Section)) +
geom_violin(alpha = 0.4, trim = FALSE) +
geom_boxplot(width = 0.15, outlier.shape = NA, alpha = 0.6) +
facet_wrap(~Topic) +
theme_bw() +
labs(
y = "Ideological Framing Score",
x = "Section",
title = "Distribution of Ideological Framing Responses by Section"
)
## Warning: Removed 122 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 122 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Control section uses neutral categories more often. This pattern is common when students lack an explicit analytic framework for interpreting socio-scientific issues.
Without ideological awareness instruction, students may:
hedge their answers
default to neutrality
frame responses as individual opinion rather than ideological analysis.
Looking at the qualitative responses:
AI section (Section 1)
Students more often reference:
systemic issues
ideological framing
social structures
policy implications
Responses tend to focus on how the issue is constructed or framed.
Control section (Section 2)
Students more often focus on:
individual harms
fairness
environmental consequences
personal opinion
Responses tend to focus on what the issue is, rather than how it is framed.
resp <- read.csv("Qual_textresp.csv")
library(tidytext)
library(dplyr)
words <- resp %>%
unnest_tokens(word, Vid.EnvInj.Text)
library(stopwords)
words_clean <- words %>%
filter(!word %in% stop_words$word)
word_counts <- words_clean %>%
count(Section, word, sort = TRUE)
word_counts
## Section word n
## 1 2 environmental 34
## 2 1 environmental 30
## 3 2 racism 30
## 4 1 racism 21
## 5 1 injustice 17
## 6 1 people 14
## 7 2 communities 14
## 8 1 communities 13
## 9 2 injustice 12
## 10 2 people 12
## 11 2 health 11
## 12 1 government 10
## 13 2 understanding 10
## 14 1 water 9
## 15 2 water 9
## 16 1 community 8
## 17 1 topic 8
## 18 1 understanding 8
## 19 2 clean 8
## 20 1 environment 7
## 21 1 health 7
## 22 2 issue 7
## 23 2 marginalized 7
## 24 2 resources 7
## 25 1 clean 6
## 26 1 due 6
## 27 1 issues 6
## 28 1 live 6
## 29 1 lot 6
## 30 2 access 6
## 31 2 affected 6
## 32 2 environment 6
## 33 1 affected 5
## 34 1 income 5
## 35 1 money 5
## 36 2 community 5
## 37 2 disparities 5
## 38 2 flint 5
## 39 2 waste 5
## 40 1 access 4
## 41 1 don 4
## 42 1 educated 4
## 43 1 feel 4
## 44 1 issue 4
## 45 1 lower 4
## 46 1 marginalized 4
## 47 1 poor 4
## 48 1 resources 4
## 49 1 time 4
## 50 2 air 4
## 51 2 concerns 4
## 52 2 crisis 4
## 53 2 exposure 4
## 54 2 issues 4
## 55 2 lack 4
## 56 2 michigan 4
## 57 2 race 4
## 58 1 attention 3
## 59 1 care 3
## 60 1 dont 3
## 61 1 flint 3
## 62 1 healthy 3
## 63 1 impact 3
## 64 1 informed 3
## 65 1 involved 3
## 66 1 low 3
## 67 1 michigan 3
## 68 1 pollution 3
## 69 1 poverty 3
## 70 1 protect 3
## 71 1 race 3
## 72 1 risks 3
## 73 1 sad 3
## 74 1 society 3
## 75 1 speak 3
## 76 1 world 3
## 77 2 color 3
## 78 2 concept 3
## 79 2 dont 3
## 80 2 due 3
## 81 2 environments 3
## 82 2 equal 3
## 83 2 hazards 3
## 84 2 healthy 3
## 85 2 leading 3
## 86 2 pollutants 3
## 87 2 pollution 3
## 88 2 social 3
## 89 2 systemic 3
## 90 1 advantage 2
## 91 1 afford 2
## 92 1 air 2
## 93 1 answer 2
## 94 1 cancer 2
## 95 1 city 2
## 96 1 class 2
## 97 1 concerns 2
## 98 1 cycle 2
## 99 1 disadvantaged 2
## 100 1 disasters 2
## 101 1 disturbing 2
## 102 1 fields 2
## 103 1 fight 2
## 104 1 found 2
## 105 1 generation 2
## 106 1 harder 2
## 107 1 hardships 2
## 108 1 hate 2
## 109 1 hazards 2
## 110 1 hit 2
## 111 1 homeless 2
## 112 1 huge 2
## 113 1 inequalities 2
## 114 1 land 2
## 115 1 looked 2
## 116 1 major 2
## 117 1 makes 2
## 118 1 minorities 2
## 119 1 necessarily 2
## 120 1 person 2
## 121 1 pollutants 2
## 122 1 poorly 2
## 123 1 rates 2
## 124 1 socioeconomic 2
## 125 1 status 2
## 126 1 stop 2
## 127 1 subject 2
## 128 1 super 2
## 129 1 systemic 2
## 130 1 topics 2
## 131 1 trash 2
## 132 1 understand 2
## 133 1 worse 2
## 134 2 addressing 2
## 135 2 adverse 2
## 136 2 biggest 2
## 137 2 burdening 2
## 138 2 cancer 2
## 139 2 country 2
## 140 2 don 2
## 141 2 earth 2
## 142 2 economic 2
## 143 2 effects 2
## 144 2 ensure 2
## 145 2 equitable 2
## 146 2 exacerbating 2
## 147 2 existing 2
## 148 2 family 2
## 149 2 green 2
## 150 2 honestly 2
## 151 2 impactful 2
## 152 2 importance 2
## 153 2 increased 2
## 154 2 industrial 2
## 155 2 inequalities 2
## 156 2 involve 2
## 157 2 leads 2
## 158 2 live 2
## 159 2 lot 2
## 160 2 neighborhoods 2
## 161 2 perpetuating 2
## 162 2 poor 2
## 163 2 poverty 2
## 164 2 races 2
## 165 2 racist 2
## 166 2 rates 2
## 167 2 respiratory 2
## 168 2 sites 2
## 169 2 situations 2
## 170 2 society 2
## 171 2 spaces 2
## 172 2 system 2
## 173 2 targeted 2
## 174 2 topic 2
## 175 2 underscores 2
## 176 2 understand 2
## 177 2 unfairly 2
## 178 2 united 2
## 179 NA racism 2
## 180 1 2024 1
## 181 1 3 1
## 182 1 5x 1
## 183 1 ability 1
## 184 1 achieving 1
## 185 1 addressing 1
## 186 1 advocate 1
## 187 1 affect 1
## 188 1 age 1
## 189 1 ago 1
## 190 1 agricultural 1
## 191 1 amount 1
## 192 1 amounts 1
## 193 1 animals 1
## 194 1 apartments 1
## 195 1 arising 1
## 196 1 aspect 1
## 197 1 aware 1
## 198 1 awareness 1
## 199 1 awful 1
## 200 1 bad 1
## 201 1 bear 1
## 202 1 becasue 1
## 203 1 bit 1
## 204 1 brought 1
## 205 1 brunt 1
## 206 1 build 1
## 207 1 call 1
## 208 1 caring 1
## 209 1 caught 1
## 210 1 caused 1
## 211 1 change 1
## 212 1 cheap 1
## 213 1 chemical 1
## 214 1 children 1
## 215 1 choices 1
## 216 1 cities 1
## 217 1 clue 1
## 218 1 colored 1
## 219 1 coming 1
## 220 1 companies 1
## 221 1 company 1
## 222 1 conditions 1
## 223 1 condos 1
## 224 1 considered 1
## 225 1 contributes 1
## 226 1 country 1
## 227 1 created 1
## 228 1 crime 1
## 229 1 crucial 1
## 230 1 current 1
## 231 1 damage 1
## 232 1 day 1
## 233 1 deal 1
## 234 1 dealt 1
## 235 1 decent 1
## 236 1 decision 1
## 237 1 decrease 1
## 238 1 depends 1
## 239 1 difficult 1
## 240 1 directly 1
## 241 1 discrimination 1
## 242 1 discussed 1
## 243 1 discussing 1
## 244 1 disparities 1
## 245 1 disproportionate 1
## 246 1 diversity 1
## 247 1 doesn 1
## 248 1 drink 1
## 249 1 drinking 1
## 250 1 drop 1
## 251 1 eat 1
## 252 1 economic 1
## 253 1 economics 1
## 254 1 ecosystem 1
## 255 1 education 1
## 256 1 effected 1
## 257 1 effects 1
## 258 1 ehtically 1
## 259 1 eliminate 1
## 260 1 endure 1
## 261 1 enhance 1
## 262 1 ensuring 1
## 263 1 entire 1
## 264 1 enviroment 1
## 265 1 enviromental 1
## 266 1 environmentally 1
## 267 1 environtmental 1
## 268 1 equally 1
## 269 1 equitable 1
## 270 1 exacerbates 1
## 271 1 exact 1
## 272 1 existing 1
## 273 1 experience 1
## 274 1 experiences 1
## 275 1 exposed 1
## 276 1 exposure 1
## 277 1 extremely 1
## 278 1 eyes 1
## 279 1 facing 1
## 280 1 families 1
## 281 1 fault 1
## 282 1 financially 1
## 283 1 fix 1
## 284 1 fixes 1
## 285 1 food 1
## 286 1 form 1
## 287 1 formed 1
## 288 1 fought 1
## 289 1 freaks 1
## 290 1 garbage 1
## 291 1 goal 1
## 292 1 god 1
## 293 1 grocery 1
## 294 1 habitats 1
## 295 1 handled 1
## 296 1 hard 1
## 297 1 harmful 1
## 298 1 helping 1
## 299 1 highlights 1
## 300 1 history 1
## 301 1 honest 1
## 302 1 honestly 1
## 303 1 hopeless 1
## 304 1 horrible 1
## 305 1 hospitals 1
## 306 1 human 1
## 307 1 hurt 1
## 308 1 idea 1
## 309 1 ill 1
## 310 1 illnesses 1
## 311 1 impactful 1
## 312 1 impacting 1
## 313 1 impacts 1
## 314 1 impoverished 1
## 315 1 incident 1
## 316 1 insightful 1
## 317 1 instances 1
## 318 1 invalidate 1
## 319 1 invest 1
## 320 1 justice 1
## 321 1 knowledge 1
## 322 1 knowledgeable 1
## 323 1 lack 1
## 324 1 landfill 1
## 325 1 landfills 1
## 326 1 leading 1
## 327 1 learn 1
## 328 1 learned 1
## 329 1 levels 1
## 330 1 lights 1
## 331 1 linger 1
## 332 1 lives 1
## 333 1 living 1
## 334 1 local 1
## 335 1 location 1
## 336 1 louisiana 1
## 337 1 maintain 1
## 338 1 maintained 1
## 339 1 managed 1
## 340 1 means 1
## 341 1 mi 1
## 342 1 minority 1
## 343 1 move 1
## 344 1 negative 1
## 345 1 negatively 1
## 346 1 officials 1
## 347 1 opinions 1
## 348 1 opportunity 1
## 349 1 passed 1
## 350 1 peoples 1
## 351 1 plants 1
## 352 1 plays 1
## 353 1 policies 1
## 354 1 poorer 1
## 355 1 predominant 1
## 356 1 prejudice 1
## 357 1 pressing 1
## 358 1 prevalent 1
## 359 1 prevent 1
## 360 1 prices 1
## 361 1 properly 1
## 362 1 provided 1
## 363 1 public 1
## 364 1 races 1
## 365 1 rational 1
## 366 1 reason 1
## 367 1 refers 1
## 368 1 regulation 1
## 369 1 reoccurrence 1
## 370 1 represent 1
## 371 1 residents 1
## 372 1 respiratory 1
## 373 1 responsibility 1
## 374 1 restaurants 1
## 375 1 restrictions 1
## 376 1 rising 1
## 377 1 risk 1
## 378 1 river 1
## 379 1 roads 1
## 380 1 role 1
## 381 1 run 1
## 382 1 save 1
## 383 1 scapegoat 1
## 384 1 segregated 1
## 385 1 sidewalks 1
## 386 1 simple 1
## 387 1 situation 1
## 388 1 solution 1
## 389 1 someting 1
## 390 1 sooner 1
## 391 1 spend 1
## 392 1 spread 1
## 393 1 stand 1
## 394 1 standpoint 1
## 395 1 status's 1
## 396 1 stems 1
## 397 1 step 1
## 398 1 stores 1
## 399 1 support 1
## 400 1 supported 1
## 401 1 takes 1
## 402 1 themself 1
## 403 1 therefor 1
## 404 1 times 1
## 405 1 today's 1
## 406 1 treat 1
## 407 1 treated 1
## 408 1 unfair 1
## 409 1 unique 1
## 410 1 united 1
## 411 1 unjust 1
## 412 1 vulnerable 1
## 413 1 weapon 1
## 414 1 willpower 1
## 415 1 wrong 1
## 416 1 zoning 1
## 417 1 <NA> 1
## 418 2 absolutely 1
## 419 2 address 1
## 420 2 addressed 1
## 421 2 adequate 1
## 422 2 advantage 1
## 423 2 affecting 1
## 424 2 affects 1
## 425 2 allowing 1
## 426 2 amount 1
## 427 2 amounts 1
## 428 2 animal 1
## 429 2 answer 1
## 430 2 apparently 1
## 431 2 aspect 1
## 432 2 bad 1
## 433 2 based 1
## 434 2 biases 1
## 435 2 birth 1
## 436 2 black 1
## 437 2 blind 1
## 438 2 born 1
## 439 2 broad 1
## 440 2 broken 1
## 441 2 building 1
## 442 2 care 1
## 443 2 causation 1
## 444 2 caused 1
## 445 2 change 1
## 446 2 choose 1
## 447 2 city 1
## 448 2 claim 1
## 449 2 class 1
## 450 2 climate 1
## 451 2 concentrated 1
## 452 2 constantly 1
## 453 2 contributes 1
## 454 2 correlation 1
## 455 2 countries 1
## 456 2 crazy 1
## 457 2 created 1
## 458 2 creates 1
## 459 2 credible 1
## 460 2 cuases 1
## 461 2 current 1
## 462 2 cycles 1
## 463 2 death 1
## 464 2 decade 1
## 465 2 depleting 1
## 466 2 deserves 1
## 467 2 destroys 1
## 468 2 destruction 1
## 469 2 developmental 1
## 470 2 disadvantaged 1
## 471 2 disagree 1
## 472 2 discriminated 1
## 473 2 disposal 1
## 474 2 disproportionately 1
## 475 2 distributed 1
## 476 2 distribution 1
## 477 2 diverse 1
## 478 2 documentary 1
## 479 2 dumped 1
## 480 2 easily 1
## 481 2 economical 1
## 482 2 ecosystem 1
## 483 2 educate 1
## 484 2 educated 1
## 485 2 effected 1
## 486 2 emissions 1
## 487 2 enviormental 1
## 488 2 equally 1
## 489 2 examine 1
## 490 2 examples 1
## 491 2 existed 1
## 492 2 exists 1
## 493 2 extinction 1
## 494 2 extreme 1
## 495 2 eye 1
## 496 2 eyes 1
## 497 2 facilities 1
## 498 2 facing 1
## 499 2 failure 1
## 500 2 failures 1
## 501 2 families 1
## 502 2 feel 1
## 503 2 field 1
## 504 2 frequently 1
## 505 2 gender 1
## 506 2 giving 1
## 507 2 god's 1
## 508 2 government 1
## 509 2 growing 1
## 510 2 happening 1
## 511 2 harm 1
## 512 2 harmful 1
## 513 2 hate 1
## 514 2 hazardous 1
## 515 2 healthcare 1
## 516 2 hierarchy 1
## 517 2 highly 1
## 518 2 horrible 1
## 519 2 housing 1
## 520 2 illnesses 1
## 521 2 imminent 1
## 522 2 impact 1
## 523 2 impacts 1
## 524 2 incident 1
## 525 2 income 1
## 526 2 incomes 1
## 527 2 industry 1
## 528 2 inequity 1
## 529 2 information 1
## 530 2 infrastructure 1
## 531 2 institution 1
## 532 2 judging 1
## 533 2 keeping 1
## 534 2 lakes 1
## 535 2 layer 1
## 536 2 leave 1
## 537 2 life 1
## 538 2 living 1
## 539 2 located 1
## 540 2 low 1
## 541 2 main 1
## 542 2 management 1
## 543 2 market 1
## 544 2 matter 1
## 545 2 means 1
## 546 2 meat 1
## 547 2 mention 1
## 548 2 minimal 1
## 549 2 minority 1
## 550 2 mistaken 1
## 551 2 money 1
## 552 2 move 1
## 553 2 natural 1
## 554 2 negatively 1
## 555 2 neglect 1
## 556 2 occupied 1
## 557 2 occurring 1
## 558 2 officials 1
## 559 2 opinion 1
## 560 2 outcomes 1
## 561 2 outwards 1
## 562 2 ozone 1
## 563 2 perfect 1
## 564 2 personally 1
## 565 2 personlly 1
## 566 2 phenomenon 1
## 567 2 phrase 1
## 568 2 physical 1
## 569 2 policies 1
## 570 2 policy 1
## 571 2 politics 1
## 572 2 pollutes 1
## 573 2 polluting 1
## 574 2 ponds 1
## 575 2 poorer 1
## 576 2 population 1
## 577 2 predominantly 1
## 578 2 prevalent 1
## 579 2 prioritized 1
## 580 2 privilege 1
## 581 2 progressively 1
## 582 2 properly 1
## 583 2 push 1
## 584 2 qualities 1
## 585 2 quicker 1
## 586 2 real 1
## 587 2 recently 1
## 588 2 refer 1
## 589 2 resolve 1
## 590 2 resource's 1
## 591 2 respond 1
## 592 2 result 1
## 593 2 ridiculous 1
## 594 2 sad 1
## 595 2 school 1
## 596 2 severely 1
## 597 2 shouldn 1
## 598 2 significant 1
## 599 2 single 1
## 600 2 socioeconomic 1
## 601 2 solved 1
## 602 2 speak 1
## 603 2 species 1
## 604 2 status 1
## 605 2 steady 1
## 606 2 stereotype 1
## 607 2 stewards 1
## 608 2 studied 1
## 609 2 superior 1
## 610 2 sustainable 1
## 611 2 systemically 1
## 612 2 tend 1
## 613 2 time 1
## 614 2 today's 1
## 615 2 toxins 1
## 616 2 treated 1
## 617 2 treatment 1
## 618 2 types 1
## 619 2 unable 1
## 620 2 unaware 1
## 621 2 underrepresented 1
## 622 2 understnd 1
## 623 2 unequal 1
## 624 2 unfair 1
## 625 2 unfamiliar 1
## 626 2 ungodly 1
## 627 2 views 1
## 628 2 vulnerable 1
## 629 2 watched 1
## 630 2 white 1
## 631 2 whomever 1
## 632 2 workplace 1
## 633 2 world 1
## 634 2 worldy 1
## 635 2 worrying 1
## 636 2 worse 1
## 637 2 worship 1
## 638 2 wrong 1
## 639 NA aspects 1
## 640 NA aspire 1
## 641 NA current 1
## 642 NA discuss 1
## 643 NA environmental 1
## 644 NA honestly 1
## 645 NA impactful 1
## 646 NA injustice 1
## 647 NA understanding 1
## 648 NA ups 1
top_words <- word_counts %>%
group_by(Section) %>%
slice_max(n, n = 20)
top_words
## # A tibble: 58 × 3
## # Groups: Section [3]
## Section word n
## <int> <chr> <int>
## 1 1 environmental 30
## 2 1 racism 21
## 3 1 injustice 17
## 4 1 people 14
## 5 1 communities 13
## 6 1 government 10
## 7 1 water 9
## 8 1 community 8
## 9 1 topic 8
## 10 1 understanding 8
## # ℹ 48 more rows
ggplot(top_words, aes(x = reorder(word, n), y = n, fill = factor(Section))) +
geom_col(show.legend = FALSE) +
facet_wrap(~Section, scales = "free") +
coord_flip() +
labs(
x = "",
y = "Word Frequency",
title = "Most Common Words in Student Responses by Section"
) +
theme_bw()
keywords <- c("system","policy","government","industry","media","bias","justice","freedom")
words_clean %>%
filter(word %in% keywords) %>%
count(Section, word)
## Section word n
## 1 1 government 10
## 2 1 justice 1
## 3 2 government 1
## 4 2 industry 1
## 5 2 policy 1
## 6 2 system 2
ia_terms <- c(
"system","systemic","structure","structural",
"policy","policies",
"government","state",
"industry","corporate","corporation",
"power","control",
"bias","biased",
"media","narrative","framing",
"justice","inequality","equity",
"rights","freedom",
"capitalism","market",
"regulation","regulated"
)
library(dplyr)
library(tidyr)
library(tidytext)
text_long <- resp %>%
pivot_longer(
cols = starts_with("Vid."),
names_to = "Topic",
values_to = "Text"
)
words <- text_long %>%
unnest_tokens(word, Text)
data(stop_words)
words_clean <- words %>%
anti_join(stop_words, by = "word")
ia_words <- words_clean %>%
filter(word %in% ia_terms)
ia_counts <- ia_words %>%
count(Section, Topic, word, sort = TRUE)
ia_counts
## # A tibble: 28 × 4
## Section Topic word n
## <int> <chr> <chr> <int>
## 1 1 Vid.Resp.Text regulated 16
## 2 2 Vid.Resp.Text regulated 12
## 3 1 Vid.EnvInj.Text government 10
## 4 2 Vid.EnvInj.Text systemic 3
## 5 2 Vid.Resp.Text regulation 3
## 6 1 Vid.EnvInj.Text systemic 2
## 7 1 Vid.MolBio.Text control 2
## 8 1 Vid.Resp.Text government 2
## 9 1 Vid.Resp.Text regulation 2
## 10 2 Vid.EnvInj.Text system 2
## # ℹ 18 more rows
ia_summary <- ia_words %>%
count(Section, Topic)
ia_summary
## # A tibble: 8 × 3
## Section Topic n
## <int> <chr> <int>
## 1 1 Vid.EnvInj.Text 15
## 2 1 Vid.MolBio.Text 2
## 3 1 Vid.Resp.Text 22
## 4 1 Vid.Vaccines.Text 3
## 5 2 Vid.EnvInj.Text 10
## 6 2 Vid.MolBio.Text 2
## 7 2 Vid.Resp.Text 16
## 8 2 Vid.Vaccines.Text 3
library(ggplot2)
ggplot(ia_summary,
aes(x = Topic, y = n, fill = factor(Section))) +
geom_col(position = "dodge") +
coord_flip() +
theme_bw() +
labs(
y = "Number of IA terms used",
x = "Topic",
fill = "Section"
)
total_words <- words_clean %>%
count(Section, Topic)
ia_rate <- ia_summary %>%
left_join(total_words, by = c("Section","Topic"),
suffix = c("_ia","_total")) %>%
mutate(rate = n_ia / n_total)
ia_rate
## # A tibble: 8 × 5
## Section Topic n_ia n_total rate
## <int> <chr> <int> <int> <dbl>
## 1 1 Vid.EnvInj.Text 15 620 0.0242
## 2 1 Vid.MolBio.Text 2 534 0.00375
## 3 1 Vid.Resp.Text 22 546 0.0403
## 4 1 Vid.Vaccines.Text 3 526 0.00570
## 5 2 Vid.EnvInj.Text 10 581 0.0172
## 6 2 Vid.MolBio.Text 2 514 0.00389
## 7 2 Vid.Resp.Text 16 657 0.0244
## 8 2 Vid.Vaccines.Text 3 529 0.00567
ia_rate %>%
arrange(desc(rate))
## # A tibble: 8 × 5
## Section Topic n_ia n_total rate
## <int> <chr> <int> <int> <dbl>
## 1 1 Vid.Resp.Text 22 546 0.0403
## 2 2 Vid.Resp.Text 16 657 0.0244
## 3 1 Vid.EnvInj.Text 15 620 0.0242
## 4 2 Vid.EnvInj.Text 10 581 0.0172
## 5 1 Vid.Vaccines.Text 3 526 0.00570
## 6 2 Vid.Vaccines.Text 3 529 0.00567
## 7 2 Vid.MolBio.Text 2 514 0.00389
## 8 1 Vid.MolBio.Text 2 534 0.00375
ggplot(ia_rate,
aes(x = Topic, y = rate, fill = factor(Section))) +
geom_col(position = "dodge") +
coord_flip() +
theme_bw() +
labs(
y = "Rate of IA terms used",
x = "Topic",
fill = "Section"
)
library(dplyr)
library(tidytext)
library(tidyr)
#install.packages("tidylo")
library(tidylo)
log_odds <- word_counts %>%
bind_log_odds(set = Section, feature = word, n = n)
log_odds_plot <- log_odds %>%
slice_max(abs(log_odds_weighted), n = 20) %>%
mutate(
more_common = ifelse(log_odds_weighted > 0, "AI", "Control"),
word = reorder(word, log_odds_weighted)
)
xmax <- max(abs(log_odds_plot$log_odds_weighted)) * 1.15
ggplot(log_odds_plot,
aes(x = word, y = log_odds_weighted, fill = more_common)) +
geom_col(width = 0.8) +
coord_flip() +
geom_hline(yintercept = 0, linewidth = 0.6) +
annotate("text", x = 1, y = xmax * 0.85, label = "AI", hjust = 1, fontface = "bold") +
annotate("text", x = 1, y = -xmax * 0.85, label = "Control", hjust = 0, fontface = "bold") +
scale_fill_manual(values = c("AI" = "#D95F5F", "Control" = "#2C7FB8")) +
scale_y_continuous(limits = c(-xmax, xmax)) +
labs(
x = NULL,
y = "Weighted log odds",
fill = "More common in",
title = "Words most characteristic of each section"
) +
theme_bw() +
theme(
panel.grid.major.y = element_blank(),
legend.position = "none"
)
library(tidylo)
log_odds_topic <- words_clean %>%
count(Topic, Section, word) %>%
group_by(Topic) %>%
filter(sum(n) >= 3) %>%
bind_log_odds(set = Section, feature = word, n = n) %>%
ungroup()
plot_topic <- log_odds_topic %>%
group_by(Topic) %>%
slice_max(abs(log_odds_weighted), n = 8) %>%
mutate(more_common = ifelse(log_odds_weighted > 0, "AI", "Control"))
ggplot(plot_topic,
aes(x = reorder(word, log_odds_weighted), y = log_odds_weighted, fill = more_common)) +
geom_col() +
coord_flip() +
facet_wrap(~Topic, scales = "free_y") +
scale_fill_manual(values = c("AI" = "#D95F5F", "Control" = "#2C7FB8")) +
labs(
x = NULL,
y = "Weighted log odds",
fill = "More common in",
title = "Topic-specific words distinguishing AI and Control sections"
) +
theme_bw()
The results suggest that explicit ideological awareness instruction influences students’ interpretive reasoning about socio-scientific issues, but does not consistently change the direction of their attitudes.
Students exposed to AI instruction appear more likely to:
articulate clearer positions
recognize ideological structures
analyze issues beyond individual harms.
Explicit instruction in ideological awareness does not necessarily change students’ positions on socio-scientific issues, but it appears to influence how students interpret and articulate those issues. Students in the AI section demonstrated more structured ideological reasoning, whereas students in the control section more frequently expressed neutral or opinion-based responses.
Your data suggest:
Students can recognize ideological issues even without explicit instruction, but explicit IA instruction shifts how they reason about those issues.
Specifically, IA instruction appears to:
increase emotional engagement
strengthen science identity
encourage structural interpretation of socio-scientific issues
rather than simply increasing advocacy or opinion strength.
#focus in on long response more clearly
structural_terms <- c(
"system","systemic","structure","structural",
"policy","policies","regulation","regulated",
"government","state","institution","institutions",
"industry","corporate","corporation","companies",
"power","powerful","control",
"inequality","inequity","equity",
"justice","racism","discrimination",
"community","communities","neighborhood","neighborhoods",
"environmental","environment","pollution",
"media","narrative","framing",
"economic","economics","capitalism","market"
)
library(dplyr)
library(tidytext)
library(tidyr)
text_long <- resp %>%
pivot_longer(
cols = starts_with("Vid."),
names_to = "Topic",
values_to = "Text"
)
words <- text_long %>%
unnest_tokens(word, Text)
data(stop_words)
words_clean <- words %>%
anti_join(stop_words, by = "word")
structural_words <- words_clean %>%
filter(word %in% structural_terms)
structural_counts <- structural_words %>%
count(SortingID, Section, Topic)
total_words <- words_clean %>%
count(SortingID, Section, Topic)
structural_rate <- structural_counts %>%
left_join(total_words, by = c("SortingID","Section","Topic"),
suffix = c("_struct","_total")) %>%
mutate(rate = n_struct / n_total)
structural_summary <- structural_rate %>%
filter(!is.na(Section)) %>% # remove NA section rows
group_by(Section, Topic) %>%
summarise(
mean_rate = mean(rate, na.rm = TRUE),
sd = sd(rate, na.rm = TRUE),
n = n(),
se = sd / sqrt(n), # <-- this creates the SE column
.groups = "drop"
)
structural_summary <- structural_summary %>%
filter(!is.na(Section))
library(ggplot2)
ggplot(structural_summary,
aes(x = Topic, y = mean_rate, fill = factor(Section))) +
geom_col(position = "dodge") +
coord_flip() +
theme_bw() +
labs(
y = "Structural reasoning rate",
x = "Topic",
fill = "Section",
title = "Use of structural reasoning language in responses"
)
lm(rate ~ Section + Topic, data = structural_rate)
##
## Call:
## lm(formula = rate ~ Section + Topic, data = structural_rate)
##
## Coefficients:
## (Intercept) Section TopicVid.MolBio.Text
## 0.281361 -0.005751 -0.207970
## TopicVid.Resp.Text TopicVid.Vaccines.Text
## -0.120820 -0.179134
high_structural <- structural_rate %>%
arrange(desc(rate)) %>%
slice_head(n = 10)
resp %>%
filter(SortingID %in% high_structural$SortingID)
## SortingID Section Timepoint Consent Email
## 1 15 1 Post I Consent kme0042@auburn.edu
## 2 19 2 Post I Consent emo0021@auburn.edu
## 3 28 1 Post I Consent rer0061@auburn.edu
## 4 39 2 Post I Consent Hno0009@auburn.edu
## 5 47 2 Post I Consent lrp0037@auburn.edu
## 6 48 2 Post I Consent mzs0263@auburn.edu
## 7 69 1 Post I Consent akd0082@auburn.edu
## 8 86 1 Post I Consent mzp0147@auburn.edu
## 9 104 2 Post I Consent mar0147@auburn.edu
## 10 119 1 Post I Consent meg0156@auburn.edu
## Learn.Science
## 1 So we can know how the world works
## 2 To learn about how the world around them works.
## 3 Learning science is somethings that is important because it can help you learn more about how your body works and how the earth works.
## 4 To understand basic concepts of how the world and things around us work
## 5 to understand our world and how it works
## 6 Because science is objective truth
## 7
## 8
## 9 It is important for students to learn science because science greatly effects their own life and the world around them.
## 10 It is important to learn biology so that people can have basic knowledge of life.
## Benefits
## 1 It would help people have a better understanding of the more political parts of the science field so they can form their own opinions.
## 2 Make people more aware of what's going on in today's society.
## 3 Having different ideal that others might not of heard about which might open their eye to new ideals.
## 4 Biology would be inclusive and appealing to all groups if this was included
## 5 to have a more unbiased education
## 6 With my biology class being so introductory, I'm not sure which materials would benefit from ideological awareness, but maybe making sure to emphasize the difference in sex and gender when teaching about the reproductive system.
## 7 Students would know some things are not soley based on facts.
## 8 to have awareness.
## 9 More ideologically aware resources would benefit biology content by helping people learn more about their own anatomy and life.
## 10 \nI think this may be able to help students understand a few things better
## Downsides
## 1 You could offend people if you are not careful.
## 2 It can deteriorate from class material.
## 3 Some people not be open to learning new ideas, which might lead them to not evey listening to other ideas.
## 4 I think some people would get annoyed with the fact that it has to be mentioned
## 5 take time away from material
## 6 I don't see any downsides
## 7 Some people would argue these topics.
## 8 may sway people\xd5s thought processes
## 9 One downside would be that many aspects of ideologies can lead to controversy and take away from the facts.
## 10 this could greatly offend people.
## Vid.Resp.Text
## 1 I think that they should be highly regulated.
## 2 Vaping and using marijuana is bad for the public's health.
## 3 Vaping and marijuana use is not good for the health of user, and the government needed to regulate them more.
## 4 I think vaping should be banned but marijuana should be available for prescription use
## 5 I do not believe in the recreational use of vapes/marijuana because it has been proven harmful for us
## 6 I think vaping is horrible. I think that right now, marijuana use is dangerous in our state because it is not regulated because it is illegal. I think that there are safe ways to use marijuana, but I do not think that vaping is ever safe.
## 7 I do not do it because it is bad for health.
## 8 it\xd5s not good for you, it could harm those around you
## 9 Marijuana can be very beneficial to those suffering with chronic pain. However, Marijuana should be well regulated and only accessible to those with a prescription.
## 10 I think that it has not been around long enough to know the harmful side effects of vaping
## Vid.MolBio.Text
## 1 I believe we should not allow this because it will create an even bigger divide in society because not everyone will have access to the technology.
## 2 I don't know enough information on human genome editing to give my opinion.
## 3 Designer babies can be bad for the world.
## 4 I think that it is wrong and having a baby should be a natural process unless altering it to not have death related diseases
## 5 I think the ethical and societal implications outweigh the other pros for genetic editing
## 6 I don't really know enough about this topic to make a thoughtful comment.
## 7 Not sure what this is.
## 8 i don\xd5t understand it enough to have a strong thought about it
## 9 I do not believe genetic editing is ethical if people are trying to make children look a certain way. This practice can be dangerous and is unnatural. Genetic editing should only be used when necessary to prevent a life threatening mutation.
## 10 I think that this is wrong based on my religion
## Vid.Vaccines.Text
## 1 We should all try to get vaccinated but we shouldn\xd5t be forced to do anything we don\xd5t want to do. We also need to have correct information about it.
## 2 Vaccines are good for society's health.
## 3 Misinformation has a really negative impact on whether or not people get vaccines.
## 4 I think vaccinations are important to the general public and stop disease from spreading
## 5 I do not know enough information on these vaccinations
## 6 I think that it is important to get vaccinated for the benefit of the public.
## 7 It can help people not spread diseases that they may have.
## 8 vaccines should be required, for not only the person\xd5s safety but those around them
## 9 Vaccinations are soemtimes useful when preventing possible disease. Vaccines should be well studied and should not be experimental.
## 10 I think that vaccines are good but they SHOULD NOT be forced on the public
## Vid.EnvInj.Text
## 1 I find the most concerning impact to be that these things are passed down from generation to generation.
## 2 I do not know enough about environmental racism to give my opinion.
## 3 Environmental racism and injustice is something that need to be worked on within the United State.
## 4 I don\xd5t know much about environmental racism and injustice
## 5 I find environmental racism concerning and impactful
## 6 I don't know much about environmental racism, but I do find the concept very concerning.
## 7 I am not aware of any environmental racism concerns.
## 8 just take care of the environment and each other.
## 9 I am unfamiliar with environmental racism
## 10 I have no clue what environmental racism is
## X X.1 X.2 X.3 X.4 X.5 X.6 X.7 X.8 X.9 X.10 X.11 X.12 X.13 X.14 X.15
## 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 7 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 8 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 9 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 10 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
summary(lm(rate ~ Section * Topic, data = structural_rate))
##
## Call:
## lm(formula = rate ~ Section * Topic, data = structural_rate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.24113 -0.08461 -0.02008 0.04872 0.40360
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.24153 0.04950 4.880 3.28e-06 ***
## Section 0.02154 0.03210 0.671 0.503
## TopicVid.MolBio.Text -0.05913 0.17742 -0.333 0.740
## TopicVid.Resp.Text -0.03934 0.08607 -0.457 0.648
## TopicVid.Vaccines.Text -0.07817 0.13059 -0.599 0.551
## Section:TopicVid.MolBio.Text -0.09438 0.10549 -0.895 0.373
## Section:TopicVid.Resp.Text -0.05704 0.05731 -0.995 0.322
## Section:TopicVid.Vaccines.Text -0.07044 0.08672 -0.812 0.418
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1376 on 121 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.2532, Adjusted R-squared: 0.21
## F-statistic: 5.862 on 7 and 121 DF, p-value: 7.124e-06
structural_summary$Topic <- factor(
structural_summary$Topic,
levels = c(
"Vid.EnvInj.Text",
"Vid.Resp.Text",
"Vid.Vaccines.Text",
"Vid.MolBio.Text"
)
)
ggplot(structural_summary,
aes(x = Topic, y = mean_rate, color = factor(Section))) +
geom_point(size = 4) +
geom_errorbar(aes(ymin = mean_rate - se,
ymax = mean_rate + se),
width = .1) +
coord_flip() +
theme_bw() +
labs(
y = "Structural reasoning rate",
x = "Topic",
color = "Section"
)
AI students appear more likely to:
connect science to systems
recognize social structures
interpret responsibility beyond individuals
library(dplyr)
library(tidyr)
library(tidytext)
structural_terms <- c(
"system","systemic","structure","structural",
"policy","policies","regulation","regulated",
"government","state","institution","institutions",
"industry","corporate","corporation","companies",
"power","control",
"inequality","inequity","equity",
"justice","racism","discrimination",
"community","communities","neighborhood","neighborhoods",
"environmental","environment",
"media","narrative","framing",
"economic","economics","capitalism","market"
)
text_long <- resp %>%
pivot_longer(
cols = starts_with("Vid."),
names_to = "Topic",
values_to = "Text"
) %>%
mutate(
Section = na_if(as.character(Section), "NA"),
Section = recode(Section, "1" = "AI", "2" = "Control")
) %>%
filter(!is.na(Section), !is.na(Text), Text != "")
words_clean <- text_long %>%
unnest_tokens(word, Text) %>%
anti_join(stop_words, by = "word")
structural_by_response <- words_clean %>%
group_by(SortingID, Section, Topic) %>%
summarise(
total_words = n(),
structural_n = sum(word %in% structural_terms),
structural_present = as.integer(structural_n > 0),
structural_rate = structural_n / total_words,
.groups = "drop"
)
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'lme4'
## The following object is masked from 'package:nlme':
##
## lmList
model_presence <- glmer(
structural_present ~ Section + (1 | Topic) + (1 | SortingID),
data = structural_by_response,
family = binomial
)
summary(model_presence)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: structural_present ~ Section + (1 | Topic) + (1 | SortingID)
## Data: structural_by_response
##
## AIC BIC logLik -2*log(L) df.resid
## 455.8 472.7 -223.9 447.8 500
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.8812 -0.4372 -0.2473 0.3373 4.4931
##
## Random effects:
## Groups Name Variance Std.Dev.
## SortingID (Intercept) 1.040 1.020
## Topic (Intercept) 2.209 1.486
## Number of obs: 504, groups: SortingID, 133; Topic, 4
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7854 0.7781 -2.295 0.0218 *
## SectionControl 0.3311 0.3119 1.061 0.2885
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## SectinCntrl -0.175
exp(fixef(model_presence))
## (Intercept) SectionControl
## 0.1677246 1.3924618
exp(confint(model_presence, parm = "beta_", method = "Wald"))
## 2.5 % 97.5 %
## (Intercept) 0.03649859 0.7707571
## SectionControl 0.75555417 2.5662618
library(lme4)
model_rate <- lmer(
structural_rate ~ Section + (1 | Topic) + (1 | SortingID),
data = structural_by_response
)
summary(model_rate)
## Linear mixed model fit by REML ['lmerMod']
## Formula: structural_rate ~ Section + (1 | Topic) + (1 | SortingID)
## Data: structural_by_response
##
## REML criterion at convergence: -853.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.7268 -0.3660 -0.0739 0.0050 4.9553
##
## Random effects:
## Groups Name Variance Std.Dev.
## SortingID (Intercept) 0.0001068 0.01033
## Topic (Intercept) 0.0059033 0.07683
## Residual 0.0101001 0.10050
## Number of obs: 504, groups: SortingID, 133; Topic, 4
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.052119 0.038868 1.341
## SectionControl 0.008608 0.009320 0.924
##
## Correlation of Fixed Effects:
## (Intr)
## SectinCntrl -0.096
model_rate_topic <- lmer(
structural_rate ~ Section * Topic + (1 | SortingID),
data = structural_by_response
)
summary(model_rate_topic)
## Linear mixed model fit by REML ['lmerMod']
## Formula: structural_rate ~ Section * Topic + (1 | SortingID)
## Data: structural_by_response
##
## REML criterion at convergence: -838.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.0270 -0.3505 -0.0757 -0.0184 5.1401
##
## Random effects:
## Groups Name Variance Std.Dev.
## SortingID (Intercept) 0.0001306 0.01143
## Residual 0.0099822 0.09991
## Number of obs: 504, groups: SortingID, 133
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.14740 0.01202 12.264
## SectionControl 0.05403 0.01896 2.849
## TopicVid.MolBio.Text -0.14300 0.01661 -8.610
## TopicVid.Resp.Text -0.10253 0.01645 -6.231
## TopicVid.Vaccines.Text -0.13727 0.01641 -8.367
## SectionControl:TopicVid.MolBio.Text -0.05513 0.02623 -2.101
## SectionControl:TopicVid.Resp.Text -0.06425 0.02592 -2.479
## SectionControl:TopicVid.Vaccines.Text -0.05787 0.02596 -2.229
##
## Correlation of Fixed Effects:
## (Intr) SctnCn TV.MB. TV.R.T TV.V.T SC:TV.M SC:TV.R
## SectinCntrl -0.634
## TpcVd.MlB.T -0.715 0.453
## TpcVd.Rsp.T -0.722 0.458 0.522
## TpcVd.Vcc.T -0.724 0.459 0.524 0.529
## ScC:TV.MB.T 0.453 -0.714 -0.633 -0.331 -0.332
## SctC:TV.R.T 0.458 -0.723 -0.332 -0.635 -0.336 0.523
## SctC:TV.V.T 0.458 -0.722 -0.331 -0.334 -0.632 0.522 0.528
library(ggeffects)
library(ggplot2)
model_topic <- glmer(
structural_present ~ Section * Topic + (1 | SortingID),
data = structural_by_response,
family = binomial
)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00267592 (tol = 0.002, component 1)
summary(model_topic)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: structural_present ~ Section * Topic + (1 | SortingID)
## Data: structural_by_response
##
## AIC BIC logLik -2*log(L) df.resid
## 444.4 482.4 -213.2 426.4 495
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.5620 -0.3672 -0.2174 0.2520 6.3521
##
## Random effects:
## Groups Name Variance Std.Dev.
## SortingID (Intercept) 1.406 1.186
## Number of obs: 504, groups: SortingID, 133
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.3842 0.3127 1.229 0.219
## SectionControl 0.7216 0.5116 1.411 0.158
## TopicVid.MolBio.Text -4.1764 0.7307 -5.715 1.09e-08 ***
## TopicVid.Resp.Text -1.6779 0.4263 -3.936 8.29e-05 ***
## TopicVid.Vaccines.Text -3.2420 0.5603 -5.786 7.19e-09 ***
## SectionControl:TopicVid.MolBio.Text 0.0754 0.9546 0.079 0.937
## SectionControl:TopicVid.Resp.Text -0.7639 0.6555 -1.165 0.244
## SectionControl:TopicVid.Vaccines.Text -0.6430 0.8137 -0.790 0.429
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) SctnCn TV.MB. TV.R.T TV.V.T SC:TV.M SC:TV.R
## SectinCntrl -0.593
## TpcVd.MlB.T -0.389 0.171
## TpcVd.Rsp.T -0.627 0.325 0.389
## TpcVd.Vcc.T -0.499 0.231 0.369 0.480
## ScC:TV.MB.T 0.265 -0.451 -0.646 -0.193 -0.151
## SctC:TV.R.T 0.375 -0.656 -0.135 -0.548 -0.183 0.367
## SctC:TV.V.T 0.306 -0.534 -0.117 -0.211 -0.538 0.309 0.434
## optimizer (Nelder_Mead) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.00267592 (tol = 0.002, component 1)
VarCorr(model_presence)
## Groups Name Std.Dev.
## SortingID (Intercept) 1.0196
## Topic (Intercept) 1.4861
student_structural <- structural_by_response %>%
group_by(SortingID, Section) %>%
summarise(
mean_rate = mean(structural_rate, na.rm = TRUE),
.groups = "drop"
)
ggplot(student_structural,
aes(x = as.factor(Section), y = mean_rate)) +
geom_violin(fill = "grey80") +
geom_boxplot(width = .1) +
geom_jitter(width = .1, alpha = .5) +
theme_bw()
student_structural <- student_structural %>%
mutate(level = case_when(
mean_rate == 0 ~ "None",
mean_rate < 0.05 ~ "Low",
mean_rate < 0.10 ~ "Moderate",
TRUE ~ "High"
))
student_structural <- student_structural %>%
filter(!is.na(Section))
ggplot(student_structural,
aes(x = level, fill = factor(Section))) +
geom_bar(position = "dodge") +
theme_bw() +
labs(
x = "Structural reasoning level",
y = "Number of students",
fill = "Section"
)
student_structural$level <- factor(
student_structural$level,
levels = c("None","Low","Moderate","High")
)
ggplot(student_structural,
aes(x = factor(Section), fill = level)) +
geom_bar(position = "fill") +
theme_bw() +
labs(
x = "Section",
y = "Proportion of students",
fill = "Structural reasoning level"
)
table_levels <- table(student_structural$Section,
student_structural$level)
chisq.test(table_levels)
##
## Pearson's Chi-squared test
##
## data: table_levels
## X-squared = 1.695, df = 3, p-value = 0.638
This means many students never used structural reasoning language in any response.
That is actually a major finding:
Structural reasoning is not a default explanation style for most students.
This is very consistent with research showing students tend to default to proximate explanations (pollution, waste, sickness) rather than systemic explanations.
library(dplyr)
library(tidyr)
library(tidytext)
structural_terms <- c(
"system","systemic","structure","structural",
"policy","policies","regulation","regulated",
"government","state","institution","institutions",
"industry","corporate","corporation","companies",
"power","control",
"inequality","inequity","equity",
"justice","racism","discrimination",
"community","communities","neighborhood","neighborhoods",
"environmental","environment",
"media","narrative","framing",
"economic","economics","capitalism","market"
)
text_long <- resp %>%
pivot_longer(
cols = starts_with("Vid."),
names_to = "Topic",
values_to = "Text"
) %>%
mutate(
Section = na_if(as.character(Section), "NA")
) %>%
filter(!is.na(Section), !is.na(Text), Text != "")
words_clean <- text_long %>%
unnest_tokens(word, Text) %>%
anti_join(stop_words, by = "word")
structural_by_response <- words_clean %>%
group_by(SortingID, Section, Topic) %>%
summarise(
total_words = n(),
structural_n = sum(word %in% structural_terms),
structural_present = structural_n > 0,
structural_rate = structural_n / total_words,
.groups = "drop"
)
library(dplyr)
library(tidyr)
library(tidytext)
structural_terms <- c(
"system","systemic","structure","structural",
"policy","policies","regulation","regulated",
"government","state","institution","institutions",
"industry","corporate","corporation","companies",
"power","control",
"inequality","inequity","equity",
"justice","racism","discrimination",
"community","communities","neighborhood","neighborhoods",
"environmental","environment",
"media","narrative","framing",
"economic","economics","capitalism","market"
)
text_long <- resp %>%
pivot_longer(
cols = starts_with("Vid."),
names_to = "Topic",
values_to = "Text"
) %>%
mutate(
Section = na_if(as.character(Section), "NA")
) %>%
filter(!is.na(Section), !is.na(Text), Text != "")
words_clean <- text_long %>%
unnest_tokens(word, Text) %>%
anti_join(stop_words, by = "word")
structural_by_response <- words_clean %>%
group_by(SortingID, Section, Topic) %>%
summarise(
total_words = n(),
structural_n = sum(word %in% structural_terms),
structural_present = structural_n > 0,
structural_rate = structural_n / total_words,
.groups = "drop"
)
structural_responses <- structural_by_response %>%
filter(structural_present)
structural_words_only <- words_clean %>%
semi_join(structural_responses, by = c("SortingID", "Section", "Topic"))
library(tidylo)
word_counts_structural <- structural_words_only %>%
count(Section, word) %>%
group_by(word) %>%
filter(sum(n) >= 3) %>%
ungroup()
log_odds_structural <- word_counts_structural %>%
bind_log_odds(set = Section, feature = word, n = n)
library(ggplot2)
plot_words <- log_odds_structural %>%
slice_max(abs(log_odds_weighted), n = 20) %>%
mutate(
more_common = ifelse(log_odds_weighted > 0, "AI", "Control"),
word = reorder(word, log_odds_weighted)
)
ggplot(plot_words,
aes(x = word, y = log_odds_weighted, fill = more_common)) +
geom_col() +
coord_flip() +
scale_fill_manual(values = c("AI" = "#D95F5F", "Control" = "#2C7FB8")) +
theme_bw() +
labs(
x = NULL,
y = "Weighted log odds",
fill = "More common in",
title = "How AI vs Control articulate structural reasoning"
)
student_profiles <- structural_by_response %>%
group_by(SortingID, Section) %>%
summarise(
n_topics = n(),
n_structural = sum(structural_present),
prop_structural = mean(structural_present),
mean_rate = mean(structural_rate, na.rm = TRUE),
max_rate = max(structural_rate, na.rm = TRUE),
.groups = "drop"
)
student_profiles <- student_profiles %>%
mutate(archetype = case_when(
n_structural == 0 ~ "None",
prop_structural <= 0.25 ~ "Occasional",
prop_structural <= 0.75 ~ "Mixed",
prop_structural > 0.75 ~ "Consistent"
))
ggplot(student_profiles,
aes(x = factor(Section), fill = archetype)) +
geom_bar(position = "fill") +
theme_bw() +
labs(
x = "Section",
y = "Proportion of students",
fill = "Reasoning archetype",
title = "Student structural reasoning archetypes by section"
)
archetype_tab <- table(student_profiles$Section, student_profiles$archetype)
chisq.test(archetype_tab)
## Warning in chisq.test(archetype_tab): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: archetype_tab
## X-squared = 2.1101, df = 3, p-value = 0.5499
archetype_tab
##
## Consistent Mixed None Occasional
## 1 1 18 30 31
## 2 0 17 17 19
ggplot(student_profiles, aes(x = factor(Section), y = prop_structural)) +
geom_violin(fill = "grey85") +
geom_boxplot(width = 0.15, outlier.shape = NA) +
geom_jitter(width = 0.08, alpha = 0.6) +
theme_bw() +
labs(
x = "Section",
y = "Proportion of responses with structural reasoning",
title = "Cross-topic consistency in structural reasoning"
)
Social Engagement