Quiz Analysis

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(nlme)
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
quizzes=read.csv("Quiz_data_combined.csv") #228 unique users with quiz data 

#Create a "type" column that labels each question as either an "attention" question or a "viewpoint"
quizzes$Type <- ifelse(grepl("TRUE|FALSE", quizzes$Response.Text, ignore.case = TRUE), "attention", "viewpoint")

#Assess quiz responses for engagement using the "attention" check questions
###subset quiz data to only include the attention check questions 
att.check=subset(quizzes, Type == "attention")

#Create a "score" column, where if their response matches the correct response, we give them a score of 1, and otherwise they get a score of 0. 
att.check$score <- ifelse(att.check$Correct. == "TRUE", 1, 0)

#get the average score for each section
att.check %>%
  group_by(Topic, Section) %>%
  summarize(average_score = mean(score))
## `summarise()` has grouped output by 'Topic'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 3
## # Groups:   Topic [6]
##    Topic        Section average_score
##    <chr>        <chr>           <dbl>
##  1 BioDiv       Control         0.769
##  2 BioDiv       IA              0.863
##  3 Digestion    Control         1    
##  4 Digestion    IA              0.993
##  5 EnvInjustice Control         0.887
##  6 EnvInjustice IA              0.981
##  7 MolBiology   Control         0.927
##  8 MolBiology   IA              0.705
##  9 Respiration  Control         0.787
## 10 Respiration  IA              0.738
## 11 Vaccines     Control         0.815
## 12 Vaccines     IA              0.865
att.check %>%
  group_by(Section) %>%
  summarize(average_score = mean(score))
## # A tibble: 2 × 2
##   Section average_score
##   <chr>           <dbl>
## 1 Control         0.852
## 2 IA              0.839
# Calculate average score per section grouped by topic and section
avg_scores <- att.check %>%
  group_by(Topic, Section) %>%
  summarize(avg_score = mean(score, na.rm = TRUE))
## `summarise()` has grouped output by 'Topic'. You can override using the
## `.groups` argument.
# Create a ggplot object to visualize the data
plot <- ggplot(avg_scores, aes(x = Section, y = avg_score)) +
  geom_bar(stat = "identity", position = "dodge", fill = "blue") +
  facet_wrap(~ Topic, scales = "free") +
  labs(title = "Average Score per Section Faceted by Topic",
       x = "Section", y = "Average Score") +
  theme_minimal()

# Display the plot
print(plot)

summary(lme(fixed = score ~ Section, random = ~ 1 | User.Email, data = att.check))
## Linear mixed-effects model fit by REML
##   Data: att.check 
##      AIC      BIC    logLik
##   1852.5 1875.401 -922.2498
## 
## Random effects:
##  Formula: ~1 | User.Email
##         (Intercept)  Residual
## StdDev:  0.04760064 0.3595739
## 
## Fixed effects:  score ~ Section 
##                  Value  Std.Error   DF  t-value p-value
## (Intercept)  0.8519613 0.01315351 2040 64.77062  0.0000
## SectionIA   -0.0127315 0.01684294  225 -0.75590  0.4505
##  Correlation: 
##           (Intr)
## SectionIA -0.781
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.3950415  0.3748525  0.4157131  0.4565736  0.6311915 
## 
## Number of Observations: 2267
## Number of Groups: 227
split_data <- split(att.check, att.check$Topic)

# Function to fit a linear mixed-effects model for each topic
fit_model <- function(df) {
  lme(fixed = score ~ Section, random = ~ 1 | User.Email, data = df)
}
# Apply the function to each subset of data and store the models in a list
models <- lapply(split_data, fit_model)
# To see the summary for each model
summaries <- lapply(models, summary)
# Optional: print summaries for review
summaries
## $BioDiv
## Linear mixed-effects model fit by REML
##   Data: df 
##        AIC      BIC    logLik
##   379.2015 395.2269 -185.6008
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept) Residual
## StdDev: 4.557599e-06 0.377255
## 
## Fixed effects:  score ~ Section 
##                 Value  Std.Error  DF   t-value p-value
## (Intercept) 0.7687500 0.02982463 204 25.775680  0.0000
## SectionIA   0.0941532 0.03825421 202  2.461251  0.0147
##  Correlation: 
##           (Intr)
## SectionIA -0.78 
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.2873209  0.3634061  0.3634061  0.6129806  0.6129806 
## 
## Number of Observations: 408
## Number of Groups: 204 
## 
## $Digestion
## Linear mixed-effects model fit by REML
##   Data: df 
##         AIC       BIC  logLik
##   -553.6341 -540.0234 280.817
## 
## Random effects:
##  Formula: ~1 | User.Email
##         (Intercept)   Residual
## StdDev:  0.06261081 0.02347914
## 
## Fixed effects:  score ~ Section 
##                  Value   Std.Error  DF   t-value p-value
## (Intercept)  1.0000000 0.007128196 222 140.28796  0.0000
## SectionIA   -0.0073529 0.009148169 222  -0.80376  0.4224
##  Correlation: 
##           (Intr)
## SectionIA -0.779
## 
## Standardized Within-Group Residuals:
##           Min            Q1           Med            Q3           Max 
## -5.212369e+00  2.364275e-14  3.861014e-02  3.861014e-02  3.861014e-02 
## 
## Number of Observations: 224
## Number of Groups: 224 
## 
## $EnvInjustice
## Linear mixed-effects model fit by REML
##   Data: df 
##         AIC       BIC   logLik
##   -21.44766 -6.050361 14.72383
## 
## Random effects:
##  Formula: ~1 | User.Email
##         (Intercept)  Residual
## StdDev:  0.02535476 0.2271112
## 
## Fixed effects:  score ~ Section 
##                 Value  Std.Error  DF  t-value p-value
## (Intercept) 0.8873239 0.01929483 174 45.98766   0e+00
## SectionIA   0.0933535 0.02505320 173  3.72621   3e-04
##  Correlation: 
##           (Intr)
## SectionIA -0.77 
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -4.26657464  0.08301035  0.08301035  0.48406097  0.53760489 
## 
## Number of Observations: 349
## Number of Groups: 175 
## 
## $MolBiology
## Linear mixed-effects model fit by REML
##   Data: df 
##        AIC      BIC    logLik
##   433.7067 449.9244 -212.8533
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept) Residual
## StdDev: 1.519978e-05 0.393839
## 
## Fixed effects:  score ~ Section 
##                  Value  Std.Error  DF   t-value p-value
## (Intercept)  0.9268293 0.03075366 214 30.137204       0
## SectionIA   -0.2222838 0.03915766 212 -5.676637       0
##  Correlation: 
##           (Intr)
## SectionIA -0.785
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.3533203  0.1857884  0.1857884  0.7501912  0.7501912 
## 
## Number of Observations: 428
## Number of Groups: 214 
## 
## $Respiration
## Linear mixed-effects model fit by REML
##   Data: df 
##        AIC      BIC    logLik
##   516.3417 532.6339 -254.1709
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 1.652548e-05 0.4292832
## 
## Fixed effects:  score ~ Section 
##                  Value  Std.Error  DF   t-value p-value
## (Intercept)  0.7869822 0.03302179 217 23.832212  0.0000
## SectionIA   -0.0491545 0.04219764 217 -1.164864  0.2454
##  Correlation: 
##           (Intr)
## SectionIA -0.783
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -1.8332471  0.4962173  0.4962173  0.6107210  0.6107210 
## 
## Number of Observations: 436
## Number of Groups: 219 
## 
## $Vaccines
## Linear mixed-effects model fit by REML
##   Data: df 
##       AIC     BIC    logLik
##   354.685 370.846 -173.3425
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 3.733875e-05 0.3609937
## 
## Fixed effects:  score ~ Section 
##                 Value  Std.Error  DF   t-value p-value
## (Intercept) 0.8148148 0.02836234 211 28.728755  0.0000
## SectionIA   0.0505698 0.03613364 209  1.399522  0.1631
##  Correlation: 
##           (Intr)
## SectionIA -0.785
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.3972292  0.3729023  0.3729023  0.5129873  0.5129873 
## 
## Number of Observations: 422
## Number of Groups: 211

#Remove data for each user by topic if they have an attention check of “0”.

# Sum total scores for each "User.Email" by "Topic"
total_scores <- att.check %>%
  group_by(User.Email, Topic) %>%
  summarize(total_score = sum(score, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'User.Email'. You can override using the
## `.groups` argument.
# Identify users and topics where the score is zero
zero_scores <- total_scores %>%
  filter(total_score == 0) %>%
  select(User.Email, Topic)

# Remove these users' data for topics where they scored zero
att.check <- att.check %>%
  anti_join(zero_scores, by = c("User.Email", "Topic"))

# Calculate sum total scores for each "User.Email" by "topic" in the filtered data
#Check to make sure none of these are 0
sum_scores <- att.check %>%
  group_by(User.Email, Topic) %>%
  summarize(total_score = sum(score, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'User.Email'. You can override using the
## `.groups` argument.
#get the average score for each section
att.check %>%
  group_by(Topic, Section) %>%
  summarize(average_score = mean(score))
## `summarise()` has grouped output by 'Topic'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 3
## # Groups:   Topic [6]
##    Topic        Section average_score
##    <chr>        <chr>           <dbl>
##  1 BioDiv       Control         0.778
##  2 BioDiv       IA              0.877
##  3 Digestion    Control         1    
##  4 Digestion    IA              1    
##  5 EnvInjustice Control         0.9  
##  6 EnvInjustice IA              0.981
##  7 MolBiology   Control         0.95 
##  8 MolBiology   IA              0.710
##  9 Respiration  Control         0.796
## 10 Respiration  IA              0.749
## 11 Vaccines     Control         0.825
## 12 Vaccines     IA              0.886
att.check %>%
  group_by(Section) %>%
  summarize(average_score = mean(score))
## # A tibble: 2 × 2
##   Section average_score
##   <chr>           <dbl>
## 1 Control         0.864
## 2 IA              0.850
library(nlme)
summary(lme(score ~ Section, random = ~ 1 | User.Email, data = att.check))
## Linear mixed-effects model fit by REML
##   Data: att.check 
##        AIC      BIC    logLik
##   1697.693 1720.543 -844.8465
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 0.0001212476 0.3519632
## 
## Fixed effects:  score ~ Section 
##                  Value  Std.Error   DF  t-value p-value
## (Intercept)  0.8636884 0.01191216 2011 72.50480  0.0000
## SectionIA   -0.0138716 0.01525296  225 -0.90944  0.3641
##  Correlation: 
##           (Intr)
## SectionIA -0.781
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.4539173  0.3872892  0.4267008  0.4267012  0.4267021 
## 
## Number of Observations: 2238
## Number of Groups: 227
split_data <- split(att.check, att.check$Topic)

# Function to fit a linear mixed-effects model for each topic
fit_model <- function(df) {
  lme(fixed = score ~ Section, random = ~ 1 | User.Email, data = df)
}
# Apply the function to each subset of data and store the models in a list
models <- lapply(split_data, fit_model)
# To see the summary for each model
summaries <- lapply(models, summary)
# Optional: print summaries for review
summaries
## $BioDiv
## Linear mixed-effects model fit by REML
##   Data: df 
##        AIC     BIC    logLik
##   349.4332 365.399 -170.7166
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 8.703664e-06 0.3659177
## 
## Fixed effects:  score ~ Section 
##                 Value  Std.Error  DF   t-value p-value
## (Intercept) 0.7784810 0.02911085 201 26.741953   0.000
## SectionIA   0.0985682 0.03736569 199  2.637932   0.009
##  Correlation: 
##           (Intr)
## SectionIA -0.779
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.3968481  0.3360067  0.3360067  0.6053792  0.6053792 
## 
## Number of Observations: 402
## Number of Groups: 201 
## 
## $Digestion
## Linear mixed-effects model fit by REML
##   Data: df 
##         AIC       BIC   logLik
##   -15313.71 -15300.11 7660.853
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)     Residual
## StdDev: 2.084409e-16 1.388871e-17
## 
## Fixed effects:  score ~ Section 
##             Value    Std.Error  DF       t-value p-value
## (Intercept)     1 2.226914e-17 221  4.490518e+16  0.0000
## SectionIA       0 2.862128e-17 221 -1.000000e+00  0.4207
##  Correlation: 
##           (Intr)
## SectionIA -0.778
## 
## Standardized Within-Group Residuals:
## Min  Q1 Med  Q3 Max 
##   0   0   0   0   0 
## 
## Number of Observations: 223
## Number of Groups: 223 
## 
## $EnvInjustice
## Linear mixed-effects model fit by REML
##   Data: df 
##         AIC       BIC   logLik
##   -51.04712 -35.67294 29.52356
## 
## Random effects:
##  Formula: ~1 | User.Email
##         (Intercept)  Residual
## StdDev: 1.48505e-05 0.2188422
## 
## Fixed effects:  score ~ Section 
##                 Value  Std.Error  DF  t-value p-value
## (Intercept) 0.9000000 0.01849554 173 48.66038   0e+00
## SectionIA   0.0806763 0.02394675 172  3.36899   9e-04
##  Correlation: 
##           (Intr)
## SectionIA -0.772
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -4.48120344  0.08829957  0.08829957  0.45695030  0.45695032 
## 
## Number of Observations: 347
## Number of Groups: 174 
## 
## $MolBiology
## Linear mixed-effects model fit by REML
##   Data: df 
##        AIC      BIC    logLik
##   404.0105 420.1715 -198.0052
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 1.207588e-05 0.3828284
## 
## Fixed effects:  score ~ Section 
##                  Value  Std.Error  DF   t-value p-value
## (Intercept)  0.9500000 0.03026525 211 31.389139       0
## SectionIA   -0.2400763 0.03841048 209 -6.250281       0
##  Correlation: 
##           (Intr)
## SectionIA -0.788
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.4815293  0.1306068  0.1306068  0.7577189  0.7577189 
## 
## Number of Observations: 422
## Number of Groups: 211 
## 
## $Respiration
## Linear mixed-effects model fit by REML
##   Data: df 
##        AIC      BIC    logLik
##   496.4418 512.6782 -244.2209
## 
## Random effects:
##  Formula: ~1 | User.Email
##         (Intercept)  Residual
## StdDev: 7.42436e-06 0.4228163
## 
## Fixed effects:  score ~ Section 
##                  Value  Std.Error  DF   t-value p-value
## (Intercept)  0.7964072 0.03271851 214 24.341184  0.0000
## SectionIA   -0.0473578 0.04183597 214 -1.131987  0.2589
##  Correlation: 
##           (Intr)
## SectionIA -0.782
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -1.8835774  0.4815160  0.4815160  0.5935215  0.5935215 
## 
## Number of Observations: 430
## Number of Groups: 216 
## 
## $Vaccines
## Linear mixed-effects model fit by REML
##   Data: df 
##       AIC      BIC   logLik
##   308.808 324.8921 -150.404
## 
## Random effects:
##  Formula: ~1 | User.Email
##          (Intercept)  Residual
## StdDev: 1.685294e-05 0.3441218
## 
## Fixed effects:  score ~ Section 
##                 Value  Std.Error  DF   t-value p-value
## (Intercept) 0.8250000 0.02720522 207 30.325062  0.0000
## SectionIA   0.0608268 0.03473246 205  1.751295  0.0814
##  Correlation: 
##           (Intr)
## SectionIA -0.783
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.5741660  0.3317814  0.3317814  0.5085408  0.5085408 
## 
## Number of Observations: 414
## Number of Groups: 207
library(ggplot2)

# Calculate average score per section grouped by topic and section
avg_scores <- att.check %>%
  group_by(Topic, Section) %>%
  summarize(avg_score = mean(score, na.rm = TRUE))
## `summarise()` has grouped output by 'Topic'. You can override using the
## `.groups` argument.
# Create a ggplot object to visualize the data
plot <- ggplot(avg_scores, aes(x = Section, y = avg_score)) +
  geom_bar(stat = "identity", position = "dodge", fill = "blue") +
  facet_wrap(~ Topic, scales = "free") +
  labs(title = "Average Score per Section Faceted by Topic",
       x = "Section", y = "Average Score") +
  theme_minimal()

# Display the plot
print(plot)

#Assess viewpoint questions only

viewpoint=subset(quizzes, Type == "viewpoint")

#Remove all user info on a per topic basis if they scored a "0" on the attention check for that topic
viewpoint <- viewpoint %>%
  anti_join(zero_scores, by = c("User.Email", "Topic"))

###add in a column called "score" with likert ranging from 1-5 (strongly disagree to strongly agree)
viewpoint$score <- ifelse(grepl("Neither Agree nor Disagree", viewpoint$Response.Text, ignore.case = TRUE), 3,
                  ifelse(grepl("Agree", viewpoint$Response.Text, ignore.case = TRUE) & !grepl("Disagree", viewpoint$Response.Text, ignore.case = TRUE), 4,
                  ifelse(grepl("Strongly Agree", viewpoint$Response.Text, ignore.case = TRUE), 5,
                  ifelse(grepl("Strongly Disagree", viewpoint$Response.Text, ignore.case = TRUE), 1,
                  ifelse(grepl("Disagree", viewpoint$Response.Text, ignore.case = TRUE), 2, NA)))))

#Calculate the average score per question per section

average_scores <- viewpoint %>%
  group_by(Section, Question.Text, Topic) %>%
  summarize(avg_score = mean(score, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'Section', 'Question.Text'. You can
## override using the `.groups` argument.
#calculate the difference for each question between sections
library(tidyr)
# Assuming that `average_scores` is the data frame obtained from your previous calculations
# Ensure that there are exactly two Section subcategories per Question.Text
# Pivot to have each Section in separate columns
pivoted_scores <- average_scores %>%
  pivot_wider(names_from = Section, values_from = avg_score)

# Check the column names to confirm the two subcategories are present
print(colnames(pivoted_scores))
## [1] "Question.Text" "Topic"         "Control"       "IA"
# Assuming the two subcategories are, for example, 'Section1' and 'Section2'
# Calculate the difference between them for each Question.Text
pivoted_scores <- pivoted_scores %>%
  mutate(score_difference = IA - Control)

# View the results with differences
print(pivoted_scores)
## # A tibble: 47 × 5
##    Question.Text                            Topic Control    IA score_difference
##    <chr>                                    <chr>   <dbl> <dbl>            <dbl>
##  1 "Active advocacy for and support of pol… BioD…    3.81 NA            NA      
##  2 "Choosing to consume products that are … BioD…    3.91 NA            NA      
##  3 "Communities affected by environmental … EnvI…    3.81  3.87          0.0595 
##  4 "Dietary supplements, including vitamin… Dige…    3.29  3.75          0.465  
##  5 "Economic development justifies environ… EnvI…    2.97  3.03          0.0577 
##  6 "Editing the human genome is a risk to … MolB…    3.75  3.84          0.0885 
##  7 "Education about the food web and human… BioD…    3.86  3.85         -0.00773
##  8 "Environmental injustices are primarily… EnvI…    3.63  3.69          0.0607 
##  9 "Ethical considerations, such as equity… Vacc…    3.81  3.63         -0.183  
## 10 "Gene editing technology has already sa… MolB…    3.55 NA            NA      
## # ℹ 37 more rows
library(DT)

# Create an interactive HTML table with DT
datatable(
  pivoted_scores,
  options = list(
    pageLength = 57,             # Adjust this to change the number of rows per page
    scrollX = TRUE,              # Enables horizontal scrolling if necessary
    columnDefs = list(
      list(targets = 0:ncol(pivoted_scores), className = 'wrap')  # Apply text wrapping
    )
  )
)
average_scores_topic <- viewpoint %>%
  group_by(Section, Topic) %>%
  summarize(avg_score = mean(score, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'Section'. You can override using the
## `.groups` argument.
#calculate the difference for each question between sections
library(tidyr)
# Assuming that `average_scores` is the data frame obtained from your previous calculations
# Ensure that there are exactly two Section subcategories per Question.Text
# Pivot to have each Section in separate columns
pivoted_scores <- average_scores %>%
  pivot_wider(names_from = Section, values_from = avg_score)

# Check the column names to confirm the two subcategories are present
print(colnames(pivoted_scores))
## [1] "Question.Text" "Topic"         "Control"       "IA"
# Assuming the two subcategories are, for example, 'Section1' and 'Section2'
# Calculate the difference between them for each Question.Text
pivoted_scores <- pivoted_scores %>%
  mutate(score_difference = IA - Control)

# View the results with differences
print(pivoted_scores)
## # A tibble: 47 × 5
##    Question.Text                            Topic Control    IA score_difference
##    <chr>                                    <chr>   <dbl> <dbl>            <dbl>
##  1 "Active advocacy for and support of pol… BioD…    3.81 NA            NA      
##  2 "Choosing to consume products that are … BioD…    3.91 NA            NA      
##  3 "Communities affected by environmental … EnvI…    3.81  3.87          0.0595 
##  4 "Dietary supplements, including vitamin… Dige…    3.29  3.75          0.465  
##  5 "Economic development justifies environ… EnvI…    2.97  3.03          0.0577 
##  6 "Editing the human genome is a risk to … MolB…    3.75  3.84          0.0885 
##  7 "Education about the food web and human… BioD…    3.86  3.85         -0.00773
##  8 "Environmental injustices are primarily… EnvI…    3.63  3.69          0.0607 
##  9 "Ethical considerations, such as equity… Vacc…    3.81  3.63         -0.183  
## 10 "Gene editing technology has already sa… MolB…    3.55 NA            NA      
## # ℹ 37 more rows
library(DT)

# Create an interactive HTML table with DT
datatable(
  pivoted_scores,
  options = list(
    pageLength = 57,             # Adjust this to change the number of rows per page
    scrollX = TRUE,              # Enables horizontal scrolling if necessary
    columnDefs = list(
      list(targets = 0:ncol(pivoted_scores), className = 'wrap')  # Apply text wrapping
    )
  )
)

#Merge the available viewpoint scores and data?

s1=read.csv("Bio1010S1.csv")
s2=read.csv("Bio1010S2.csv")
pre=read.csv("PreResponses.csv")

library("dplyr")

combined_df <- bind_rows(list(s1 = s1, s2 = s2), .id = "source")
combined_df <- combined_df[ , -5] 

pre <- pre[ -c(1:2), ] 


#Q141 is the studentID in email form 
pre$Q141 <- tolower(substr(pre$Q141, 1, 7))

#Q1.2 is consent
pre=subset(pre, Q1.2 == "I Consent")
pre=subset(pre, Progress == "100")

combined_df <- combined_df %>%
  mutate(Pre_Survey = if_else(SIS.Login.ID %in% unique(pre$Q141), "yes", "no"))

write.csv(combined_df, "combined_df.csv", row.names = FALSE)


# Extract unique values from both columns
unique_Q141 <- unique(pre$Q141)
unique_SIS_Login_ID <- unique(combined_df$SIS.Login.ID)

# Find values in 'Q141' that are not in 'SIS.Login.ID'
values_not_present <- setdiff(unique_Q141, unique_SIS_Login_ID)
as.data.frame(values_not_present)

pre_only=subset(combined_df, Pre_Survey == "yes")
s1=read.csv("Bio1010S1.csv")
s2=read.csv("Bio1010S2.csv")
post=read.csv("PostResponses.csv")

library("dplyr")

combined_df <- bind_rows(list(s1 = s1, s2 = s2), .id = "source")

post <- post[ -c(1:2), ] 


#Q141 is the studentID in email form 
post$Q141 <- tolower(substr(post$Q141, 1, 7))

#Q1.2 is consent
post=subset(post, Q1.2 == "I Consent")
post=subset(post, Progress == "100")

combined_df <- combined_df %>%
  mutate(Post_Survey = if_else(SIS.Login.ID %in% unique(post$Q141), "yes", "no"))

write.csv(combined_df, "combined_df_post.csv", row.names = FALSE)


# Extract unique values from both columns
unique_Q141 <- unique(post$Q141)
unique_SIS_Login_ID <- unique(combined_df$SIS.Login.ID)

# Find values in 'Q141' that are not in 'SIS.Login.ID'
values_not_present <- setdiff(unique_Q141, unique_SIS_Login_ID)
as.data.frame(values_not_present)

post_only=subset(combined_df, Post_Survey == "yes")

#archive combined

#write.csv(combined_df, "combined_archived.csv")
combined=combined_df
both=subset(combined, Pre_Survey == "yes" & Post_Survey =="yes" ) #141 total participants have both 

Q134_1 = age Q138 = year Q140 = major W146 = country of birht Q149_1 = Q2.4= gender Q2.6 = Race/Ethnicity Q2.7 = First Gen

pre_post_all <- bind_rows(list(pre = pre, post = post), .id = "source")
pre_post_all=subset(pre_post_all, Q1.2 == "I Consent")
pre_post_all=subset(pre_post_all, Progress == "100")

unique_pre_post_all <- unique(pre_post_all$Q141)
unique_pre_post_all #250 in "all", 141 for people who did "both"


pre_post_all %>%
  count(Q2.4)

pre_post_all %>%
  count(Q138)