#Loading packages

library(lme4)
## Loading required package: Matrix
library(lmerTest)
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
library(lavaan)
## This is lavaan 0.6-21
## lavaan is FREE software! Please report any bugs.
library(asherR)
library(reshape2)

#Loading data

source("../Scripts/S25_Chem1041_Data_Combination.R", chdir = T)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'janitor'
## 
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
## 
## 
## |-------------------------------------|
## | misty 0.8.1 (2026-03-06)            |
## | Miscellaneous Functions T. Yanagida |
## |-------------------------------------|
## 
## 
## Attaching package: 'misty'
## 
## 
## The following object is masked from 'package:janitor':
## 
##     crosstab
## 
## 
## Rows: 983 Columns: 115
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): StartDate, EndDate, IPAddress, RecordedDate, ResponseId, Recipient...
## dbl (87): Status, Progress, Duration (in seconds), Finished, LocationLatitud...
## num  (1): race_b
## lgl  (9): RecipientLastName, RecipientFirstName, ExternalReference, userName...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10125 Columns: 52
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): StartDate, EndDate, IPAddress, RecordedDate, ResponseId, Recipient...
## dbl (39): Status, Progress, Duration (in seconds), Finished, LocationLatitud...
## lgl  (3): RecipientLastName, RecipientFirstName, ExternalReference
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 758 Columns: 123
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): StartDate, EndDate, IPAddress, RecordedDate, ResponseId, Recipient...
## dbl (96): Status, Progress, Duration (in seconds), Finished, LocationLatitud...
## lgl  (9): RecipientLastName, RecipientFirstName, ExternalReference, userName...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 974 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): student, loginid, recipient_email, stud_id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 888 Columns: 89
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (39): loginid, Recitation Assignment - WEEK 1 - PRACTICE (21920682), Rec...
## dbl (50): FINAL LETTER GRADE (22222814), Initial Knowledge Check Completion ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 861 Columns: 33
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (19): student, recipient_email, s25_enrollment_status, s25_program_code,...
## dbl (14): s25_last_term_enrolled, s25_uc_gpa_inst, s25_gpa, s25_college_code...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 861 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): recipient_email, f25_last_term_enrolled, f25_enrollment_status, f2...
## dbl (12): f25_uc_gpa_inst, f25_gpa, f25_college_code, f25_credits_taken, f25...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 861 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): recipient_email, s26_last_term_enrolled, s26_enrollment_status, s2...
## dbl (12): s26_uc_gpa_inst, s26_gpa, s26_college_code, s26_credits_taken, s26...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 847 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): stud_id
## dbl (13): te_code_coursework_b, te_code_performance_b, te_code_comprehension...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 655 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): stud_id
## dbl (24): mediadescribe_stem_code_f, mediadescribe_fiction_code_f, mediadesc...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(recipient_email, student, loginid, stud_id)`
## Joining with `by = join_by(stud_id)`
## Warning in psych::alpha(chem_1041_baseline[, c("bel1_b", "bel2_b", "bel3_b", : Some items were negatively correlated with the first principal component and were automatically reversed.
##  This is indicated by a negative sign for the variable name.
## Warning in psych::alpha(chem_1041_diary[, c("sr1_eoc", "sr2_eoc", "sr3_eoc")], : Some items were negatively correlated with the first principal component and were automatically reversed.
##  This is indicated by a negative sign for the variable name.
## Warning in psych::alpha(chem_1041_final[, c("bel1_f", "bel2_f", "bel3_f", : Some items were negatively correlated with the first principal component and were automatically reversed.
##  This is indicated by a negative sign for the variable name.
## Warning in psych::alpha(chem_1041_final[, c("dr1_f", "dr2_f", "dr3_f", "dr4_f", : Some items were negatively correlated with the first principal component and were automatically reversed.
##  This is indicated by a negative sign for the variable name.

#Pivoting grades to long and creating wave

library(dplyr)
library(tidyr)

chem_1041_grades_long <- chem_1041_grades %>%
  select(stud_id,
         homework_week_2_21905056,
         homework_week_3_21905058,
         homework_week_4_22032186,
         homework_week_5_22046326,
         homework_week_6_22056124,
         homework_week_7_22063384,
         homework_week_8_22078166,
         homework_week_9_22078190,
         homework_week_11_22111120,
         homework_week_12_22127630,
         homework_week_13_22142184,
         homework_week_14_22149232,
         homework_week_15_22172800) %>%
  pivot_longer(
    cols = -stud_id,
    names_to = "hw_var",
    values_to = "homework"
  ) %>%
  mutate(
    wave = case_when(
      hw_var == "homework_week_2_21905056"  ~ 1,
      hw_var == "homework_week_3_21905058"  ~ 2,
      hw_var == "homework_week_4_22032186"  ~ 3,
      hw_var == "homework_week_5_22046326"  ~ 4,
      hw_var == "homework_week_6_22056124"  ~ 5,
      hw_var == "homework_week_7_22063384"  ~ 6,
      hw_var == "homework_week_8_22078166"  ~ 7,
      hw_var == "homework_week_9_22078190"  ~ 8,
      hw_var == "homework_week_11_22111120" ~ 9,
      hw_var == "homework_week_12_22127630" ~ 10,
      hw_var == "homework_week_13_22142184" ~ 11,
      hw_var == "homework_week_14_22149232" ~ 12,
      hw_var == "homework_week_15_22172800" ~ 13
    )
  ) %>%
  select(stud_id, wave, homework)

chem_1041_diary <- chem_1041_diary %>%
  left_join(chem_1041_grades_long, by = c("stud_id", "wave"))

#Creating wave 0 variable

chem_1041_diary <- chem_1041_diary %>% mutate(wave_c = wave - 1)

#Creating weekly variable for forecasting

chem_1041_diary <- chem_1041_diary %>%
  arrange(stud_id, wave) %>%
  group_by(stud_id) %>%
  mutate(
    comp_eoc_next = lead(comp_eoc),
    pred_error = comp_nw_eoc - comp_eoc_next,

    # person mean (between-person component)
    pred_error_pm = mean(pred_error, na.rm = TRUE),

    # group-mean centered (within-person deviation)
    pred_error_wc = pred_error - pred_error_pm
  ) %>%
  ungroup()

chem_1041_diary <- chem_1041_diary %>% mutate(pred_comp_eoc_abs = abs(pred_error))

chem_1041_diary <- chem_1041_diary %>% mutate(
  competence_calibration = case_when(
  pred_error > 0  ~ "overconfident",
  pred_error < 0  ~ "underconfident",
  pred_error == 0 ~ "accurate"
))

#Creating a lagged homework variable

chem_1041_diary <- chem_1041_diary %>%
  arrange(stud_id, wave) %>%
  group_by(stud_id) %>%
  mutate(homework_next = lead(homework)) %>%
  ungroup()

#Joining Data

chem_bf <- left_join(chem_1041_baseline, chem_1041_final, by = "stud_id")
chem_bf <- left_join(chem_bf, chem_1041_grades, by = "stud_id")
#chem_bf_diary_only <- left_join(chem_1040_diary_wide, chem_bf, by = "stud_id")

chem_all <- left_join(chem_1041_diary, chem_bf, by = "stud_id")

# Extract the first row
# first_row <- chem_1041_final[1, ]
# names_column <- data.frame(Names = unlist(first_row))

# write.csv(chem_1041_final, "s25_chem_1041_final.csv")
# write.csv(chem_1041_baseline, "s25_chem_1041_baseline.csv")
# write.csv(chem_1041_diary, "s25_chem_1041_diary.csv")
# write.csv(chem_1041_grades, "s25_chem_1041_grades.csv")
# write.csv(chem_1041_diary_wide, "s25_chem_1041_diary_wide.csv")

#Model

library(lme4)

# #Performance during the current week
# model <- lmer(
#   homework ~ pred_comp_eoc_wc + pred_comp_eoc_pm +
#              urm + female + fg + hs_gpa_scaled_grandZ + wave_c + (1 + wave_c || stud_id),
#   data = chem_all, control = lmerControl(optimizer = "bobyqa")
# )
# summary(model)
# #When a student is more overconfident than usual, they perform slightly better that same week. Students who are generally more overconfident across the semester perform worse overall.

#Performance during the following week
model2 <- lmer(
  homework_next ~ pred_error_wc + pred_error_pm +
                  urm + female + fg + hs_gpa_scaled_grandZ + wave_c + (1 + wave_c | stud_id),
  data = chem_all, control = lmerControl(optimizer = "bobyqa")
)

summary(model2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: homework_next ~ pred_error_wc + pred_error_pm + urm + female +  
##     fg + hs_gpa_scaled_grandZ + wave_c + (1 + wave_c | stud_id)
##    Data: chem_all
## Control: lmerControl(optimizer = "bobyqa")
## 
## REML criterion at convergence: 35223
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.6870 -0.0075  0.0953  0.3234  4.4934 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr  
##  stud_id  (Intercept) 4.37541  2.0917         
##           wave_c      0.03394  0.1842   -0.18 
##  Residual             5.60266  2.3670         
## Number of obs: 7293, groups:  stud_id, 734
## 
## Fixed effects:
##                        Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)            13.77313    0.14848  716.62165  92.763  < 2e-16 ***
## pred_error_wc          -0.04228    0.03398 6342.80166  -1.244  0.21348    
## pred_error_pm          -0.34653    0.12474  656.38145  -2.778  0.00563 ** 
## urm                    -0.52926    0.24405  641.73263  -2.169  0.03047 *  
## female                  0.07590    0.17753  627.12812   0.428  0.66916    
## fg                      0.13525    0.24265  631.01059   0.557  0.57745    
## hs_gpa_scaled_grandZ    2.48716    0.30270  644.41851   8.216 1.15e-15 ***
## wave_c                 -0.07542    0.01086  501.05439  -6.946 1.17e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) prd_rrr_w prd_rrr_p urm    female fg     hs___Z
## pred_rrr_wc -0.012                                                
## pred_rrr_pm -0.172  0.000                                         
## urm         -0.139  0.004    -0.025                               
## female      -0.708  0.002     0.142    -0.074                     
## fg          -0.118 -0.003    -0.007    -0.213 -0.109              
## hs_gp_scl_Z  0.039  0.002    -0.135     0.123 -0.185  0.111       
## wave_c      -0.288  0.030     0.009     0.003  0.002  0.004 -0.006
#No week-to-week effect. Between-person: Students who are more overconfident on average across the semester tend to perform worse in subsequent weeks. Chronic overconfidence is maladaptive even though momentary fluctuations are not.
#Calibration does not drive learning or adjustment across weeks, but more overconfident students overall perform worse, suggesting poor self-knowledge/regulation

#Overall bias test

psych::describe(chem_all$pred_error, na.rm = TRUE)
##    vars    n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 7995 0.08 1.06      0    0.05 0.49  -6   6    12 0.32     3.54 0.01
#Is it signficant
bias_model <- lmer(
  pred_error ~ 1 + (1 | stud_id),
  data = chem_all
)

summary(bias_model)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: pred_error ~ 1 + (1 | stud_id)
##    Data: chem_all
## 
## REML criterion at convergence: 22206.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.7568 -0.3883 -0.0170  0.3663  6.7021 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  stud_id  (Intercept) 0.3539   0.5949  
##  Residual             0.7950   0.8916  
## Number of obs: 7995, groups:  stud_id, 825
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)   0.08950    0.02326 779.27219   3.848 0.000129 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Students on average significantly overestimate their competence week-to-week

#Does bias change over time
bias_time <- lmer(
  pred_error ~ wave_c + (1 | stud_id),
  data = chem_all
)

summary(bias_time)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: pred_error ~ wave_c + (1 | stud_id)
##    Data: chem_all
## 
## REML criterion at convergence: 22207
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.7769 -0.3801 -0.0241  0.3680  6.6782 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  stud_id  (Intercept) 0.3533   0.5944  
##  Residual             0.7942   0.8912  
## Number of obs: 7995, groups:  stud_id, 825
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)  1.350e-01  2.772e-02  1.568e+03   4.873 1.21e-06 ***
## wave_c      -8.933e-03  2.963e-03  7.313e+03  -3.015  0.00258 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##        (Intr)
## wave_c -0.545
#Early semester there is more overconfidence; later in the semester it is closer to accurate

#Who is most likely to be miscalibrated

library(lme4)

bias_model2 <- lmer(
  pred_error ~ urm + female + fg + hs_gpa_scaled_grandZ + (1 | stud_id),
  data = chem_all
)

summary(bias_model2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: pred_error ~ urm + female + fg + hs_gpa_scaled_grandZ + (1 |  
##     stud_id)
##    Data: chem_all
## 
## REML criterion at convergence: 20875.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.7290 -0.3890 -0.0231  0.3805  6.7774 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  stud_id  (Intercept) 0.3369   0.5804  
##  Residual             0.7894   0.8885  
## Number of obs: 7541, groups:  stud_id, 776
## 
## Fixed effects:
##                        Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)            0.208982   0.039655 741.433445   5.270 1.79e-07 ***
## urm                    0.049000   0.068474 761.884985   0.716 0.474460    
## female                -0.213184   0.049738 731.575478  -4.286 2.06e-05 ***
## fg                     0.003283   0.068111 748.903493   0.048 0.961570    
## hs_gpa_scaled_grandZ   0.309370   0.084232 766.377601   3.673 0.000257 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) urm    female fg    
## urm         -0.150                     
## female      -0.734 -0.072              
## fg          -0.123 -0.223 -0.110       
## hs_gp_scl_Z  0.031  0.128 -0.176  0.116
#Women are less overconfient than men (or more underconfident relative to men)
#Higher achieving students are more overconfident on average