Setting up Data

Participants who submitted utility: We have 20 participants who responded to immediate follow-up questions regarding the utility of CARE. There are 91 response data points for the utility survey.

System logs of participants who just have at least two practice sessions during MRT (independent of having submitted utility measures): We have 22 participants who responded to immediate follow-up questions regarding the utility of CARE. There are 157 data points for those who had at least two practices.

Merging utility with system logs of participants who have at least two practice sessions during MRT: We have 18 participants who responded to immediate follow-up questions regarding the utility of CARE. There are 77 response data points for the utility survey.

head(df_merged)
##   px                current_chat_code show_ai_feedback
## 1 P1 4e46082d86de41f992e775dad8e1452b                0
## 2 P1 3b24a874839c444685284fa57b82113e                1
## 3 P1 51dd1fc0ad2c4798a6814dfd692c9a77                1
## 4 P1 b38a5cce414342f28e2cfc6626435a17                1
## 5 P1 4af5bd3af5144657850a4cc2ca047837                0
## 6 P1 953278ec313d471aa2ccc50f63606acf                0
##   show_utterance_reflections                created_at
## 1                          0 2025-02-04 16:47:50+00:00
## 2                          0 2025-02-11 02:35:53+00:00
## 3                          1 2025-02-19 22:42:59+00:00
## 4                          1 2025-02-19 23:01:19+00:00
## 5                          1 2025-02-22 19:27:21+00:00
## 6                          0 2025-02-22 19:42:23+00:00
##   current_session_duration_seconds current_utterance_count
## 1                              642                      31
## 2                              556                      54
## 3                              559                      24
## 4                              338                      14
## 5                              504                      25
## 6                              368                      20
##                     next_chat_code time_between_sessions next_utterance_count
## 1 3b24a874839c444685284fa57b82113e                553041                   54
## 2 176042c01bb242169f0eed4b2d9f5713                   445                   32
## 3 b38a5cce414342f28e2cfc6626435a17                   540                   14
## 4 eb3d2a81af8e42779e823eef271a3963                245438                   18
## 5 953278ec313d471aa2ccc50f63606acf                   398                   20
## 6 218adcbe3afd482386c9329acbecdbde                710045                   23
##   next_session_duration_seconds                          condition
## 1                           556           Session Reflections Only
## 2                           647   AI Feedback, Session Reflections
## 3                           338 AI Feedback, Utterance Reflections
## 4                           486 AI Feedback, Utterance Reflections
## 5                           368         Utterance Reflections Only
## 6                           643           Session Reflections Only
##   next_attempt_within_24_hours q1_insight q2_valuable combined_q1_q2 avail
## 1                        False          6           6            6.0     1
## 2                         True          5           6            5.5     1
## 3                         True          6           6            6.0     1
## 4                        False          6           6            6.0     1
## 5                         True          5           6            5.5     1
## 6                        False          6           6            6.0     1
head(df_utility)
##   px                        chat_code q1_insight q2_valuable combined_q1_q2
## 1 P1 4e46082d86de41f992e775dad8e1452b          6           6            6.0
## 2 P1 3b24a874839c444685284fa57b82113e          5           6            5.5
## 3 P1 51dd1fc0ad2c4798a6814dfd692c9a77          6           6            6.0
## 4 P1 b38a5cce414342f28e2cfc6626435a17          6           6            6.0
## 5 P1 4af5bd3af5144657850a4cc2ca047837          5           6            5.5
## 6 P1 953278ec313d471aa2ccc50f63606acf          6           6            6.0
##                  created_at show_ai_feedback show_utterance_reflections
## 1 2025-02-04 16:47:50+00:00                0                          0
## 2 2025-02-11 02:35:53+00:00                1                          0
## 3 2025-02-19 22:42:59+00:00                1                          1
## 4 2025-02-19 23:01:19+00:00                1                          1
## 5 2025-02-22 19:27:21+00:00                0                          1
## 6 2025-02-22 19:42:23+00:00                0                          0
##                            condition avail
## 1           Session Reflections Only     1
## 2   AI Feedback, Session Reflections     1
## 3 AI Feedback, Utterance Reflections     1
## 4 AI Feedback, Utterance Reflections     1
## 5         Utterance Reflections Only     1
## 6           Session Reflections Only     1

Test 1: Causal Excursion Effect Estimation for MRT with Continuous Outcomes - No controls

We’ll just an effect estimation of the binary intervention, whether AI feedback was shown or not, on the proximal outcome of utility

fit1 <- wcls(
    data = df_utility,
    id = "px",
    outcome = "combined_q1_q2",
    treatment = "show_ai_feedback",
    rand_prob = 0.5,
    moderator_formula = ~1,
    control_formula = ~1,
    availability = "avail"
)
## Constant randomization probability 0.5 is used.
## Constant numerator probability 0.5 is used.
summary(fit1)
## $call
## wcls(data = df_utility, id = "px", outcome = "combined_q1_q2", 
##     treatment = "show_ai_feedback", rand_prob = 0.5, moderator_formula = ~1, 
##     control_formula = ~1, availability = "avail")
## 
## $causal_excursion_effect
##             Estimate 95% LCL 95% UCL StdErr Hotelling df1 df2 p-value
## (Intercept)    0.587   0.113    1.06  0.225      6.78   1  18   0.018

Next, we’ll perform effect estimation of the binary intervention, whether utterance-reflections were used or not (session reflections myself), on the proximal outcome of utility

fit2 <- wcls(
    data = df_utility,
    id = "px",
    outcome = "combined_q1_q2",
    treatment = "show_utterance_reflections",
    rand_prob = 0.5,
    moderator_formula = ~1,
    control_formula = ~1,
    availability = "avail"
)
## Constant randomization probability 0.5 is used.
## Constant numerator probability 0.5 is used.
summary(fit2)
## $call
## wcls(data = df_utility, id = "px", outcome = "combined_q1_q2", 
##     treatment = "show_utterance_reflections", rand_prob = 0.5, 
##     moderator_formula = ~1, control_formula = ~1, availability = "avail")
## 
## $causal_excursion_effect
##             Estimate 95% LCL 95% UCL StdErr Hotelling df1 df2 p-value
## (Intercept)    0.276   -0.67    1.22   0.45     0.375   1  18   0.548

Test 2: Estimation with Utility Proximal Variable, with system-log variables as Controls

Prior to analysis, we should do some transformations of the system-log variables.

Again, we’ll just an effect estimation of the binary intervention, whether AI feedback was shown or not, on the proximal outcome of utility. This time, we’ll control for several system logs, such as current_session_duration_seconds_log + current_utterance_count_log + time_between_sessions_days. Below, we see they all hover around marginal significance (p~0.05), which is a wider CI than before. So these controls don’t seem to help with the stderr.

fit3 <- wcls(
    data = df_merged,
    id = "px",
    outcome = "combined_q1_q2",
    treatment = "show_ai_feedback",
    rand_prob = 0.5,
    moderator_formula = ~1,
    control_formula = ~current_session_duration_seconds_log,
    availability = "avail"
)
## Constant randomization probability 0.5 is used.
## Constant numerator probability 0.5 is used.
summary(fit3)
## $call
## wcls(data = df_merged, id = "px", outcome = "combined_q1_q2", 
##     treatment = "show_ai_feedback", rand_prob = 0.5, moderator_formula = ~1, 
##     control_formula = ~current_session_duration_seconds_log, 
##     availability = "avail")
## 
## $causal_excursion_effect
##             Estimate 95% LCL 95% UCL StdErr Hotelling df1 df2 p-value
## (Intercept)    0.596  0.0155    1.18  0.272      4.79   1  15  0.0449
fit4 <- wcls(
    data = df_merged,
    id = "px",
    outcome = "combined_q1_q2",
    treatment = "show_ai_feedback",
    rand_prob = 0.5,
    moderator_formula = ~1,
    control_formula = ~current_session_duration_seconds_log + current_utterance_count_log,
    availability = "avail"
)
## Constant randomization probability 0.5 is used.
## Constant numerator probability 0.5 is used.
summary(fit4)
## $call
## wcls(data = df_merged, id = "px", outcome = "combined_q1_q2", 
##     treatment = "show_ai_feedback", rand_prob = 0.5, moderator_formula = ~1, 
##     control_formula = ~current_session_duration_seconds_log + 
##         current_utterance_count_log, availability = "avail")
## 
## $causal_excursion_effect
##             Estimate 95% LCL 95% UCL StdErr Hotelling df1 df2 p-value
## (Intercept)    0.599  0.0169    1.18  0.271      4.87   1  14  0.0445
fit5 <- wcls(
    data = df_merged,
    id = "px",
    outcome = "combined_q1_q2",
    treatment = "show_ai_feedback",
    rand_prob = 0.5,
    moderator_formula = ~1,
    control_formula = ~current_session_duration_seconds_log + current_utterance_count_log + time_between_sessions_days,
    availability = "avail"
)
## Constant randomization probability 0.5 is used.
## Constant numerator probability 0.5 is used.
summary(fit5)
## $call
## wcls(data = df_merged, id = "px", outcome = "combined_q1_q2", 
##     treatment = "show_ai_feedback", rand_prob = 0.5, moderator_formula = ~1, 
##     control_formula = ~current_session_duration_seconds_log + 
##         current_utterance_count_log + time_between_sessions_days, 
##     availability = "avail")
## 
## $causal_excursion_effect
##             Estimate 95% LCL 95% UCL StdErr Hotelling df1 df2 p-value
## (Intercept)     0.55 -0.0403    1.14  0.273      4.05   1  13  0.0653

Test 3: Binary proximal outcome of trying another practice in the next 24 hours

Considering the outcome is “they attempt to practice again”, does assignment to the “AI feedback” condition predict more practice in the future?

fit6 <- emee(
    data = df_system,
    id = "px",
    outcome = "next_attempt_within_24_hours",
    treatment = "show_ai_feedback",
    rand_prob = 0.5,
    moderator_formula = ~1,
    control_formula = ~1,
    availability = "avail"
)
## Constant randomization probability 0.5 is used.
## Constant numerator probability 0.5 is used.
summary(fit6)
## $call
## emee(data = df_system, id = "px", outcome = "next_attempt_within_24_hours", 
##     treatment = "show_ai_feedback", rand_prob = 0.5, moderator_formula = ~1, 
##     control_formula = ~1, availability = "avail")
## 
## $causal_excursion_effect
##             Estimate 95% LCL 95% UCL StdErr t_value df p-value
## (Intercept)    0.184  -0.304   0.672  0.234   0.786 20   0.441