Data Processing: Varsity Tutors, Kent SD

Author

Alexis Davila

Clean Data

Import

Packages selected for inspecting and cleaning Varsity Tutors, district data from Kent School District.

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.1     ✔ readr     2.2.0
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.3     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Import roster.

roster <- read_csv("roster_vt_kent.csv")
Rows: 638 Columns: 18
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (12): Grade, School, MOY i-Ready Math Composite Scaled Score*, MOY i-Rea...
dbl  (5): Student Number, Sped, MLE, Low Income, LAP
lgl  (1): Treatment

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Inspect data variables. Data file appears to contain all requested elements - except for MLL Level (“Mulit-lingual learner level on a scale of 1 to 6 in SY25-26 based on beginning of year assessment). MLE, or multilingual education status, presents in place of ELL status. Low Income presents in place of Free/Reduced Price Lunch status. Sped, or Special Education, presents in place of IEP status.

colnames(roster)
 [1] "Student Number"                          
 [2] "Grade"                                   
 [3] "School"                                  
 [4] "Treatment"                               
 [5] "MOY i-Ready Math Composite Scaled Score*"
 [6] "MOY i-Ready Math Math Test Date*"        
 [7] "EOY i-Ready Math Composite Scaled Score*"
 [8] "EOY i-Ready Math Math Test Date*"        
 [9] "MOY i-Ready ELA Composite Scaled Score*" 
[10] "MOY i-Ready ELA Test Date*"              
[11] "EOY i-Ready ELA Composite  Scaled Score*"
[12] "EOY i-Ready ELA Test Date*"              
[13] "Gender Code"                             
[14] "Sped"                                    
[15] "MLE"                                     
[16] "Low Income"                              
[17] "Race/Ethnicity"                          
[18] "LAP"                                     

Change column headers for roster.

roster = roster %>%
  rename(
    student_id = "Student Number",
    grade = "Grade", 
    school = "School", 
    treat = "Treatment", 
    
    moy_math_comp = "MOY i-Ready Math Composite Scaled Score*", 
    moy_math_date = "MOY i-Ready Math Math Test Date*", 
    moy_ela_comp = "MOY i-Ready ELA Composite Scaled Score*", 
    moy_ela_date = "MOY i-Ready ELA Test Date*", 
    
    eoy_math_comp = "EOY i-Ready Math Composite Scaled Score*", 
    eoy_math_date = "EOY i-Ready Math Math Test Date*", 
    eoy_ela_comp = "EOY i-Ready ELA Composite  Scaled Score*", 
    eoy_ela_date = "EOY i-Ready ELA Test Date*", 
    
    gender = "Gender Code", 
    sped = "Sped", 
    mll = "MLE", #multilingual education/multilingual learners
    low_inc = "Low Income", 
    race_eth = "Race/Ethnicity", 
    low_ap = "LAP" #low academic performance
    
  )
colnames(roster)
 [1] "student_id"    "grade"         "school"        "treat"        
 [5] "moy_math_comp" "moy_math_date" "eoy_math_comp" "eoy_math_date"
 [9] "moy_ela_comp"  "moy_ela_date"  "eoy_ela_comp"  "eoy_ela_date" 
[13] "gender"        "sped"          "mll"           "low_inc"      
[17] "race_eth"      "low_ap"       

Inspect duplicates

Identify duplicate student records within roster file.

# frequency table of student_id occurrences
id_counts_roster = roster %>%
  group_by(student_id) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

# filter only IDs that appear > 1
dupes_roster = id_counts_roster %>%
  filter(n > 1)

# view results
print(dupes_roster)
# A tibble: 2 × 2
  student_id     n
       <dbl> <int>
1     380845     2
2     386424     2
#export
write.csv(dupes_roster, file = "duplicate_ids_roster_vt_kent.csv", row.names = FALSE)

Inspect duplicate student records in roster. Observation: for a given duplicate student record, it appears to be the same individual (e.g., same school, same grade, same scores).

dup_preview_roster = roster %>%
  group_by(student_id) %>%
  filter(n() > 1) %>%
  ungroup()

print(dup_preview_roster)
# A tibble: 4 × 18
  student_id grade school        treat moy_math_comp moy_math_date eoy_math_comp
       <dbl> <chr> <chr>         <lgl> <chr>         <chr>         <chr>        
1     380845 08    Meridian Mid… NA    #N/A          #N/A          587          
2     386424 08    Meridian Mid… NA    #N/A          #N/A          510          
3     386424 08    Meridian Mid… NA    #N/A          #N/A          510          
4     380845 08    Meridian Mid… NA    #N/A          #N/A          587          
# ℹ 11 more variables: eoy_math_date <chr>, moy_ela_comp <chr>,
#   moy_ela_date <chr>, eoy_ela_comp <chr>, eoy_ela_date <chr>, gender <chr>,
#   sped <dbl>, mll <dbl>, low_inc <dbl>, race_eth <chr>, low_ap <dbl>

Decision: keep first occurrence of student ID based on current row order (N=636).

roster_unique = roster %>%
  distinct(student_id, .keep_all = TRUE)

print(roster_unique)
# A tibble: 636 × 18
   student_id grade school       treat moy_math_comp moy_math_date eoy_math_comp
        <dbl> <chr> <chr>        <lgl> <chr>         <chr>         <chr>        
 1     391033 07    Meridian Mi… NA    484           1/6/2026      490          
 2     412115 08    Meridian Mi… NA    384           1/7/2026      409          
 3     425285 06    Meridian Mi… NA    468           1/6/2026      #N/A         
 4     373185 07    Meridian Mi… NA    533           1/7/2026      549          
 5     394412 06    Meridian Mi… NA    571           1/7/2026      568          
 6     389860 07    Meridian Mi… NA    530           1/7/2026      549          
 7     389442 07    Meridian Mi… NA    467           1/6/2026      454          
 8     384402 07    Meridian Mi… NA    506           1/7/2026      496          
 9     382167 07    Meridian Mi… NA    481           1/7/2026      477          
10     417702 06    Meridian Mi… NA    505           1/6/2026      499          
# ℹ 626 more rows
# ℹ 11 more variables: eoy_math_date <chr>, moy_ela_comp <chr>,
#   moy_ela_date <chr>, eoy_ela_comp <chr>, eoy_ela_date <chr>, gender <chr>,
#   sped <dbl>, mll <dbl>, low_inc <dbl>, race_eth <chr>, low_ap <dbl>

Merge

Insert treatment status

Import roster from VT that details which students have accounts in their system

#import ID crosswalk from VT
vt_roster = read_csv("vt_roster_all_districts.csv")
Rows: 1496 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): District, School
dbl (2): District ID, Varsity Tutors ID

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# rename vars
vt_roster = vt_roster %>%
  rename(
    district_vt = "District",
    school_vt = "School",
    student_id = "District ID", 
    vt_acct_id = "Varsity Tutors ID"
  )

Add treatment status. Equals 1 if student id appears in roster from VT (N=88). All else equals 0.

merge_vt_kent_unclean = roster_unique %>%
  mutate(
    treat = ifelse(student_id %in% vt_roster$student_id, 1, 0)
  )

table(merge_vt_kent_unclean$treat)

  0   1 
548  88 

Inspect Merged Data

Missing Data Summary

Recode “#N/A” to NA, missing value, in R.

merge_vt_kent_unclean[merge_vt_kent_unclean == "#N/A"] = NA

sum(merge_vt_kent_unclean == "#N/A", na.rm = TRUE)
[1] 0

Percentage missing for each variable. Observations: All other demographic info is present for all students. Some missing data for test info.

missing_summary = merge_vt_kent_unclean %>%
  summarise(across(
    everything(),
    ~ mean(is.na(.)) * 100
  )) %>%
  pivot_longer(
    cols = everything(),
    names_to = "variable",
    values_to = "percent_missing"
  ) %>%
  arrange(desc(percent_missing))

print(missing_summary)
# A tibble: 18 × 2
   variable      percent_missing
   <chr>                   <dbl>
 1 eoy_math_comp            4.25
 2 eoy_math_date            4.25
 3 moy_ela_comp             4.25
 4 moy_ela_date             4.25
 5 moy_math_comp            3.77
 6 moy_math_date            3.77
 7 eoy_ela_comp             3.46
 8 eoy_ela_date             3.46
 9 student_id               0   
10 grade                    0   
11 school                   0   
12 treat                    0   
13 gender                   0   
14 sped                     0   
15 mll                      0   
16 low_inc                  0   
17 race_eth                 0   
18 low_ap                   0   

Percentage of student who have both MOY and EOY data overall

#indicator for having BOTH MOY and EOY scores
merge_vt_kent_test_info = merge_vt_kent_unclean %>%
  mutate(
    both_math = !is.na(moy_math_comp) & !is.na(eoy_math_comp),
    both_ela  = !is.na(moy_ela_comp)  & !is.na(eoy_ela_comp),
    all_tests = both_math & both_ela
  )

#pct across schools
overall_pct = merge_vt_kent_test_info %>%
  summarise(
    total_students = n(),
    students_with_all_tests = sum(all_tests),
    percent_with_all_tests = (students_with_all_tests / total_students) * 100
  ) %>%
  mutate(percent_with_all_tests = round(percent_with_all_tests, 2))

print(overall_pct)
# A tibble: 1 × 3
  total_students students_with_all_tests percent_with_all_tests
           <int>                   <int>                  <dbl>
1            636                     593                   93.2

Percentage of student who have both MOY and EOY data by school Note: only 1 school site from Kent SD, Meridian Middle.

by_school_pct = merge_vt_kent_test_info %>%
  group_by(school) %>%
  summarise(
    total_students = n(),
    students_with_all_tests = sum(all_tests),
    percent_with_all_tests = (students_with_all_tests / total_students) * 100
  ) %>%
  mutate(percent_with_all_tests = round(percent_with_all_tests, 2)) %>%
  arrange(desc(percent_with_all_tests))

print(by_school_pct)
# A tibble: 1 × 4
  school          total_students students_with_all_tests percent_with_all_tests
  <chr>                    <int>                   <int>                  <dbl>
1 Meridian Middle            636                     593                   93.2

Dummies for test info

both_math: dummy for students with complete math test info across MOY and EOY.

table(merge_vt_kent_test_info$both_math)

FALSE  TRUE 
   37   599 
#complete math: MOY and EOY present
merge_vt_kent_test_info$both_math = ifelse(merge_vt_kent_test_info$both_math, 1, 0)
table(merge_vt_kent_test_info$both_math) #check counts

  0   1 
 37 599 

both_ela: dummy for students with complete ELA test info across MOY and EOY..

table(merge_vt_kent_test_info$both_ela)

FALSE  TRUE 
   31   605 
#complete ELA: MOY and EOY present
merge_vt_kent_test_info$both_ela = ifelse(merge_vt_kent_test_info$both_ela, 1, 0)
table(merge_vt_kent_test_info$both_ela) #check counts

  0   1 
 31 605 

all_tests: dummy for students with both math and ELA test info complete across MOY and EOY.

table(merge_vt_kent_test_info$all_tests)

FALSE  TRUE 
   43   593 
#complete math and ELA: MOY and EOY present
merge_vt_kent_test_info$all_tests = ifelse(merge_vt_kent_test_info$all_tests, 1, 0)
table(merge_vt_kent_test_info$all_tests) #check counts

  0   1 
 43 593 

math_moy_missing_eoy_present: dummy for students who have EOY math test info but MOY test info is missing.

merge_vt_kent_test_info = merge_vt_kent_test_info %>%
  mutate(
    moy_math_present = !is.na(moy_math_comp) & !is.na(moy_math_date),
    eoy_math_present = !is.na(eoy_math_comp) & !is.na(eoy_math_date),
    
    math_moy_missing_eoy_present = case_when(
      !moy_math_present &  eoy_math_present ~ 1,   # moy missing, eoy present
       moy_math_present &  eoy_math_present ~ 0,   # both present
      !moy_math_present & !eoy_math_present ~ NA_real_  # both missing
    )
  )
table(merge_vt_kent_test_info$math_moy_missing_eoy_present)

  0   1 
599  10 

ela_moy_missing_eoy_present: dummy for students who have EOY ELA test info but MOY test info is missing.

merge_vt_kent_test_info = merge_vt_kent_test_info %>%
  mutate(
    moy_ela_present = !is.na(moy_ela_comp) & !is.na(moy_ela_date),
    eoy_ela_present = !is.na(eoy_ela_comp) & !is.na(eoy_ela_date),
    
    ela_moy_missing_eoy_present = case_when(
      !moy_ela_present &  eoy_ela_present ~ 1,   # moy missing, eoy present
       moy_ela_present &  eoy_ela_present ~ 0,   # both present
      !moy_ela_present & !eoy_ela_present ~ NA_real_  # both missing
    )
  )
table(merge_vt_kent_test_info$ela_moy_missing_eoy_present)

  0   1 
605   9 

###Test info summary

Proportion of math test info present (MOY present and EOY present, grouped by treatment status).

summary_math_tests_present = merge_vt_kent_test_info %>%
  mutate(
    moy_math_present = !is.na(moy_math_comp) & !is.na(moy_math_date),
    eoy_math_present = !is.na(eoy_math_comp) & !is.na(eoy_math_date)
  ) %>%
  group_by(treat) %>%
  summarise(
    n_students = n(),
    
    moy_count = sum(moy_math_present),
    moy_prop  = mean(moy_math_present),
    
    eoy_count = sum(eoy_math_present),
    eoy_prop  = mean(eoy_math_present),
    
    .groups = "drop"
  )

# View result
print(summary_math_tests_present)
# A tibble: 2 × 6
  treat n_students moy_count moy_prop eoy_count eoy_prop
  <dbl>      <int>     <int>    <dbl>     <int>    <dbl>
1     0        548       524    0.956       526    0.960
2     1         88        88    1            83    0.943

Proportion of ELA test info present (MOY present and EOY present, grouped by treatment status).

summary_ela_tests_present = merge_vt_kent_test_info %>%
  mutate(
    moy_ela_present = !is.na(moy_ela_comp) & !is.na(moy_ela_date),
    eoy_ela_present = !is.na(eoy_ela_comp) & !is.na(eoy_ela_date)
  ) %>%
  group_by(treat) %>%
  summarise(
    n_students = n(),
    
    moy_count = sum(moy_ela_present),
    moy_prop  = mean(moy_ela_present),
    
    eoy_count = sum(eoy_ela_present),
    eoy_prop  = mean(eoy_ela_present),
    
    .groups = "drop"
  )

# View result
print(summary_ela_tests_present)
# A tibble: 2 × 6
  treat n_students moy_count moy_prop eoy_count eoy_prop
  <dbl>      <int>     <int>    <dbl>     <int>    <dbl>
1     0        548       521    0.951       527    0.962
2     1         88        88    1            87    0.989

Data Summary

Generate school-level means for MOY ELA and Math assessment data, number of students, plus distribution across demographics.

#create function to streamline calculations
calc_pct = function(x) {
  prop.table(table(x)) * 100
}

#school-level means and counts for TEST data
merge_vt_kent_test_info = merge_vt_kent_test_info %>%
  mutate(across(c(moy_math_comp, #transform all scores to numeric value
                  moy_ela_comp, 
                  eoy_math_comp, 
                  eoy_ela_comp), as.numeric))

school_means = merge_vt_kent_test_info %>%
  group_by(school)%>%
  summarise(
    n_students = n(),
    mean_moy_math = mean(moy_math_comp, na.rm = TRUE),
    mean_moy_ela  = mean(moy_ela_comp, na.rm = TRUE), 
    mean_eoy_math = mean(eoy_math_comp, na.rm = TRUE),
    mean_eoy_ela  = mean(eoy_ela_comp, na.rm = TRUE), 
  )

# DEMOGRAPHICS

# race/ethnicity
race_pct = merge_vt_kent_test_info %>%
  group_by(school, race_eth) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = race_eth,
    values_from = pct,
    names_prefix = "race_"
  )

#gender
gender_pct = merge_vt_kent_test_info %>%
  group_by(school, gender) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = gender,
    values_from = pct,
    names_prefix = "gender_"
  )

#low income status
low_inc_pct = merge_vt_kent_test_info %>%
  group_by(school, low_inc) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = low_inc,
    values_from = pct,
    names_prefix = "low_inc_"
  )

#multilingual learner
mll_pct = merge_vt_kent_test_info %>%
  group_by(school, mll) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = mll,
    values_from = pct,
    names_prefix = "mll_"
  )

#at-risk students
low_ap_pct = merge_vt_kent_test_info %>%
  group_by(school, low_ap) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = low_ap,
    values_from = pct,
    names_prefix = "low_ap_"
  )

#sped students
sped_pct = merge_vt_kent_test_info %>%
  group_by(school, sped) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = sped,
    values_from = pct,
    names_prefix = "sped_"
  )

# COMBINE summaries
school_summary = school_means %>%
  left_join(race_pct,   by = "school") %>%
  left_join(gender_pct, by = "school") %>%
  left_join(low_inc_pct,   by = "school") %>%
  left_join(mll_pct,    by = "school") %>%
  left_join(low_ap_pct, by = "school")%>%
  left_join(sped_pct, by = "school")

school_summary = school_summary %>%
  mutate(across(where(is.numeric), ~ round(.x, 2)))

print(school_summary)
# A tibble: 1 × 23
  school        n_students mean_moy_math mean_moy_ela mean_eoy_math mean_eoy_ela
  <chr>              <dbl>         <dbl>        <dbl>         <dbl>        <dbl>
1 Meridian Mid…        636          487.         583.          494.         588.
# ℹ 17 more variables: race_Asian <dbl>,
#   `race_Black or African American` <dbl>, `race_Hispanic/Latino` <dbl>,
#   `race_Native Hawaiian or Other Pacific Islander` <dbl>,
#   `race_Two or More Races` <dbl>, race_White <dbl>, gender_F <dbl>,
#   gender_M <dbl>, gender_NB <dbl>, low_inc_0 <dbl>, low_inc_1 <dbl>,
#   mll_0 <dbl>, mll_1 <dbl>, low_ap_0 <dbl>, low_ap_1 <dbl>, sped_0 <dbl>,
#   sped_1 <dbl>
write.csv(school_summary, "test_summary_by_school_vt_kent.csv", row.names = FALSE)

Reformat Data

Create dummy variables

Create numeric binary for gender. Observation: 1 NB student; NB student will present as NA under the male dummy.

table(merge_vt_kent_test_info$gender)

  F   M  NB 
285 350   1 
merge_vt_kent_test_info = merge_vt_kent_test_info %>%
  mutate(
    male = case_when(
      gender == "M" ~ 1,
      gender == "F" ~ 0,
      TRUE ~ NA_real_   # handles missing or unexpected values
    )
  )
table(merge_vt_kent_test_info$male)

  0   1 
285 350 

Create numeric binaries for each race.

table(merge_vt_kent_test_info$race_eth)

                                    Asian 
                                      212 
                Black or African American 
                                      101 
                          Hispanic/Latino 
                                      149 
Native Hawaiian or Other Pacific Islander 
                                       14 
                        Two or More Races 
                                       50 
                                    White 
                                      110 
merge_vt_kent_test_info = merge_vt_kent_test_info %>%
  mutate(
    race_A     = ifelse(race_eth == "Asian", 1, ifelse(is.na(race_eth), NA, 0)),
    race_B     = ifelse(race_eth == "Black or African American", 1, ifelse(is.na(race_eth), NA, 0)),
    race_H     = ifelse(race_eth == "Hispanic/Latino", 1, ifelse(is.na(race_eth), NA, 0)),
    race_Multi     = ifelse(race_eth == "Two or More Races", 1, ifelse(is.na(race_eth), NA, 0)),
    race_P     = ifelse(race_eth == "Native Hawaiian or Other Pacific Islander", 1, ifelse(is.na(race_eth), NA, 0)),
    race_W     = ifelse(race_eth == "White", 1, ifelse(is.na(race_eth), NA, 0))
  )
table(merge_vt_kent_test_info$race_B) #quick check coding on race_B

  0   1 
535 101 

Merge Usage Data

Clean Kent Usage Data

Import usage data from VT.

#import kent usage data from VT
vt_usage_kent = read_csv("vt_usage_kent.csv")
Rows: 1792 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sch_name*, Sch_BOY_date*, Sch_EOY_date
dbl (7): Stu_ID_VT*, Num_days_active*, Num_sessions_AI, Num_sessions_human, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(vt_usage_kent)
 [1] "Sch_name*"                      "Sch_BOY_date*"                 
 [3] "Sch_EOY_date"                   "Stu_ID_VT*"                    
 [5] "Num_days_active*"               "Num_sessions_AI"               
 [7] "Num_sessions_human"             "Num_classesviewed"             
 [9] "Num_essaysreviewed"             "Num_AI_practiceproblemsessions"

Rename columns in usage file. Prepare for merging.

# rename vars
vt_usage_kent = vt_usage_kent %>%
  rename(
    school_vt = "Sch_name*",
    school_BOY_date = "Sch_BOY_date*",
    school_EOY_date = "Sch_EOY_date", 
    vt_acct_id = "Stu_ID_VT*", 
    num_days_active = "Num_days_active*",
    num_sessions_AI = "Num_sessions_AI",
    num_sessions_human = "Num_sessions_human",
    num_classesviewed = "Num_classesviewed",
    num_essaysreviewed = "Num_essaysreviewed",
    num_AI_practiceproblemsessions = "Num_AI_practiceproblemsessions"
  )
print(vt_usage_kent)
# A tibble: 1,792 × 10
   school_vt       school_BOY_date school_EOY_date vt_acct_id num_days_active
   <chr>           <chr>           <chr>                <dbl>           <dbl>
 1 Meridian Middle 2/2/26          4/14/26           13151928              18
 2 Meridian Middle 2/2/26          4/14/26           13151799              27
 3 Meridian Middle 2/2/26          4/14/26           13151928              18
 4 Meridian Middle 2/2/26          4/14/26           13151849              21
 5 Meridian Middle 2/2/26          4/14/26           13151928              18
 6 Meridian Middle 2/2/26          4/14/26           13152086              24
 7 Meridian Middle 2/2/26          4/14/26           13151694              26
 8 Meridian Middle 2/2/26          4/14/26           13152119              23
 9 Meridian Middle 2/2/26          4/14/26           13151685              28
10 Meridian Middle 2/2/26          4/14/26           13152119              23
# ℹ 1,782 more rows
# ℹ 5 more variables: num_sessions_AI <dbl>, num_sessions_human <dbl>,
#   num_classesviewed <dbl>, num_essaysreviewed <dbl>,
#   num_AI_practiceproblemsessions <dbl>

Identify duplicate student records within usage data file (N=88).

# frequency table of vt_acct_id occurrences
id_counts_vt_usage_kent = vt_usage_kent %>%
  group_by(vt_acct_id) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

# filter only acct that appear > 1
dupes_vt_usage_kent = id_counts_vt_usage_kent %>%
  filter(n > 1)

# view results
print(dupes_vt_usage_kent)
# A tibble: 88 × 2
   vt_acct_id     n
        <dbl> <int>
 1   13151682    37
 2   13151686    30
 3   13151692    30
 4   13151845    30
 5   13151945    30
 6   13152099    29
 7   13151685    28
 8   13151808    28
 9   13151698    27
10   13151799    27
# ℹ 78 more rows
#export
write.csv(dupes_vt_usage_kent, file = "duplicate_acct_ids_usage_vt_kent.csv", row.names = FALSE)

Inspect duplicate student records in usage file (based on VT Account ID - not student ID). Observation: for a given duplicate student record, school name,num_days_active, BOY date and EOY date will match but usage data differs across the duplicate records.

dup_preview_usage_kent = vt_usage_kent %>%
  group_by(vt_acct_id) %>%
  filter(n() > 1) %>%
  ungroup()

print(dup_preview_usage_kent)
# A tibble: 1,792 × 10
   school_vt       school_BOY_date school_EOY_date vt_acct_id num_days_active
   <chr>           <chr>           <chr>                <dbl>           <dbl>
 1 Meridian Middle 2/2/26          4/14/26           13151928              18
 2 Meridian Middle 2/2/26          4/14/26           13151799              27
 3 Meridian Middle 2/2/26          4/14/26           13151928              18
 4 Meridian Middle 2/2/26          4/14/26           13151849              21
 5 Meridian Middle 2/2/26          4/14/26           13151928              18
 6 Meridian Middle 2/2/26          4/14/26           13152086              24
 7 Meridian Middle 2/2/26          4/14/26           13151694              26
 8 Meridian Middle 2/2/26          4/14/26           13152119              23
 9 Meridian Middle 2/2/26          4/14/26           13151685              28
10 Meridian Middle 2/2/26          4/14/26           13152119              23
# ℹ 1,782 more rows
# ℹ 5 more variables: num_sessions_AI <dbl>, num_sessions_human <dbl>,
#   num_classesviewed <dbl>, num_essaysreviewed <dbl>,
#   num_AI_practiceproblemsessions <dbl>

Decision: Collapse rows for each student ID. Per VT, these instances are rows that only capture one of the students multiple days active and need to be compressed.

Basic check of randomly selected student, 13151681. Inspect sum of Num_sessions_AI before collapsing.

sum(vt_usage_kent$num_sessions_AI[vt_usage_kent$vt_acct_id == 13151681], na.rm = TRUE)
[1] 22

Collapse rows.

vt_usage_kent_collapsed = vt_usage_kent %>%
  group_by(vt_acct_id) %>%
  summarise(
    num_sessions_AI = sum(num_sessions_AI, na.rm = TRUE),
    num_sessions_human = sum(num_sessions_human, na.rm = TRUE),
    num_classesviewed = sum(num_classesviewed, na.rm = TRUE),
    num_essaysreviewed = sum(num_essaysreviewed, na.rm = TRUE),
    num_AI_practiceproblemsessions = sum(num_AI_practiceproblemsessions, na.rm = TRUE),
    
    # keep other columns (takes first non-NA value)
    across(-c(num_sessions_AI,
              num_sessions_human,
              num_classesviewed,
              num_essaysreviewed,
              num_AI_practiceproblemsessions),
           ~ first(.x[!is.na(.x)])),
    
    .groups = "drop"
  )

print(vt_usage_kent_collapsed)
# A tibble: 88 × 10
   vt_acct_id num_sessions_AI num_sessions_human num_classesviewed
        <dbl>           <dbl>              <dbl>             <dbl>
 1   13151681              22                  3                 9
 2   13151682              62                  0                29
 3   13151683              33                  0                35
 4   13151684              20                  0                 0
 5   13151685              38                  0                10
 6   13151686              60                  0                57
 7   13151688              31                  0                 0
 8   13151690              24                  0                 0
 9   13151691              23                  2                 6
10   13151692              71                  1                 6
# ℹ 78 more rows
# ℹ 6 more variables: num_essaysreviewed <dbl>,
#   num_AI_practiceproblemsessions <dbl>, school_vt <chr>,
#   school_BOY_date <chr>, school_EOY_date <chr>, num_days_active <dbl>

Check number of IDs that appear more than once.

sum(table(vt_usage_kent_collapsed$vt_acct_id) > 1)
[1] 0

Basic check of randomly selected student, 13151681. Inspect sum of Num_sessions_AI after collapsing.

sum(vt_usage_kent_collapsed$num_sessions_AI[vt_usage_kent_collapsed$vt_acct_id == 13151681], na.rm = TRUE)
[1] 22

Append student ID from crosswalk file to usage file.

vt_usage_kent_collapsed = vt_usage_kent_collapsed %>%
  left_join(vt_roster %>% select(vt_acct_id, student_id),
            by = "vt_acct_id")
print(vt_usage_kent_collapsed)
# A tibble: 88 × 11
   vt_acct_id num_sessions_AI num_sessions_human num_classesviewed
        <dbl>           <dbl>              <dbl>             <dbl>
 1   13151681              22                  3                 9
 2   13151682              62                  0                29
 3   13151683              33                  0                35
 4   13151684              20                  0                 0
 5   13151685              38                  0                10
 6   13151686              60                  0                57
 7   13151688              31                  0                 0
 8   13151690              24                  0                 0
 9   13151691              23                  2                 6
10   13151692              71                  1                 6
# ℹ 78 more rows
# ℹ 7 more variables: num_essaysreviewed <dbl>,
#   num_AI_practiceproblemsessions <dbl>, school_vt <chr>,
#   school_BOY_date <chr>, school_EOY_date <chr>, num_days_active <dbl>,
#   student_id <dbl>

Merge usage data to student Kent data with test indicator information.

merge_vt_kent_usage_test_info = merge_vt_kent_test_info %>%
  left_join(vt_usage_kent_collapsed, by = "student_id")

print(merge_vt_kent_usage_test_info)
# A tibble: 636 × 44
   student_id grade school       treat moy_math_comp moy_math_date eoy_math_comp
        <dbl> <chr> <chr>        <dbl>         <dbl> <chr>                 <dbl>
 1     391033 07    Meridian Mi…     0           484 1/6/2026                490
 2     412115 08    Meridian Mi…     0           384 1/7/2026                409
 3     425285 06    Meridian Mi…     1           468 1/6/2026                 NA
 4     373185 07    Meridian Mi…     0           533 1/7/2026                549
 5     394412 06    Meridian Mi…     0           571 1/7/2026                568
 6     389860 07    Meridian Mi…     0           530 1/7/2026                549
 7     389442 07    Meridian Mi…     0           467 1/6/2026                454
 8     384402 07    Meridian Mi…     0           506 1/7/2026                496
 9     382167 07    Meridian Mi…     0           481 1/7/2026                477
10     417702 06    Meridian Mi…     0           505 1/6/2026                499
# ℹ 626 more rows
# ℹ 37 more variables: eoy_math_date <chr>, moy_ela_comp <dbl>,
#   moy_ela_date <chr>, eoy_ela_comp <dbl>, eoy_ela_date <chr>, gender <chr>,
#   sped <dbl>, mll <dbl>, low_inc <dbl>, race_eth <chr>, low_ap <dbl>,
#   both_math <dbl>, both_ela <dbl>, all_tests <dbl>, moy_math_present <lgl>,
#   eoy_math_present <lgl>, math_moy_missing_eoy_present <dbl>,
#   moy_ela_present <lgl>, eoy_ela_present <lgl>, …

Check for unmatched students.

merge_vt_kent_usage_test_info %>% filter(is.na(student_id)) # no NAs; no unmatched students. 
# A tibble: 0 × 44
# ℹ 44 variables: student_id <dbl>, grade <chr>, school <chr>, treat <dbl>,
#   moy_math_comp <dbl>, moy_math_date <chr>, eoy_math_comp <dbl>,
#   eoy_math_date <chr>, moy_ela_comp <dbl>, moy_ela_date <chr>,
#   eoy_ela_comp <dbl>, eoy_ela_date <chr>, gender <chr>, sped <dbl>,
#   mll <dbl>, low_inc <dbl>, race_eth <chr>, low_ap <dbl>, both_math <dbl>,
#   both_ela <dbl>, all_tests <dbl>, moy_math_present <lgl>,
#   eoy_math_present <lgl>, math_moy_missing_eoy_present <dbl>, …

Count of students with any usage logged (defined as num_days_active => 1) by treatment

merge_vt_kent_usage_test_info$num_days_active = as.numeric(merge_vt_kent_usage_test_info$num_days_active)

#create new any_usage for new tabulation                                                           
summary_usage_by_treat = merge_vt_kent_usage_test_info %>%
  mutate(any_usage = ifelse(num_days_active >= 1, 1, 0)) 
  #for treat=0 cases, any_usage will be NA

#for many treat=0 cases, recode any_usage from NA to 0. 
summary_usage_by_treat$any_usage[is.na(summary_usage_by_treat$any_usage)] <- 0

summary_usage_by_treat = summary_usage_by_treat %>%
  group_by(treat) %>%
  summarise(
    students_with_usage = sum(any_usage == 1, na.rm = TRUE),
    students_no_usage = sum(any_usage == 0, na.rm = TRUE),
    .groups = "drop"
  )

print(summary_usage_by_treat)
# A tibble: 2 × 3
  treat students_with_usage students_no_usage
  <dbl>               <int>             <int>
1     0                   0               548
2     1                  88                 0

Export files for e2i Coach

Save e2i file with all matched students from the merge, regardless of missing test info.

#select columns ready for e2i
vt_kent_e2i_usage_test_info = merge_vt_kent_usage_test_info %>%
  dplyr::select(student_id, 
                grade, 
                school, 
                moy_math_comp, moy_math_date, 
                moy_ela_comp,moy_ela_date, 
                eoy_math_comp, eoy_math_date,
                eoy_ela_comp, eoy_ela_date,
                male, sped, mll, low_inc,low_ap,
                treat,
                race_A, race_B, race_H, race_Multi, race_P, race_W,
                both_math, both_ela, all_tests, 
                math_moy_missing_eoy_present, ela_moy_missing_eoy_present, 
                
                vt_acct_id,
                num_days_active,num_sessions_AI, num_sessions_human,
                num_classesviewed, num_essaysreviewed, num_AI_practiceproblemsessions, 
                school_BOY_date, school_EOY_date, 
                )

#export e2i with test indicators, all matched students from the merge, regardless of missing test info (all_tests == 1 or 0)
write.csv(vt_kent_e2i_usage_test_info, file = "vt_kent_e2i_all_students.csv", row.names = FALSE)

Save e2i file with only containing matched students from the merge with complete test info.

table(vt_kent_e2i_usage_test_info$all_tests)

  0   1 
 43 593 
#subset: keep only students with complete test info (all_tests == 1)
vt_kent_e2i_all_tests = vt_kent_e2i_usage_test_info %>% filter(all_tests == 1)
print(vt_kent_e2i_all_tests) # N=593
# A tibble: 593 × 37
   student_id grade school moy_math_comp moy_math_date moy_ela_comp moy_ela_date
        <dbl> <chr> <chr>          <dbl> <chr>                <dbl> <chr>       
 1     391033 07    Merid…           484 1/6/2026               616 1/14/2026   
 2     412115 08    Merid…           384 1/7/2026               474 1/13/2026   
 3     373185 07    Merid…           533 1/7/2026               666 1/13/2026   
 4     394412 06    Merid…           571 1/7/2026               641 1/12/2026   
 5     389860 07    Merid…           530 1/7/2026               620 1/13/2026   
 6     389442 07    Merid…           467 1/6/2026               529 1/13/2026   
 7     384402 07    Merid…           506 1/7/2026               593 1/13/2026   
 8     382167 07    Merid…           481 1/7/2026               576 1/14/2026   
 9     417702 06    Merid…           505 1/6/2026               584 1/12/2026   
10     415707 08    Merid…           442 1/6/2026               555 1/12/2026   
# ℹ 583 more rows
# ℹ 30 more variables: eoy_math_comp <dbl>, eoy_math_date <chr>,
#   eoy_ela_comp <dbl>, eoy_ela_date <chr>, male <dbl>, sped <dbl>, mll <dbl>,
#   low_inc <dbl>, low_ap <dbl>, treat <dbl>, race_A <dbl>, race_B <dbl>,
#   race_H <dbl>, race_Multi <dbl>, race_P <dbl>, race_W <dbl>,
#   both_math <dbl>, both_ela <dbl>, all_tests <dbl>,
#   math_moy_missing_eoy_present <dbl>, ela_moy_missing_eoy_present <dbl>, …

Count of treatment students among the subset with complete test info.

table(vt_kent_e2i_all_tests$treat)

  0   1 
510  83 
#export e2i only containing matched students from the merge with complete test info (all_tests == 1)
write.csv(vt_kent_e2i_all_tests, file = "vt_kent_e2i_complete_tests.csv", row.names = FALSE)