Data Processing: Varsity Tutors, William Penn SD

Author

Alexis Davila

Clean Data

Import

Packages selected for inspecting and cleaning Varsity Tutors, district data from William Penn School District.

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.1     ✔ readr     2.2.0
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.3     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Import roster.

roster <- read_csv("roster_vt_penn.csv")

New names:
Rows: 1279 Columns: 41
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(10): School Name*, MOY MAP Math Math Test Date*, EOY MAP Math Math Test... dbl
(7): Student ID*, Student Grade Level*, Treatment*, MOY MAP Math Compos... lgl
(24): ...18, ...19, ...20, ...21, ...22, ...23, ...24, ...25, ...26, ......
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...18`
• `` -> `...19`
• `` -> `...20`
• `` -> `...21`
• `` -> `...22`
• `` -> `...23`
• `` -> `...24`
• `` -> `...25`
• `` -> `...26`
• `` -> `...27`
• `` -> `...28`
• `` -> `...29`
• `` -> `...30`
• `` -> `...31`
• `` -> `...32`
• `` -> `...33`
• `` -> `...34`
• `` -> `...35`
• `` -> `...36`
• `` -> `...37`
• `` -> `...38`
• `` -> `...39`
• `` -> `...40`
• `` -> `...41`

Inspect data variables. Data file appears to contain all requested elements - except for Academically at-risk.

colnames(roster)

 [1] "Student ID*"                         
 [2] "Student Grade Level*"                
 [3] "School Name*"                        
 [4] "Treatment*"                          
 [5] "MOY MAP Math Composite Scaled Score*"
 [6] "MOY MAP Math Math Test Date*"        
 [7] "EOY MAP Math Composite Scaled Score*"
 [8] "EOY MAP Math Math Test Date*"        
 [9] "MOY MAP ELA Composite Scaled Score*" 
[10] "MOY MAP ELA Test Date*"              
[11] "EOY MAP ELA Composite  Scaled Score*"
[12] "EOY MAP ELA Test Date*"              
[13] "Gender"                              
[14] "IEP"                                 
[15] "ELL"                                 
[16] "FRPL"                                
[17] "Race/Ethnicity"                      
[18] "...18"                               
[19] "...19"                               
[20] "...20"                               
[21] "...21"                               
[22] "...22"                               
[23] "...23"                               
[24] "...24"                               
[25] "...25"                               
[26] "...26"                               
[27] "...27"                               
[28] "...28"                               
[29] "...29"                               
[30] "...30"                               
[31] "...31"                               
[32] "...32"                               
[33] "...33"                               
[34] "...34"                               
[35] "...35"                               
[36] "...36"                               
[37] "...37"                               
[38] "...38"                               
[39] "...39"                               
[40] "...40"                               
[41] "...41"

Change column headers for roster.

roster = roster [, -c(18:41)] #drop empty columns
 
roster = roster %>%
  rename(
    student_id = "Student ID*",
    grade = "Student Grade Level*", 
    school = "School Name*", 
    treat = "Treatment*", 
    
    moy_math_comp = "MOY MAP Math Composite Scaled Score*", 
    moy_math_date = "MOY MAP Math Math Test Date*", 
    moy_ela_comp = "MOY MAP ELA Composite Scaled Score*", 
    moy_ela_date = "MOY MAP ELA Test Date*", 
    
    eoy_math_comp = "EOY MAP Math Composite Scaled Score*", 
    eoy_math_date = "EOY MAP Math Math Test Date*", 
    eoy_ela_comp = "EOY MAP ELA Composite  Scaled Score*", 
    eoy_ela_date = "EOY MAP ELA Test Date*", 
    
    gender = "Gender", 
    ell = "ELL", 
    frpl = "FRPL", 
    race_eth = "Race/Ethnicity", 
    iep = "IEP" 
    
  )
colnames(roster)

 [1] "student_id"    "grade"         "school"        "treat"        
 [5] "moy_math_comp" "moy_math_date" "eoy_math_comp" "eoy_math_date"
 [9] "moy_ela_comp"  "moy_ela_date"  "eoy_ela_comp"  "eoy_ela_date" 
[13] "gender"        "iep"           "ell"           "frpl"         
[17] "race_eth"

Inspect duplicates

Identify duplicate student records within roster file. No duplicates to report.

# frequency table of student_id occurrences
id_counts_roster = roster %>%
  group_by(student_id) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

# filter only IDs that appear > 1
dupes_roster = id_counts_roster %>%
  filter(n > 1)

# view results
print(dupes_roster)

# A tibble: 0 × 2
# ℹ 2 variables: student_id <dbl>, n <int>

#export - NA: no duplicates to report
#write.csv(dupes_roster, file = "duplicate_ids_roster_vt_penn.csv", row.names = FALSE)

Merge

Insert treatment status

Import roster from VT that details which students have accounts in their system.

#import ID crosswalk from VT
vt_roster = read_csv("vt_roster_all_districts.csv")

Rows: 1496 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): District, School
dbl (2): District ID, Varsity Tutors ID

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# rename vars
vt_roster = vt_roster %>%
  rename(
    district_vt = "District",
    school_vt = "School",
    student_id = "District ID", 
    vt_acct_id = "Varsity Tutors ID"
  )

Add treatment status. Equals 1 if student id appears in roster from VT (N=445). All else equals 0.

merge_vt_penn_unclean = roster %>%
  mutate(
    treat = ifelse(student_id %in% vt_roster$student_id, 1, 0)
  )

table(merge_vt_penn_unclean$treat)


  0   1 
834 445

Inspect Merged Data

Missing Data Summary

Percentage missing for each variable. Observations: All other demographic info is present for all students. Some missing data for test info.

missing_summary = merge_vt_penn_unclean %>%
  summarise(across(
    everything(),
    ~ mean(is.na(.)) * 100
  )) %>%
  pivot_longer(
    cols = everything(),
    names_to = "variable",
    values_to = "percent_missing"
  ) %>%
  arrange(desc(percent_missing))

print(missing_summary)

# A tibble: 17 × 2
   variable      percent_missing
   <chr>                   <dbl>
 1 moy_ela_comp             5.39
 2 moy_ela_date             5.39
 3 moy_math_comp            5.08
 4 moy_math_date            5.08
 5 eoy_ela_comp             3.60
 6 eoy_ela_date             3.60
 7 eoy_math_comp            3.13
 8 eoy_math_date            3.13
 9 student_id               0   
10 grade                    0   
11 school                   0   
12 treat                    0   
13 gender                   0   
14 iep                      0   
15 ell                      0   
16 frpl                     0   
17 race_eth                 0

Percentage of student who have both MOY and EOY data overall

#indicator for having BOTH MOY and EOY scores
merge_vt_penn_test_info = merge_vt_penn_unclean %>%
  mutate(
    both_math = !is.na(moy_math_comp) & !is.na(eoy_math_comp),
    both_ela  = !is.na(moy_ela_comp)  & !is.na(eoy_ela_comp),
    all_tests = both_math & both_ela
  )

#pct across schools
overall_pct = merge_vt_penn_test_info %>%
  summarise(
    total_students = n(),
    students_with_all_tests = sum(all_tests),
    percent_with_all_tests = (students_with_all_tests / total_students) * 100
  ) %>%
  mutate(percent_with_all_tests = round(percent_with_all_tests, 2))

print(overall_pct)

# A tibble: 1 × 3
  total_students students_with_all_tests percent_with_all_tests
           <int>                   <int>                  <dbl>
1           1279                    1194                   93.4

Percentage of student who have both MOY and EOY data by school

by_school_pct = merge_vt_penn_test_info %>%
  group_by(school) %>%
  summarise(
    total_students = n(),
    students_with_all_tests = sum(all_tests),
    percent_with_all_tests = (students_with_all_tests / total_students) * 100
  ) %>%
  mutate(percent_with_all_tests = round(percent_with_all_tests, 2)) %>%
  arrange(desc(percent_with_all_tests))

print(by_school_pct)

# A tibble: 8 × 4
  school            total_students students_with_all_te…¹ percent_with_all_tests
  <chr>                      <int>                  <int>                  <dbl>
1 Colwyn Elementar…             72                     71                   98.6
2 Bell Avenue Elem…            128                    123                   96.1
3 East Lansdowne E…            155                    148                   95.5
4 Park Lane Elemen…            170                    161                   94.7
5 Walnut Street El…            147                    137                   93.2
6 W.B. Evans Eleme…            179                    166                   92.7
7 Ardmore Avenue E…            311                    284                   91.3
8 Aldan Elementary…            117                    104                   88.9
# ℹ abbreviated name: ¹students_with_all_tests

Dummies for test info

both_math: dummy for students with complete math test info across MOY and EOY.

table(merge_vt_penn_test_info$both_math)


FALSE  TRUE 
   76  1203

#complete math: MOY and EOY present
merge_vt_penn_test_info$both_math = ifelse(merge_vt_penn_test_info$both_math, 1, 0)
table(merge_vt_penn_test_info$both_math) #check counts


   0    1 
  76 1203

both_ela: dummy for students with complete ELA test info across MOY and EOY..

table(merge_vt_penn_test_info$both_ela)


FALSE  TRUE 
   83  1196

#complete ELA: MOY and EOY present
merge_vt_penn_test_info$both_ela = ifelse(merge_vt_penn_test_info$both_ela, 1, 0)
table(merge_vt_penn_test_info$both_ela) #check counts


   0    1 
  83 1196

all_tests: dummy for students with both math and ELA test info complete across MOY and EOY.

table(merge_vt_penn_test_info$all_tests)


FALSE  TRUE 
   85  1194

#complete math and ELA: MOY and EOY present
merge_vt_penn_test_info$all_tests = ifelse(merge_vt_penn_test_info$all_tests, 1, 0)
table(merge_vt_penn_test_info$all_tests) #check counts


   0    1 
  85 1194

math_moy_missing_eoy_present: dummy for students who have EOY math test info but MOY test info is missing.

merge_vt_penn_test_info = merge_vt_penn_test_info %>%
  mutate(
    moy_math_present = !is.na(moy_math_comp) & !is.na(moy_math_date),
    eoy_math_present = !is.na(eoy_math_comp) & !is.na(eoy_math_date),
    
    math_moy_missing_eoy_present = case_when(
      !moy_math_present &  eoy_math_present ~ 1,   # moy missing, eoy present
       moy_math_present &  eoy_math_present ~ 0,   # both present
      !moy_math_present & !eoy_math_present ~ NA_real_  # both missing
    )
  )
table(merge_vt_penn_test_info$math_moy_missing_eoy_present)


   0    1 
1203   36

ela_moy_missing_eoy_present: dummy for students who have EOY ELA test info but MOY test info is missing.

merge_vt_penn_test_info = merge_vt_penn_test_info %>%
  mutate(
    moy_ela_present = !is.na(moy_ela_comp) & !is.na(moy_ela_date),
    eoy_ela_present = !is.na(eoy_ela_comp) & !is.na(eoy_ela_date),
    
    ela_moy_missing_eoy_present = case_when(
      !moy_ela_present &  eoy_ela_present ~ 1,   # moy missing, eoy present
       moy_ela_present &  eoy_ela_present ~ 0,   # both present
      !moy_ela_present & !eoy_ela_present ~ NA_real_  # both missing
    )
  )
table(merge_vt_penn_test_info$ela_moy_missing_eoy_present)


   0    1 
1196   37

###Test info summary

Proportion of math test info present (MOY present and EOY present, grouped by treatment status).

summary_math_tests_present = merge_vt_penn_test_info %>%
  mutate(
    moy_math_present = !is.na(moy_math_comp) & !is.na(moy_math_date),
    eoy_math_present = !is.na(eoy_math_comp) & !is.na(eoy_math_date)
  ) %>%
  group_by(treat) %>%
  summarise(
    n_students = n(),
    
    moy_count = sum(moy_math_present),
    moy_prop  = mean(moy_math_present),
    
    eoy_count = sum(eoy_math_present),
    eoy_prop  = mean(eoy_math_present),
    
    .groups = "drop"
  )

# View result
print(summary_math_tests_present)

# A tibble: 2 × 6
  treat n_students moy_count moy_prop eoy_count eoy_prop
  <dbl>      <int>     <int>    <dbl>     <int>    <dbl>
1     0        834       780    0.935       802    0.962
2     1        445       434    0.975       437    0.982

Proportion of ELA test info present (MOY present and EOY present, grouped by treatment status).

summary_ela_tests_present = merge_vt_penn_test_info %>%
  mutate(
    moy_ela_present = !is.na(moy_ela_comp) & !is.na(moy_ela_date),
    eoy_ela_present = !is.na(eoy_ela_comp) & !is.na(eoy_ela_date)
  ) %>%
  group_by(treat) %>%
  summarise(
    n_students = n(),
    
    moy_count = sum(moy_ela_present),
    moy_prop  = mean(moy_ela_present),
    
    eoy_count = sum(eoy_ela_present),
    eoy_prop  = mean(eoy_ela_present),
    
    .groups = "drop"
  )

# View result
print(summary_ela_tests_present)

# A tibble: 2 × 6
  treat n_students moy_count moy_prop eoy_count eoy_prop
  <dbl>      <int>     <int>    <dbl>     <int>    <dbl>
1     0        834       777    0.932       796    0.954
2     1        445       433    0.973       437    0.982

Data Summary

Generate school-level means for MOY ELA and Math assessment data, number of students, plus distribution across demographics.

#create function to streamline calculations
calc_pct = function(x) {
  prop.table(table(x)) * 100
}

#school-level means and counts for TEST data
merge_vt_penn_test_info = merge_vt_penn_test_info %>%
  mutate(across(c(moy_math_comp, #transform all scores to numeric value
                  moy_ela_comp, 
                  eoy_math_comp, 
                  eoy_ela_comp), as.numeric))

school_means = merge_vt_penn_test_info %>%
  group_by(school)%>%
  summarise(
    n_students = n(),
    mean_moy_math = mean(moy_math_comp, na.rm = TRUE),
    mean_moy_ela  = mean(moy_ela_comp, na.rm = TRUE), 
    mean_eoy_math = mean(eoy_math_comp, na.rm = TRUE),
    mean_eoy_ela  = mean(eoy_ela_comp, na.rm = TRUE), 
  )

# DEMOGRAPHICS

# race/ethnicity
race_pct = merge_vt_penn_test_info %>%
  group_by(school, race_eth) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = race_eth,
    values_from = pct,
    names_prefix = "race_"
  )

#gender
gender_pct = merge_vt_penn_test_info %>%
  group_by(school, gender) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = gender,
    values_from = pct,
    names_prefix = "gender_"
  )

#frpl
frpl_pct = merge_vt_penn_test_info %>%
  group_by(school, frpl) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = frpl,
    values_from = pct,
    names_prefix = "frpl_"
  )

#ell
ell_pct = merge_vt_penn_test_info %>%
  group_by(school, ell) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = ell,
    values_from = pct,
    names_prefix = "ell_"
  )

#iep 
iep_pct = merge_vt_penn_test_info %>%
  group_by(school, iep) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = iep,
    values_from = pct,
    names_prefix = "iep_"
  )

# COMBINE summaries
school_summary = school_means %>%
  left_join(race_pct,   by = "school") %>%
  left_join(gender_pct, by = "school") %>%
  left_join(frpl_pct,   by = "school") %>%
  left_join(ell_pct,    by = "school") %>%
  left_join(iep_pct, by = "school")

school_summary = school_summary %>%
  mutate(across(where(is.numeric), ~ round(.x, 2)))

print(school_summary)

# A tibble: 8 × 21
  school        n_students mean_moy_math mean_moy_ela mean_eoy_math mean_eoy_ela
  <chr>              <dbl>         <dbl>        <dbl>         <dbl>        <dbl>
1 Aldan Elemen…        117          201.         198.          204.         198.
2 Ardmore Aven…        311          202.         194.          208.         196.
3 Bell Avenue …        128          204.         194.          211.         194.
4 Colwyn Eleme…         72          203.         199.          211.         202.
5 East Lansdow…        155          206.         202.          214.         205.
6 Park Lane El…        170          202.         195.          207.         193.
7 W.B. Evans E…        179          205.         199.          213.         201.
8 Walnut Stree…        147          199.         194.          205          195.
# ℹ 15 more variables: `race_AMERICAN INDIAN/ALASKAN` <dbl>, race_ASIAN <dbl>,
#   `race_BLACK/AFRICAN AMERICAN` <dbl>, race_HISPANIC <dbl>,
#   `race_NATIVE HAWAIIAN` <dbl>, `race_WHITE/CAUCASIAN` <dbl>,
#   `race_MULTI RACIAL - NON HISPANIC` <dbl>, gender_F <dbl>, gender_M <dbl>,
#   frpl_0 <dbl>, frpl_Y <dbl>, ell_N <dbl>, ell_Y <dbl>, iep_N <dbl>,
#   iep_Y <dbl>

write.csv(school_summary, "test_summary_by_school_vt_penn.csv", row.names = FALSE)

Reformat Data

Create dummy variables

Create numeric binary for gender. Observation: binary variable already present, prime for male dummy coding.

table(merge_vt_penn_test_info$gender)


  F   M 
667 612

merge_vt_penn_test_info = merge_vt_penn_test_info %>%
  mutate(
    male = case_when(
      gender == "M" ~ 1,
      gender == "F" ~ 0,
      TRUE ~ NA_real_   # handles missing or unexpected values
    )
  )
table(merge_vt_penn_test_info$male)


  0   1 
667 612

Create numeric binary for IEP.

table(merge_vt_penn_test_info$iep)


   N    Y 
1009  270

merge_vt_penn_test_info = merge_vt_penn_test_info %>%
  mutate(
    iep = case_when(
      iep == "Y" ~ 1,
      iep == "N" ~ 0,
      TRUE ~ NA_real_   # handles missing or unexpected values
    )
  )
table(merge_vt_penn_test_info$iep)


   0    1 
1009  270

Create numeric binary for ELL.

table(merge_vt_penn_test_info$ell)


   N    Y 
1175  104

merge_vt_penn_test_info = merge_vt_penn_test_info %>%
  mutate(
    ell = case_when(
      ell == "Y" ~ 1,
      ell == "N" ~ 0,
      TRUE ~ NA_real_   # handles missing or unexpected values
    )
  )
table(merge_vt_penn_test_info$ell)


   0    1 
1175  104

Create numeric binary for FRPL.

table(merge_vt_penn_test_info$frpl)


  0   Y 
353 926

merge_vt_penn_test_info = merge_vt_penn_test_info %>%
  mutate(
    frpl = case_when(
      frpl == "Y" ~ 1,
      frpl == "0" ~ 0,
      TRUE ~ NA_real_   # handles missing or unexpected values
    )
  )
table(merge_vt_penn_test_info$frpl)


  0   1 
353 926

Create numeric binaries for each race.

table(merge_vt_penn_test_info$race_eth)


    AMERICAN INDIAN/ALASKAN                       ASIAN 
                         22                          31 
     BLACK/AFRICAN AMERICAN                    HISPANIC 
                       1101                          43 
MULTI RACIAL - NON HISPANIC             NATIVE HAWAIIAN 
                          1                           8 
            WHITE/CAUCASIAN 
                         73

merge_vt_penn_test_info = merge_vt_penn_test_info %>%
  mutate(
    race_A     = ifelse(race_eth == "ASIAN", 1, ifelse(is.na(race_eth), NA, 0)),
    race_B     = ifelse(race_eth == "BLACK/AFRICAN AMERICAN", 1, ifelse(is.na(race_eth), NA, 0)),
    race_H     = ifelse(race_eth == "HISPANIC", 1, ifelse(is.na(race_eth), NA, 0)),
    race_Multi     = ifelse(race_eth == "MULTI RACIAL - NON HISPANIC", 1, ifelse(is.na(race_eth), NA, 0)),
    race_NH     = ifelse(race_eth == "NATIVE HAWAIIAN", 1, ifelse(is.na(race_eth), NA, 0)),
    race_AI     = ifelse(race_eth == "AMERICAN INDIAN/ALASKAN", 1, ifelse(is.na(race_eth), NA, 0)),
    race_W     = ifelse(race_eth == "WHITE/CAUCASIAN", 1, ifelse(is.na(race_eth), NA, 0))
  )
table(merge_vt_penn_test_info$race_B) #quick check coding on race_B


   0    1 
 178 1101

Merge Usage Data

Clean Penn Usage Data

Import usage data from VT.

#import penn usage data from VT
vt_usage_penn = read_csv("vt_usage_penn.csv")

Rows: 1648 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sch_name*, Sch_BOY_date*, Sch_EOY_date
dbl (7): Stu_ID_VT*, Num_days_active*, Num_sessions_AI, Num_sessions_human, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

colnames(vt_usage_penn)

 [1] "Sch_name*"                      "Sch_BOY_date*"                 
 [3] "Sch_EOY_date"                   "Stu_ID_VT*"                    
 [5] "Num_days_active*"               "Num_sessions_AI"               
 [7] "Num_sessions_human"             "Num_classesviewed"             
 [9] "Num_essaysreviewed"             "Num_AI_practiceproblemsessions"

Rename columns in usage file. Prepare for merging.

# rename vars
vt_usage_penn = vt_usage_penn %>%
  rename(
    school_vt = "Sch_name*",
    school_BOY_date = "Sch_BOY_date*",
    school_EOY_date = "Sch_EOY_date", 
    vt_acct_id = "Stu_ID_VT*", 
    num_days_active = "Num_days_active*",
    num_sessions_AI = "Num_sessions_AI",
    num_sessions_human = "Num_sessions_human",
    num_classesviewed = "Num_classesviewed",
    num_essaysreviewed = "Num_essaysreviewed",
    num_AI_practiceproblemsessions = "Num_AI_practiceproblemsessions"
  )
print(vt_usage_penn)

# A tibble: 1,648 × 10
   school_vt          school_BOY_date school_EOY_date vt_acct_id num_days_active
   <chr>              <chr>           <chr>                <dbl>           <dbl>
 1 Bell Avenue Eleme… 1/5/26          5/24/26           13134752               3
 2 Bell Avenue Eleme… 1/5/26          5/24/26           13183824              15
 3 Bell Avenue Eleme… 1/5/26          5/24/26           13183824              15
 4 Bell Avenue Eleme… 1/5/26          5/24/26           13155212               8
 5 Bell Avenue Eleme… 1/5/26          5/24/26           13193600              16
 6 Bell Avenue Eleme… 1/5/26          5/24/26           13154761               9
 7 Bell Avenue Eleme… 1/5/26          5/24/26           13155203              NA
 8 Bell Avenue Eleme… 1/5/26          5/24/26           13130626               3
 9 Bell Avenue Eleme… 1/5/26          5/24/26           13155895              16
10 Bell Avenue Eleme… 1/5/26          5/24/26           13152761              12
# ℹ 1,638 more rows
# ℹ 5 more variables: num_sessions_AI <dbl>, num_sessions_human <dbl>,
#   num_classesviewed <dbl>, num_essaysreviewed <dbl>,
#   num_AI_practiceproblemsessions <dbl>

Identify duplicate student records within usage data file (N=280).

# frequency table of vt_acct_id occurrences
id_counts_vt_usage_penn = vt_usage_penn %>%
  group_by(vt_acct_id) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

# filter only acct that appear > 1
dupes_vt_usage_penn = id_counts_vt_usage_penn %>%
  filter(n > 1)

# view results
print(dupes_vt_usage_penn)

# A tibble: 280 × 2
   vt_acct_id     n
        <dbl> <int>
 1         NA   200
 2   13143440    15
 3   13143446    14
 4   13144034    14
 5   13143434    13
 6   13143436    13
 7   13143370    11
 8   13143438    11
 9   13130599    10
10   13143445    10
# ℹ 270 more rows

#export
write.csv(dupes_vt_usage_penn, file = "duplicate_acct_ids_usage_vt_penn.csv", row.names = FALSE)

Inspect duplicate student records in usage file (based on VT Account ID - not student ID). Observation: for a given duplicate student record, school name,num_days_active, BOY date and EOY date will match but usage data differs across the duplicate records.

dup_preview_usage_penn = vt_usage_penn %>%
  group_by(vt_acct_id) %>%
  filter(n() > 1) %>%
  ungroup()

print(dup_preview_usage_penn)

# A tibble: 1,572 × 10
   school_vt          school_BOY_date school_EOY_date vt_acct_id num_days_active
   <chr>              <chr>           <chr>                <dbl>           <dbl>
 1 Bell Avenue Eleme… 1/5/26          5/24/26           13134752               3
 2 Bell Avenue Eleme… 1/5/26          5/24/26           13183824              15
 3 Bell Avenue Eleme… 1/5/26          5/24/26           13183824              15
 4 Bell Avenue Eleme… 1/5/26          5/24/26           13155212               8
 5 Bell Avenue Eleme… 1/5/26          5/24/26           13193600              16
 6 Bell Avenue Eleme… 1/5/26          5/24/26           13154761               9
 7 Bell Avenue Eleme… 1/5/26          5/24/26           13130626               3
 8 Bell Avenue Eleme… 1/5/26          5/24/26           13155895              16
 9 Bell Avenue Eleme… 1/5/26          5/24/26           13152761              12
10 Bell Avenue Eleme… 1/5/26          5/24/26           13130597               4
# ℹ 1,562 more rows
# ℹ 5 more variables: num_sessions_AI <dbl>, num_sessions_human <dbl>,
#   num_classesviewed <dbl>, num_essaysreviewed <dbl>,
#   num_AI_practiceproblemsessions <dbl>

Decision: Collapse rows for each student ID. Per VT, these instances are rows that only capture one of the students multiple days active and need to be compressed.

Basic check of randomly selected student, 13130591. Inspect sum of Num_sessions_AI before collapsing.

sum(vt_usage_penn$num_sessions_AI[vt_usage_penn$vt_acct_id == 13130591], na.rm = TRUE)

[1] 6

Collapse rows.

vt_usage_penn_collapsed = vt_usage_penn %>%
  group_by(vt_acct_id) %>%
  summarise(
    num_sessions_AI = sum(num_sessions_AI, na.rm = TRUE),
    num_sessions_human = sum(num_sessions_human, na.rm = TRUE),
    num_classesviewed = sum(num_classesviewed, na.rm = TRUE),
    num_essaysreviewed = sum(num_essaysreviewed, na.rm = TRUE),
    num_AI_practiceproblemsessions = sum(num_AI_practiceproblemsessions, na.rm = TRUE),
    
    # keep other columns (take first non-NA value)
    across(-c(num_sessions_AI,
              num_sessions_human,
              num_classesviewed,
              num_essaysreviewed,
              num_AI_practiceproblemsessions),
           ~ first(.x[!is.na(.x)])),
    
    .groups = "drop"
  )

print(vt_usage_penn_collapsed)

# A tibble: 356 × 10
   vt_acct_id num_sessions_AI num_sessions_human num_classesviewed
        <dbl>           <dbl>              <dbl>             <dbl>
 1   13125602               4                  0                 4
 2   13125757               0                  2                 2
 3   13126268               0                  6                 6
 4   13127796               0                  7                 7
 5   13130564               0                  0                 0
 6   13130590               0                  0                 0
 7   13130591               6                  3                 9
 8   13130592               0                  7                 7
 9   13130594               0                  3                 3
10   13130595               3                  3                 6
# ℹ 346 more rows
# ℹ 6 more variables: num_essaysreviewed <dbl>,
#   num_AI_practiceproblemsessions <dbl>, school_vt <chr>,
#   school_BOY_date <chr>, school_EOY_date <chr>, num_days_active <dbl>

Check number of IDs that appear more than once.

sum(table(vt_usage_penn_collapsed$vt_acct_id) > 1)

[1] 0

Basic check of randomly selected student, 13130591. Inspect sum of Num_sessions_AI after collapsing.

sum(vt_usage_penn_collapsed$num_sessions_AI[vt_usage_penn_collapsed$vt_acct_id == 13130591], na.rm = TRUE)

[1] 6

Append student ID from crosswalk file to usage file.

vt_usage_penn_collapsed = vt_usage_penn_collapsed %>%
  left_join(vt_roster %>% select(vt_acct_id, student_id),
            by = "vt_acct_id")
print(vt_usage_penn_collapsed)

# A tibble: 356 × 11
   vt_acct_id num_sessions_AI num_sessions_human num_classesviewed
        <dbl>           <dbl>              <dbl>             <dbl>
 1   13125602               4                  0                 4
 2   13125757               0                  2                 2
 3   13126268               0                  6                 6
 4   13127796               0                  7                 7
 5   13130564               0                  0                 0
 6   13130590               0                  0                 0
 7   13130591               6                  3                 9
 8   13130592               0                  7                 7
 9   13130594               0                  3                 3
10   13130595               3                  3                 6
# ℹ 346 more rows
# ℹ 7 more variables: num_essaysreviewed <dbl>,
#   num_AI_practiceproblemsessions <dbl>, school_vt <chr>,
#   school_BOY_date <chr>, school_EOY_date <chr>, num_days_active <dbl>,
#   student_id <dbl>

Merge usage data to student Penn data with test indicator information.

merge_vt_penn_usage_test_info = merge_vt_penn_test_info %>%
  left_join(vt_usage_penn_collapsed, by = "student_id")

print(merge_vt_penn_usage_test_info)

# A tibble: 1,279 × 44
   student_id grade school       treat moy_math_comp moy_math_date eoy_math_comp
        <dbl> <dbl> <chr>        <dbl>         <dbl> <chr>                 <dbl>
 1     203802     6 Colwyn Elem…     1           186 1/9/2026                190
 2     203862     6 Colwyn Elem…     1           223 1/8/2026                242
 3     204106     6 Colwyn Elem…     1           230 1/13/2026               226
 4     204166     6 Colwyn Elem…     1           210 1/8/2026                214
 5     204211     6 Colwyn Elem…     1           181 1/8/2026                195
 6     204755     5 Colwyn Elem…     1           226 1/6/2026                234
 7     204759     5 Colwyn Elem…     1           205 1/6/2026                211
 8     204769     5 Colwyn Elem…     1           209 1/6/2026                214
 9     204787     5 Colwyn Elem…     1           170 1/6/2026                173
10     204843     5 Colwyn Elem…     1           213 1/6/2026                221
# ℹ 1,269 more rows
# ℹ 37 more variables: eoy_math_date <chr>, moy_ela_comp <dbl>,
#   moy_ela_date <chr>, eoy_ela_comp <dbl>, eoy_ela_date <chr>, gender <chr>,
#   iep <dbl>, ell <dbl>, frpl <dbl>, race_eth <chr>, both_math <dbl>,
#   both_ela <dbl>, all_tests <dbl>, moy_math_present <lgl>,
#   eoy_math_present <lgl>, math_moy_missing_eoy_present <dbl>,
#   moy_ela_present <lgl>, eoy_ela_present <lgl>, …

Check for unmatched students.

merge_vt_penn_usage_test_info %>% filter(is.na(student_id)) # no NAs; no unmatched students.

# A tibble: 0 × 44
# ℹ 44 variables: student_id <dbl>, grade <dbl>, school <chr>, treat <dbl>,
#   moy_math_comp <dbl>, moy_math_date <chr>, eoy_math_comp <dbl>,
#   eoy_math_date <chr>, moy_ela_comp <dbl>, moy_ela_date <chr>,
#   eoy_ela_comp <dbl>, eoy_ela_date <chr>, gender <chr>, iep <dbl>, ell <dbl>,
#   frpl <dbl>, race_eth <chr>, both_math <dbl>, both_ela <dbl>,
#   all_tests <dbl>, moy_math_present <lgl>, eoy_math_present <lgl>,
#   math_moy_missing_eoy_present <dbl>, moy_ela_present <lgl>, …

Count of students with any usage logged (defined as num_days_active => 1) by treatment.

merge_vt_penn_usage_test_info$num_days_active = as.numeric(merge_vt_penn_usage_test_info$num_days_active)

#create new any_usage for new tabulation                                                           
summary_usage_by_treat = merge_vt_penn_usage_test_info %>%
  mutate(any_usage = ifelse(num_days_active >= 1, 1, 0)) 
  #for treat=0 cases, any_usage will be NA

#for many treat=0 cases, recode any_usage from NA to 0. 
summary_usage_by_treat$any_usage[is.na(summary_usage_by_treat$any_usage)] <- 0

summary_usage_by_treat = summary_usage_by_treat %>%
  group_by(treat) %>%
  summarise(
    students_with_usage = sum(any_usage == 1, na.rm = TRUE),
    students_no_usage = sum(any_usage == 0, na.rm = TRUE),
    .groups = "drop"
  )

print(summary_usage_by_treat)

# A tibble: 2 × 3
  treat students_with_usage students_no_usage
  <dbl>               <int>             <int>
1     0                   0               834
2     1                 333               112

Export files for e2i Coach

Save e2i file with all matched students from the merge, regardless of missing test info.

#select columns ready for e2i
vt_penn_e2i_usage_test_info = merge_vt_penn_usage_test_info %>%
  dplyr::select(student_id, 
                grade, 
                school, 
                moy_math_comp, moy_math_date, 
                moy_ela_comp,moy_ela_date, 
                eoy_math_comp, eoy_math_date,
                eoy_ela_comp, eoy_ela_date,
                male, iep, ell, frpl,
                treat,
                race_A, race_B, race_H, race_Multi, race_NH, race_AI, race_W,
                both_math, both_ela, all_tests, 
                math_moy_missing_eoy_present, ela_moy_missing_eoy_present, 
                
                vt_acct_id,
                num_days_active,num_sessions_AI, num_sessions_human,
                num_classesviewed, num_essaysreviewed, num_AI_practiceproblemsessions, 
                school_BOY_date, school_EOY_date, 
                )

#export e2i with test indicators, all matched students from the merge, regardless of missing test info (all_tests == 1 or 0)
write.csv(vt_penn_e2i_usage_test_info, file = "vt_penn_e2i_all_students.csv", row.names = FALSE)

Save e2i file with only containing matched students from the merge with complete test info.

table(vt_penn_e2i_usage_test_info$all_tests)


   0    1 
  85 1194

#subset: keep only students with complete test info (all_tests == 1)
vt_penn_e2i_all_tests = vt_penn_e2i_usage_test_info %>% filter(all_tests == 1)
print(vt_penn_e2i_all_tests) # N=1194

# A tibble: 1,194 × 37
   student_id grade school moy_math_comp moy_math_date moy_ela_comp moy_ela_date
        <dbl> <dbl> <chr>          <dbl> <chr>                <dbl> <chr>       
 1     203802     6 Colwy…           186 1/9/2026               181 1/7/2026    
 2     203862     6 Colwy…           223 1/8/2026               230 1/7/2026    
 3     204106     6 Colwy…           230 1/13/2026              226 1/7/2026    
 4     204166     6 Colwy…           210 1/8/2026               212 1/7/2026    
 5     204211     6 Colwy…           181 1/8/2026               231 1/7/2026    
 6     204755     5 Colwy…           226 1/6/2026               219 1/7/2026    
 7     204759     5 Colwy…           205 1/6/2026               199 1/7/2026    
 8     204769     5 Colwy…           209 1/6/2026               215 1/7/2026    
 9     204787     5 Colwy…           170 1/6/2026               170 1/7/2026    
10     204843     5 Colwy…           213 1/6/2026               208 1/7/2026    
# ℹ 1,184 more rows
# ℹ 30 more variables: eoy_math_comp <dbl>, eoy_math_date <chr>,
#   eoy_ela_comp <dbl>, eoy_ela_date <chr>, male <dbl>, iep <dbl>, ell <dbl>,
#   frpl <dbl>, treat <dbl>, race_A <dbl>, race_B <dbl>, race_H <dbl>,
#   race_Multi <dbl>, race_NH <dbl>, race_AI <dbl>, race_W <dbl>,
#   both_math <dbl>, both_ela <dbl>, all_tests <dbl>,
#   math_moy_missing_eoy_present <dbl>, ela_moy_missing_eoy_present <dbl>, …

Count of treatment students among the subset with complete test info.

table(vt_penn_e2i_all_tests$treat)


  0   1 
765 429

#export e2i only containing matched students from the merge with complete test info (all_tests == 1)
write.csv(vt_penn_e2i_all_tests, file = "vt_penn_e2i_complete_tests.csv", row.names = FALSE)