Data Processing: Varsity Tutors, Cumberland

Author

Alexis Davila

Clean Data

Import

Packages selected for inspecting and cleaning Varsity Tutors, district data from Cumberland schools.

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.1     ✔ readr     2.2.0
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.3     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Import roster.

roster <- read_csv("roster_vt_cumber.csv")

Rows: 14447 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): school
dbl (3): school_code, student_id, grade

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Change column headers for roster.

roster = roster %>%
  rename(
    grade_ros = "grade",
    school_ros = "school",
  )

Import Middle of Year data request.

moy_map = read_csv("moy_map_vt_cumber.csv")

Rows: 10572 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): School Name, MOY MAP Math Math Test Date*, MOY MAP ELA Test Date*, ...
dbl (7): Student ID, Student Grade Level, MOY MAP Math Composite Scaled Scor...
lgl (2): Treatment, FRPL

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

view(moy_map)

Change column headers for moy_map.

moy_map = moy_map %>%
  rename(
    student_id = "Student ID",
    grade_moy = "Student Grade Level", 
    school_moy = "School Name", 
    treat_moy = "Treatment", 
    moy_math_comp = "MOY MAP Math Composite Scaled Score*", 
    moy_math_date = "MOY MAP Math Math Test Date*", 
    moy_ela_comp = "MOY MAP ELA Composite Scaled Score*", 
    moy_ela_date = "MOY MAP ELA Test Date*", 
    gender_moy = "Gender", 
    iep_moy = "IEP", 
    ell_moy = "ELL", 
    frpl_moy = "FRPL", 
    race_eth_moy = "Race/Ethnicity", 
    at_risk_moy = "Academically at-risk"
    
  )

Import End of Year data request.

eoy_map = read_csv("eoy_map_vt_cumber.csv")

Rows: 8810 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): School Name, EOY MAP Math Math Test Date*, EOY MAP ELA Test Date*, ...
dbl (7): Student ID, Student Grade Level, EOY MAP Math Composite Scaled Scor...
lgl (2): Treatment, FRPL

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

view(eoy_map)

Change column headers for eoy_map.

eoy_map = eoy_map %>%
  rename(
    student_id = "Student ID",
    grade_eoy = "Student Grade Level", 
    school_eoy = "School Name", 
    treat_eoy = "Treatment", 
    eoy_math_comp = "EOY MAP Math Composite Scaled Score*", 
    eoy_math_date = "EOY MAP Math Math Test Date*", 
    eoy_ela_comp = "EOY MAP ELA Composite  Scaled Score*", 
    eoy_ela_date = "EOY MAP ELA Test Date*", 
    gender_eoy = "Gender", 
    iep_eoy = "IEP", 
    ell_eoy = "ELL", 
    frpl_eoy = "FRPL", 
    race_eth_eoy = "Race/Ethnicity", 
    at_risk_eoy = "Academically at-risk"
    
  )

Inspect duplicates

Roster

Identify duplicate student records within roster file.

# frequency table of student_id occurrences
id_counts_roster = roster %>%
  group_by(student_id) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

# filter only IDs that appear > 1
dupes_roster = id_counts_roster %>%
  filter(n > 1)

# view results
print(dupes_roster)

# A tibble: 1,425 × 2
   student_id     n
        <dbl> <int>
 1 2257769627    16
 2 3284674675    16
 3 5761715848    15
 4 7877815913    15
 5 8922727837    15
 6 4254112513    14
 7 5251233515    14
 8 3568287471    13
 9 4526196894    13
10 6564849621    13
# ℹ 1,415 more rows

#export
write.csv(dupes_roster, file = "duplicate_ids_roster_vt_cumber.csv", row.names = FALSE)

Inspect duplicate student records in roster. Observation: for a given duplicate student record, it appears to be the same individual (e.g., same school, same grade)

dup_preview_roster = roster %>%
  group_by(student_id) %>%
  filter(n() > 1) %>%
  ungroup()

print(dup_preview_roster)

# A tibble: 5,315 × 4
   school_code school_ros           student_id grade_ros
         <dbl> <chr>                     <dbl>     <dbl>
 1      260336 Anne Chesnutt Middle 9581845356         6
 2      260336 Anne Chesnutt Middle 9581845356         6
 3      260336 Anne Chesnutt Middle 9581845356         6
 4      260336 Anne Chesnutt Middle 5396535695         6
 5      260336 Anne Chesnutt Middle 5396535695         6
 6      260336 Anne Chesnutt Middle 4978175585         6
 7      260336 Anne Chesnutt Middle 4978175585         6
 8      260336 Anne Chesnutt Middle 6429225941         6
 9      260336 Anne Chesnutt Middle 6429225941         6
10      260336 Anne Chesnutt Middle 6429225941         6
# ℹ 5,305 more rows

Decision: keep first occurrence of student ID based on current row order (N=10,557).

roster_unique = roster %>%
  distinct(student_id, .keep_all = TRUE)

print(roster_unique)

# A tibble: 10,557 × 4
   school_code school_ros           student_id grade_ros
         <dbl> <chr>                     <dbl>     <dbl>
 1      260336 Anne Chesnutt Middle 2287512462         6
 2      260336 Anne Chesnutt Middle 4431265988         6
 3      260336 Anne Chesnutt Middle 9779738622         6
 4      260336 Anne Chesnutt Middle 3968556194         6
 5      260336 Anne Chesnutt Middle 1925959163         6
 6      260336 Anne Chesnutt Middle 2638992577         6
 7      260336 Anne Chesnutt Middle 8925824655         6
 8      260336 Anne Chesnutt Middle 4634477858         6
 9      260336 Anne Chesnutt Middle 9836237186         6
10      260336 Anne Chesnutt Middle 1961537885         6
# ℹ 10,547 more rows

MOY

Identify duplicate student records within MOY file.

# frequency table of student_id occurrences
id_counts_moy = moy_map %>%
  group_by(student_id) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

# filter only IDs that appear > 1
dupes_moy = id_counts_moy %>%
  filter(n > 1)

# view list of duplicate IDs
print(dupes_moy)

# A tibble: 1 × 2
  student_id     n
       <dbl> <int>
1 7117828382     2

#export
write.csv(dupes_moy, file = "duplicate_ids_moy_vt_cumber.csv", row.names = FALSE)

Inspect duplicate student record in the MOY file. Observation: Student records contain the same demographic info and math test info, however ELA test info (date and score) differ. Decision: Considering the student has complete test info in both observations, keep only the first student record (i.e., take the duplicate row with the earlier ELA test info).

dup_preview_moy = moy_map %>%
  group_by(student_id) %>%
  filter(n() > 1) %>%
  ungroup()

print(dup_preview_moy)

# A tibble: 2 × 14
  student_id grade_moy school_moy          treat_moy moy_math_comp moy_math_date
       <dbl>     <dbl> <chr>               <lgl>             <dbl> <chr>        
1 7117828382         7 Lewis Chapel Middle NA                  204 1/8/2026     
2 7117828382         7 Lewis Chapel Middle NA                  204 1/8/2026     
# ℹ 8 more variables: moy_ela_comp <dbl>, moy_ela_date <chr>, gender_moy <chr>,
#   iep_moy <dbl>, ell_moy <dbl>, frpl_moy <lgl>, race_eth_moy <chr>,
#   at_risk_moy <dbl>

Remove duplicate student record in MOY file following above decision.

# remove duplicates entirely
moy_no_dup = moy_map %>%
  group_by(student_id) %>%
  filter(n() == 1) %>%   # keep only IDs that appear once
  ungroup()

# pull out duplicate record with earliest ELA info to keep
moy_dup_keep = moy_map %>%
  filter(student_id == "7117828382",
         moy_ela_date == "1/7/2026")

# append moy w/out dupes to selected dup record
moy_unique = bind_rows(moy_no_dup, moy_dup_keep)

print(moy_unique)

# A tibble: 10,571 × 14
   student_id grade_moy school_moy         treat_moy moy_math_comp moy_math_date
        <dbl>     <dbl> <chr>              <lgl>             <dbl> <chr>        
 1 2287512462         6 Anne Chesnutt Mid… NA                  225 1/14/2026    
 2 4431265988         6 Anne Chesnutt Mid… NA                  201 1/14/2026    
 3 9779738622         6 Anne Chesnutt Mid… NA                  235 1/15/2026    
 4 3968556194         6 Anne Chesnutt Mid… NA                  211 1/14/2026    
 5 1925959163         6 Anne Chesnutt Mid… NA                  226 1/14/2026    
 6 2638992577         6 Anne Chesnutt Mid… NA                  213 1/14/2026    
 7 8925824655         6 Anne Chesnutt Mid… NA                  201 1/14/2026    
 8 4634477858         6 Anne Chesnutt Mid… NA                  228 1/14/2026    
 9 9836237186         6 Anne Chesnutt Mid… NA                  194 1/14/2026    
10 1961537885         6 Anne Chesnutt Mid… NA                  249 1/14/2026    
# ℹ 10,561 more rows
# ℹ 8 more variables: moy_ela_comp <dbl>, moy_ela_date <chr>, gender_moy <chr>,
#   iep_moy <dbl>, ell_moy <dbl>, frpl_moy <lgl>, race_eth_moy <chr>,
#   at_risk_moy <dbl>

EOY

Identify duplicate student records within EOY file.

# frequency table of student_id occurrences
id_counts_eoy = eoy_map %>%
  group_by(student_id) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

# filter only IDs that appear > 1
dupes_eoy = id_counts_eoy %>%
  filter(n > 1)

# view list of duplicate IDs
print(dupes_eoy)

# A tibble: 1 × 2
  student_id     n
       <dbl> <int>
1 8354674797     2

#export
write.csv(dupes_eoy, file = "duplicate_ids_eoy_vt_cumber.csv", row.names = FALSE)

dup_preview_eoy = eoy_map %>%
  group_by(student_id) %>%
  filter(n() > 1) %>% 
  ungroup()

print(dup_preview_eoy)

# A tibble: 2 × 14
  student_id grade_eoy school_eoy         treat_eoy eoy_math_comp eoy_math_date
       <dbl>     <dbl> <chr>              <lgl>             <dbl> <chr>        
1 8354674797         6 Spring Lake Middle NA                  206 4/17/2026    
2 8354674797         6 Spring Lake Middle NA                  206 4/17/2026    
# ℹ 8 more variables: eoy_ela_comp <dbl>, eoy_ela_date <chr>, gender_eoy <chr>,
#   iep_eoy <dbl>, ell_eoy <dbl>, frpl_eoy <lgl>, race_eth_eoy <chr>,
#   at_risk_eoy <dbl>

Remove duplicate student record in EOY file.

# remove duplicates entirely
eoy_no_dup = eoy_map %>%
  group_by(student_id) %>%
  filter(n() == 1) %>%   # keep only IDs that appear once
  ungroup()

# pull out duplicate record with earliest ELA info to keep
eoy_dup_keep = eoy_map %>%
  filter(student_id == "8354674797",
         eoy_ela_date == "4/20/2026")

# append eoy w/out dupes to selected dup record
eoy_unique = bind_rows(eoy_no_dup, eoy_dup_keep)

# preview result
print(eoy_unique)

# A tibble: 8,809 × 14
   student_id grade_eoy school_eoy         treat_eoy eoy_math_comp eoy_math_date
        <dbl>     <dbl> <chr>              <lgl>             <dbl> <chr>        
 1 7729773724         8 Cumberland Virtua… NA                  229 4/16/2026    
 2 4694886258         8 Cumberland Virtua… NA                  240 4/15/2026    
 3 1836192339         8 Cumberland Virtua… NA                  268 4/15/2026    
 4 7362151357         8 Cumberland Virtua… NA                  224 4/15/2026    
 5 4215536561         8 Cumberland Virtua… NA                  199 4/15/2026    
 6 3532188732         8 Cumberland Virtua… NA                  252 4/15/2026    
 7 3482512388         8 Cumberland Virtua… NA                  283 4/15/2026    
 8 4294541369         8 Cumberland Virtua… NA                  236 4/15/2026    
 9 4855429317         8 Cumberland Virtua… NA                  241 4/15/2026    
10 1582791228         8 Cumberland Virtua… NA                  239 4/15/2026    
# ℹ 8,799 more rows
# ℹ 8 more variables: eoy_ela_comp <dbl>, eoy_ela_date <chr>, gender_eoy <chr>,
#   iep_eoy <dbl>, ell_eoy <dbl>, frpl_eoy <lgl>, race_eth_eoy <chr>,
#   at_risk_eoy <dbl>

Merge

Merge MOY to roster.

merge_roster_moy = roster_unique %>%
  left_join(moy_unique, by = "student_id")

Check unmatched records in intermediary data: 257 students in the roster with no appearance in MOY.

#check students in roster with NO match in MOY
missing_moy = roster_unique %>%
  anti_join(moy_unique, by = "student_id")
cat("Number of roster students missing MOY data:", nrow(missing_moy), "\n")

Number of roster students missing MOY data: 257

Check unexpected records (MOY): 271 students present in the MOY file that do NOT present in the roster.

# check in MOY data NOT in roster
unexpected_moy_records = moy_unique %>%
  anti_join(roster_unique, by = "student_id")

cat("Number of MOY records not in roster:", nrow(unexpected_moy_records), "\n")

Number of MOY records not in roster: 271

Match intermediary with EOY records by student ID.

merge_cumber = merge_roster_moy %>%
  left_join(eoy_unique, by = "student_id")

Check unmatched records in final merge: 1753 students in the intermediary file, merge_roster_moy, with no appearance in EOY.

#check students in merge_roster_moy with NO match in EOY
missing_eoy = merge_roster_moy %>%
  anti_join(eoy_unique, by = "student_id")

cat("Number of students missing EOY data from merge_roster_moy:", nrow(missing_eoy), "\n")

Number of students missing EOY data from merge_roster_moy: 1753

Check unexpected records (EOY): 5 students present in the EOY file that not NOT present in the roster.

# check in EOY file NOT in roster (unexpected records)
unexpected_eoy_records = eoy_unique %>%
  anti_join(roster_unique, by = "student_id")


cat("Number of EOY records not in roster:", nrow(unexpected_eoy_records), "\n")

Number of EOY records not in roster: 5

Export unmatched (missing) and unexpected (extra) records.

write.csv(missing_moy, file =  "missing_moy_records_vt_cumber.csv", row.names = FALSE)
write.csv(unexpected_moy_records, file = "extra_moy_records_vt_cumber.csv", row.names = FALSE)
write.csv(missing_eoy, file = "missing_eoy_records_vt_cumber.csv", row.names = FALSE)
write.csv(unexpected_eoy_records, file = "extra_eoy_records_vt_cumber.csv", row.names = FALSE)

Clean merge

Keep rows where ID presents across the de-duped roster, moy file, and eoy file (N=8,646).

roster_moy_matched = roster_unique %>%
  inner_join(moy_unique, by = "student_id")

merge_vt_cumber_unclean = roster_moy_matched %>%
  inner_join(eoy_unique, by = "student_id")

print(merge_vt_cumber_unclean)

# A tibble: 8,648 × 30
   school_code school_ros    student_id grade_ros grade_moy school_moy treat_moy
         <dbl> <chr>              <dbl>     <dbl>     <dbl> <chr>      <lgl>    
 1      260336 Anne Chesnut… 2287512462         6         6 Anne Ches… NA       
 2      260336 Anne Chesnut… 4431265988         6         6 Anne Ches… NA       
 3      260336 Anne Chesnut… 9779738622         6         6 Anne Ches… NA       
 4      260336 Anne Chesnut… 3968556194         6         6 Anne Ches… NA       
 5      260336 Anne Chesnut… 1925959163         6         6 Anne Ches… NA       
 6      260336 Anne Chesnut… 2638992577         6         6 Anne Ches… NA       
 7      260336 Anne Chesnut… 8925824655         6         6 Anne Ches… NA       
 8      260336 Anne Chesnut… 4634477858         6         6 Anne Ches… NA       
 9      260336 Anne Chesnut… 9836237186         6         6 Anne Ches… NA       
10      260336 Anne Chesnut… 1961537885         6         6 Anne Ches… NA       
# ℹ 8,638 more rows
# ℹ 23 more variables: moy_math_comp <dbl>, moy_math_date <chr>,
#   moy_ela_comp <dbl>, moy_ela_date <chr>, gender_moy <chr>, iep_moy <dbl>,
#   ell_moy <dbl>, frpl_moy <lgl>, race_eth_moy <chr>, at_risk_moy <dbl>,
#   grade_eoy <dbl>, school_eoy <chr>, treat_eoy <lgl>, eoy_math_comp <dbl>,
#   eoy_math_date <chr>, eoy_ela_comp <dbl>, eoy_ela_date <chr>,
#   gender_eoy <chr>, iep_eoy <dbl>, ell_eoy <dbl>, frpl_eoy <lgl>, …

Check for requested data elements.

colnames(merge_vt_cumber_unclean)

 [1] "school_code"   "school_ros"    "student_id"    "grade_ros"    
 [5] "grade_moy"     "school_moy"    "treat_moy"     "moy_math_comp"
 [9] "moy_math_date" "moy_ela_comp"  "moy_ela_date"  "gender_moy"   
[13] "iep_moy"       "ell_moy"       "frpl_moy"      "race_eth_moy" 
[17] "at_risk_moy"   "grade_eoy"     "school_eoy"    "treat_eoy"    
[21] "eoy_math_comp" "eoy_math_date" "eoy_ela_comp"  "eoy_ela_date" 
[25] "gender_eoy"    "iep_eoy"       "ell_eoy"       "frpl_eoy"     
[29] "race_eth_eoy"  "at_risk_eoy"

Insert treatment status

Import roster from VT that details which students have accounts in their system

#import ID crosswalk from VT
vt_roster = read_csv("vt_roster_all_districts.csv")

Rows: 1496 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): District, School
dbl (2): District ID, Varsity Tutors ID

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# rename vars
vt_roster = vt_roster %>%
  rename(
    district_vt = "District",
    school_vt = "School",
    student_id = "District ID", 
    vt_acct_id = "Varsity Tutors ID"
  )

Add treatment status. Equals 1 if student id appears in roster from VT (N=870). All else equals 0.

merge_vt_cumber_unclean = merge_vt_cumber_unclean %>%
  mutate(
    treat = ifelse(student_id %in% vt_roster$student_id, 1, 0)
  )

table(merge_vt_cumber_unclean$treat)


   0    1 
7778  870

Inspect demographics (MOY v EOY)

GENDER: From MOY to EOY, there was 1 additional F student and 1 less M student, which suggests 1 student changed their gender identification.

gender_summary  = merge_vt_cumber_unclean  %>%
  pivot_longer(
    cols = c(gender_moy, gender_eoy),
    names_to = "time",
    values_to = "gender"
  ) %>%
  mutate(time = ifelse(time == "gender_moy", "moy", "eoy")) %>%
  filter(gender %in% c("M", "F")) %>%
  count(time, gender) %>%
  group_by(time) %>%
  mutate(
    total = sum(n),
    pct = n / total
  ) %>%
  arrange(time, gender)

print(gender_summary)

# A tibble: 4 × 5
# Groups:   time [2]
  time  gender     n total   pct
  <chr> <chr>  <int> <int> <dbl>
1 eoy   F       4188  8648 0.484
2 eoy   M       4460  8648 0.516
3 moy   F       4187  8648 0.484
4 moy   M       4461  8648 0.516

IEPs: 5 additional students gained IEPs at the end of year.

iep_summary = merge_vt_cumber_unclean %>%
  summarise(
    moy_iep_n = sum(iep_moy == 1, na.rm = TRUE),
    eoy_iep_n = sum(iep_eoy == 1, na.rm = TRUE),

    total_students = n(),

    moy_iep_pct =  moy_iep_n / total_students,
    eoy_iep_pct =  eoy_iep_n / total_students,

    change_n = eoy_iep_n - moy_iep_n,
    change_pct = eoy_iep_pct - moy_iep_pct
  )

print(iep_summary)

# A tibble: 1 × 7
  moy_iep_n eoy_iep_n total_students moy_iep_pct eoy_iep_pct change_n change_pct
      <int>     <int>          <int>       <dbl>       <dbl>    <int>      <dbl>
1      1245      1250           8648       0.144       0.145        5   0.000578

ELLs: 2 additional students gained ELL status by EOY.

ell_summary = merge_vt_cumber_unclean %>%
  summarise(
    moy_ell_n = sum(ell_moy == 1, na.rm = TRUE),
    eoy_ell_n = sum(ell_eoy == 1, na.rm = TRUE),

    total_students = n(),

    moy_ell_pct =  moy_ell_n / total_students,
    eoy_ell_pct =  eoy_ell_n / total_students,

    change_n = eoy_ell_n - moy_ell_n,
    change_pct = eoy_ell_pct - moy_ell_pct
  )

print(ell_summary)

# A tibble: 1 × 7
  moy_ell_n eoy_ell_n total_students moy_ell_pct eoy_ell_pct change_n change_pct
      <int>     <int>          <int>       <dbl>       <dbl>    <int>      <dbl>
1       372       374           8648      0.0430      0.0432        2   0.000231

FRPL: No students identified as having FRPL in MOY or EOY.

frpl_summary = merge_vt_cumber_unclean %>%
  summarise(
    moy_frpl_n = sum(frpl_moy == 1, na.rm = TRUE),
    eoy_frpl_n = sum(frpl_eoy == 1, na.rm = TRUE),

    total_students = n(),

    moy_frpl_pct =  moy_frpl_n / total_students,
    eoy_frpl_pct =  eoy_frpl_n / total_students,

    change_n = eoy_frpl_n - moy_frpl_n,
    change_pct = eoy_frpl_pct - moy_frpl_pct
  )

print(frpl_summary)

# A tibble: 1 × 7
  moy_frpl_n eoy_frpl_n total_students moy_frpl_pct eoy_frpl_pct change_n
       <int>      <int>          <int>        <dbl>        <dbl>    <int>
1          0          0           8648            0            0        0
# ℹ 1 more variable: change_pct <dbl>

AT-RISK: No change in status between MOY to EOY.

atrisk_summary = merge_vt_cumber_unclean %>%
  summarise(
    moy_atrisk_n = sum(at_risk_moy == 1, na.rm = TRUE),
    eoy_atrisk_n = sum(at_risk_eoy == 1, na.rm = TRUE),

    total_students = n(),

    moy_atrisk_pct =  moy_atrisk_n / total_students,
    eoy_atrisk_pct =  eoy_atrisk_n / total_students,

    change_n = eoy_atrisk_n - moy_atrisk_n,
    change_pct = eoy_atrisk_pct - moy_atrisk_pct
  )

print(atrisk_summary)

# A tibble: 1 × 7
  moy_atrisk_n eoy_atrisk_n total_students moy_atrisk_pct eoy_atrisk_pct
         <int>        <int>          <int>          <dbl>          <dbl>
1         2669         2669           8648          0.309          0.309
# ℹ 2 more variables: change_n <int>, change_pct <dbl>

RACE-ETHNICITY: Following categories appeared to have stayed more or less the same: A, B, W. There was a significant drop in the count of H students, suggesting that the district and/or families re-identified away from H in favor of other new categories: I, Multi, and P.

MOY: inspect racial categories and basic counts

table(merge_vt_cumber_unclean$race_eth_moy)


   A    B    H    P    W 
 180 4075 2385   46 1962

EOY: inspect racial categories and basic counts

table(merge_vt_cumber_unclean$race_eth_eoy)


    A     B     H     I Multi     P     W 
  181  4075  1467   100   817    46  1962

Decision: Keep racial/ethnicity categorizations and gender identifications from EOY file. All other demographic data keep from MOY file.

final_vt_cumber = merge_vt_cumber_unclean %>%
  dplyr::select(school_code, 
                student_id, 
                grade_moy, 
                school_moy, 
                moy_math_comp, moy_math_date, 
                moy_ela_comp,moy_ela_date, 
                eoy_math_comp, eoy_math_date,
                eoy_ela_comp, eoy_ela_date,
                gender_eoy, iep_moy, ell_moy,frpl_moy, race_eth_eoy, at_risk_moy,
                treat)

colnames(final_vt_cumber)

 [1] "school_code"   "student_id"    "grade_moy"     "school_moy"   
 [5] "moy_math_comp" "moy_math_date" "moy_ela_comp"  "moy_ela_date" 
 [9] "eoy_math_comp" "eoy_math_date" "eoy_ela_comp"  "eoy_ela_date" 
[13] "gender_eoy"    "iep_moy"       "ell_moy"       "frpl_moy"     
[17] "race_eth_eoy"  "at_risk_moy"   "treat"

Clean column names for selected demographics.

final_vt_cumber = final_vt_cumber %>%
  rename(
    grade = "grade_moy", 
    school = "school_moy", 
    gender = "gender_eoy", 
    iep = "iep_moy", 
    ell = "ell_moy", 
    frpl = "frpl_moy", 
    race_eth = "race_eth_eoy", 
    at_risk = "at_risk_moy"
  )
colnames(final_vt_cumber)

 [1] "school_code"   "student_id"    "grade"         "school"       
 [5] "moy_math_comp" "moy_math_date" "moy_ela_comp"  "moy_ela_date" 
 [9] "eoy_math_comp" "eoy_math_date" "eoy_ela_comp"  "eoy_ela_date" 
[13] "gender"        "iep"           "ell"           "frpl"         
[17] "race_eth"      "at_risk"       "treat"

Inspect Merged Data

Missing Data Summary

Percentage missing for each variable. Note: percentage missing for at_risk probably indicates the percent NOT at-risk. Observations: FRPL may be missing for all students or district forgot to add to file. All other demographic info is present for all students. Some missing data for test info.

missing_summary = final_vt_cumber %>%
  summarise(across(
    everything(),
    ~ mean(is.na(.)) * 100
  )) %>%
  pivot_longer(
    cols = everything(),
    names_to = "variable",
    values_to = "percent_missing"
  ) %>%
  arrange(desc(percent_missing))

print(missing_summary)

# A tibble: 19 × 2
   variable      percent_missing
   <chr>                   <dbl>
 1 frpl                   100   
 2 at_risk                 69.1 
 3 eoy_math_comp           12.7 
 4 eoy_math_date           12.7 
 5 moy_ela_comp             8.51
 6 moy_ela_date             8.51
 7 moy_math_comp            6.93
 8 moy_math_date            6.93
 9 eoy_ela_comp             6.81
10 eoy_ela_date             6.81
11 school_code              0   
12 student_id               0   
13 grade                    0   
14 school                   0   
15 gender                   0   
16 iep                      0   
17 ell                      0   
18 race_eth                 0   
19 treat                    0

Percentage of student who have both MOY and EOY data overall

#indicator for having BOTH MOY and EOY scores
final_vt_cumber_test_info = final_vt_cumber %>%
  mutate(
    both_math = !is.na(moy_math_comp) & !is.na(eoy_math_comp),
    both_ela  = !is.na(moy_ela_comp)  & !is.na(eoy_ela_comp),
    all_tests = both_math & both_ela
  )

#pct across schools
overall_pct = final_vt_cumber_test_info %>%
  summarise(
    total_students = n(),
    students_with_all_tests = sum(all_tests),
    percent_with_all_tests = (students_with_all_tests / total_students) * 100
  ) %>%
  mutate(percent_with_all_tests = round(percent_with_all_tests, 2))

print(overall_pct)

# A tibble: 1 × 3
  total_students students_with_all_tests percent_with_all_tests
           <int>                   <int>                  <dbl>
1           8648                    6311                   73.0

Percentage of student who have both MOY and EOY data by school

by_school_pct = final_vt_cumber_test_info %>%
  group_by(school) %>%
  summarise(
    total_students = n(),
    students_with_all_tests = sum(all_tests),
    percent_with_all_tests = (students_with_all_tests / total_students) * 100
  ) %>%
  mutate(percent_with_all_tests = round(percent_with_all_tests, 2)) %>%
  arrange(desc(percent_with_all_tests))

print(by_school_pct)

# A tibble: 17 × 4
   school           total_students students_with_all_te…¹ percent_with_all_tests
   <chr>                     <int>                  <int>                  <dbl>
 1 Seventy-First C…            387                    384                   99.2
 2 Cumberland Virt…            126                    124                   98.4
 3 Anne Chesnutt M…            492                    482                   98.0
 4 South View Midd…            564                    525                   93.1
 5 R Max Abbott Mi…            699                    635                   90.8
 6 Lewis Chapel Mi…            509                    441                   86.6
 7 Gray's Creek Mi…            979                    829                   84.7
 8 Luther Nick Jer…            515                    389                   75.5
 9 Douglas Byrd Mi…            644                    476                   73.9
10 Hope Mills Midd…            407                    300                   73.7
11 Pine Forest Mid…            631                    464                   73.5
12 Westover Middle             647                    469                   72.5
13 Spring Lake Mid…            428                    268                   62.6
14 Mac Williams Mi…            805                    412                   51.2
15 Howard Learning…             55                     19                   34.6
16 John R Griffin …            559                     94                   16.8
17 Reid Ross Class…            201                      0                    0  
# ℹ abbreviated name: ¹students_with_all_tests

Dummies for test info

both_math: dummy for students with complete math test info across MOY and EOY.

table(final_vt_cumber_test_info$both_math)


FALSE  TRUE 
 1610  7038

#complete math: MOY and EOY present
final_vt_cumber_test_info$both_math = ifelse(final_vt_cumber_test_info$both_math, 1, 0)
table(final_vt_cumber_test_info$both_math) #check counts


   0    1 
1610 7038

both_ela: dummy for students with complete ELA test info across MOY and EOY..

table(final_vt_cumber_test_info$both_ela)


FALSE  TRUE 
 1215  7433

#complete ELA: MOY and EOY present
final_vt_cumber_test_info$both_ela = ifelse(final_vt_cumber_test_info$both_ela, 1, 0)
table(final_vt_cumber_test_info$both_ela) #check counts


   0    1 
1215 7433

all_tests: dummy for students with both math and ELA test info complete across MOY and EOY.

table(final_vt_cumber_test_info$all_tests)


FALSE  TRUE 
 2337  6311

#complete math and ELA: MOY and EOY present
final_vt_cumber_test_info$all_tests = ifelse(final_vt_cumber_test_info$all_tests, 1, 0)
table(final_vt_cumber_test_info$all_tests) #check counts


   0    1 
2337 6311

math_moy_missing_eoy_present: dummy for students who have EOY math test info but MOY test info is missing.

final_vt_cumber_test_info <- final_vt_cumber_test_info %>%
  mutate(
    moy_math_present = !is.na(moy_math_comp) & !is.na(moy_math_date),
    eoy_math_present = !is.na(eoy_math_comp) & !is.na(eoy_math_date),
    
    math_moy_missing_eoy_present = case_when(
      !moy_math_present &  eoy_math_present ~ 1,   # moy missing, eoy present
       moy_math_present &  eoy_math_present ~ 0,   # both present
      !moy_math_present & !eoy_math_present ~ NA_real_  # both missing
    )
  )
table(final_vt_cumber_test_info$math_moy_missing_eoy_present)


   0    1 
7038  512

ela_moy_missing_eoy_present: dummy for students who have EOY ELA test info but MOY test info is missing.

final_vt_cumber_test_info <- final_vt_cumber_test_info %>%
  mutate(
    moy_ela_present = !is.na(moy_ela_comp) & !is.na(moy_ela_date),
    eoy_ela_present = !is.na(eoy_ela_comp) & !is.na(eoy_ela_date),
    
    ela_moy_missing_eoy_present = case_when(
      !moy_ela_present &  eoy_ela_present ~ 1,   # moy missing, eoy present
       moy_ela_present &  eoy_ela_present ~ 0,   # both present
      !moy_ela_present & !eoy_ela_present ~ NA_real_  # both missing
    )
  )
table(final_vt_cumber_test_info$ela_moy_missing_eoy_present)


   0    1 
7433  626

###Test info summary

Proportion of math test info present (MOY present and EOY present, grouped by treatment status).

summary_math_tests_present <- final_vt_cumber_test_info %>%
  mutate(
    moy_math_present = !is.na(moy_math_comp) & !is.na(moy_math_date),
    eoy_math_present = !is.na(eoy_math_comp) & !is.na(eoy_math_date)
  ) %>%
  group_by(treat) %>%
  summarise(
    n_students = n(),
    
    moy_count = sum(moy_math_present),
    moy_prop  = mean(moy_math_present),
    
    eoy_count = sum(eoy_math_present),
    eoy_prop  = mean(eoy_math_present),
    
    .groups = "drop"
  )

# View result
print(summary_math_tests_present)

# A tibble: 2 × 6
  treat n_students moy_count moy_prop eoy_count eoy_prop
  <dbl>      <int>     <int>    <dbl>     <int>    <dbl>
1     0       7778      7262    0.934      6736    0.866
2     1        870       787    0.905       814    0.936

Proportion of ELA test info present (MOY present and EOY present, grouped by treatment status).

summary_ela_tests_present <- final_vt_cumber_test_info %>%
  mutate(
    moy_ela_present = !is.na(moy_ela_comp) & !is.na(moy_ela_date),
    eoy_ela_present = !is.na(eoy_ela_comp) & !is.na(eoy_ela_date)
  ) %>%
  group_by(treat) %>%
  summarise(
    n_students = n(),
    
    moy_count = sum(moy_ela_present),
    moy_prop  = mean(moy_ela_present),
    
    eoy_count = sum(eoy_ela_present),
    eoy_prop  = mean(eoy_ela_present),
    
    .groups = "drop"
  )

# View result
print(summary_ela_tests_present)

# A tibble: 2 × 6
  treat n_students moy_count moy_prop eoy_count eoy_prop
  <dbl>      <int>     <int>    <dbl>     <int>    <dbl>
1     0       7778      7168    0.922      7285    0.937
2     1        870       744    0.855       774    0.890

Data Summary

Generate school-level means for MOY ELA and Math assessment data, number of students, percentages of students by race/ethnicity, gender, and FRPL-status.

#create function to streamline calculations
calc_pct = function(x) {
  prop.table(table(x)) * 100
}

#school-level means and counts for TEST data
school_means = final_vt_cumber_test_info %>%
  group_by(school) %>%
  summarise(
    n_students = n(),
    mean_moy_math = mean(moy_math_comp, na.rm = TRUE),
    mean_moy_ela  = mean(moy_ela_comp, na.rm = TRUE)
  )

# DEMOGRAPHICS

# race/ethnicity
race_pct = final_vt_cumber_test_info %>%
  group_by(school, race_eth) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = race_eth,
    values_from = pct,
    names_prefix = "race_"
  )

#gender
gender_pct = final_vt_cumber_test_info %>%
  group_by(school, gender) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = gender,
    values_from = pct,
    names_prefix = "gender_"
  )

#free reduced price lunch
frpl_pct = final_vt_cumber_test_info %>%
  group_by(school, frpl) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = frpl,
    values_from = pct,
    names_prefix = "frpl_"
  )

#english language learner
ell_pct = final_vt_cumber_test_info %>%
  group_by(school, ell) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = ell,
    values_from = pct,
    names_prefix = "ell_"
  )

#at-risk students
atrisk_pct = final_vt_cumber_test_info %>%
  group_by(school, at_risk) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = at_risk,
    values_from = pct,
    names_prefix = "atrisk_"
  )

# COMBINE summaries
school_summary = school_means %>%
  left_join(race_pct,   by = "school") %>%
  left_join(gender_pct, by = "school") %>%
  left_join(frpl_pct,   by = "school") %>%
  left_join(ell_pct,    by = "school") %>%
  left_join(atrisk_pct, by = "school")

school_summary = school_summary %>%
  mutate(across(where(is.numeric), ~ round(.x, 2)))

print(school_summary)

# A tibble: 17 × 18
   school      n_students mean_moy_math mean_moy_ela race_A race_B race_H race_I
   <chr>            <dbl>         <dbl>        <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
 1 Anne Chesn…        492          220.         214.   0.41   62.6   19.7   0.81
 2 Cumberland…        126          239.         226.   1.59   43.6   23.0   0.79
 3 Douglas By…        644          214.         210.   1.24   50     22.7   1.4 
 4 Gray's Cre…        979          222.         215.   1.33   28.4   15.1   2.25
 5 Hope Mills…        407          221.         214.   0.74   41.0   18.4   1.23
 6 Howard Lea…         55          204.         202.  NA      74.6   16.4  NA   
 7 John R Gri…        559          227.         216.   7.51   25.8   18.1   1.61
 8 Lewis Chap…        509          213.         207.   1.18   67.2   15.7   0.59
 9 Luther Nic…        515          216.         210.   0.78   71.1   13.4   0.58
10 Mac Willia…        805          222.         213.   0.87   24.5   13.2   2.24
11 Pine Fores…        631          219.         213.   2.85   39.3   16.5   0.63
12 R Max Abbo…        699          219.         214.   3.72   42.8   15.3   0.86
13 Reid Ross …        201          NaN          NaN   NA      63.7   11.0  NA   
14 Seventy-Fi…        387          228.         220.   5.43   53.8   15.5   0.26
15 South View…        564          215.         211.   1.06   54.6   19.3   1.77
16 Spring Lak…        428          216.         211.   1.17   59.8   21.3   0.23
17 Westover M…        647          218.         212.   2.78   63.1   17.6   0.62
# ℹ 10 more variables: race_Multi <dbl>, race_P <dbl>, race_W <dbl>,
#   gender_F <dbl>, gender_M <dbl>, frpl_NA <dbl>, ell_0 <dbl>, ell_1 <dbl>,
#   atrisk_1 <dbl>, atrisk_NA <dbl>

write.csv(school_summary, "test_summary_by_school_vt_cumber.csv", row.names = FALSE)

Reformat Data

Create dummy variables

Create numeric binary for gender.

table(final_vt_cumber_test_info$gender)


   F    M 
4188 4460

final_vt_cumber_test_info = final_vt_cumber_test_info %>%
  mutate(
    male = case_when(
      gender == "M" ~ 1,
      gender == "F" ~ 0,
      TRUE ~ NA_real_   # handles missing or unexpected values
    )
  )
table(final_vt_cumber_test_info$male)


   0    1 
4188 4460

Create numeric binaries for each race.

table(final_vt_cumber_test_info$race_eth)


    A     B     H     I Multi     P     W 
  181  4075  1467   100   817    46  1962

final_vt_cumber_test_info = final_vt_cumber_test_info %>%
  mutate(
    race_A     = ifelse(race_eth == "A", 1, ifelse(is.na(race_eth), NA, 0)),
    race_B     = ifelse(race_eth == "B", 1, ifelse(is.na(race_eth), NA, 0)),
    race_H     = ifelse(race_eth == "H", 1, ifelse(is.na(race_eth), NA, 0)),
    race_I     = ifelse(race_eth == "I", 1, ifelse(is.na(race_eth), NA, 0)),
    race_Multi     = ifelse(race_eth == "Multi", 1, ifelse(is.na(race_eth), NA, 0)),
    race_P     = ifelse(race_eth == "P", 1, ifelse(is.na(race_eth), NA, 0)),
    race_W     = ifelse(race_eth == "W", 1, ifelse(is.na(race_eth), NA, 0))
  )
table(final_vt_cumber_test_info$race_B) #quick check coding on race_B


   0    1 
4573 4075

Export files for e2i Coach

Save e2i file with all matched students from the merge, regardless of missing test info.

#select columns ready for e2i
vt_cumber_e2i_test_info = final_vt_cumber_test_info %>%
  dplyr::select(school_code, 
                student_id, 
                grade, 
                school, 
                moy_math_comp, moy_math_date, 
                moy_ela_comp,moy_ela_date, 
                eoy_math_comp, eoy_math_date,
                eoy_ela_comp, eoy_ela_date,
                male, iep, ell,frpl, at_risk,
                treat,
                race_A, race_B, race_H, race_I, race_Multi, race_P, race_W,
                both_math, both_ela, all_tests, 
                math_moy_missing_eoy_present, ela_moy_missing_eoy_present)

#export e2i with test indicators, all matched students from the merge, regardless of missing test info (all_tests == 1 or 0)
write.csv(vt_cumber_e2i_test_info, file = "vt_cumber_e2i_all_students.csv", row.names = FALSE)

Save e2i file with only containing matched students from the merge with complete test info.

table(final_vt_cumber_test_info$all_tests)


   0    1 
2337 6311

#subset: keep only students with complete test info (all_tests == 1)
vt_cumber_e2i_all_tests = vt_cumber_e2i_test_info %>% filter(all_tests == 1)
print(vt_cumber_e2i_all_tests) # N=6311

# A tibble: 6,311 × 30
   school_code student_id grade school  moy_math_comp moy_math_date moy_ela_comp
         <dbl>      <dbl> <dbl> <chr>           <dbl> <chr>                <dbl>
 1      260336 2287512462     6 Anne C…           225 1/14/2026              215
 2      260336 4431265988     6 Anne C…           201 1/14/2026              200
 3      260336 9779738622     6 Anne C…           235 1/15/2026              235
 4      260336 3968556194     6 Anne C…           211 1/14/2026              225
 5      260336 1925959163     6 Anne C…           226 1/14/2026              226
 6      260336 2638992577     6 Anne C…           213 1/14/2026              209
 7      260336 8925824655     6 Anne C…           201 1/14/2026              209
 8      260336 4634477858     6 Anne C…           228 1/14/2026              217
 9      260336 9836237186     6 Anne C…           194 1/14/2026              198
10      260336 1961537885     6 Anne C…           249 1/14/2026              237
# ℹ 6,301 more rows
# ℹ 23 more variables: moy_ela_date <chr>, eoy_math_comp <dbl>,
#   eoy_math_date <chr>, eoy_ela_comp <dbl>, eoy_ela_date <chr>, male <dbl>,
#   iep <dbl>, ell <dbl>, frpl <lgl>, at_risk <dbl>, treat <dbl>, race_A <dbl>,
#   race_B <dbl>, race_H <dbl>, race_I <dbl>, race_Multi <dbl>, race_P <dbl>,
#   race_W <dbl>, both_math <dbl>, both_ela <dbl>, all_tests <dbl>,
#   math_moy_missing_eoy_present <dbl>, ela_moy_missing_eoy_present <dbl>

Count of treatment students among the subset with complete test info.

table(vt_cumber_e2i_all_tests$treat)


   0    1 
5708  603

#export e2i only containing matched students from the merge with complete test info (all_tests == 1)
write.csv(vt_cumber_e2i_all_tests, file = "vt_cumber_e2i_complete_tests.csv", row.names = FALSE)