Data Processing: Varsity Tutors, Cumberland

Author

Alexis Davila

Clean Data

Import

Packages selected for inspecting and cleaning Varsity Tutors, district data from Cumberland schools.

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.1     ✔ readr     2.2.0
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.3     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Import roster.

roster <- read_csv("roster_vt_cumber.csv")
Rows: 14447 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): school
dbl (3): school_code, student_id, grade

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Change column headers for roster.

roster = roster %>%
  rename(
    grade_ros = "grade",
    school_ros = "school",
  )

Import Middle of Year data request.

moy_map = read_csv("moy_map_vt_cumber.csv")
Rows: 10572 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): School Name, MOY MAP Math Math Test Date*, MOY MAP ELA Test Date*, ...
dbl (7): Student ID, Student Grade Level, MOY MAP Math Composite Scaled Scor...
lgl (2): Treatment, FRPL

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
view(moy_map)

Change column headers for moy_map.

moy_map = moy_map %>%
  rename(
    student_id = "Student ID",
    grade_moy = "Student Grade Level", 
    school_moy = "School Name", 
    treat_moy = "Treatment", 
    moy_math_comp = "MOY MAP Math Composite Scaled Score*", 
    moy_math_date = "MOY MAP Math Math Test Date*", 
    moy_ela_comp = "MOY MAP ELA Composite Scaled Score*", 
    moy_ela_date = "MOY MAP ELA Test Date*", 
    gender_moy = "Gender", 
    iep_moy = "IEP", 
    ell_moy = "ELL", 
    frpl_moy = "FRPL", 
    race_eth_moy = "Race/Ethnicity", 
    at_risk_moy = "Academically at-risk"
    
  )

Import End of Year data request.

eoy_map = read_csv("eoy_map_vt_cumber.csv")
Rows: 8810 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): School Name, EOY MAP Math Math Test Date*, EOY MAP ELA Test Date*, ...
dbl (7): Student ID, Student Grade Level, EOY MAP Math Composite Scaled Scor...
lgl (2): Treatment, FRPL

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
view(eoy_map)

Change column headers for eoy_map.

eoy_map = eoy_map %>%
  rename(
    student_id = "Student ID",
    grade_eoy = "Student Grade Level", 
    school_eoy = "School Name", 
    treat_eoy = "Treatment", 
    eoy_math_comp = "EOY MAP Math Composite Scaled Score*", 
    eoy_math_date = "EOY MAP Math Math Test Date*", 
    eoy_ela_comp = "EOY MAP ELA Composite  Scaled Score*", 
    eoy_ela_date = "EOY MAP ELA Test Date*", 
    gender_eoy = "Gender", 
    iep_eoy = "IEP", 
    ell_eoy = "ELL", 
    frpl_eoy = "FRPL", 
    race_eth_eoy = "Race/Ethnicity", 
    at_risk_eoy = "Academically at-risk"
    
  )

Inspect duplicates

Roster

Identify duplicate student records within roster file.

# frequency table of student_id occurrences
id_counts_roster = roster %>%
  group_by(student_id) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

# filter only IDs that appear > 1
dupes_roster = id_counts_roster %>%
  filter(n > 1)

# view results
print(dupes_roster)
# A tibble: 1,425 × 2
   student_id     n
        <dbl> <int>
 1 2257769627    16
 2 3284674675    16
 3 5761715848    15
 4 7877815913    15
 5 8922727837    15
 6 4254112513    14
 7 5251233515    14
 8 3568287471    13
 9 4526196894    13
10 6564849621    13
# ℹ 1,415 more rows
#export
write.csv(dupes_roster, file = "duplicate_ids_roster_vt_cumber.csv", row.names = FALSE)

Inspect duplicate student records in roster. Observation: for a given duplicate student record, it appears to be the same individual (e.g., same school, same grade)

dup_preview_roster = roster %>%
  group_by(student_id) %>%
  filter(n() > 1) %>%
  ungroup()

print(dup_preview_roster)
# A tibble: 5,315 × 4
   school_code school_ros           student_id grade_ros
         <dbl> <chr>                     <dbl>     <dbl>
 1      260336 Anne Chesnutt Middle 9581845356         6
 2      260336 Anne Chesnutt Middle 9581845356         6
 3      260336 Anne Chesnutt Middle 9581845356         6
 4      260336 Anne Chesnutt Middle 5396535695         6
 5      260336 Anne Chesnutt Middle 5396535695         6
 6      260336 Anne Chesnutt Middle 4978175585         6
 7      260336 Anne Chesnutt Middle 4978175585         6
 8      260336 Anne Chesnutt Middle 6429225941         6
 9      260336 Anne Chesnutt Middle 6429225941         6
10      260336 Anne Chesnutt Middle 6429225941         6
# ℹ 5,305 more rows

Decision: keep first occurrence of student ID based on current row order (N=10,557).

roster_unique = roster %>%
  distinct(student_id, .keep_all = TRUE)

print(roster_unique)
# A tibble: 10,557 × 4
   school_code school_ros           student_id grade_ros
         <dbl> <chr>                     <dbl>     <dbl>
 1      260336 Anne Chesnutt Middle 2287512462         6
 2      260336 Anne Chesnutt Middle 4431265988         6
 3      260336 Anne Chesnutt Middle 9779738622         6
 4      260336 Anne Chesnutt Middle 3968556194         6
 5      260336 Anne Chesnutt Middle 1925959163         6
 6      260336 Anne Chesnutt Middle 2638992577         6
 7      260336 Anne Chesnutt Middle 8925824655         6
 8      260336 Anne Chesnutt Middle 4634477858         6
 9      260336 Anne Chesnutt Middle 9836237186         6
10      260336 Anne Chesnutt Middle 1961537885         6
# ℹ 10,547 more rows

MOY

Identify duplicate student records within MOY file.

# frequency table of student_id occurrences
id_counts_moy = moy_map %>%
  group_by(student_id) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

# filter only IDs that appear > 1
dupes_moy = id_counts_moy %>%
  filter(n > 1)

# view list of duplicate IDs
print(dupes_moy)
# A tibble: 1 × 2
  student_id     n
       <dbl> <int>
1 7117828382     2
#export
write.csv(dupes_moy, file = "duplicate_ids_moy_vt_cumber.csv", row.names = FALSE)

Inspect duplicate student record in the MOY file. Observation: Student records contain the same demographic info and math test info, however ELA test info (date and score) differ. Decision: Remove student duplicate record.

dup_preview_moy = moy_map %>%
  group_by(student_id) %>%
  filter(n() > 1) %>%
  ungroup()

print(dup_preview_moy)
# A tibble: 2 × 14
  student_id grade_moy school_moy          treat_moy moy_math_comp moy_math_date
       <dbl>     <dbl> <chr>               <lgl>             <dbl> <chr>        
1 7117828382         7 Lewis Chapel Middle NA                  204 1/8/2026     
2 7117828382         7 Lewis Chapel Middle NA                  204 1/8/2026     
# ℹ 8 more variables: moy_ela_comp <dbl>, moy_ela_date <chr>, gender_moy <chr>,
#   iep_moy <dbl>, ell_moy <dbl>, frpl_moy <lgl>, race_eth_moy <chr>,
#   at_risk_moy <dbl>

Remove duplicate student record in MOY file.

moy_unique <- moy_map %>%
  group_by(student_id) %>%
  filter(n() == 1) %>% # identify IDs that appear exactly once
  ungroup()

print(moy_unique)
# A tibble: 10,570 × 14
   student_id grade_moy school_moy         treat_moy moy_math_comp moy_math_date
        <dbl>     <dbl> <chr>              <lgl>             <dbl> <chr>        
 1 2287512462         6 Anne Chesnutt Mid… NA                  225 1/14/2026    
 2 4431265988         6 Anne Chesnutt Mid… NA                  201 1/14/2026    
 3 9779738622         6 Anne Chesnutt Mid… NA                  235 1/15/2026    
 4 3968556194         6 Anne Chesnutt Mid… NA                  211 1/14/2026    
 5 1925959163         6 Anne Chesnutt Mid… NA                  226 1/14/2026    
 6 2638992577         6 Anne Chesnutt Mid… NA                  213 1/14/2026    
 7 8925824655         6 Anne Chesnutt Mid… NA                  201 1/14/2026    
 8 4634477858         6 Anne Chesnutt Mid… NA                  228 1/14/2026    
 9 9836237186         6 Anne Chesnutt Mid… NA                  194 1/14/2026    
10 1961537885         6 Anne Chesnutt Mid… NA                  249 1/14/2026    
# ℹ 10,560 more rows
# ℹ 8 more variables: moy_ela_comp <dbl>, moy_ela_date <chr>, gender_moy <chr>,
#   iep_moy <dbl>, ell_moy <dbl>, frpl_moy <lgl>, race_eth_moy <chr>,
#   at_risk_moy <dbl>

EOY

Identify duplicate student records within EOY file.

# frequency table of student_id occurrences
id_counts_eoy = eoy_map %>%
  group_by(student_id) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

# filter only IDs that appear > 1
dupes_eoy = id_counts_eoy %>%
  filter(n > 1)

# view list of duplicate IDs
print(dupes_eoy)
# A tibble: 1 × 2
  student_id     n
       <dbl> <int>
1 8354674797     2
#export
write.csv(dupes_eoy, file = "duplicate_ids_eoy_vt_cumber.csv", row.names = FALSE)

Inspect duplicate student record in the MOY file. Observation: Student records contain the same demographic info and math test info, however ELA test info (date and score) differ. Decision: Remove student duplicate record.

dup_preview_eoy = eoy_map %>%
  group_by(student_id) %>%
  filter(n() > 1) %>% 
  ungroup()

print(dup_preview_eoy)
# A tibble: 2 × 14
  student_id grade_eoy school_eoy         treat_eoy eoy_math_comp eoy_math_date
       <dbl>     <dbl> <chr>              <lgl>             <dbl> <chr>        
1 8354674797         6 Spring Lake Middle NA                  206 4/17/2026    
2 8354674797         6 Spring Lake Middle NA                  206 4/17/2026    
# ℹ 8 more variables: eoy_ela_comp <dbl>, eoy_ela_date <chr>, gender_eoy <chr>,
#   iep_eoy <dbl>, ell_eoy <dbl>, frpl_eoy <lgl>, race_eth_eoy <chr>,
#   at_risk_eoy <dbl>

Remove duplicate student record in EOY file.

eoy_unique <- eoy_map %>%
  group_by(student_id) %>%
  filter(n() == 1) %>% # identify IDs that appear exactly once
  ungroup()
# preview result
print(eoy_unique)
# A tibble: 8,808 × 14
   student_id grade_eoy school_eoy         treat_eoy eoy_math_comp eoy_math_date
        <dbl>     <dbl> <chr>              <lgl>             <dbl> <chr>        
 1 7729773724         8 Cumberland Virtua… NA                  229 4/16/2026    
 2 4694886258         8 Cumberland Virtua… NA                  240 4/15/2026    
 3 1836192339         8 Cumberland Virtua… NA                  268 4/15/2026    
 4 7362151357         8 Cumberland Virtua… NA                  224 4/15/2026    
 5 4215536561         8 Cumberland Virtua… NA                  199 4/15/2026    
 6 3532188732         8 Cumberland Virtua… NA                  252 4/15/2026    
 7 3482512388         8 Cumberland Virtua… NA                  283 4/15/2026    
 8 4294541369         8 Cumberland Virtua… NA                  236 4/15/2026    
 9 4855429317         8 Cumberland Virtua… NA                  241 4/15/2026    
10 1582791228         8 Cumberland Virtua… NA                  239 4/15/2026    
# ℹ 8,798 more rows
# ℹ 8 more variables: eoy_ela_comp <dbl>, eoy_ela_date <chr>, gender_eoy <chr>,
#   iep_eoy <dbl>, ell_eoy <dbl>, frpl_eoy <lgl>, race_eth_eoy <chr>,
#   at_risk_eoy <dbl>

Merge

Merge MOY to roster.

merge_roster_moy = roster_unique %>%
  left_join(moy_unique, by = "student_id")

Check unmatched records in intermediary data: 258 students in the roster with no appearance in MOY.

#check students in roster with NO match in MOY
missing_moy = roster_unique %>%
  anti_join(moy_unique, by = "student_id")
cat("Number of roster students missing MOY data:", nrow(missing_moy), "\n")
Number of roster students missing MOY data: 258 

Check unexpected records (MOY): 271 students present in the MOY file that do NOT present in the roster.

# check in MOY data NOT in roster
unexpected_moy_records = moy_unique %>%
  anti_join(roster_unique, by = "student_id")

cat("Number of MOY records not in roster:", nrow(unexpected_moy_records), "\n")
Number of MOY records not in roster: 271 

Match intermediary with EOY records by student ID.

merge_cumber = merge_roster_moy %>%
  left_join(eoy_unique, by = "student_id")

Check unmatched records in final merge: 1754 students in the intermediary file, merge_roster_moy, with no appearance in EOY.

#check students in merge_roster_moy with NO match in EOY
missing_eoy = merge_roster_moy %>%
  anti_join(eoy_unique, by = "student_id")

cat("Number of students missing EOY data from merge_roster_moy:", nrow(missing_eoy), "\n")
Number of students missing EOY data from merge_roster_moy: 1754 

Check unexpected records (EOY): 5 students present in the EOY file that not NOT present in the roster.

# check in EOY file NOT in roster (unexpected records)
unexpected_eoy_records = eoy_unique %>%
  anti_join(roster_unique, by = "student_id")


cat("Number of EOY records not in roster:", nrow(unexpected_eoy_records), "\n")
Number of EOY records not in roster: 5 

Export unmatched (missing) and unexpected (extra) records.

write.csv(missing_moy, file =  "missing_moy_records_vt_cumber.csv", row.names = FALSE)
write.csv(unexpected_moy_records, file = "extra_moy_records_vt_cumber.csv", row.names = FALSE)
write.csv(missing_eoy, file = "missing_eoy_records_vt_cumber.csv", row.names = FALSE)
write.csv(unexpected_eoy_records, file = "extra_eoy_records_vt_cumber.csv", row.names = FALSE)

Clean merge

Keep rows where ID presents across the de-duped roster, moy file, and eoy file.

roster_moy_matched = roster_unique %>%
  inner_join(moy_unique, by = "student_id")

final_vt_cumber_unclean = roster_moy_matched %>%
  inner_join(eoy_unique, by = "student_id")

Check for requested data elements. Keep only necessary columns (only keep the latest demographic info from eoy file). No missing columns following the data request.

final_vt_cumber = final_vt_cumber_unclean %>%
  dplyr::select(school_code, 
                student_id, 
                grade_eoy, 
                school_eoy, 
                treat_eoy, 
                moy_math_comp, moy_math_date, 
                moy_ela_comp,moy_ela_date, 
                eoy_math_comp, eoy_math_date,
                eoy_ela_comp, eoy_ela_date,
                gender_eoy, iep_eoy, ell_eoy,frpl_eoy, race_eth_eoy, at_risk_eoy)

colnames(final_vt_cumber)
 [1] "school_code"   "student_id"    "grade_eoy"     "school_eoy"   
 [5] "treat_eoy"     "moy_math_comp" "moy_math_date" "moy_ela_comp" 
 [9] "moy_ela_date"  "eoy_math_comp" "eoy_math_date" "eoy_ela_comp" 
[13] "eoy_ela_date"  "gender_eoy"    "iep_eoy"       "ell_eoy"      
[17] "frpl_eoy"      "race_eth_eoy"  "at_risk_eoy"  

Export clean, merged data.

write.csv(final_vt_cumber, file = "merged_clean_vt_cumber.csv", row.names = FALSE)

Inspect Merged Data

Missing Data Summary

Calculate the percentage missing for each variable. Note: percentage missing for at_risk probably indicates the percent NOT at-risk. Observations: treatment status is missing for all students; FRPL may be missing for all students or district forgot to add to file. All other demographic info is present for all students. Some missing data for test info.

missing_summary = final_vt_cumber %>%
  summarise(across(
    everything(),
    ~ mean(is.na(.)) * 100
  )) %>%
  pivot_longer(
    cols = everything(),
    names_to = "variable",
    values_to = "percent_missing"
  ) %>%
  arrange(desc(percent_missing))

print(missing_summary)
# A tibble: 19 × 2
   variable      percent_missing
   <chr>                   <dbl>
 1 treat_eoy              100   
 2 frpl_eoy               100   
 3 at_risk_eoy             69.1 
 4 eoy_math_comp           12.7 
 5 eoy_math_date           12.7 
 6 moy_ela_comp             8.51
 7 moy_ela_date             8.51
 8 moy_math_comp            6.93
 9 moy_math_date            6.93
10 eoy_ela_comp             6.81
11 eoy_ela_date             6.81
12 school_code              0   
13 student_id               0   
14 grade_eoy                0   
15 school_eoy               0   
16 gender_eoy               0   
17 iep_eoy                  0   
18 ell_eoy                  0   
19 race_eth_eoy             0   

Calculate percentage of student who have both MOY and EOY data overall.

#indicator for having BOTH MOY and EOY scores
final_both_scores_vt_cumber = final_vt_cumber %>%
  mutate(
    both_math = !is.na(moy_math_comp) & !is.na(eoy_math_comp),
    both_ela  = !is.na(moy_ela_comp)  & !is.na(eoy_ela_comp),
    all_tests = both_math & both_ela
  )

#pct across schools
overall_pct = final_both_scores_vt_cumber %>%
  summarise(
    total_students = n(),
    students_with_all_tests = sum(all_tests),
    percent_with_all_tests = (students_with_all_tests / total_students) * 100
  ) %>%
  mutate(percent_with_all_tests = round(percent_with_all_tests, 2))

print(overall_pct)
# A tibble: 1 × 3
  total_students students_with_all_tests percent_with_all_tests
           <int>                   <int>                  <dbl>
1           8646                    6309                   73.0

Calculate percentage of student who have both MOY and EOY data by school.

by_school_pct = final_both_scores_vt_cumber %>%
  group_by(school_eoy) %>%
  summarise(
    total_students = n(),
    students_with_all_tests = sum(all_tests),
    percent_with_all_tests = (students_with_all_tests / total_students) * 100
  ) %>%
  mutate(percent_with_all_tests = round(percent_with_all_tests, 2)) %>%
  arrange(desc(percent_with_all_tests))

print(by_school_pct)
# A tibble: 17 × 4
   school_eoy       total_students students_with_all_te…¹ percent_with_all_tests
   <chr>                     <int>                  <int>                  <dbl>
 1 Seventy-First C…            387                    384                   99.2
 2 Cumberland Virt…            127                    125                   98.4
 3 Anne Chesnutt M…            491                    481                   98.0
 4 South View Midd…            567                    529                   93.3
 5 R Max Abbott Mi…            685                    630                   92.0
 6 Lewis Chapel Mi…            502                    439                   87.4
 7 Gray's Creek Mi…            979                    830                   84.8
 8 Luther Nick Jer…            515                    384                   74.6
 9 Pine Forest Mid…            629                    463                   73.6
10 Douglas Byrd Mi…            632                    464                   73.4
11 Hope Mills Midd…            410                    300                   73.2
12 Westover Middle             630                    461                   73.2
13 Spring Lake Mid…            423                    268                   63.4
14 Mac Williams Mi…            796                    411                   51.6
15 Howard Learning…            115                     48                   41.7
16 John R Griffin …            558                     92                   16.5
17 Reid Ross Class…            200                      0                    0  
# ℹ abbreviated name: ¹​students_with_all_tests

Data Summary

Generate school-level means for MOY ELA and Math assessment data, number of students, percentages of students by race/ethnicity, gender, and FRPL-status.

#create function to streamline calculations
calc_pct = function(x) {
  prop.table(table(x)) * 100
}

#school-level means and counts for TEST data
school_means = final_both_scores_vt_cumber %>%
  group_by(school_eoy) %>%
  summarise(
    n_students = n(),
    mean_moy_math = mean(moy_math_comp, na.rm = TRUE),
    mean_moy_ela  = mean(moy_ela_comp, na.rm = TRUE)
  )

# DEMOGRAPHICS

# race/ethnicity
race_pct = final_both_scores_vt_cumber %>%
  group_by(school_eoy, race_eth_eoy) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school_eoy) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = race_eth_eoy,
    values_from = pct,
    names_prefix = "race_"
  )

#gender
gender_pct = final_both_scores_vt_cumber %>%
  group_by(school_eoy, gender_eoy) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school_eoy) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = gender_eoy,
    values_from = pct,
    names_prefix = "gender_"
  )

#free reduced price lunch
frpl_pct = final_both_scores_vt_cumber %>%
  group_by(school_eoy, frpl_eoy) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school_eoy) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = frpl_eoy,
    values_from = pct,
    names_prefix = "frpl_"
  )

#english language learner
ell_pct = final_both_scores_vt_cumber %>%
  group_by(school_eoy, ell_eoy) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school_eoy) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = ell_eoy,
    values_from = pct,
    names_prefix = "ell_"
  )

#at-risk students
atrisk_pct = final_both_scores_vt_cumber %>%
  group_by(school_eoy, at_risk_eoy) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school_eoy) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = at_risk_eoy,
    values_from = pct,
    names_prefix = "atrisk_"
  )

# COMBINE summaries
school_summary = school_means %>%
  left_join(race_pct,   by = "school_eoy") %>%
  left_join(gender_pct, by = "school_eoy") %>%
  left_join(frpl_pct,   by = "school_eoy") %>%
  left_join(ell_pct,    by = "school_eoy") %>%
  left_join(atrisk_pct, by = "school_eoy")

school_summary = school_summary %>%
  mutate(across(where(is.numeric), ~ round(.x, 2)))

print(school_summary)
# A tibble: 17 × 18
   school_eoy  n_students mean_moy_math mean_moy_ela race_A race_B race_H race_I
   <chr>            <dbl>         <dbl>        <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
 1 Anne Chesn…        491          220.         214.   0.41   62.7   19.8   0.81
 2 Cumberland…        127          238.         226.   1.57   42.5   23.6   0.79
 3 Douglas By…        632          214.         210.   1.27   49.2   22.8   1.42
 4 Gray's Cre…        979          222.         215.   1.33   28.5   15.1   2.25
 5 Hope Mills…        410          222.         214.   0.73   41.0   18.3   1.22
 6 Howard Lea…        115          209.         203.   0.87   76.5   10.4   0.87
 7 John R Gri…        558          227.         216.   7.53   26.0   18.1   1.61
 8 Lewis Chap…        502          213.         208.   1.2    68.1   15.3   0.6 
 9 Luther Nic…        515          216.         211.   0.78   69.9   14.2   0.78
10 Mac Willia…        796          222.         213.   0.88   23.9   13.3   2.26
11 Pine Fores…        629          219.         213.   2.7    39.4   16.5   0.64
12 R Max Abbo…        685          219.         214.   3.8    41.9   15.5   0.58
13 Reid Ross …        200          NaN          NaN   NA      64     11    NA   
14 Seventy-Fi…        387          228.         220.   5.43   53.8   15.5   0.26
15 South View…        567          216.         211.   1.06   54.8   19.2   1.76
16 Spring Lak…        423          216.         210.   1.18   59.3   21.3   0.24
17 Westover M…        630          218.         212.   2.86   63.0   17.6   0.63
# ℹ 10 more variables: race_Multi <dbl>, race_P <dbl>, race_W <dbl>,
#   gender_F <dbl>, gender_M <dbl>, frpl_NA <dbl>, ell_0 <dbl>, ell_1 <dbl>,
#   atrisk_1 <dbl>, atrisk_NA <dbl>
write.csv(school_summary, "summary_by_school_vt_cumber.csv", row.names = FALSE)

Reformat Data

Create dummy variables

Create numeric binary for gender.

final_both_scores_vt_cumber = final_both_scores_vt_cumber %>%
  mutate(
    male = case_when(
      gender_eoy == "M" ~ 1,
      gender_eoy == "F" ~ 0,
      TRUE ~ NA_real_   # handles missing or unexpected values
    )
  )

Create numeric binaries for each race.

final_both_scores_vt_cumber = final_both_scores_vt_cumber %>%
  mutate(
    race_A     = ifelse(race_eth_eoy == "A", 1, ifelse(is.na(race_eth_eoy), NA, 0)),
    race_B     = ifelse(race_eth_eoy == "B", 1, ifelse(is.na(race_eth_eoy), NA, 0)),
    race_H     = ifelse(race_eth_eoy == "H", 1, ifelse(is.na(race_eth_eoy), NA, 0)),
    race_I     = ifelse(race_eth_eoy == "I", 1, ifelse(is.na(race_eth_eoy), NA, 0)),
    race_Multi = ifelse(race_eth_eoy == "Multi", 1, ifelse(is.na(race_eth_eoy), NA, 0)),
    race_R     = ifelse(race_eth_eoy == "R", 1, ifelse(is.na(race_eth_eoy), NA, 0)),
    race_W     = ifelse(race_eth_eoy == "W", 1, ifelse(is.na(race_eth_eoy), NA, 0))
  )

Clean columns

Rename columns and keep only those needed for e2i Coach.

final_both_scores_vt_cumber = final_both_scores_vt_cumber %>%
  rename(
    grade = "grade_eoy", 
    school = "school_eoy", 
    treat = "treat_eoy", 
    gender = "gender_eoy", 
    iep = "iep_eoy", 
    ell = "ell_eoy", 
    frpl = "frpl_eoy", 
    race_eth = "race_eth_eoy", 
    at_risk = "at_risk_eoy"
    
  )
#export final_both_scores_vt_cumber with dummies original vars, and, all_tests indicators
write.csv(final_both_scores_vt_cumber, file = "vt_cumber_dummies_test indicators.csv", row.names = FALSE)

#keep columns for e2i
vt_cumber_e2i = final_both_scores_vt_cumber %>%
  dplyr::select(school_code, 
                student_id, 
                grade, 
                school, 
                treat, 
                moy_math_comp, moy_math_date, 
                moy_ela_comp,moy_ela_date, 
                eoy_math_comp, eoy_math_date,
                eoy_ela_comp, eoy_ela_date,
                male, iep, ell,frpl, at_risk, 
                race_A, race_B, race_H, race_I, race_Multi, race_R, race_W)
colnames(vt_cumber_e2i)
 [1] "school_code"   "student_id"    "grade"         "school"       
 [5] "treat"         "moy_math_comp" "moy_math_date" "moy_ela_comp" 
 [9] "moy_ela_date"  "eoy_math_comp" "eoy_math_date" "eoy_ela_comp" 
[13] "eoy_ela_date"  "male"          "iep"           "ell"          
[17] "frpl"          "at_risk"       "race_A"        "race_B"       
[21] "race_H"        "race_I"        "race_Multi"    "race_R"       
[25] "race_W"       

Save final data file with only the necessary columns for e2i.

write.csv(vt_cumber_e2i, file = "vt_cumber_e2i.csv", row.names = FALSE)