Data Processing: Third Learning Space, NHA

Author

Alexis Davila

Clean District Data

Import

Packages selected for inspecting and cleaning Third Learning Space, National Heritage Academy data.

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.1     ✔ readr     2.2.0
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.3     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Import roster.

roster <- read_csv("TSL Impact Evaluation Data.csv")
Rows: 4340 Columns: 18
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (7): School Name*, Date of BOY MAP Math Test*, Date of MOY MAP Math Tes...
dbl (10): Student ID*, Student Grade Level*, Teacher ID*, BOY MAP Math Scale...
lgl  (1): Treatment*

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(roster)
 [1] "Student ID*"                "Student Grade Level*"      
 [3] "School Name*"               "Teacher ID*"               
 [5] "Treatment*"                 "BOY MAP Math Scaled Score*"
 [7] "Date of BOY MAP Math Test*" "MOY MAP Math Scaled Score*"
 [9] "Date of MOY MAP Math Test*" "EOY MAP Math Scaled Score*"
[11] "Date of EOY MAP Math Test*" "Gender"                    
[13] "IEP"                        "ELL"                       
[15] "FRPL"                       "Race"                      
[17] "Ethnicity"                  "Attendance Rate"           

Change column headers for roster.

roster = roster %>%
  rename(
    student_id = "Student ID*",
    teacher_id = "Teacher ID*",
    grade = "Student Grade Level*", 
    school = "School Name*", 
    treat = "Treatment*", 
    
    boy_math_score = "BOY MAP Math Scaled Score*", 
    boy_math_date = "Date of BOY MAP Math Test*", 
    
    moy_math_score = "MOY MAP Math Scaled Score*", 
    moy_math_date = "Date of MOY MAP Math Test*", 

    eoy_math_score = "EOY MAP Math Scaled Score*", 
    eoy_math_date = "Date of EOY MAP Math Test*", 
    
    gender = "Gender", 
    iep = "IEP", 
    ell = "ELL", 
    frpl = "FRPL", 
    race = "Race",
    ethn = "Ethnicity", 
    attend_rate = "Attendance Rate"
  )
colnames(roster)
 [1] "student_id"     "grade"          "school"         "teacher_id"    
 [5] "treat"          "boy_math_score" "boy_math_date"  "moy_math_score"
 [9] "moy_math_date"  "eoy_math_score" "eoy_math_date"  "gender"        
[13] "iep"            "ell"            "frpl"           "race"          
[17] "ethn"           "attend_rate"   

Inspect duplicates

Roster

Identify duplicate student records within roster file.

# frequency table of student_id occurrences
id_counts_roster = roster %>%
  group_by(student_id) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

# filter only IDs that appear > 1
dupes_roster = id_counts_roster %>%
  filter(n > 1)

# view results
print(dupes_roster)
# A tibble: 8 × 2
  student_id     n
       <dbl> <int>
1     566870     2
2     605643     2
3     632862     2
4     670363     2
5     680297     2
6     732197     2
7     732203     2
8     832717     2
#export
write.csv(dupes_roster, file = "duplicate_ids_roster_tsl_nha.csv", row.names = FALSE)

Observation: Each duplicate student ID appears exactly 2 times. For a given duplicate student record, it appears to be the same individual (e.g., same demographics, same teacher), however, the duplicate record is tied to 2 different schools. These students have taken one test at one school site and completed their EOY test date at a different school site. This suggests that the students have transferred some point between BOY and EOY testing.

dup_preview_roster = roster %>%
  group_by(student_id) %>%
  filter(n() > 1) %>%
  ungroup()

print(dup_preview_roster)
# A tibble: 16 × 18
   student_id grade school         teacher_id treat boy_math_score boy_math_date
        <dbl> <dbl> <chr>               <dbl> <lgl>          <dbl> <chr>        
 1     566870     7 Walton Charte…      19525 NA                NA <NA>         
 2     566870     7 Oakside Prep …      19525 NA               216 9/10/2025    
 3     605643     8 Walton Charte…      21216 NA               229 8/28/2025    
 4     605643     8 Oakside Prep …      21216 NA                NA <NA>         
 5     632862     7 Oakside Prep …      23257 NA                NA <NA>         
 6     632862     7 Walton Charte…      23257 NA               204 8/28/2025    
 7     670363     3 Linden Charte…      18378 NA               196 9/29/2025    
 8     670363     4 Linden Charte…      18378 NA                NA <NA>         
 9     680297     3 Oakside Prep …      23154 NA                NA <NA>         
10     680297     3 Walton Charte…      23154 NA               181 8/28/2025    
11     732197     5 Walton Charte…      17341 NA                NA <NA>         
12     732197     5 Oakside Prep …      17341 NA               198 9/3/2025     
13     732203     8 Walton Charte…      23087 NA                NA <NA>         
14     732203     8 Oakside Prep …      23087 NA               199 9/10/2025    
15     832717     3 Burton Glen C…      18916 NA                NA <NA>         
16     832717     3 Holly Park Ac…      18916 NA               172 8/21/2025    
# ℹ 11 more variables: moy_math_score <dbl>, moy_math_date <chr>,
#   eoy_math_score <dbl>, eoy_math_date <chr>, gender <chr>, iep <dbl>,
#   ell <dbl>, frpl <dbl>, race <chr>, ethn <chr>, attend_rate <dbl>

Decision: Collapse rows to obtain full test data but keep the school, grade, and demographics where EOY test was taken. Total observations reduced from 4340 to 4332 (8 duplicate student records removed).

#placeholder code
roster_unique = roster %>%
  group_by(student_id) %>%
  summarize(
    # Pull BOY values (non-missing)
    boy_math_score = first(na.omit(boy_math_score)),
    boy_math_date  = first(na.omit(boy_math_date)),
    
    # Pull EOY values (non-missing)
    eoy_math_score = first(na.omit(eoy_math_score)),
    eoy_math_date  = first(na.omit(eoy_math_date)),
    
    # Identify the EOY row (where EOY score exists)
    school      = school[!is.na(eoy_math_score)][1],
    grade       = grade[!is.na(eoy_math_score)][1],
    teacher_id = teacher_id [!is.na(eoy_math_score)][1],
    moy_math_score       = moy_math_score[!is.na(eoy_math_score)][1],
    moy_math_date       = moy_math_date[!is.na(eoy_math_score)][1],
    gender      = gender[!is.na(eoy_math_score)][1],
    iep         = iep[!is.na(eoy_math_score)][1],
    ell         = ell[!is.na(eoy_math_score)][1],
    frpl        = frpl[!is.na(eoy_math_score)][1],
    race        = race[!is.na(eoy_math_score)][1],
    ethn        = ethn[!is.na(eoy_math_score)][1],
    attend_rate = attend_rate[!is.na(eoy_math_score)][1],
    
    .groups = "drop"
  )

  
#decision to be made
print(roster_unique)
# A tibble: 4,332 × 17
   student_id boy_math_score boy_math_date eoy_math_score eoy_math_date school  
        <dbl>          <dbl> <chr>                  <dbl> <chr>         <chr>   
 1     366259            202 9/3/2025                 207 5/18/2026     Laurus …
 2     430015            218 9/3/2025                 217 5/20/2026     Laurus …
 3     445701            204 9/29/2025                217 5/26/2026     Linden …
 4     461310            170 9/10/2025                173 6/2/2026      Burton …
 5     471757            224 9/24/2025                236 5/20/2026     North S…
 6     472379            209 9/8/2025                 226 6/2/2026      Oakside…
 7     473083            225 9/23/2025                228 5/18/2026     Walton …
 8     473087            204 9/3/2025                 199 5/18/2026     Walton …
 9     474393            212 9/10/2025                 NA <NA>          <NA>    
10     475569            244 8/28/2025                255 5/18/2026     Walton …
# ℹ 4,322 more rows
# ℹ 11 more variables: grade <dbl>, teacher_id <dbl>, moy_math_score <dbl>,
#   moy_math_date <chr>, gender <chr>, iep <dbl>, ell <dbl>, frpl <dbl>,
#   race <chr>, ethn <chr>, attend_rate <dbl>

Du-dupe check. Confirmed: no more duplicate records.

roster_unique %>%
  count(student_id) %>%
  filter(n > 1)
# A tibble: 0 × 2
# ℹ 2 variables: student_id <dbl>, n <int>

Merge Usage Data

Clean NHA Usage Data

Import usage data from TSL.

#import NHA usage data from TSL
tsl_nha_usage = read_csv("TSL Usage Data.csv")
New names:
Rows: 999 Columns: 24
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(5): SIS ID, School Name, School Clever ID, Session start date, Conf_ra... dbl
(8): TSL Student ID, Count of Check Out Confidence Answer, Total CO Con... num
(1): Num_minutes* lgl (10): ...15, ...16, ...17, ...18, ...19, ...20, ...21,
...22, ...23, ...24
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...15`
• `` -> `...16`
• `` -> `...17`
• `` -> `...18`
• `` -> `...19`
• `` -> `...20`
• `` -> `...21`
• `` -> `...22`
• `` -> `...23`
• `` -> `...24`
colnames(tsl_nha_usage)
 [1] "TSL Student ID"                       
 [2] "SIS ID"                               
 [3] "School Name"                          
 [4] "School Clever ID"                     
 [5] "Session start date"                   
 [6] "Count of Check Out Confidence Answer" 
 [7] "Total CO Confidence Same Or Improved" 
 [8] "Conf_rating* % same or improved"      
 [9] "Conf_rating* same or improved score/5"
[10] "Enjoy_rating*"                        
[11] "Num_session*"                         
[12] "Num_minutes*"                         
[13] "Active_participation*"                
[14] "Total LOs Completed"                  
[15] "...15"                                
[16] "...16"                                
[17] "...17"                                
[18] "...18"                                
[19] "...19"                                
[20] "...20"                                
[21] "...21"                                
[22] "...22"                                
[23] "...23"                                
[24] "...24"                                

Rename columns in usage file. Prepare for merging.

tsl_nha_usage = tsl_nha_usage [, -c(15:24)] #drop empty columns
# rename vars
tsl_nha_usage = tsl_nha_usage %>%
  rename(
    student_id = "SIS ID",
    tsl_acct_id = "TSL Student ID", 
    
    school_tsl = "School Name",
    school_code_tsl = "School Clever ID",
    session_BOY_date = "Session start date",
    
    n_co_confid = "Count of Check Out Confidence Answer",
    n_co_confid_same_imp = "Total CO Confidence Same Or Improved",
    perc_confid_same_imp ="Conf_rating* % same or improved",
    score_confid_same_imp = "Conf_rating* same or improved score/5",
    enjoy_rate = "Enjoy_rating*",
    num_session = "Num_session*", 
    num_min = "Num_minutes*", 
    activ_partic = "Active_participation*",
    n_lo_complete = "Total LOs Completed"
  )
print(tsl_nha_usage)
# A tibble: 999 × 14
   tsl_acct_id student_id       school_tsl      school_code_tsl session_BOY_date
         <dbl> <chr>            <chr>           <chr>           <chr>           
 1     1002128 Student Archived Burton Glen Ch… 54c7fe26275d78… 03/02/2026      
 2     1003094 Student Archived Winterfield Ve… 54c7fe26275d78… 25/02/2026      
 3     1002836 854674           Holly Park Aca… 54c7fe26275d78… 26/02/2026      
 4     1002811 737634           Holly Park Aca… 54c7fe26275d78… 26/02/2026      
 5     1002951 688666           Laurus Academy  54c7fe26275d78… 03/02/2026      
 6     1002905 625958           Laurus Academy  54c7fe26275d78… 16/03/2026      
 7     1002113 614413           Burton Glen Ch… 54c7fe26275d78… 02/02/2026      
 8     1001718 592802           Linden Charter… 54c7fe26275d78… 18/02/2026      
 9     1001628 Student Archived Oakside Prep A… 54c7fe26275d78… 28/01/2026      
10     1002111 Student Archived Burton Glen Ch… 54c7fe26275d78… 02/02/2026      
# ℹ 989 more rows
# ℹ 9 more variables: n_co_confid <dbl>, n_co_confid_same_imp <dbl>,
#   perc_confid_same_imp <chr>, score_confid_same_imp <dbl>, enjoy_rate <dbl>,
#   num_session <dbl>, num_min <dbl>, activ_partic <dbl>, n_lo_complete <dbl>

Identify duplicate student records within usage data file (N=0).

# frequency table of tsl_acct_id occurrences
id_counts_tsl_usage_nha = tsl_nha_usage %>%
  group_by(tsl_acct_id) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

# filter only acct that appear > 1
dupes_tsl_usage_nha = id_counts_tsl_usage_nha %>%
  filter(n > 1)

# view results
print(dupes_tsl_usage_nha)
# A tibble: 1 × 2
  tsl_acct_id     n
        <dbl> <int>
1          NA   346
#export
#write.csv(dupes_tsl_usage_nha, file = "duplicate_acct_ids_usage_tsl_nha.csv", row.names = FALSE)

Inspect rows without TSL Account ID (N=346). Observation: these cases are all blanks carried over from csv conversion.

tsl_nha_usage %>%
  filter(is.na(tsl_acct_id))
# A tibble: 346 × 14
   tsl_acct_id student_id school_tsl school_code_tsl session_BOY_date
         <dbl> <chr>      <chr>      <chr>           <chr>           
 1          NA <NA>       <NA>       <NA>            <NA>            
 2          NA <NA>       <NA>       <NA>            <NA>            
 3          NA <NA>       <NA>       <NA>            <NA>            
 4          NA <NA>       <NA>       <NA>            <NA>            
 5          NA <NA>       <NA>       <NA>            <NA>            
 6          NA <NA>       <NA>       <NA>            <NA>            
 7          NA <NA>       <NA>       <NA>            <NA>            
 8          NA <NA>       <NA>       <NA>            <NA>            
 9          NA <NA>       <NA>       <NA>            <NA>            
10          NA <NA>       <NA>       <NA>            <NA>            
# ℹ 336 more rows
# ℹ 9 more variables: n_co_confid <dbl>, n_co_confid_same_imp <dbl>,
#   perc_confid_same_imp <chr>, score_confid_same_imp <dbl>, enjoy_rate <dbl>,
#   num_session <dbl>, num_min <dbl>, activ_partic <dbl>, n_lo_complete <dbl>

Decision: Remove rows where TSL Account ID is missing.

tsl_nha_usage = tsl_nha_usage %>%
  filter(!is.na(tsl_acct_id))
print(tsl_nha_usage)
# A tibble: 653 × 14
   tsl_acct_id student_id       school_tsl      school_code_tsl session_BOY_date
         <dbl> <chr>            <chr>           <chr>           <chr>           
 1     1002128 Student Archived Burton Glen Ch… 54c7fe26275d78… 03/02/2026      
 2     1003094 Student Archived Winterfield Ve… 54c7fe26275d78… 25/02/2026      
 3     1002836 854674           Holly Park Aca… 54c7fe26275d78… 26/02/2026      
 4     1002811 737634           Holly Park Aca… 54c7fe26275d78… 26/02/2026      
 5     1002951 688666           Laurus Academy  54c7fe26275d78… 03/02/2026      
 6     1002905 625958           Laurus Academy  54c7fe26275d78… 16/03/2026      
 7     1002113 614413           Burton Glen Ch… 54c7fe26275d78… 02/02/2026      
 8     1001718 592802           Linden Charter… 54c7fe26275d78… 18/02/2026      
 9     1001628 Student Archived Oakside Prep A… 54c7fe26275d78… 28/01/2026      
10     1002111 Student Archived Burton Glen Ch… 54c7fe26275d78… 02/02/2026      
# ℹ 643 more rows
# ℹ 9 more variables: n_co_confid <dbl>, n_co_confid_same_imp <dbl>,
#   perc_confid_same_imp <chr>, score_confid_same_imp <dbl>, enjoy_rate <dbl>,
#   num_session <dbl>, num_min <dbl>, activ_partic <dbl>, n_lo_complete <dbl>

Inspect rows where student ID notes “Student Archived”.

tsl_nha_usage %>%
  filter(student_id != "Student Archived")
# A tibble: 629 × 14
   tsl_acct_id student_id school_tsl            school_code_tsl session_BOY_date
         <dbl> <chr>      <chr>                 <chr>           <chr>           
 1     1002836 854674     Holly Park Academy    54c7fe26275d78… 26/02/2026      
 2     1002811 737634     Holly Park Academy    54c7fe26275d78… 26/02/2026      
 3     1002951 688666     Laurus Academy        54c7fe26275d78… 03/02/2026      
 4     1002905 625958     Laurus Academy        54c7fe26275d78… 16/03/2026      
 5     1002113 614413     Burton Glen Charter … 54c7fe26275d78… 02/02/2026      
 6     1001718 592802     Linden Charter Acade… 54c7fe26275d78… 18/02/2026      
 7     1002843 847934     Holly Park Academy    54c7fe26275d78… 04/03/2026      
 8     1003634 845181     Laurus Academy        54c7fe26275d78… 09/02/2026      
 9     1003044 843138     Stambaugh Charter Ac… 54c7fe26275d78… 05/02/2026      
10     1003096 840127     Winterfield Venture … 54c7fe26275d78… 17/02/2026      
# ℹ 619 more rows
# ℹ 9 more variables: n_co_confid <dbl>, n_co_confid_same_imp <dbl>,
#   perc_confid_same_imp <chr>, score_confid_same_imp <dbl>, enjoy_rate <dbl>,
#   num_session <dbl>, num_min <dbl>, activ_partic <dbl>, n_lo_complete <dbl>

Decision: Remove student rows where student ID notes “Student Archived”(N=629).

tsl_nha_usage = tsl_nha_usage %>%
  filter(student_id != "Student Archived")
print(tsl_nha_usage)
# A tibble: 629 × 14
   tsl_acct_id student_id school_tsl            school_code_tsl session_BOY_date
         <dbl> <chr>      <chr>                 <chr>           <chr>           
 1     1002836 854674     Holly Park Academy    54c7fe26275d78… 26/02/2026      
 2     1002811 737634     Holly Park Academy    54c7fe26275d78… 26/02/2026      
 3     1002951 688666     Laurus Academy        54c7fe26275d78… 03/02/2026      
 4     1002905 625958     Laurus Academy        54c7fe26275d78… 16/03/2026      
 5     1002113 614413     Burton Glen Charter … 54c7fe26275d78… 02/02/2026      
 6     1001718 592802     Linden Charter Acade… 54c7fe26275d78… 18/02/2026      
 7     1002843 847934     Holly Park Academy    54c7fe26275d78… 04/03/2026      
 8     1003634 845181     Laurus Academy        54c7fe26275d78… 09/02/2026      
 9     1003044 843138     Stambaugh Charter Ac… 54c7fe26275d78… 05/02/2026      
10     1003096 840127     Winterfield Venture … 54c7fe26275d78… 17/02/2026      
# ℹ 619 more rows
# ℹ 9 more variables: n_co_confid <dbl>, n_co_confid_same_imp <dbl>,
#   perc_confid_same_imp <chr>, score_confid_same_imp <dbl>, enjoy_rate <dbl>,
#   num_session <dbl>, num_min <dbl>, activ_partic <dbl>, n_lo_complete <dbl>

Insert treatment status

Add treatment status. Equals 1 if student id appears in roster from TSL (N=629). All else equals 0.

roster_unique = roster_unique %>%
  mutate(
    treat = ifelse(student_id %in% tsl_nha_usage$student_id, 1, 0)
  )

table(roster_unique$treat)

   0    1 
3703  629 

Inspect for any duplicate student IDs amongst the treated. Zero duplicates.

roster_unique %>%
  filter(treat == 1) %>%
  group_by(student_id) %>%
  filter(n() > 1)
# A tibble: 0 × 18
# Groups:   student_id [0]
# ℹ 18 variables: student_id <dbl>, boy_math_score <dbl>, boy_math_date <chr>,
#   eoy_math_score <dbl>, eoy_math_date <chr>, school <chr>, grade <dbl>,
#   teacher_id <dbl>, moy_math_score <dbl>, moy_math_date <chr>, gender <chr>,
#   iep <dbl>, ell <dbl>, frpl <dbl>, race <chr>, ethn <chr>,
#   attend_rate <dbl>, treat <dbl>

Merge

Merge usage data to student NHA data.

#ensure student_id is same format for merging
roster_unique$student_id = as.character(roster_unique$student_id)
tsl_nha_usage$student_id = as.character(tsl_nha_usage$student_id)

merge_tsl_nha = roster_unique %>%
  left_join(tsl_nha_usage, by = "student_id")

print(merge_tsl_nha)
# A tibble: 4,332 × 31
   student_id boy_math_score boy_math_date eoy_math_score eoy_math_date school  
   <chr>               <dbl> <chr>                  <dbl> <chr>         <chr>   
 1 366259                202 9/3/2025                 207 5/18/2026     Laurus …
 2 430015                218 9/3/2025                 217 5/20/2026     Laurus …
 3 445701                204 9/29/2025                217 5/26/2026     Linden …
 4 461310                170 9/10/2025                173 6/2/2026      Burton …
 5 471757                224 9/24/2025                236 5/20/2026     North S…
 6 472379                209 9/8/2025                 226 6/2/2026      Oakside…
 7 473083                225 9/23/2025                228 5/18/2026     Walton …
 8 473087                204 9/3/2025                 199 5/18/2026     Walton …
 9 474393                212 9/10/2025                 NA <NA>          <NA>    
10 475569                244 8/28/2025                255 5/18/2026     Walton …
# ℹ 4,322 more rows
# ℹ 25 more variables: grade <dbl>, teacher_id <dbl>, moy_math_score <dbl>,
#   moy_math_date <chr>, gender <chr>, iep <dbl>, ell <dbl>, frpl <dbl>,
#   race <chr>, ethn <chr>, attend_rate <dbl>, treat <dbl>, tsl_acct_id <dbl>,
#   school_tsl <chr>, school_code_tsl <chr>, session_BOY_date <chr>,
#   n_co_confid <dbl>, n_co_confid_same_imp <dbl>, perc_confid_same_imp <chr>,
#   score_confid_same_imp <dbl>, enjoy_rate <dbl>, num_session <dbl>, …

Missing Data Inspections

Percentage missing for each variable. Observations: Most variables from the TSL file are missing, as expected; after merging the district roster with TSL usage file, not all students will be in the TSL system. Looking at district data columns, most of the data is present (test info, demographics, etc). The repeating missing percentage (6.37%) across EOY test info variables, school, grade, and demographics suggests there is a small group of students missing this info. The large majority of students are missing MOY test info. No student entry is missing an ID or treatment status, though.

missing_summary = merge_tsl_nha %>%
  summarise(across(
    everything(),
    ~ mean(is.na(.)) * 100
  )) %>%
  pivot_longer(
    cols = everything(),
    names_to = "variable",
    values_to = "percent_missing"
  ) %>%
  arrange(desc(percent_missing))

print(missing_summary)
# A tibble: 31 × 2
   variable              percent_missing
   <chr>                           <dbl>
 1 enjoy_rate                       89.3
 2 n_co_confid                      87.7
 3 n_co_confid_same_imp             87.7
 4 perc_confid_same_imp             87.7
 5 score_confid_same_imp            87.7
 6 session_BOY_date                 86.3
 7 num_session                      86.3
 8 num_min                          86.3
 9 activ_partic                     86.3
10 n_lo_complete                    86.3
# ℹ 21 more rows
write.csv(missing_summary, file = "tsl_nha_missing_summary.csv", row.names = FALSE)

Percentage of student who have both BOY and EOY data and ALL math data (BOY, MOY, EOY) overall

#indicator for having BOTH MOY and EOY scores
merge_tsl_nha_test_info = merge_tsl_nha %>%
  mutate(
    boy_eoy_math = !is.na(boy_math_score) & !is.na(eoy_math_score),
    all_math = !is.na(boy_math_score) & !is.na(moy_math_score) # incl. MOY
            & !is.na(eoy_math_score),
  )

#pct across schools
overall_pct = merge_tsl_nha_test_info %>%
  summarise(
    total_students = n(),
    
    students_with_boy_eoy = sum(boy_eoy_math), 
    percent_with_boy_eoy = sum(students_with_boy_eoy / total_students) * 100, 
    
    
    students_with_all_tests = sum(all_math),
    percent_with_all_tests = (students_with_all_tests / total_students) * 100
  ) %>%
  mutate(percent_with_boy_eoy = round(percent_with_boy_eoy, 2))%>%
  mutate(percent_with_all_tests = round(percent_with_all_tests, 2))

print(overall_pct)
# A tibble: 1 × 5
  total_students students_with_boy_eoy percent_with_boy_eoy
           <int>                 <int>                <dbl>
1           4332                  3846                 88.8
# ℹ 2 more variables: students_with_all_tests <int>,
#   percent_with_all_tests <dbl>

Percentage of student who have both BOY and EOY data by school Assuming MOY is not of interest? Only 21% of the sample have all test info, including MOY test.

by_school_pct = merge_tsl_nha_test_info %>%
  group_by(school) %>%
  summarise(
    total_students = n(),
    students_with_boy_eoy = sum(boy_eoy_math),
    percent_with_boy_eoy = (students_with_boy_eoy / total_students) * 100
  ) %>%
  mutate(percent_with_boy_eoy = round(percent_with_boy_eoy, 2)) %>%
  arrange(desc(percent_with_boy_eoy))

print(by_school_pct)
# A tibble: 12 × 4
   school              total_students students_with_boy_eoy percent_with_boy_eoy
   <chr>                        <int>                 <int>                <dbl>
 1 Walton Charter Aca…            504                   496                 98.4
 2 Linden Charter Aca…            457                   447                 97.8
 3 Burton Glen Charte…            372                   360                 96.8
 4 Holly Park Academy             307                   295                 96.1
 5 Oakside Prep Acade…            554                   531                 95.8
 6 Stambaugh Charter …            237                   226                 95.4
 7 Laurus Academy                 458                   434                 94.8
 8 North Saginaw Char…            435                   401                 92.2
 9 Windemere Park Cha…            340                   312                 91.8
10 Bennett Venture Ac…            204                   187                 91.7
11 Winterfield Ventur…            188                   157                 83.5
12 <NA>                           276                     0                  0  

Dummies for test info (complete and missing indicators)

boy_eoy_math: dummy for students with complete math test info for BOY and EOY.

table(merge_tsl_nha_test_info$boy_eoy_math)

FALSE  TRUE 
  486  3846 
#complete math: BOY and EOY present
merge_tsl_nha_test_info$boy_eoy_math = ifelse(merge_tsl_nha_test_info$boy_eoy_math, 1, 0)
table(merge_tsl_nha_test_info$boy_eoy_math) #check counts

   0    1 
 486 3846 

math_boy_missing_eoy_present: dummy for students who have EOY math test info but BOY test info is missing.

merge_tsl_nha_test_info = merge_tsl_nha_test_info %>%
  mutate(
    boy_math_present = !is.na(boy_math_score) & !is.na(boy_math_date),
    eoy_math_present = !is.na(eoy_math_score) & !is.na(eoy_math_date),
    
    math_boy_missing_eoy_present = case_when(
      !boy_math_present &  eoy_math_present ~ 1,   # moy missing, eoy present
       boy_math_present &  eoy_math_present ~ 0,   # both present
      !boy_math_present & !eoy_math_present ~ NA_real_  # both missing
    )
  )
table(merge_tsl_nha_test_info$math_boy_missing_eoy_present)

   0    1 
3846  210 

###Test info summary

Proportion of math test info present (BOY present and EOY present, grouped by treatment status).

summary_math_tests_present = merge_tsl_nha_test_info %>%
  mutate(
    boy_math_present = !is.na(boy_math_score) & !is.na(boy_math_date),
    eoy_math_present = !is.na(eoy_math_score) & !is.na(eoy_math_date)
  ) %>%
  group_by(treat) %>%
  summarise(
    n_students = n(),
    
    boy_count = sum(boy_math_present),
    boy_prop  = mean(boy_math_present),
    
    eoy_count = sum(eoy_math_present),
    eoy_prop  = mean(eoy_math_present),
    
    .groups = "drop"
  )

# View result
print(summary_math_tests_present)
# A tibble: 2 × 6
  treat n_students boy_count boy_prop eoy_count eoy_prop
  <dbl>      <int>     <int>    <dbl>     <int>    <dbl>
1     0       3703      3502    0.946      3438    0.928
2     1        629       619    0.984       618    0.983

Data Summary

Generate school-level means for BOY assessment data, number of students, plus distribution across demographics.

#create function to streamline calculations
calc_pct = function(x) {
  prop.table(table(x)) * 100
}

#school-level means and counts for TEST data, including attend rate avg
merge_tsl_nha_test_info = merge_tsl_nha_test_info %>%
  mutate(across(c(boy_math_score, #transform all scores to numeric value
                  eoy_math_score, 
                  attend_rate), as.numeric))

school_means = merge_tsl_nha_test_info %>%
  group_by(school)%>%
  summarise(
    n_students = n(),
    mean_boy_math = mean(boy_math_score, na.rm = TRUE),
    mean_eoy_math = mean(eoy_math_score, na.rm = TRUE),
    attend_rate = mean(attend_rate, na.rm =  TRUE)
  )

# DEMOGRAPHICS

# race
race_pct = merge_tsl_nha_test_info %>%
  group_by(school, race) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = race,
    values_from = pct,
    names_prefix = "race_"
  )

#ethn BEFORE CLEANING
ethn_pct = merge_tsl_nha_test_info %>%
  group_by(school, ethn) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = ethn,
    values_from = pct,
    names_prefix = "ethn_"
  )


#gender
gender_pct = merge_tsl_nha_test_info %>%
  group_by(school, gender) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = gender,
    values_from = pct,
    names_prefix = "gender_"
  )

#frpl
frpl_pct = merge_tsl_nha_test_info %>%
  group_by(school, frpl) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = frpl,
    values_from = pct,
    names_prefix = "frpl_"
  )

#ell
ell_pct = merge_tsl_nha_test_info %>%
  group_by(school, ell) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = ell,
    values_from = pct,
    names_prefix = "ell_"
  )


#iep students
iep_pct = merge_tsl_nha_test_info %>%
  group_by(school, iep) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = iep,
    values_from = pct,
    names_prefix = "iep_"
  )

# COMBINE summaries
school_summary = school_means %>%
  left_join(race_pct,   by = "school") %>%
  left_join(ethn_pct,   by = "school") %>%
  left_join(gender_pct, by = "school") %>%
  left_join(frpl_pct,   by = "school") %>%
  left_join(ell_pct,    by = "school") %>%
  left_join(iep_pct, by = "school")

school_summary = school_summary %>%
  mutate(across(where(is.numeric), ~ round(.x, 2)))

print(school_summary)
# A tibble: 12 × 30
   school                     n_students mean_boy_math mean_eoy_math attend_rate
   <chr>                           <dbl>         <dbl>         <dbl>       <dbl>
 1 Bennett Venture Academy           204          196.          210.        0.87
 2 Burton Glen Charter Acade…        372          198.          212.        0.9 
 3 Holly Park Academy                307          194.          206.        0.91
 4 Laurus Academy                    458          201.          213.        0.88
 5 Linden Charter Academy            457          202.          213.        0.88
 6 North Saginaw Charter Aca…        435          200.          209.        0.9 
 7 Oakside Prep Academy              554          197.          209.        0.9 
 8 Stambaugh Charter Academy         237          197.          211.        0.91
 9 Walton Charter Academy            504          198.          212.        0.92
10 Windemere Park Charter Ac…        340          200.          215.        0.9 
11 Winterfield Venture Acade…        188          197.          210.        0.87
12 <NA>                              276          196.          NaN       NaN   
# ℹ 25 more variables: `race_American Indian or Alaskan Native` <dbl>,
#   race_Asian <dbl>, `race_Black or African American` <dbl>, race_White <dbl>,
#   `race_Native Hawaiian or Pacific Islander` <dbl>, race_NA <dbl>,
#   `ethn_American Indian or Alaskan Native` <dbl>, ethn_Asian <dbl>,
#   `ethn_Black or African American` <dbl>, ethn_Hispanic <dbl>,
#   ethn_White <dbl>, `ethn_Native Hawaiian or Pacific Islander` <dbl>,
#   ethn_NA <dbl>, gender_Female <dbl>, gender_Male <dbl>, gender_NA <dbl>, …
write.csv(school_summary, "test_summary_by_school_tsl_nha.csv", row.names = FALSE)

Reformat Data

Create dummy variables

Create numeric binary for gender.

table(merge_tsl_nha_test_info$gender)

Female   Male 
  2070   1986 
merge_tsl_nha_test_info = merge_tsl_nha_test_info %>%
  mutate(
    male = case_when(
      gender == "Male" ~ 1,
      gender == "Female" ~ 0,
      TRUE ~ NA_real_   # handles missing or unexpected values
    )
  )
table(merge_tsl_nha_test_info$male)

   0    1 
2070 1986 

Create numeric binaries for each race.

table(merge_tsl_nha_test_info$race)

  American Indian or Alaskan Native                               Asian 
                                121                                  44 
          Black or African American Native Hawaiian or Pacific Islander 
                               2913                                  49 
                              White 
                                929 
merge_tsl_nha_test_info = merge_tsl_nha_test_info %>%
  mutate(
    race_AI     = ifelse(race == "American Indian or Alaskan Native", 1, ifelse(is.na(race), NA, 0)),
    race_A     = ifelse(race == "Asian", 1, ifelse(is.na(race), NA, 0)),
    race_B     = ifelse(race == "Black or African American", 1, ifelse(is.na(race), NA, 0)),
    race_NH     = ifelse(race == "Native Hawaiian or Pacific Islander", 1, ifelse(is.na(race), NA, 0)),
    race_W     = ifelse(race == "White", 1, ifelse(is.na(race), NA, 0))
  )
table(merge_tsl_nha_test_info$race_B) #quick check coding on race_B

   0    1 
1143 2913 

Create dummy for ethnicity. Observation: Ethn is often a duplication of Race, and Hispanic never appears under Race. Recode ethnicity into a Hispanic indicator?

table(merge_tsl_nha_test_info$ethn)

  American Indian or Alaskan Native                               Asian 
                                 39                                  41 
          Black or African American                            Hispanic 
                               2723                                 916 
Native Hawaiian or Pacific Islander                               White 
                                  4                                 333 

Decision: Keep Race as-is; Mutate Ethn to Hispanic indicator.

merge_tsl_nha_test_info = merge_tsl_nha_test_info %>%
  mutate(
    hisp_eth = ifelse(
      is.na(ethn),
      NA,
      ifelse(ethn == "Hispanic", 1, 0)
    )
  )

table(merge_tsl_nha_test_info$hisp_eth)#check recoding

   0    1 
3140  916 

Create num_weeks of implemention

Reformatted session_BOY_date given by TSL (start date of treatment), from DD/MM/YYYY → MM/DD/YYYY.

#transform all date vars as dates
merge_tsl_nha_test_info = merge_tsl_nha_test_info %>%
  mutate(
    session_BOY_date = as.Date(session_BOY_date, format = "%d/%m/%Y"), 
    eoy_math_date = as.Date(eoy_math_date, format = "%m/%d/%Y"), 
    moy_math_date = as.Date(moy_math_date, format = "%m/%d/%Y"), 
    boy_math_date = as.Date(boy_math_date, format = "%m/%d/%Y"), 
  )

Create num_weeks by taking the difference in time between session start date and EOY assessment date. Average implementation period was about 14 weeks.

merge_tsl_nha_test_info = merge_tsl_nha_test_info %>%
  mutate(
    num_weeks = as.numeric(difftime(eoy_math_date, session_BOY_date, units = "weeks"))
  )
mean(merge_tsl_nha_test_info$num_weeks, na.rm = TRUE)
[1] 14.2735

Clean strings

Drop “%” string from perc_confid_same_imp.

merge_tsl_nha_test_info$perc_confid_same_imp = as.numeric(sub("%$", "", merge_tsl_nha_test_info$perc_confid_same_imp))
mean(merge_tsl_nha_test_info$perc_confid_same_imp, na.rm = TRUE)
[1] 82.40602

Export files for e2i Coach

Save e2i file with all matched students from the merge, regardless of missing test info.

#select columns ready for e2i
tsl_nha_e2i_usage_test_info = merge_tsl_nha_test_info %>%
  dplyr::select(student_id, 
                teacher_id,
                grade, 
                school,
                boy_math_score, boy_math_date,
                moy_math_score, moy_math_date,
                eoy_math_score, eoy_math_date,
                male, iep, ell, frpl, attend_rate,
                treat,
                race_AI, race_A, race_B, race_NH, race_W,hisp_eth,
                boy_eoy_math,#1/0 indicator: boy and eoy math info present
                math_boy_missing_eoy_present, 
                
                tsl_acct_id, school_code_tsl, 
                session_BOY_date, #start date of TSL/treat
                n_co_confid, n_co_confid_same_imp, perc_confid_same_imp, 
                score_confid_same_imp, enjoy_rate,activ_partic,
                n_lo_complete
                )

#export e2i with test indicators, all matched students from the merge, regardless of missing test info (all_tests == 1 or 0)
write.csv(tsl_nha_e2i_usage_test_info, file = "tsl_nha_e2i_all_students.csv", row.names = FALSE)

Save e2i file with only containing matched students from the merge with complete test info (all_tests means student has all BOY and EOY scores and dates).

table(tsl_nha_e2i_usage_test_info$boy_eoy_math)

   0    1 
 486 3846 
#subset: keep only students with complete test info (all_tests == 1)
tsl_nha_e2i_all_tests = tsl_nha_e2i_usage_test_info %>% filter(boy_eoy_math == 1)
print(tsl_nha_e2i_all_tests) # N=3846
# A tibble: 3,846 × 34
   student_id teacher_id grade school               boy_math_score boy_math_date
   <chr>           <dbl> <dbl> <chr>                         <dbl> <date>       
 1 366259          19461     7 Laurus Academy                  202 2025-09-03   
 2 430015          13501     8 Laurus Academy                  218 2025-09-03   
 3 445701           4891     8 Linden Charter Acad…            204 2025-09-29   
 4 461310           4416     7 Burton Glen Charter…            170 2025-09-10   
 5 471757          13405     8 North Saginaw Chart…            224 2025-09-24   
 6 472379           6179     8 Oakside Prep Academy            209 2025-09-08   
 7 473083          23087     8 Walton Charter Acad…            225 2025-09-23   
 8 473087          23087     8 Walton Charter Acad…            204 2025-09-03   
 9 475569          23087     8 Walton Charter Acad…            244 2025-08-28   
10 475590          23087     8 Walton Charter Acad…            202 2025-09-02   
# ℹ 3,836 more rows
# ℹ 28 more variables: moy_math_score <dbl>, moy_math_date <date>,
#   eoy_math_score <dbl>, eoy_math_date <date>, male <dbl>, iep <dbl>,
#   ell <dbl>, frpl <dbl>, attend_rate <dbl>, treat <dbl>, race_AI <dbl>,
#   race_A <dbl>, race_B <dbl>, race_NH <dbl>, race_W <dbl>, hisp_eth <dbl>,
#   boy_eoy_math <dbl>, math_boy_missing_eoy_present <dbl>, tsl_acct_id <dbl>,
#   school_code_tsl <chr>, session_BOY_date <date>, n_co_confid <dbl>, …

Count of treatment students among the subset with complete test info.

table(tsl_nha_e2i_all_tests$treat)

   0    1 
3238  608 
#export e2i only containing matched students from the merge with complete test info (all_tests == 1)
write.csv(tsl_nha_e2i_all_tests, file = "tsl_nha_e2i_complete_tests.csv", row.names = FALSE)