Clean District Data
Import
Packages selected for inspecting and cleaning Third Learning Space, National Heritage Academy data.
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.2.1 ✔ readr 2.2.0
✔ forcats 1.0.1 ✔ stringr 1.6.0
✔ ggplot2 4.0.3 ✔ tibble 3.3.1
✔ lubridate 1.9.5 ✔ tidyr 1.3.2
✔ purrr 1.2.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Import roster.
roster <- read_csv ("TSL Impact Evaluation Data.csv" )
Rows: 4340 Columns: 18
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): School Name*, Date of BOY MAP Math Test*, Date of MOY MAP Math Tes...
dbl (10): Student ID*, Student Grade Level*, Teacher ID*, BOY MAP Math Scale...
lgl (1): Treatment*
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1] "Student ID*" "Student Grade Level*"
[3] "School Name*" "Teacher ID*"
[5] "Treatment*" "BOY MAP Math Scaled Score*"
[7] "Date of BOY MAP Math Test*" "MOY MAP Math Scaled Score*"
[9] "Date of MOY MAP Math Test*" "EOY MAP Math Scaled Score*"
[11] "Date of EOY MAP Math Test*" "Gender"
[13] "IEP" "ELL"
[15] "FRPL" "Race"
[17] "Ethnicity" "Attendance Rate"
Change column headers for roster.
roster = roster %>%
rename (
student_id = "Student ID*" ,
teacher_id = "Teacher ID*" ,
grade = "Student Grade Level*" ,
school = "School Name*" ,
treat = "Treatment*" ,
boy_math_score = "BOY MAP Math Scaled Score*" ,
boy_math_date = "Date of BOY MAP Math Test*" ,
moy_math_score = "MOY MAP Math Scaled Score*" ,
moy_math_date = "Date of MOY MAP Math Test*" ,
eoy_math_score = "EOY MAP Math Scaled Score*" ,
eoy_math_date = "Date of EOY MAP Math Test*" ,
gender = "Gender" ,
iep = "IEP" ,
ell = "ELL" ,
frpl = "FRPL" ,
race = "Race" ,
ethn = "Ethnicity" ,
attend_rate = "Attendance Rate"
)
colnames (roster)
[1] "student_id" "grade" "school" "teacher_id"
[5] "treat" "boy_math_score" "boy_math_date" "moy_math_score"
[9] "moy_math_date" "eoy_math_score" "eoy_math_date" "gender"
[13] "iep" "ell" "frpl" "race"
[17] "ethn" "attend_rate"
Inspect duplicates
Roster
Identify duplicate student records within roster file.
# frequency table of student_id occurrences
id_counts_roster = roster %>%
group_by (student_id) %>%
summarise (n = n ()) %>%
arrange (desc (n))
# filter only IDs that appear > 1
dupes_roster = id_counts_roster %>%
filter (n > 1 )
# view results
print (dupes_roster)
# A tibble: 8 × 2
student_id n
<dbl> <int>
1 566870 2
2 605643 2
3 632862 2
4 670363 2
5 680297 2
6 732197 2
7 732203 2
8 832717 2
#export
write.csv (dupes_roster, file = "duplicate_ids_roster_tsl_nha.csv" , row.names = FALSE )
Observation: Each duplicate student ID appears exactly 2 times. For a given duplicate student record, it appears to be the same individual (e.g., same demographics, same teacher), however, the duplicate record is tied to 2 different schools. These students have taken one test at one school site and completed their EOY test date at a different school site. This suggests that the students have transferred some point between BOY and EOY testing.
dup_preview_roster = roster %>%
group_by (student_id) %>%
filter (n () > 1 ) %>%
ungroup ()
print (dup_preview_roster)
# A tibble: 16 × 18
student_id grade school teacher_id treat boy_math_score boy_math_date
<dbl> <dbl> <chr> <dbl> <lgl> <dbl> <chr>
1 566870 7 Walton Charte… 19525 NA NA <NA>
2 566870 7 Oakside Prep … 19525 NA 216 9/10/2025
3 605643 8 Walton Charte… 21216 NA 229 8/28/2025
4 605643 8 Oakside Prep … 21216 NA NA <NA>
5 632862 7 Oakside Prep … 23257 NA NA <NA>
6 632862 7 Walton Charte… 23257 NA 204 8/28/2025
7 670363 3 Linden Charte… 18378 NA 196 9/29/2025
8 670363 4 Linden Charte… 18378 NA NA <NA>
9 680297 3 Oakside Prep … 23154 NA NA <NA>
10 680297 3 Walton Charte… 23154 NA 181 8/28/2025
11 732197 5 Walton Charte… 17341 NA NA <NA>
12 732197 5 Oakside Prep … 17341 NA 198 9/3/2025
13 732203 8 Walton Charte… 23087 NA NA <NA>
14 732203 8 Oakside Prep … 23087 NA 199 9/10/2025
15 832717 3 Burton Glen C… 18916 NA NA <NA>
16 832717 3 Holly Park Ac… 18916 NA 172 8/21/2025
# ℹ 11 more variables: moy_math_score <dbl>, moy_math_date <chr>,
# eoy_math_score <dbl>, eoy_math_date <chr>, gender <chr>, iep <dbl>,
# ell <dbl>, frpl <dbl>, race <chr>, ethn <chr>, attend_rate <dbl>
Decision : Collapse rows to obtain full test data but keep the school, grade, and demographics where EOY test was taken. Total observations reduced from 4340 to 4332 (8 duplicate student records removed).
#placeholder code
roster_unique = roster %>%
group_by (student_id) %>%
summarize (
# Pull BOY values (non-missing)
boy_math_score = first (na.omit (boy_math_score)),
boy_math_date = first (na.omit (boy_math_date)),
# Pull EOY values (non-missing)
eoy_math_score = first (na.omit (eoy_math_score)),
eoy_math_date = first (na.omit (eoy_math_date)),
# Identify the EOY row (where EOY score exists)
school = school[! is.na (eoy_math_score)][1 ],
grade = grade[! is.na (eoy_math_score)][1 ],
teacher_id = teacher_id [! is.na (eoy_math_score)][1 ],
moy_math_score = moy_math_score[! is.na (eoy_math_score)][1 ],
moy_math_date = moy_math_date[! is.na (eoy_math_score)][1 ],
gender = gender[! is.na (eoy_math_score)][1 ],
iep = iep[! is.na (eoy_math_score)][1 ],
ell = ell[! is.na (eoy_math_score)][1 ],
frpl = frpl[! is.na (eoy_math_score)][1 ],
race = race[! is.na (eoy_math_score)][1 ],
ethn = ethn[! is.na (eoy_math_score)][1 ],
attend_rate = attend_rate[! is.na (eoy_math_score)][1 ],
.groups = "drop"
)
#decision to be made
print (roster_unique)
# A tibble: 4,332 × 17
student_id boy_math_score boy_math_date eoy_math_score eoy_math_date school
<dbl> <dbl> <chr> <dbl> <chr> <chr>
1 366259 202 9/3/2025 207 5/18/2026 Laurus …
2 430015 218 9/3/2025 217 5/20/2026 Laurus …
3 445701 204 9/29/2025 217 5/26/2026 Linden …
4 461310 170 9/10/2025 173 6/2/2026 Burton …
5 471757 224 9/24/2025 236 5/20/2026 North S…
6 472379 209 9/8/2025 226 6/2/2026 Oakside…
7 473083 225 9/23/2025 228 5/18/2026 Walton …
8 473087 204 9/3/2025 199 5/18/2026 Walton …
9 474393 212 9/10/2025 NA <NA> <NA>
10 475569 244 8/28/2025 255 5/18/2026 Walton …
# ℹ 4,322 more rows
# ℹ 11 more variables: grade <dbl>, teacher_id <dbl>, moy_math_score <dbl>,
# moy_math_date <chr>, gender <chr>, iep <dbl>, ell <dbl>, frpl <dbl>,
# race <chr>, ethn <chr>, attend_rate <dbl>
Du-dupe check. Confirmed: no more duplicate records.
roster_unique %>%
count (student_id) %>%
filter (n > 1 )
# A tibble: 0 × 2
# ℹ 2 variables: student_id <dbl>, n <int>
Merge Usage Data
Clean NHA Usage Data
Import usage data from TSL.
#import NHA usage data from TSL
tsl_nha_usage = read_csv ("TSL Usage Data.csv" )
New names:
Rows: 999 Columns: 24
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(5): SIS ID, School Name, School Clever ID, Session start date, Conf_ra... dbl
(8): TSL Student ID, Count of Check Out Confidence Answer, Total CO Con... num
(1): Num_minutes* lgl (10): ...15, ...16, ...17, ...18, ...19, ...20, ...21,
...22, ...23, ...24
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...15`
• `` -> `...16`
• `` -> `...17`
• `` -> `...18`
• `` -> `...19`
• `` -> `...20`
• `` -> `...21`
• `` -> `...22`
• `` -> `...23`
• `` -> `...24`
[1] "TSL Student ID"
[2] "SIS ID"
[3] "School Name"
[4] "School Clever ID"
[5] "Session start date"
[6] "Count of Check Out Confidence Answer"
[7] "Total CO Confidence Same Or Improved"
[8] "Conf_rating* % same or improved"
[9] "Conf_rating* same or improved score/5"
[10] "Enjoy_rating*"
[11] "Num_session*"
[12] "Num_minutes*"
[13] "Active_participation*"
[14] "Total LOs Completed"
[15] "...15"
[16] "...16"
[17] "...17"
[18] "...18"
[19] "...19"
[20] "...20"
[21] "...21"
[22] "...22"
[23] "...23"
[24] "...24"
Rename columns in usage file. Prepare for merging.
tsl_nha_usage = tsl_nha_usage [, - c (15 : 24 )] #drop empty columns
# rename vars
tsl_nha_usage = tsl_nha_usage %>%
rename (
student_id = "SIS ID" ,
tsl_acct_id = "TSL Student ID" ,
school_tsl = "School Name" ,
school_code_tsl = "School Clever ID" ,
session_BOY_date = "Session start date" ,
n_co_confid = "Count of Check Out Confidence Answer" ,
n_co_confid_same_imp = "Total CO Confidence Same Or Improved" ,
perc_confid_same_imp = "Conf_rating* % same or improved" ,
score_confid_same_imp = "Conf_rating* same or improved score/5" ,
enjoy_rate = "Enjoy_rating*" ,
num_session = "Num_session*" ,
num_min = "Num_minutes*" ,
activ_partic = "Active_participation*" ,
n_lo_complete = "Total LOs Completed"
)
print (tsl_nha_usage)
# A tibble: 999 × 14
tsl_acct_id student_id school_tsl school_code_tsl session_BOY_date
<dbl> <chr> <chr> <chr> <chr>
1 1002128 Student Archived Burton Glen Ch… 54c7fe26275d78… 03/02/2026
2 1003094 Student Archived Winterfield Ve… 54c7fe26275d78… 25/02/2026
3 1002836 854674 Holly Park Aca… 54c7fe26275d78… 26/02/2026
4 1002811 737634 Holly Park Aca… 54c7fe26275d78… 26/02/2026
5 1002951 688666 Laurus Academy 54c7fe26275d78… 03/02/2026
6 1002905 625958 Laurus Academy 54c7fe26275d78… 16/03/2026
7 1002113 614413 Burton Glen Ch… 54c7fe26275d78… 02/02/2026
8 1001718 592802 Linden Charter… 54c7fe26275d78… 18/02/2026
9 1001628 Student Archived Oakside Prep A… 54c7fe26275d78… 28/01/2026
10 1002111 Student Archived Burton Glen Ch… 54c7fe26275d78… 02/02/2026
# ℹ 989 more rows
# ℹ 9 more variables: n_co_confid <dbl>, n_co_confid_same_imp <dbl>,
# perc_confid_same_imp <chr>, score_confid_same_imp <dbl>, enjoy_rate <dbl>,
# num_session <dbl>, num_min <dbl>, activ_partic <dbl>, n_lo_complete <dbl>
Identify duplicate student records within usage data file (N=0).
# frequency table of tsl_acct_id occurrences
id_counts_tsl_usage_nha = tsl_nha_usage %>%
group_by (tsl_acct_id) %>%
summarise (n = n ()) %>%
arrange (desc (n))
# filter only acct that appear > 1
dupes_tsl_usage_nha = id_counts_tsl_usage_nha %>%
filter (n > 1 )
# view results
print (dupes_tsl_usage_nha)
# A tibble: 1 × 2
tsl_acct_id n
<dbl> <int>
1 NA 346
#export
#write.csv(dupes_tsl_usage_nha, file = "duplicate_acct_ids_usage_tsl_nha.csv", row.names = FALSE)
Inspect rows without TSL Account ID (N=346). Observation: these cases are all blanks carried over from csv conversion.
tsl_nha_usage %>%
filter (is.na (tsl_acct_id))
# A tibble: 346 × 14
tsl_acct_id student_id school_tsl school_code_tsl session_BOY_date
<dbl> <chr> <chr> <chr> <chr>
1 NA <NA> <NA> <NA> <NA>
2 NA <NA> <NA> <NA> <NA>
3 NA <NA> <NA> <NA> <NA>
4 NA <NA> <NA> <NA> <NA>
5 NA <NA> <NA> <NA> <NA>
6 NA <NA> <NA> <NA> <NA>
7 NA <NA> <NA> <NA> <NA>
8 NA <NA> <NA> <NA> <NA>
9 NA <NA> <NA> <NA> <NA>
10 NA <NA> <NA> <NA> <NA>
# ℹ 336 more rows
# ℹ 9 more variables: n_co_confid <dbl>, n_co_confid_same_imp <dbl>,
# perc_confid_same_imp <chr>, score_confid_same_imp <dbl>, enjoy_rate <dbl>,
# num_session <dbl>, num_min <dbl>, activ_partic <dbl>, n_lo_complete <dbl>
Decision : Remove rows where TSL Account ID is missing.
tsl_nha_usage = tsl_nha_usage %>%
filter (! is.na (tsl_acct_id))
print (tsl_nha_usage)
# A tibble: 653 × 14
tsl_acct_id student_id school_tsl school_code_tsl session_BOY_date
<dbl> <chr> <chr> <chr> <chr>
1 1002128 Student Archived Burton Glen Ch… 54c7fe26275d78… 03/02/2026
2 1003094 Student Archived Winterfield Ve… 54c7fe26275d78… 25/02/2026
3 1002836 854674 Holly Park Aca… 54c7fe26275d78… 26/02/2026
4 1002811 737634 Holly Park Aca… 54c7fe26275d78… 26/02/2026
5 1002951 688666 Laurus Academy 54c7fe26275d78… 03/02/2026
6 1002905 625958 Laurus Academy 54c7fe26275d78… 16/03/2026
7 1002113 614413 Burton Glen Ch… 54c7fe26275d78… 02/02/2026
8 1001718 592802 Linden Charter… 54c7fe26275d78… 18/02/2026
9 1001628 Student Archived Oakside Prep A… 54c7fe26275d78… 28/01/2026
10 1002111 Student Archived Burton Glen Ch… 54c7fe26275d78… 02/02/2026
# ℹ 643 more rows
# ℹ 9 more variables: n_co_confid <dbl>, n_co_confid_same_imp <dbl>,
# perc_confid_same_imp <chr>, score_confid_same_imp <dbl>, enjoy_rate <dbl>,
# num_session <dbl>, num_min <dbl>, activ_partic <dbl>, n_lo_complete <dbl>
Inspect rows where student ID notes “Student Archived”.
tsl_nha_usage %>%
filter (student_id != "Student Archived" )
# A tibble: 629 × 14
tsl_acct_id student_id school_tsl school_code_tsl session_BOY_date
<dbl> <chr> <chr> <chr> <chr>
1 1002836 854674 Holly Park Academy 54c7fe26275d78… 26/02/2026
2 1002811 737634 Holly Park Academy 54c7fe26275d78… 26/02/2026
3 1002951 688666 Laurus Academy 54c7fe26275d78… 03/02/2026
4 1002905 625958 Laurus Academy 54c7fe26275d78… 16/03/2026
5 1002113 614413 Burton Glen Charter … 54c7fe26275d78… 02/02/2026
6 1001718 592802 Linden Charter Acade… 54c7fe26275d78… 18/02/2026
7 1002843 847934 Holly Park Academy 54c7fe26275d78… 04/03/2026
8 1003634 845181 Laurus Academy 54c7fe26275d78… 09/02/2026
9 1003044 843138 Stambaugh Charter Ac… 54c7fe26275d78… 05/02/2026
10 1003096 840127 Winterfield Venture … 54c7fe26275d78… 17/02/2026
# ℹ 619 more rows
# ℹ 9 more variables: n_co_confid <dbl>, n_co_confid_same_imp <dbl>,
# perc_confid_same_imp <chr>, score_confid_same_imp <dbl>, enjoy_rate <dbl>,
# num_session <dbl>, num_min <dbl>, activ_partic <dbl>, n_lo_complete <dbl>
Decision : Remove student rows where student ID notes “Student Archived”(N=629).
tsl_nha_usage = tsl_nha_usage %>%
filter (student_id != "Student Archived" )
print (tsl_nha_usage)
# A tibble: 629 × 14
tsl_acct_id student_id school_tsl school_code_tsl session_BOY_date
<dbl> <chr> <chr> <chr> <chr>
1 1002836 854674 Holly Park Academy 54c7fe26275d78… 26/02/2026
2 1002811 737634 Holly Park Academy 54c7fe26275d78… 26/02/2026
3 1002951 688666 Laurus Academy 54c7fe26275d78… 03/02/2026
4 1002905 625958 Laurus Academy 54c7fe26275d78… 16/03/2026
5 1002113 614413 Burton Glen Charter … 54c7fe26275d78… 02/02/2026
6 1001718 592802 Linden Charter Acade… 54c7fe26275d78… 18/02/2026
7 1002843 847934 Holly Park Academy 54c7fe26275d78… 04/03/2026
8 1003634 845181 Laurus Academy 54c7fe26275d78… 09/02/2026
9 1003044 843138 Stambaugh Charter Ac… 54c7fe26275d78… 05/02/2026
10 1003096 840127 Winterfield Venture … 54c7fe26275d78… 17/02/2026
# ℹ 619 more rows
# ℹ 9 more variables: n_co_confid <dbl>, n_co_confid_same_imp <dbl>,
# perc_confid_same_imp <chr>, score_confid_same_imp <dbl>, enjoy_rate <dbl>,
# num_session <dbl>, num_min <dbl>, activ_partic <dbl>, n_lo_complete <dbl>
Insert treatment status
Add treatment status. Equals 1 if student id appears in roster from TSL (N=629). All else equals 0.
roster_unique = roster_unique %>%
mutate (
treat = ifelse (student_id %in% tsl_nha_usage$ student_id, 1 , 0 )
)
table (roster_unique$ treat)
Inspect for any duplicate student IDs amongst the treated. Zero duplicates.
roster_unique %>%
filter (treat == 1 ) %>%
group_by (student_id) %>%
filter (n () > 1 )
# A tibble: 0 × 18
# Groups: student_id [0]
# ℹ 18 variables: student_id <dbl>, boy_math_score <dbl>, boy_math_date <chr>,
# eoy_math_score <dbl>, eoy_math_date <chr>, school <chr>, grade <dbl>,
# teacher_id <dbl>, moy_math_score <dbl>, moy_math_date <chr>, gender <chr>,
# iep <dbl>, ell <dbl>, frpl <dbl>, race <chr>, ethn <chr>,
# attend_rate <dbl>, treat <dbl>
Merge
Merge usage data to student NHA data.
#ensure student_id is same format for merging
roster_unique$ student_id = as.character (roster_unique$ student_id)
tsl_nha_usage$ student_id = as.character (tsl_nha_usage$ student_id)
merge_tsl_nha = roster_unique %>%
left_join (tsl_nha_usage, by = "student_id" )
print (merge_tsl_nha)
# A tibble: 4,332 × 31
student_id boy_math_score boy_math_date eoy_math_score eoy_math_date school
<chr> <dbl> <chr> <dbl> <chr> <chr>
1 366259 202 9/3/2025 207 5/18/2026 Laurus …
2 430015 218 9/3/2025 217 5/20/2026 Laurus …
3 445701 204 9/29/2025 217 5/26/2026 Linden …
4 461310 170 9/10/2025 173 6/2/2026 Burton …
5 471757 224 9/24/2025 236 5/20/2026 North S…
6 472379 209 9/8/2025 226 6/2/2026 Oakside…
7 473083 225 9/23/2025 228 5/18/2026 Walton …
8 473087 204 9/3/2025 199 5/18/2026 Walton …
9 474393 212 9/10/2025 NA <NA> <NA>
10 475569 244 8/28/2025 255 5/18/2026 Walton …
# ℹ 4,322 more rows
# ℹ 25 more variables: grade <dbl>, teacher_id <dbl>, moy_math_score <dbl>,
# moy_math_date <chr>, gender <chr>, iep <dbl>, ell <dbl>, frpl <dbl>,
# race <chr>, ethn <chr>, attend_rate <dbl>, treat <dbl>, tsl_acct_id <dbl>,
# school_tsl <chr>, school_code_tsl <chr>, session_BOY_date <chr>,
# n_co_confid <dbl>, n_co_confid_same_imp <dbl>, perc_confid_same_imp <chr>,
# score_confid_same_imp <dbl>, enjoy_rate <dbl>, num_session <dbl>, …
Missing Data Inspections
Percentage missing for each variable. Observations: Most variables from the TSL file are missing, as expected; after merging the district roster with TSL usage file, not all students will be in the TSL system. Looking at district data columns, most of the data is present (test info, demographics, etc). The repeating missing percentage (6.37%) across EOY test info variables, school, grade, and demographics suggests there is a small group of students missing this info. The large majority of students are missing MOY test info. No student entry is missing an ID or treatment status, though.
missing_summary = merge_tsl_nha %>%
summarise (across (
everything (),
~ mean (is.na (.)) * 100
)) %>%
pivot_longer (
cols = everything (),
names_to = "variable" ,
values_to = "percent_missing"
) %>%
arrange (desc (percent_missing))
print (missing_summary)
# A tibble: 31 × 2
variable percent_missing
<chr> <dbl>
1 enjoy_rate 89.3
2 n_co_confid 87.7
3 n_co_confid_same_imp 87.7
4 perc_confid_same_imp 87.7
5 score_confid_same_imp 87.7
6 session_BOY_date 86.3
7 num_session 86.3
8 num_min 86.3
9 activ_partic 86.3
10 n_lo_complete 86.3
# ℹ 21 more rows
write.csv (missing_summary, file = "tsl_nha_missing_summary.csv" , row.names = FALSE )
Percentage of student who have both BOY and EOY data and ALL math data (BOY, MOY, EOY) overall
#indicator for having BOTH MOY and EOY scores
merge_tsl_nha_test_info = merge_tsl_nha %>%
mutate (
boy_eoy_math = ! is.na (boy_math_score) & ! is.na (eoy_math_score),
all_math = ! is.na (boy_math_score) & ! is.na (moy_math_score) # incl. MOY
& ! is.na (eoy_math_score),
)
#pct across schools
overall_pct = merge_tsl_nha_test_info %>%
summarise (
total_students = n (),
students_with_boy_eoy = sum (boy_eoy_math),
percent_with_boy_eoy = sum (students_with_boy_eoy / total_students) * 100 ,
students_with_all_tests = sum (all_math),
percent_with_all_tests = (students_with_all_tests / total_students) * 100
) %>%
mutate (percent_with_boy_eoy = round (percent_with_boy_eoy, 2 ))%>%
mutate (percent_with_all_tests = round (percent_with_all_tests, 2 ))
print (overall_pct)
# A tibble: 1 × 5
total_students students_with_boy_eoy percent_with_boy_eoy
<int> <int> <dbl>
1 4332 3846 88.8
# ℹ 2 more variables: students_with_all_tests <int>,
# percent_with_all_tests <dbl>
Percentage of student who have both BOY and EOY data by school Assuming MOY is not of interest? Only 21% of the sample have all test info, including MOY test.
by_school_pct = merge_tsl_nha_test_info %>%
group_by (school) %>%
summarise (
total_students = n (),
students_with_boy_eoy = sum (boy_eoy_math),
percent_with_boy_eoy = (students_with_boy_eoy / total_students) * 100
) %>%
mutate (percent_with_boy_eoy = round (percent_with_boy_eoy, 2 )) %>%
arrange (desc (percent_with_boy_eoy))
print (by_school_pct)
# A tibble: 12 × 4
school total_students students_with_boy_eoy percent_with_boy_eoy
<chr> <int> <int> <dbl>
1 Walton Charter Aca… 504 496 98.4
2 Linden Charter Aca… 457 447 97.8
3 Burton Glen Charte… 372 360 96.8
4 Holly Park Academy 307 295 96.1
5 Oakside Prep Acade… 554 531 95.8
6 Stambaugh Charter … 237 226 95.4
7 Laurus Academy 458 434 94.8
8 North Saginaw Char… 435 401 92.2
9 Windemere Park Cha… 340 312 91.8
10 Bennett Venture Ac… 204 187 91.7
11 Winterfield Ventur… 188 157 83.5
12 <NA> 276 0 0
Dummies for test info (complete and missing indicators)
boy_eoy_math : dummy for students with complete math test info for BOY and EOY.
table (merge_tsl_nha_test_info$ boy_eoy_math)
#complete math: BOY and EOY present
merge_tsl_nha_test_info$ boy_eoy_math = ifelse (merge_tsl_nha_test_info$ boy_eoy_math, 1 , 0 )
table (merge_tsl_nha_test_info$ boy_eoy_math) #check counts
math_boy_missing_eoy_present : dummy for students who have EOY math test info but BOY test info is missing.
merge_tsl_nha_test_info = merge_tsl_nha_test_info %>%
mutate (
boy_math_present = ! is.na (boy_math_score) & ! is.na (boy_math_date),
eoy_math_present = ! is.na (eoy_math_score) & ! is.na (eoy_math_date),
math_boy_missing_eoy_present = case_when (
! boy_math_present & eoy_math_present ~ 1 , # moy missing, eoy present
boy_math_present & eoy_math_present ~ 0 , # both present
! boy_math_present & ! eoy_math_present ~ NA_real_ # both missing
)
)
table (merge_tsl_nha_test_info$ math_boy_missing_eoy_present)
###Test info summary
Proportion of math test info present (BOY present and EOY present, grouped by treatment status).
summary_math_tests_present = merge_tsl_nha_test_info %>%
mutate (
boy_math_present = ! is.na (boy_math_score) & ! is.na (boy_math_date),
eoy_math_present = ! is.na (eoy_math_score) & ! is.na (eoy_math_date)
) %>%
group_by (treat) %>%
summarise (
n_students = n (),
boy_count = sum (boy_math_present),
boy_prop = mean (boy_math_present),
eoy_count = sum (eoy_math_present),
eoy_prop = mean (eoy_math_present),
.groups = "drop"
)
# View result
print (summary_math_tests_present)
# A tibble: 2 × 6
treat n_students boy_count boy_prop eoy_count eoy_prop
<dbl> <int> <int> <dbl> <int> <dbl>
1 0 3703 3502 0.946 3438 0.928
2 1 629 619 0.984 618 0.983
Data Summary
Generate school-level means for BOY assessment data, number of students, plus distribution across demographics.
#create function to streamline calculations
calc_pct = function (x) {
prop.table (table (x)) * 100
}
#school-level means and counts for TEST data, including attend rate avg
merge_tsl_nha_test_info = merge_tsl_nha_test_info %>%
mutate (across (c (boy_math_score, #transform all scores to numeric value
eoy_math_score,
attend_rate), as.numeric))
school_means = merge_tsl_nha_test_info %>%
group_by (school)%>%
summarise (
n_students = n (),
mean_boy_math = mean (boy_math_score, na.rm = TRUE ),
mean_eoy_math = mean (eoy_math_score, na.rm = TRUE ),
attend_rate = mean (attend_rate, na.rm = TRUE )
)
# DEMOGRAPHICS
# race
race_pct = merge_tsl_nha_test_info %>%
group_by (school, race) %>%
summarise (n = n (), .groups = "drop" ) %>%
group_by (school) %>%
mutate (pct = (n / sum (n)) * 100 ) %>%
select (- n) %>%
pivot_wider (
names_from = race,
values_from = pct,
names_prefix = "race_"
)
#ethn BEFORE CLEANING
ethn_pct = merge_tsl_nha_test_info %>%
group_by (school, ethn) %>%
summarise (n = n (), .groups = "drop" ) %>%
group_by (school) %>%
mutate (pct = (n / sum (n)) * 100 ) %>%
select (- n) %>%
pivot_wider (
names_from = ethn,
values_from = pct,
names_prefix = "ethn_"
)
#gender
gender_pct = merge_tsl_nha_test_info %>%
group_by (school, gender) %>%
summarise (n = n (), .groups = "drop" ) %>%
group_by (school) %>%
mutate (pct = (n / sum (n)) * 100 ) %>%
select (- n) %>%
pivot_wider (
names_from = gender,
values_from = pct,
names_prefix = "gender_"
)
#frpl
frpl_pct = merge_tsl_nha_test_info %>%
group_by (school, frpl) %>%
summarise (n = n (), .groups = "drop" ) %>%
group_by (school) %>%
mutate (pct = (n / sum (n)) * 100 ) %>%
select (- n) %>%
pivot_wider (
names_from = frpl,
values_from = pct,
names_prefix = "frpl_"
)
#ell
ell_pct = merge_tsl_nha_test_info %>%
group_by (school, ell) %>%
summarise (n = n (), .groups = "drop" ) %>%
group_by (school) %>%
mutate (pct = (n / sum (n)) * 100 ) %>%
select (- n) %>%
pivot_wider (
names_from = ell,
values_from = pct,
names_prefix = "ell_"
)
#iep students
iep_pct = merge_tsl_nha_test_info %>%
group_by (school, iep) %>%
summarise (n = n (), .groups = "drop" ) %>%
group_by (school) %>%
mutate (pct = (n / sum (n)) * 100 ) %>%
select (- n) %>%
pivot_wider (
names_from = iep,
values_from = pct,
names_prefix = "iep_"
)
# COMBINE summaries
school_summary = school_means %>%
left_join (race_pct, by = "school" ) %>%
left_join (ethn_pct, by = "school" ) %>%
left_join (gender_pct, by = "school" ) %>%
left_join (frpl_pct, by = "school" ) %>%
left_join (ell_pct, by = "school" ) %>%
left_join (iep_pct, by = "school" )
school_summary = school_summary %>%
mutate (across (where (is.numeric), ~ round (.x, 2 )))
print (school_summary)
# A tibble: 12 × 30
school n_students mean_boy_math mean_eoy_math attend_rate
<chr> <dbl> <dbl> <dbl> <dbl>
1 Bennett Venture Academy 204 196. 210. 0.87
2 Burton Glen Charter Acade… 372 198. 212. 0.9
3 Holly Park Academy 307 194. 206. 0.91
4 Laurus Academy 458 201. 213. 0.88
5 Linden Charter Academy 457 202. 213. 0.88
6 North Saginaw Charter Aca… 435 200. 209. 0.9
7 Oakside Prep Academy 554 197. 209. 0.9
8 Stambaugh Charter Academy 237 197. 211. 0.91
9 Walton Charter Academy 504 198. 212. 0.92
10 Windemere Park Charter Ac… 340 200. 215. 0.9
11 Winterfield Venture Acade… 188 197. 210. 0.87
12 <NA> 276 196. NaN NaN
# ℹ 25 more variables: `race_American Indian or Alaskan Native` <dbl>,
# race_Asian <dbl>, `race_Black or African American` <dbl>, race_White <dbl>,
# `race_Native Hawaiian or Pacific Islander` <dbl>, race_NA <dbl>,
# `ethn_American Indian or Alaskan Native` <dbl>, ethn_Asian <dbl>,
# `ethn_Black or African American` <dbl>, ethn_Hispanic <dbl>,
# ethn_White <dbl>, `ethn_Native Hawaiian or Pacific Islander` <dbl>,
# ethn_NA <dbl>, gender_Female <dbl>, gender_Male <dbl>, gender_NA <dbl>, …
write.csv (school_summary, "test_summary_by_school_tsl_nha.csv" , row.names = FALSE )
Export files for e2i Coach
Save e2i file with all matched students from the merge, regardless of missing test info.
#select columns ready for e2i
tsl_nha_e2i_usage_test_info = merge_tsl_nha_test_info %>%
dplyr:: select (student_id,
teacher_id,
grade,
school,
boy_math_score, boy_math_date,
moy_math_score, moy_math_date,
eoy_math_score, eoy_math_date,
male, iep, ell, frpl, attend_rate,
treat,
race_AI, race_A, race_B, race_NH, race_W,hisp_eth,
boy_eoy_math,#1/0 indicator: boy and eoy math info present
math_boy_missing_eoy_present,
tsl_acct_id, school_code_tsl,
session_BOY_date, #start date of TSL/treat
n_co_confid, n_co_confid_same_imp, perc_confid_same_imp,
score_confid_same_imp, enjoy_rate,activ_partic,
n_lo_complete
)
#export e2i with test indicators, all matched students from the merge, regardless of missing test info (all_tests == 1 or 0)
write.csv (tsl_nha_e2i_usage_test_info, file = "tsl_nha_e2i_all_students.csv" , row.names = FALSE )
Save e2i file with only containing matched students from the merge with complete test info (all_tests means student has all BOY and EOY scores and dates).
table (tsl_nha_e2i_usage_test_info$ boy_eoy_math)
#subset: keep only students with complete test info (all_tests == 1)
tsl_nha_e2i_all_tests = tsl_nha_e2i_usage_test_info %>% filter (boy_eoy_math == 1 )
print (tsl_nha_e2i_all_tests) # N=3846
# A tibble: 3,846 × 34
student_id teacher_id grade school boy_math_score boy_math_date
<chr> <dbl> <dbl> <chr> <dbl> <date>
1 366259 19461 7 Laurus Academy 202 2025-09-03
2 430015 13501 8 Laurus Academy 218 2025-09-03
3 445701 4891 8 Linden Charter Acad… 204 2025-09-29
4 461310 4416 7 Burton Glen Charter… 170 2025-09-10
5 471757 13405 8 North Saginaw Chart… 224 2025-09-24
6 472379 6179 8 Oakside Prep Academy 209 2025-09-08
7 473083 23087 8 Walton Charter Acad… 225 2025-09-23
8 473087 23087 8 Walton Charter Acad… 204 2025-09-03
9 475569 23087 8 Walton Charter Acad… 244 2025-08-28
10 475590 23087 8 Walton Charter Acad… 202 2025-09-02
# ℹ 3,836 more rows
# ℹ 28 more variables: moy_math_score <dbl>, moy_math_date <date>,
# eoy_math_score <dbl>, eoy_math_date <date>, male <dbl>, iep <dbl>,
# ell <dbl>, frpl <dbl>, attend_rate <dbl>, treat <dbl>, race_AI <dbl>,
# race_A <dbl>, race_B <dbl>, race_NH <dbl>, race_W <dbl>, hisp_eth <dbl>,
# boy_eoy_math <dbl>, math_boy_missing_eoy_present <dbl>, tsl_acct_id <dbl>,
# school_code_tsl <chr>, session_BOY_date <date>, n_co_confid <dbl>, …
Count of treatment students among the subset with complete test info.
table (tsl_nha_e2i_all_tests$ treat)
#export e2i only containing matched students from the merge with complete test info (all_tests == 1)
write.csv (tsl_nha_e2i_all_tests, file = "tsl_nha_e2i_complete_tests.csv" , row.names = FALSE )