Identify duplicate student records within roster file. No duplicates to report.
# frequency table of student_id occurrencesid_counts_roster = roster %>%group_by(student_id) %>%summarise(n =n()) %>%arrange(desc(n))# filter only IDs that appear > 1dupes_roster = id_counts_roster %>%filter(n >1)# view resultsprint(dupes_roster)
# A tibble: 0 × 2
# ℹ 2 variables: student_id <dbl>, n <int>
#export - NA: no duplicates to report#write.csv(dupes_roster, file = "duplicate_ids_roster_vt_penn.csv", row.names = FALSE)
Merge
Insert treatment status
Import roster from VT that details which students have accounts in their system.
#import ID crosswalk from VTvt_roster =read_csv("vt_roster_all_districts.csv")
Rows: 1496 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): District, School
dbl (2): District ID, Varsity Tutors ID
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_tests: dummy for students with both math and ELA test info complete across MOY and EOY.
table(merge_vt_penn_test_info$all_tests)
FALSE TRUE
85 1194
#complete math and ELA: MOY and EOY presentmerge_vt_penn_test_info$all_tests =ifelse(merge_vt_penn_test_info$all_tests, 1, 0)table(merge_vt_penn_test_info$all_tests) #check counts
0 1
85 1194
math_moy_missing_eoy_present: dummy for students who have EOY math test info but MOY test info is missing.
#import penn usage data from VTvt_usage_penn =read_csv("vt_usage_penn.csv")
Rows: 1648 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sch_name*, Sch_BOY_date*, Sch_EOY_date
dbl (7): Stu_ID_VT*, Num_days_active*, Num_sessions_AI, Num_sessions_human, ...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Inspect duplicate student records in usage file (based on VT Account ID - not student ID). Observation: for a given duplicate student record, school name,num_days_active, BOY date and EOY date will match but usage data differs across the duplicate records.
# A tibble: 1,572 × 10
school_vt school_BOY_date school_EOY_date vt_acct_id num_days_active
<chr> <chr> <chr> <dbl> <dbl>
1 Bell Avenue Eleme… 1/5/26 5/24/26 13134752 3
2 Bell Avenue Eleme… 1/5/26 5/24/26 13183824 15
3 Bell Avenue Eleme… 1/5/26 5/24/26 13183824 15
4 Bell Avenue Eleme… 1/5/26 5/24/26 13155212 8
5 Bell Avenue Eleme… 1/5/26 5/24/26 13193600 16
6 Bell Avenue Eleme… 1/5/26 5/24/26 13154761 9
7 Bell Avenue Eleme… 1/5/26 5/24/26 13130626 3
8 Bell Avenue Eleme… 1/5/26 5/24/26 13155895 16
9 Bell Avenue Eleme… 1/5/26 5/24/26 13152761 12
10 Bell Avenue Eleme… 1/5/26 5/24/26 13130597 4
# ℹ 1,562 more rows
# ℹ 5 more variables: num_sessions_AI <dbl>, num_sessions_human <dbl>,
# num_classesviewed <dbl>, num_essaysreviewed <dbl>,
# num_AI_practiceproblemsessions <dbl>
Decision: Collapse rows for each student ID. Per VT, these instances are rows that only capture one of the students multiple days active and need to be compressed.
Basic check of randomly selected student, 13130591. Inspect sum of Num_sessions_AI before collapsing.
Save e2i file with all matched students from the merge, regardless of missing test info.
#select columns ready for e2ivt_penn_e2i_usage_test_info = merge_vt_penn_usage_test_info %>% dplyr::select(student_id, grade, school, moy_math_comp, moy_math_date, moy_ela_comp,moy_ela_date, eoy_math_comp, eoy_math_date, eoy_ela_comp, eoy_ela_date, male, iep, ell, frpl, treat, race_A, race_B, race_H, race_Multi, race_NH, race_AI, race_W, both_math, both_ela, all_tests, math_moy_missing_eoy_present, ela_moy_missing_eoy_present, vt_acct_id, num_days_active,num_sessions_AI, num_sessions_human, num_classesviewed, num_essaysreviewed, num_AI_practiceproblemsessions, school_BOY_date, school_EOY_date, )#export e2i with test indicators, all matched students from the merge, regardless of missing test info (all_tests == 1 or 0)write.csv(vt_penn_e2i_usage_test_info, file ="vt_penn_e2i_all_students.csv", row.names =FALSE)
Save e2i file with only containing matched students from the merge with complete test info.
table(vt_penn_e2i_usage_test_info$all_tests)
0 1
85 1194
#subset: keep only students with complete test info (all_tests == 1)vt_penn_e2i_all_tests = vt_penn_e2i_usage_test_info %>%filter(all_tests ==1)print(vt_penn_e2i_all_tests) # N=1194
Count of treatment students among the subset with complete test info.
table(vt_penn_e2i_all_tests$treat)
0 1
765 429
#export e2i only containing matched students from the merge with complete test info (all_tests == 1)write.csv(vt_penn_e2i_all_tests, file ="vt_penn_e2i_complete_tests.csv", row.names =FALSE)