Data Processing: ALTER-Math, Seminole SD

Author

Alexis Davila

Clean Data

Import

Packages selected for inspecting and cleaning ALTER-Math, Seminole School District.

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.1     ✔ readr     2.2.0
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.3     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Import roster. Observation: Comparing data columns to what was listed in the data request, the following variables were not received: Attendance rate, Num_session, TH_supplied, TH_opened, and Transfer (an important outcome variable) .

roster <- read_csv("ALTER-Math_Data_Mathematica.csv")
Rows: 1562 Columns: 33
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr   (5): Student_ID*, School_Name*, Teacher_ID*, Gender, Race/Ethnicity
dbl  (25): Student_Grade*, Treatment*, PM2 _Score*, PM3_Score*, ELL, FRPL, N...
lgl   (1): IEP
dttm  (2): PM2_Date*, PM3_Date*

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(roster)
 [1] "Student_ID*"                         "Student_Grade*"                     
 [3] "School_Name*"                        "Teacher_ID*"                        
 [5] "Treatment*"                          "PM2 _Score*"                        
 [7] "PM2_Date*"                           "PM3_Score*"                         
 [9] "PM3_Date*"                           "Gender"                             
[11] "IEP"                                 "ELL"                                
[13] "FRPL"                                "Race/Ethnicity"                     
[15] "Num_minutes"                         "Hints_supplied"                     
[17] "Hints_opened"                        "Hints_engaged"                      
[19] "STT_used"                            "STT_revised"                        
[21] "Conceptual_understanding"            "Procedural_fluency"                 
[23] "Prop_tutor_giving_explanation"       "Prop_tutor_giving_instruction"      
[25] "Prop_tutor_giving_answer"            "Prop_tutor_polite_expressions"      
[27] "Prop_tutor_praising_and_encouraging" "Prop_tutor_confusion"               
[29] "Prop_tutor_confirmatory_feedback"    "Prop_accepting_reject"              
[31] "Prop_commanding"                     "Total_messages"                     
[33] "Total_conversations"                

Change column headers for roster.

roster = roster %>%
  rename(
    student_id = "Student_ID*",
    teacher_id = "Teacher_ID*",
    grade = "Student_Grade*", 
    school = "School_Name*", 
    treat = "Treatment*", 
    
    moy_score = "PM2 _Score*", 
    moy_date = "PM2_Date*", 
    eoy_score = "PM3_Score*", 
    eoy_date = "PM3_Date*", 
    
    gender = "Gender", 
    iep = "IEP", 
    ell = "ELL", 
    frpl = "FRPL", 
    race = "Race/Ethnicity",
   
    num_min = "Num_minutes", 
    hint_supplied = "Hints_supplied", 
    hints_opened = "Hints_opened", 
    hints_engaged = "Hints_engaged",
    stt_used = "STT_used", stt_revised = "STT_revised", 
    concept_under = "Conceptual_understanding", 
    procedural_flu = "Procedural_fluency", 
    prop_tutor_give_explan = "Prop_tutor_giving_explanation",
    prop_tutor_give_instruct = "Prop_tutor_giving_instruction", 
    prop_tutor_give_answer = "Prop_tutor_giving_answer",
    prop_tutor_polite_expres = "Prop_tutor_polite_expressions", 
    prop_tutor_praise_encour = "Prop_tutor_praising_and_encouraging", 
    prop_tutor_confusion = "Prop_tutor_confusion",
    prop_tutor_confirm_feed = "Prop_tutor_confirmatory_feedback",
    prop_accept_reject = "Prop_accepting_reject", 
    prop_command = "Prop_commanding", 
    total_msg = "Total_messages",
    total_convers = "Total_conversations"  
  )
colnames(roster)
 [1] "student_id"               "grade"                   
 [3] "school"                   "teacher_id"              
 [5] "treat"                    "moy_score"               
 [7] "moy_date"                 "eoy_score"               
 [9] "eoy_date"                 "gender"                  
[11] "iep"                      "ell"                     
[13] "frpl"                     "race"                    
[15] "num_min"                  "hint_supplied"           
[17] "hints_opened"             "hints_engaged"           
[19] "stt_used"                 "stt_revised"             
[21] "concept_under"            "procedural_flu"          
[23] "prop_tutor_give_explan"   "prop_tutor_give_instruct"
[25] "prop_tutor_give_answer"   "prop_tutor_polite_expres"
[27] "prop_tutor_praise_encour" "prop_tutor_confusion"    
[29] "prop_tutor_confirm_feed"  "prop_accept_reject"      
[31] "prop_command"             "total_msg"               
[33] "total_convers"           

Inspect duplicates

Identify duplicate student records within roster file.

# frequency table of student_id occurrences
id_counts_roster = roster %>%
  group_by(student_id) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

# filter only IDs that appear > 1
dupes_roster = id_counts_roster %>%
  filter(n > 1)

# view results
print(dupes_roster)
# A tibble: 0 × 2
# ℹ 2 variables: student_id <chr>, n <int>

No duplicates to report.

Missing Data Inspections

Percentage missing for each variable. Observations: IEP status is missing for all students. Many treatment-related variables are missing, as expected (repeating proportion of 52.11%); this is a merged file with treated and non-treated students. All students appears to have complete test data and demographics, including teacher ID.

missing_summary = roster %>%
  summarise(across(
    everything(),
    ~ mean(is.na(.)) * 100
  )) %>%
  pivot_longer(
    cols = everything(),
    names_to = "variable",
    values_to = "percent_missing"
  ) %>%
  arrange(desc(percent_missing))

print(missing_summary)
# A tibble: 33 × 2
   variable               percent_missing
   <chr>                            <dbl>
 1 iep                              100  
 2 num_min                           52.1
 3 hint_supplied                     52.1
 4 hints_opened                      52.1
 5 hints_engaged                     52.1
 6 stt_used                          52.1
 7 stt_revised                       52.1
 8 concept_under                     52.1
 9 procedural_flu                    52.1
10 prop_tutor_give_explan            52.1
# ℹ 23 more rows
write.csv(missing_summary, file = "alter_sem_missing_summary.csv", row.names = FALSE)

Percentage of student who have both MOY and EOY data. All students across all schools and all treatment groups have all test scores in. No need to create dummies for missing test info.

#indicator for having BOTH MOY and EOY scores
roster = roster %>%
  mutate(
    all_math = !is.na(moy_score) & !is.na(eoy_score)
  )

#pct across schools
overall_pct = roster %>%
  summarise(
    total_students = n(),
    
    students_with_all_tests = sum(all_math),
    percent_with_all_tests = (students_with_all_tests / total_students) * 100
  ) %>%
  mutate(percent_with_all_tests = round(percent_with_all_tests, 2))

print(overall_pct)
# A tibble: 1 × 3
  total_students students_with_all_tests percent_with_all_tests
           <int>                   <int>                  <dbl>
1           1562                    1562                    100

Data Summary

Count of treatment students versus non-treated student.

table(roster$treat)

  0   1 
814 748 

Generate school-level means for MOY assessment data, number of students, plus distribution across demographics.

#create function to streamline calculations
calc_pct = function(x) {
  prop.table(table(x)) * 100
}

#school-level means and counts for TEST data, including attend rate avg
roster = roster %>%
  mutate(across(c(moy_score, #transform all scores to numeric value
                  eoy_score), as.numeric))

school_means = roster %>%
  group_by(school)%>%
  summarise(
    n_students = n(),
    mean_moy_math = mean(moy_score, na.rm = TRUE),
    mean_eoy_math = mean(eoy_score, na.rm = TRUE)
  )

# DEMOGRAPHICS

# race & ethnicity
race_pct = roster %>%
  group_by(school, race) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = race,
    values_from = pct,
    names_prefix = "race_"
  )


#gender
gender_pct = roster %>%
  group_by(school, gender) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = gender,
    values_from = pct,
    names_prefix = "gender_"
  )

#frpl
frpl_pct = roster %>%
  group_by(school, frpl) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = frpl,
    values_from = pct,
    names_prefix = "frpl_"
  )

#ell
ell_pct = roster %>%
  group_by(school, ell) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = ell,
    values_from = pct,
    names_prefix = "ell_"
  )


#iep students
iep_pct = roster %>%
  group_by(school, iep) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(school) %>%
  mutate(pct = (n / sum(n)) * 100) %>%
  select(-n) %>%
  pivot_wider(
    names_from = iep,
    values_from = pct,
    names_prefix = "iep_"
  )

# COMBINE summaries
school_summary = school_means %>%
  left_join(race_pct,   by = "school") %>%
  left_join(gender_pct, by = "school") %>%
  left_join(frpl_pct,   by = "school") %>%
  left_join(ell_pct,    by = "school") %>%
  left_join(iep_pct, by = "school")

school_summary = school_summary %>%
  mutate(across(where(is.numeric), ~ round(.x, 2)))

print(school_summary)
# A tibble: 17 × 18
   school           n_students mean_moy_math mean_eoy_math race_Asian race_Black
   <chr>                 <dbl>         <dbl>         <dbl>      <dbl>      <dbl>
 1 Casselberry Ele…         86          237.          252.      10.5        1.16
 2 English Estates…         48          235.          255.      18.8       14.6 
 3 Goldsboro Eleme…         48          238.          250.      12.5        2.08
 4 Greenwood Lakes…        167          239.          248.       2.99      13.2 
 5 Highlands Eleme…         53          236.          255.      30.2       11.3 
 6 Keeth Elementar…         47          237.          247.      25.5        8.51
 7 Lake Mary Eleme…        112          233.          251.       6.25       6.25
 8 Midway Elementa…        143          237.          250       18.9       18.9 
 9 Millennium Midd…        194          239.          244.       3.61      23.7 
10 Partin Elementa…         35          236.          249.      22.9        8.57
11 Pine Crest Elem…         33          239.          248.      12.1        3.03
12 Rainbow Element…        125          240.          254.       1.6       10.4 
13 Red Bug Element…         67          236.          247.       5.97      NA   
14 Sterling Park E…         44          234.          253.       4.55      25   
15 Teague Middle S…        241          236.          239.       4.15      18.7 
16 Wekiva Elementa…         55          238.          257.      30.9        3.64
17 Wilson Elementa…         64          237.          247.       3.12       3.12
# ℹ 12 more variables: race_Hispanic <dbl>,
#   `race_Native Hawaiian/Pacific Islander` <dbl>,
#   `race_Two or More Races` <dbl>, race_White <dbl>,
#   `race_American Indian/Alaska Native` <dbl>, gender_Female <dbl>,
#   gender_Male <dbl>, frpl_0 <dbl>, frpl_1 <dbl>, ell_0 <dbl>, ell_1 <dbl>,
#   iep_NA <dbl>
write.csv(school_summary, "test_summary_by_school_alter_sem.csv", row.names = FALSE)

Reformat Data

Create dummy variables

Create numeric binary for gender.

table(roster$gender)

Female   Male 
   722    840 
roster = roster %>%
  mutate(
    male = case_when(
      gender == "Male" ~ 1,
      gender == "Female" ~ 0,
      TRUE ~ NA_real_   # handles missing or unexpected values
    )
  )
table(roster$male)

  0   1 
722 840 

Create numeric binaries for each race.

table(roster$race)

   American Indian/Alaska Native                            Asian 
                              11                              147 
                           Black                         Hispanic 
                             198                              454 
Native Hawaiian/Pacific Islander                Two or More Races 
                               2                              114 
                           White 
                             636 
roster = roster %>%
  mutate(
    race_AI     = ifelse(race == "American Indian/Alaska Native", 1, ifelse(is.na(race), NA, 0)),
    race_A     = ifelse(race == "Asian", 1, ifelse(is.na(race), NA, 0)),
    race_B     = ifelse(race == "Black", 1, ifelse(is.na(race), NA, 0)),
    race_H    = ifelse(race == "Hispanic", 1, ifelse(is.na(race), NA, 0)),
    race_NH     = ifelse(race == "Native Hawaiian/Pacific Islander", 1, ifelse(is.na(race), NA, 0)),
    race_Multi =  ifelse(race == "Two or More Races", 1, ifelse(is.na(race), NA, 0)),
    race_W     = ifelse(race == "White", 1, ifelse(is.na(race), NA, 0))
  )
table(roster$race_B) #quick check coding on race_B

   0    1 
1364  198 

Export files for e2i Coach

Save e2i file with all students but only select the columns needed for analysis.

#select columns ready for e2i
roster_e2i = roster %>%
  dplyr::select(student_id, 
                teacher_id,
                grade, 
                school,
                moy_score, moy_date,
                eoy_score, eoy_date,
                male, iep, ell, frpl,
                treat,
                race_AI, race_A, race_B, race_H, race_NH, race_Multi, race_W,
                
                num_min,
                hint_supplied, hints_opened, hints_engaged,
                stt_used, stt_revised, 
                concept_under, procedural_flu, 
                prop_tutor_give_explan,
                prop_tutor_give_instruct, 
                prop_tutor_give_answer,
                prop_tutor_polite_expres, 
                prop_tutor_praise_encour, 
                prop_tutor_confusion,
                prop_tutor_confirm_feed,
                prop_accept_reject, 
                prop_command, 
                total_msg,total_convers
                )

#export e2i
write.csv(roster_e2i, file = "alter_sem_all_students_complete_tests.csv", row.names = FALSE)

Since all student have complete test info, there’s no need to save e2i file with only containing students with complete test info.

table(roster$all_math)

TRUE 
1562