Clean Data
Import
Packages selected for inspecting and cleaning ALTER-Math, Seminole School District.
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.2.1 ✔ readr 2.2.0
✔ forcats 1.0.1 ✔ stringr 1.6.0
✔ ggplot2 4.0.3 ✔ tibble 3.3.1
✔ lubridate 1.9.5 ✔ tidyr 1.3.2
✔ purrr 1.2.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Import roster. Observation : Comparing data columns to what was listed in the data request, the following variables were not received: Attendance rate, Num_session, TH_supplied, TH_opened, and Transfer (an important outcome variable) .
roster <- read_csv ("ALTER-Math_Data_Mathematica.csv" )
Rows: 1562 Columns: 33
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): Student_ID*, School_Name*, Teacher_ID*, Gender, Race/Ethnicity
dbl (25): Student_Grade*, Treatment*, PM2 _Score*, PM3_Score*, ELL, FRPL, N...
lgl (1): IEP
dttm (2): PM2_Date*, PM3_Date*
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1] "Student_ID*" "Student_Grade*"
[3] "School_Name*" "Teacher_ID*"
[5] "Treatment*" "PM2 _Score*"
[7] "PM2_Date*" "PM3_Score*"
[9] "PM3_Date*" "Gender"
[11] "IEP" "ELL"
[13] "FRPL" "Race/Ethnicity"
[15] "Num_minutes" "Hints_supplied"
[17] "Hints_opened" "Hints_engaged"
[19] "STT_used" "STT_revised"
[21] "Conceptual_understanding" "Procedural_fluency"
[23] "Prop_tutor_giving_explanation" "Prop_tutor_giving_instruction"
[25] "Prop_tutor_giving_answer" "Prop_tutor_polite_expressions"
[27] "Prop_tutor_praising_and_encouraging" "Prop_tutor_confusion"
[29] "Prop_tutor_confirmatory_feedback" "Prop_accepting_reject"
[31] "Prop_commanding" "Total_messages"
[33] "Total_conversations"
Change column headers for roster.
roster = roster %>%
rename (
student_id = "Student_ID*" ,
teacher_id = "Teacher_ID*" ,
grade = "Student_Grade*" ,
school = "School_Name*" ,
treat = "Treatment*" ,
moy_score = "PM2 _Score*" ,
moy_date = "PM2_Date*" ,
eoy_score = "PM3_Score*" ,
eoy_date = "PM3_Date*" ,
gender = "Gender" ,
iep = "IEP" ,
ell = "ELL" ,
frpl = "FRPL" ,
race = "Race/Ethnicity" ,
num_min = "Num_minutes" ,
hint_supplied = "Hints_supplied" ,
hints_opened = "Hints_opened" ,
hints_engaged = "Hints_engaged" ,
stt_used = "STT_used" , stt_revised = "STT_revised" ,
concept_under = "Conceptual_understanding" ,
procedural_flu = "Procedural_fluency" ,
prop_tutor_give_explan = "Prop_tutor_giving_explanation" ,
prop_tutor_give_instruct = "Prop_tutor_giving_instruction" ,
prop_tutor_give_answer = "Prop_tutor_giving_answer" ,
prop_tutor_polite_expres = "Prop_tutor_polite_expressions" ,
prop_tutor_praise_encour = "Prop_tutor_praising_and_encouraging" ,
prop_tutor_confusion = "Prop_tutor_confusion" ,
prop_tutor_confirm_feed = "Prop_tutor_confirmatory_feedback" ,
prop_accept_reject = "Prop_accepting_reject" ,
prop_command = "Prop_commanding" ,
total_msg = "Total_messages" ,
total_convers = "Total_conversations"
)
colnames (roster)
[1] "student_id" "grade"
[3] "school" "teacher_id"
[5] "treat" "moy_score"
[7] "moy_date" "eoy_score"
[9] "eoy_date" "gender"
[11] "iep" "ell"
[13] "frpl" "race"
[15] "num_min" "hint_supplied"
[17] "hints_opened" "hints_engaged"
[19] "stt_used" "stt_revised"
[21] "concept_under" "procedural_flu"
[23] "prop_tutor_give_explan" "prop_tutor_give_instruct"
[25] "prop_tutor_give_answer" "prop_tutor_polite_expres"
[27] "prop_tutor_praise_encour" "prop_tutor_confusion"
[29] "prop_tutor_confirm_feed" "prop_accept_reject"
[31] "prop_command" "total_msg"
[33] "total_convers"
Inspect duplicates
Identify duplicate student records within roster file.
# frequency table of student_id occurrences
id_counts_roster = roster %>%
group_by (student_id) %>%
summarise (n = n ()) %>%
arrange (desc (n))
# filter only IDs that appear > 1
dupes_roster = id_counts_roster %>%
filter (n > 1 )
# view results
print (dupes_roster)
# A tibble: 0 × 2
# ℹ 2 variables: student_id <chr>, n <int>
No duplicates to report.
Missing Data Inspections
Percentage missing for each variable. Observations: IEP status is missing for all students. Many treatment-related variables are missing, as expected (repeating proportion of 52.11%); this is a merged file with treated and non-treated students. All students appears to have complete test data and demographics, including teacher ID.
missing_summary = roster %>%
summarise (across (
everything (),
~ mean (is.na (.)) * 100
)) %>%
pivot_longer (
cols = everything (),
names_to = "variable" ,
values_to = "percent_missing"
) %>%
arrange (desc (percent_missing))
print (missing_summary)
# A tibble: 33 × 2
variable percent_missing
<chr> <dbl>
1 iep 100
2 num_min 52.1
3 hint_supplied 52.1
4 hints_opened 52.1
5 hints_engaged 52.1
6 stt_used 52.1
7 stt_revised 52.1
8 concept_under 52.1
9 procedural_flu 52.1
10 prop_tutor_give_explan 52.1
# ℹ 23 more rows
write.csv (missing_summary, file = "alter_sem_missing_summary.csv" , row.names = FALSE )
Percentage of student who have both MOY and EOY data . All students across all schools and all treatment groups have all test scores in. No need to create dummies for missing test info.
#indicator for having BOTH MOY and EOY scores
roster = roster %>%
mutate (
all_math = ! is.na (moy_score) & ! is.na (eoy_score)
)
#pct across schools
overall_pct = roster %>%
summarise (
total_students = n (),
students_with_all_tests = sum (all_math),
percent_with_all_tests = (students_with_all_tests / total_students) * 100
) %>%
mutate (percent_with_all_tests = round (percent_with_all_tests, 2 ))
print (overall_pct)
# A tibble: 1 × 3
total_students students_with_all_tests percent_with_all_tests
<int> <int> <dbl>
1 1562 1562 100
Data Summary
Count of treatment students versus non-treated student.
Generate school-level means for MOY assessment data, number of students, plus distribution across demographics.
#create function to streamline calculations
calc_pct = function (x) {
prop.table (table (x)) * 100
}
#school-level means and counts for TEST data, including attend rate avg
roster = roster %>%
mutate (across (c (moy_score, #transform all scores to numeric value
eoy_score), as.numeric))
school_means = roster %>%
group_by (school)%>%
summarise (
n_students = n (),
mean_moy_math = mean (moy_score, na.rm = TRUE ),
mean_eoy_math = mean (eoy_score, na.rm = TRUE )
)
# DEMOGRAPHICS
# race & ethnicity
race_pct = roster %>%
group_by (school, race) %>%
summarise (n = n (), .groups = "drop" ) %>%
group_by (school) %>%
mutate (pct = (n / sum (n)) * 100 ) %>%
select (- n) %>%
pivot_wider (
names_from = race,
values_from = pct,
names_prefix = "race_"
)
#gender
gender_pct = roster %>%
group_by (school, gender) %>%
summarise (n = n (), .groups = "drop" ) %>%
group_by (school) %>%
mutate (pct = (n / sum (n)) * 100 ) %>%
select (- n) %>%
pivot_wider (
names_from = gender,
values_from = pct,
names_prefix = "gender_"
)
#frpl
frpl_pct = roster %>%
group_by (school, frpl) %>%
summarise (n = n (), .groups = "drop" ) %>%
group_by (school) %>%
mutate (pct = (n / sum (n)) * 100 ) %>%
select (- n) %>%
pivot_wider (
names_from = frpl,
values_from = pct,
names_prefix = "frpl_"
)
#ell
ell_pct = roster %>%
group_by (school, ell) %>%
summarise (n = n (), .groups = "drop" ) %>%
group_by (school) %>%
mutate (pct = (n / sum (n)) * 100 ) %>%
select (- n) %>%
pivot_wider (
names_from = ell,
values_from = pct,
names_prefix = "ell_"
)
#iep students
iep_pct = roster %>%
group_by (school, iep) %>%
summarise (n = n (), .groups = "drop" ) %>%
group_by (school) %>%
mutate (pct = (n / sum (n)) * 100 ) %>%
select (- n) %>%
pivot_wider (
names_from = iep,
values_from = pct,
names_prefix = "iep_"
)
# COMBINE summaries
school_summary = school_means %>%
left_join (race_pct, by = "school" ) %>%
left_join (gender_pct, by = "school" ) %>%
left_join (frpl_pct, by = "school" ) %>%
left_join (ell_pct, by = "school" ) %>%
left_join (iep_pct, by = "school" )
school_summary = school_summary %>%
mutate (across (where (is.numeric), ~ round (.x, 2 )))
print (school_summary)
# A tibble: 17 × 18
school n_students mean_moy_math mean_eoy_math race_Asian race_Black
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Casselberry Ele… 86 237. 252. 10.5 1.16
2 English Estates… 48 235. 255. 18.8 14.6
3 Goldsboro Eleme… 48 238. 250. 12.5 2.08
4 Greenwood Lakes… 167 239. 248. 2.99 13.2
5 Highlands Eleme… 53 236. 255. 30.2 11.3
6 Keeth Elementar… 47 237. 247. 25.5 8.51
7 Lake Mary Eleme… 112 233. 251. 6.25 6.25
8 Midway Elementa… 143 237. 250 18.9 18.9
9 Millennium Midd… 194 239. 244. 3.61 23.7
10 Partin Elementa… 35 236. 249. 22.9 8.57
11 Pine Crest Elem… 33 239. 248. 12.1 3.03
12 Rainbow Element… 125 240. 254. 1.6 10.4
13 Red Bug Element… 67 236. 247. 5.97 NA
14 Sterling Park E… 44 234. 253. 4.55 25
15 Teague Middle S… 241 236. 239. 4.15 18.7
16 Wekiva Elementa… 55 238. 257. 30.9 3.64
17 Wilson Elementa… 64 237. 247. 3.12 3.12
# ℹ 12 more variables: race_Hispanic <dbl>,
# `race_Native Hawaiian/Pacific Islander` <dbl>,
# `race_Two or More Races` <dbl>, race_White <dbl>,
# `race_American Indian/Alaska Native` <dbl>, gender_Female <dbl>,
# gender_Male <dbl>, frpl_0 <dbl>, frpl_1 <dbl>, ell_0 <dbl>, ell_1 <dbl>,
# iep_NA <dbl>
write.csv (school_summary, "test_summary_by_school_alter_sem.csv" , row.names = FALSE )
Export files for e2i Coach
Save e2i file with all students but only select the columns needed for analysis.
#select columns ready for e2i
roster_e2i = roster %>%
dplyr:: select (student_id,
teacher_id,
grade,
school,
moy_score, moy_date,
eoy_score, eoy_date,
male, iep, ell, frpl,
treat,
race_AI, race_A, race_B, race_H, race_NH, race_Multi, race_W,
num_min,
hint_supplied, hints_opened, hints_engaged,
stt_used, stt_revised,
concept_under, procedural_flu,
prop_tutor_give_explan,
prop_tutor_give_instruct,
prop_tutor_give_answer,
prop_tutor_polite_expres,
prop_tutor_praise_encour,
prop_tutor_confusion,
prop_tutor_confirm_feed,
prop_accept_reject,
prop_command,
total_msg,total_convers
)
#export e2i
write.csv (roster_e2i, file = "alter_sem_all_students_complete_tests.csv" , row.names = FALSE )
Since all student have complete test info, there’s no need to save e2i file with only containing students with complete test info.