This script starts with the New_Mexico_SGP_LONG_Data_2024.RData object generated with the SGP package.
#clear the global environment
rm(list = ls())
Load data
#load data object
load("Data/New_Mexico_SGP_LONG_Data_2024.RData")
Define dataframe
data <- get("New_Mexico_SGP_LONG_Data_2024")
It then runs a sequence of functions to group and summarize by the subgroups required:
1.All
2.Female
3.Male
4.Hispanic
5.White
6.Black
7.Asian
8.Native
9.Multirace
10.FRL
11.NotFRL
12.DirecrCert
13.NotDirectCert
14.SwD
15.NotSwD
16.EL
17.NotEL
18.Migrant
19.Homeless
20.Military
21.Foster
22.AnyNative
23.AnyBlack
Statewide numerators, denominators, and rates.
District-level numerators, denominators, and rates.
School-level numerators, denominators, and rates.
View all variables
data <- data[1:34]
names(data)
## [1] "VALID_CASE" "CONTENT_AREA"
## [3] "YEAR" "ID"
## [5] "GRADE_ENROLLED" "SCALE_SCORE"
## [7] "ACHIEVEMENT_LEVEL" "GRADE"
## [9] "SCHOOL_NUMBER" "AccountabilityModel"
## [11] "ExclAccountabiltyReason" "Test"
## [13] "Hispanic.4" "White.5"
## [15] "Black.6" "Asian.7"
## [17] "Native.8" "Multirace.9"
## [19] "FRL.10" "NotFRL.11"
## [21] "DirectCert.12" "NotDirectCert.13"
## [23] "SwD.14" "NotSwD.15"
## [25] "EL.16" "NotEL.17"
## [27] "Migrant.18" "Homeless.19"
## [29] "Military.20" "Any_Native.22"
## [31] "Any_Black.23" "Foster.21"
## [33] "GENDER" "SGP"
convert factors to integers
cols_to_update <- colnames(data)[13:32]
for (col in cols_to_update) {
data[[col]] <- as.integer(data[[col]])
if (!all(is.na(data[[col]])) && max(data[[col]], na.rm = TRUE) > 1) {
data[[col]] <- data[[col]] - 1
}
}
Count NAs. This list will point to demographic variables with missing data to fill in.
# Calculate the sum of NAs for each variable and filter out variables with no NAs
na_summary <- sapply(data, function(x) sum(is.na(x)))
# Filter to keep only variables where NA count is greater than 0
na_summary <- na_summary[na_summary > 0]
# Display the resulting filtered dataframe
print(na_summary)
## SCHOOL_NUMBER Hispanic.4 White.5 Black.6
## 13471 21519 1711 1711
## Asian.7 Native.8 Multirace.9 FRL.10
## 1711 1711 1711 33567
## NotFRL.11 DirectCert.12 NotDirectCert.13 SwD.14
## 33567 1711 1711 1711
## NotSwD.15 EL.16 NotEL.17 Migrant.18
## 1711 68022 68022 1711
## Homeless.19 Military.20 Any_Native.22 Any_Black.23
## 1711 1711 263527 263527
## Foster.21 GENDER SGP
## 263527 1943 57442
data$Female.2 <- ifelse(data$GENDER == "Female", 1, 0)
data$Male.3 <- ifelse(data$GENDER == "Male", 1, 0)
data <- data %>%
relocate(Female.2, Male.3, .after = Test)
#examine var counts
table(data$GENDER, data$Female.2, useNA = "always")
##
## 0 1 <NA>
## Female 0 128436 0
## Male 133148 0 0
## <NA> 0 0 1943
table(data$GENDER, data$AccountabilityModel, useNA = "always")
##
## 0 1 <NA>
## Female 5158 123278 0
## Male 5039 128109 0
## <NA> 1872 71 0
data <- data %>%
select(-GENDER)
Replace the following with values matching ID and StID from AVE
#before
na_count.1 <- sapply(data[, 13:34], function(x) sum(is.na(x), na.rm = TRUE))
na_count.1
## Female.2 Male.3 Hispanic.4 White.5
## 1943 1943 21519 1711
## Black.6 Asian.7 Native.8 Multirace.9
## 1711 1711 1711 1711
## FRL.10 NotFRL.11 DirectCert.12 NotDirectCert.13
## 33567 33567 1711 1711
## SwD.14 NotSwD.15 EL.16 NotEL.17
## 1711 1711 68022 68022
## Migrant.18 Homeless.19 Military.20 Any_Native.22
## 1711 1711 1711 263527
## Any_Black.23 Foster.21
## 263527 263527
ave24 <- read_csv("SY 2023-24, AVE, All Demographic and Attendance Records, V20240924 SH.csv")
## New names:
## Rows: 336242 Columns: 62
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (22): SchoolYear, DISTRICT_NAME, LOCATION_NAME, STUDENT_FIRST_NM, STUDEN... dbl
## (38): ...1, STUDENT_ID, Disnum, LOCATION_ID, Schnum, SY2023_SchoolLevel_... lgl
## (2): RACE4_CODE, RACE5_CODE
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
cols_to_update <- colnames(data)[13:34]
# Loop through the column names and update the values in data from ave24
for (col in cols_to_update) {
data[[col]] <- ifelse(
!is.na(data[[col]]),
data[[col]],
ave24[[col]][match(data$ID, ave24$STUDENT_ID)]
)
}
#after
na_count.2 <- sapply(data[, 13:34], function(x) sum(is.na(x), na.rm = TRUE))
na_count <- cbind(na_count.1, na_count.2)
na_count
## na_count.1 na_count.2
## Female.2 1943 1466
## Male.3 1943 1466
## Hispanic.4 21519 2417
## White.5 1711 1309
## Black.6 1711 1309
## Asian.7 1711 1309
## Native.8 1711 1309
## Multirace.9 1711 1309
## FRL.10 33567 3695
## NotFRL.11 33567 3695
## DirectCert.12 1711 1309
## NotDirectCert.13 1711 1309
## SwD.14 1711 1309
## NotSwD.15 1711 1309
## EL.16 68022 5356
## NotEL.17 68022 5356
## Migrant.18 1711 1309
## Homeless.19 1711 1309
## Military.20 1711 1309
## Any_Native.22 263527 13564
## Any_Black.23 263527 13564
## Foster.21 263527 13564
Keep NAs only for Female.2, Male.3 and race categories.
# List of columns where you want to replace NAs with 0
cols_to_update <- c("Hispanic.4","FRL.10","NotFRL.11", "DirectCert.12","NotDirectCert.13", "SwD.14", "NotSwD.15","EL.16","NotEL.17", "Migrant.18","Homeless.19","Military.20","Foster.21")
# Replace NAs with 0 in the specified columns
# Loop through each column in the cols_to_update vector
for (col in cols_to_update) {
# Replace NA with 0 in the current column
data[[col]] <- ifelse(is.na(data[[col]]), 0, data[[col]])
}
#final
na_count.3 <- sapply(data[, 13:34], function(x) sum(is.na(x), na.rm = TRUE))
na_count <- cbind(na_count.1, na_count.2, na_count.3)
na_count
## na_count.1 na_count.2 na_count.3
## Female.2 1943 1466 1466
## Male.3 1943 1466 1466
## Hispanic.4 21519 2417 0
## White.5 1711 1309 1309
## Black.6 1711 1309 1309
## Asian.7 1711 1309 1309
## Native.8 1711 1309 1309
## Multirace.9 1711 1309 1309
## FRL.10 33567 3695 0
## NotFRL.11 33567 3695 0
## DirectCert.12 1711 1309 0
## NotDirectCert.13 1711 1309 0
## SwD.14 1711 1309 0
## NotSwD.15 1711 1309 0
## EL.16 68022 5356 0
## NotEL.17 68022 5356 0
## Migrant.18 1711 1309 0
## Homeless.19 1711 1309 0
## Military.20 1711 1309 0
## Any_Native.22 263527 13564 13564
## Any_Black.23 263527 13564 13564
## Foster.21 263527 13564 0
#drop records with missing SCHOOL_NUMBER
data <- data[!is.na(data$SCHOOL_NUMBER),]
#add a district code
data$DISTCODE <- floor(data$SCHOOL_NUMBER/1000)
#add a statewide code
data$STATEWIDE <- 999999
#move these columns in front of SCHOOL_NUMBER
data <- data %>%
relocate(STATEWIDE, DISTCODE, .after = "GRADE")
#a function to call to summarize at level
summarize_level <- function(level_var) { # Step 1: Summarize by level_var and CONTENT_AREA
level_summary_df <- data %>%
group_by(.data[[level_var]], CONTENT_AREA) %>%
summarize(
median_SGP = median(SGP, na.rm = TRUE),
count = n(),
.groups = 'drop'
) %>%
mutate(group_value = "All.1", # Store the primary grouping value
level_value = .data[[level_var]], # Store the level variable value
group = "All.1", # Store the grouping variable name
level = level_var, # Store the level variable name
content = CONTENT_AREA) %>% # Store the content area
select(level, level_value, content, group, group_value, count, median_SGP)
return(level_summary_df)
}
#a function to call to summarize at level and group
summarize_group <- function(level_var, group_var) { # Step 1: Summarize by level_var, group_var, and CONTENT_AREA
group_summary_df <- data %>%
group_by(.data[[level_var]], .data[[group_var]], CONTENT_AREA) %>%
summarize(
median_SGP = median(SGP, na.rm = TRUE),
count = n(),
.groups = 'drop'
) %>%
mutate(group_value = .data[[group_var]], # Store the primary grouping value
level_value = .data[[level_var]], # Store the level variable value
group = group_var, # Store the grouping variable name
level = level_var, # Store the level variable name
content = CONTENT_AREA) %>% # Store the content area
select(level, level_value, content, group, group_value, count, median_SGP)
return(group_summary_df)
}
#a list of grouping variables to use at all levels
grouping_vars <- names(data[, 15:36])
#a list of levels
level_vars <- c("STATEWIDE", "DISTCODE", "SCHOOL_NUMBER")
growth <- data.frame() #initialize the df for growth
#loop through levels first to calculate the 1.All counts and median SGPs
for (lv in level_vars) {
df_level <- summarize_level(lv)
#loop through groups at each level for counts and median SGPs
df_group <- do.call(rbind, lapply(grouping_vars, function(var) summarize_group(lv, var)))
#put them all together into one df
growth <- rbind(growth, df_level, df_group)
}
#remove the group value = 0 counts and SGPs for all groups and levels
#this is a reduntant "not" category of the group
#keep the group value is NA counts and SGPS for those groups that retained NA
growth <- growth %>%
filter(group_value != 0 | is.na(group_value))
#add "NA." to the beginning of the group identifier with NAs as group value
growth$group <- ifelse(is.na(growth$group_value), paste0("NA.",growth$group), growth$group)
growth_long <- growth %>%
#remove the redundant group_value variable now that it's been added to group variable
select(-group_value) %>%
#change the names to match conventions
rename(
Level = level,
SchNumb = level_value,
Subgroup = group,
Subject = content,
"N" = count,
"Median SGP" = median_SGP
)
#Sorting columns correctly in the wide format presents a problem without changing the format of the col names
# a function to extract number, pad it with zeros, and move it to the front of the group name
replace_subgroup <- function(x) {
parts <- regmatches(x, regexec("(.*)\\.(\\d+)$", x))[[1]] # Extract parts (text and number)
if (length(parts) > 1) {
sprintf("%02d.%s", as.numeric(parts[3]), parts[2]) # Format as "number.text"
} else {
x # Return the original value if no match is found (no period or number)
}
}
# modify the Subgroup variable by applying the function
growth_long$Subgroup <- sapply(growth_long$Subgroup, replace_subgroup)
#wide format provided to each district/charter school for embargo and used in index calculations
growth_wide <- growth_long %>%
pivot_wider(
names_from = Subgroup,
values_from = c("Median SGP", "N"),
names_glue = "{Subgroup} {.value}"
) %>%
select(1:3, sort(names(.)[4:57]))
names(growth_wide)
## [1] "Level" "SchNumb"
## [3] "Subject" "01.All Median SGP"
## [5] "01.All N" "02.Female Median SGP"
## [7] "02.Female N" "02.NA.Female Median SGP"
## [9] "02.NA.Female N" "03.Male Median SGP"
## [11] "03.Male N" "03.NA.Male Median SGP"
## [13] "03.NA.Male N" "04.Hispanic Median SGP"
## [15] "04.Hispanic N" "05.White Median SGP"
## [17] "05.White N" "06.Black Median SGP"
## [19] "06.Black N" "07.Asian Median SGP"
## [21] "07.Asian N" "08.Native Median SGP"
## [23] "08.Native N" "09.Multirace Median SGP"
## [25] "09.Multirace N" "10.FRL Median SGP"
## [27] "10.FRL N" "11.NotFRL Median SGP"
## [29] "11.NotFRL N" "12.DirectCert Median SGP"
## [31] "12.DirectCert N" "13.NotDirectCert Median SGP"
## [33] "13.NotDirectCert N" "14.SwD Median SGP"
## [35] "14.SwD N" "15.NotSwD Median SGP"
## [37] "15.NotSwD N" "16.EL Median SGP"
## [39] "16.EL N" "17.NotEL Median SGP"
## [41] "17.NotEL N" "18.Migrant Median SGP"
## [43] "18.Migrant N" "19.Homeless Median SGP"
## [45] "19.Homeless N" "20.Military Median SGP"
## [47] "20.Military N" "21.Foster Median SGP"
## [49] "21.Foster N" "22.Any_Native Median SGP"
## [51] "22.Any_Native N" "22.NA.Any_Native Median SGP"
## [53] "22.NA.Any_Native N" "23.Any_Black Median SGP"
## [55] "23.Any_Black N" "23.NA.Any_Black Median SGP"
## [57] "23.NA.Any_Black N"
##Write as .csv and save as .Rdata
growth_wide[] <- lapply(growth_wide, as.character)
write.csv(growth_wide, "OUT/AMD SY 2023-24, Academic Growth, 20240926 SH.csv", row.names = FALSE)
save(growth_wide, file = "OUT/AMD SY 2023-24, Academic Growth, 20240926 SH.RData")
####prepare data for individual report to district and schools
data_out <- lapply(data, as.character)
write.csv(data_out, "OUT/SY 2023-24, AMD, Academic Growth Student-Level All, 20240926 SH.csv")