SGP NM by Level, Group, Content Area

Prepare

This script starts with the New_Mexico_SGP_LONG_Data_2024.RData object generated with the SGP package.

#clear the global environment
rm(list = ls())

Load the necessary packages

Load data

#load data object
load("Data/New_Mexico_SGP_LONG_Data_2024.RData")

Define dataframe

data <- get("New_Mexico_SGP_LONG_Data_2024")

It then runs a sequence of functions to group and summarize by the subgroups required:

Reporting Requirements

Each file that reports on student subgroups should include the following groups:

1.All
2.Female
3.Male
4.Hispanic
5.White
6.Black
7.Asian
8.Native
9.Multirace
10.FRL
11.NotFRL
12.DirecrCert
13.NotDirectCert
14.SwD
15.NotSwD
16.EL
17.NotEL
18.Migrant
19.Homeless
20.Military
21.Foster
22.AnyNative
23.AnyBlack

All indicators for which data is available must include:

Statewide numerators, denominators, and rates.
District-level numerators, denominators, and rates.
School-level numerators, denominators, and rates.

Content areas include: - ELA - MATHEMATICS

Start by examining the grouping variables.

View all variables

data <- data[1:34]
names(data)

##  [1] "VALID_CASE"              "CONTENT_AREA"           
##  [3] "YEAR"                    "ID"                     
##  [5] "GRADE_ENROLLED"          "SCALE_SCORE"            
##  [7] "ACHIEVEMENT_LEVEL"       "GRADE"                  
##  [9] "SCHOOL_NUMBER"           "AccountabilityModel"    
## [11] "ExclAccountabiltyReason" "Test"                   
## [13] "Hispanic.4"              "White.5"                
## [15] "Black.6"                 "Asian.7"                
## [17] "Native.8"                "Multirace.9"            
## [19] "FRL.10"                  "NotFRL.11"              
## [21] "DirectCert.12"           "NotDirectCert.13"       
## [23] "SwD.14"                  "NotSwD.15"              
## [25] "EL.16"                   "NotEL.17"               
## [27] "Migrant.18"              "Homeless.19"            
## [29] "Military.20"             "Any_Native.22"          
## [31] "Any_Black.23"            "Foster.21"              
## [33] "GENDER"                  "SGP"

convert factors to integers

cols_to_update <- colnames(data)[13:32]

for (col in cols_to_update) {
  data[[col]] <- as.integer(data[[col]])
  if (!all(is.na(data[[col]])) && max(data[[col]], na.rm = TRUE) > 1) {
    data[[col]] <- data[[col]] - 1
  }
}

Count NAs. This list will point to demographic variables with missing data to fill in.

# Calculate the sum of NAs for each variable and filter out variables with no NAs
na_summary <- sapply(data, function(x) sum(is.na(x)))

# Filter to keep only variables where NA count is greater than 0
na_summary <- na_summary[na_summary > 0]

# Display the resulting filtered dataframe
print(na_summary)

##    SCHOOL_NUMBER       Hispanic.4          White.5          Black.6 
##            13471            21519             1711             1711 
##          Asian.7         Native.8      Multirace.9           FRL.10 
##             1711             1711             1711            33567 
##        NotFRL.11    DirectCert.12 NotDirectCert.13           SwD.14 
##            33567             1711             1711             1711 
##        NotSwD.15            EL.16         NotEL.17       Migrant.18 
##             1711            68022            68022             1711 
##      Homeless.19      Military.20    Any_Native.22     Any_Black.23 
##             1711             1711           263527           263527 
##        Foster.21           GENDER              SGP 
##           263527             1943            57442

Transform GENDER into Female & Male dichotomous variables

data$Female.2 <- ifelse(data$GENDER == "Female", 1, 0)
data$Male.3 <- ifelse(data$GENDER == "Male", 1, 0)

data <- data %>%
  relocate(Female.2, Male.3, .after = Test)

#examine var counts
table(data$GENDER, data$Female.2, useNA = "always")

##         
##               0      1   <NA>
##   Female      0 128436      0
##   Male   133148      0      0
##   <NA>        0      0   1943

table(data$GENDER, data$AccountabilityModel, useNA = "always")

##         
##               0      1   <NA>
##   Female   5158 123278      0
##   Male     5039 128109      0
##   <NA>     1872     71      0

data <- data %>%
  select(-GENDER)

Merge final AVE demographics

Replace the following with values matching ID and StID from AVE

#before

na_count.1 <- sapply(data[, 13:34], function(x) sum(is.na(x), na.rm = TRUE))
na_count.1

##         Female.2           Male.3       Hispanic.4          White.5 
##             1943             1943            21519             1711 
##          Black.6          Asian.7         Native.8      Multirace.9 
##             1711             1711             1711             1711 
##           FRL.10        NotFRL.11    DirectCert.12 NotDirectCert.13 
##            33567            33567             1711             1711 
##           SwD.14        NotSwD.15            EL.16         NotEL.17 
##             1711             1711            68022            68022 
##       Migrant.18      Homeless.19      Military.20    Any_Native.22 
##             1711             1711             1711           263527 
##     Any_Black.23        Foster.21 
##           263527           263527

ave24 <- read_csv("SY 2023-24, AVE, All Demographic and Attendance Records, V20240924 SH.csv")

## New names:
## Rows: 336242 Columns: 62
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (22): SchoolYear, DISTRICT_NAME, LOCATION_NAME, STUDENT_FIRST_NM, STUDEN... dbl
## (38): ...1, STUDENT_ID, Disnum, LOCATION_ID, Schnum, SY2023_SchoolLevel_... lgl
## (2): RACE4_CODE, RACE5_CODE
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

cols_to_update <- colnames(data)[13:34]

# Loop through the column names and update the values in data from ave24
for (col in cols_to_update) {
  data[[col]] <- ifelse(
    !is.na(data[[col]]), 
    data[[col]], 
    ave24[[col]][match(data$ID, ave24$STUDENT_ID)]
  )
}

#after

na_count.2 <- sapply(data[, 13:34], function(x) sum(is.na(x), na.rm = TRUE))
na_count <- cbind(na_count.1, na_count.2)
na_count

##                  na_count.1 na_count.2
## Female.2               1943       1466
## Male.3                 1943       1466
## Hispanic.4            21519       2417
## White.5                1711       1309
## Black.6                1711       1309
## Asian.7                1711       1309
## Native.8               1711       1309
## Multirace.9            1711       1309
## FRL.10                33567       3695
## NotFRL.11             33567       3695
## DirectCert.12          1711       1309
## NotDirectCert.13       1711       1309
## SwD.14                 1711       1309
## NotSwD.15              1711       1309
## EL.16                 68022       5356
## NotEL.17              68022       5356
## Migrant.18             1711       1309
## Homeless.19            1711       1309
## Military.20            1711       1309
## Any_Native.22        263527      13564
## Any_Black.23         263527      13564
## Foster.21            263527      13564

Keep NAs only for Female.2, Male.3 and race categories.

# List of columns where you want to replace NAs with 0
cols_to_update <- c("Hispanic.4","FRL.10","NotFRL.11", "DirectCert.12","NotDirectCert.13", "SwD.14", "NotSwD.15","EL.16","NotEL.17", "Migrant.18","Homeless.19","Military.20","Foster.21")

# Replace NAs with 0 in the specified columns

# Loop through each column in the cols_to_update vector
for (col in cols_to_update) {
  # Replace NA with 0 in the current column
  data[[col]] <- ifelse(is.na(data[[col]]), 0, data[[col]])
}

#final
na_count.3 <- sapply(data[, 13:34], function(x) sum(is.na(x), na.rm = TRUE))
na_count <- cbind(na_count.1, na_count.2, na_count.3)
na_count

##                  na_count.1 na_count.2 na_count.3
## Female.2               1943       1466       1466
## Male.3                 1943       1466       1466
## Hispanic.4            21519       2417          0
## White.5                1711       1309       1309
## Black.6                1711       1309       1309
## Asian.7                1711       1309       1309
## Native.8               1711       1309       1309
## Multirace.9            1711       1309       1309
## FRL.10                33567       3695          0
## NotFRL.11             33567       3695          0
## DirectCert.12          1711       1309          0
## NotDirectCert.13       1711       1309          0
## SwD.14                 1711       1309          0
## NotSwD.15              1711       1309          0
## EL.16                 68022       5356          0
## NotEL.17              68022       5356          0
## Migrant.18             1711       1309          0
## Homeless.19            1711       1309          0
## Military.20            1711       1309          0
## Any_Native.22        263527      13564      13564
## Any_Black.23         263527      13564      13564
## Foster.21            263527      13564          0

Add a district and statewide code

#drop records with missing SCHOOL_NUMBER
data <- data[!is.na(data$SCHOOL_NUMBER),]

#add a district code
data$DISTCODE <- floor(data$SCHOOL_NUMBER/1000)

#add a statewide code
data$STATEWIDE <- 999999

#move these columns in front of SCHOOL_NUMBER
data <- data %>%
  relocate(STATEWIDE, DISTCODE, .after = "GRADE")

Aggregate by level, by group, by content area

#a function to call to summarize at level

summarize_level <- function(level_var) {  # Step 1: Summarize by level_var and CONTENT_AREA
  level_summary_df <- data %>%
    group_by(.data[[level_var]], CONTENT_AREA) %>%
    summarize(
      median_SGP = median(SGP, na.rm = TRUE),
      count = n(),
      .groups = 'drop'
    ) %>%
    mutate(group_value = "All.1",  # Store the primary grouping value
           level_value = .data[[level_var]],     # Store the level variable value
           group = "All.1",                   # Store the grouping variable name
           level = level_var,                  # Store the level variable name
           content = CONTENT_AREA) %>%        # Store the content area
    select(level, level_value, content, group, group_value,  count, median_SGP)
  return(level_summary_df)
}

#a function to call to summarize at level and group
summarize_group <- function(level_var, group_var) {  # Step 1: Summarize by level_var, group_var, and CONTENT_AREA
  group_summary_df <- data %>%
    group_by(.data[[level_var]], .data[[group_var]], CONTENT_AREA) %>%
    summarize(
      median_SGP = median(SGP, na.rm = TRUE),
      count = n(),
      .groups = 'drop'
    ) %>%
    mutate(group_value = .data[[group_var]],  # Store the primary grouping value
           level_value = .data[[level_var]],     # Store the level variable value
           group = group_var,                   # Store the grouping variable name
           level = level_var,                  # Store the level variable name
           content = CONTENT_AREA) %>%        # Store the content area
    select(level, level_value, content, group, group_value,  count, median_SGP)  
    
  return(group_summary_df)
}

#a list of grouping variables to use at all levels
grouping_vars <- names(data[, 15:36])
#a list of levels
level_vars <- c("STATEWIDE", "DISTCODE", "SCHOOL_NUMBER")
growth <- data.frame() #initialize the df for growth

#loop through levels first to calculate the 1.All counts and median SGPs
for (lv in level_vars) {
  df_level <- summarize_level(lv)
  #loop through groups at each level for counts and median SGPs
  df_group <- do.call(rbind, lapply(grouping_vars, function(var) summarize_group(lv, var)))
  #put them all together into one df
  growth <- rbind(growth, df_level, df_group)
}

#remove the group value = 0 counts and SGPs for all groups and levels
#this is a reduntant "not" category of the group
#keep the group value is NA counts and SGPS for those groups that retained NA
growth <- growth %>%
  filter(group_value != 0 | is.na(group_value))

#add "NA." to the beginning of the group identifier with NAs as group value
growth$group <- ifelse(is.na(growth$group_value), paste0("NA.",growth$group), growth$group)

growth_long <- growth %>%
  #remove the redundant group_value variable now that it's been added to group variable
  select(-group_value) %>% 
  #change the names to match conventions
  rename(
    Level = level,
    SchNumb = level_value,
    Subgroup = group,
    Subject = content,
    "N" = count,
    "Median SGP" = median_SGP
  ) 

#Sorting columns correctly in the wide format presents a problem without changing the format of the col names

# a function to extract number, pad it with zeros, and move it to the front of the group name

replace_subgroup <- function(x) {
  parts <- regmatches(x, regexec("(.*)\\.(\\d+)$", x))[[1]]  # Extract parts (text and number)
  if (length(parts) > 1) {
    sprintf("%02d.%s", as.numeric(parts[3]), parts[2])  # Format as "number.text"
  } else {
    x  # Return the original value if no match is found (no period or number)
  }
}

# modify the Subgroup variable by applying the function
growth_long$Subgroup <- sapply(growth_long$Subgroup, replace_subgroup)


#wide format provided to each district/charter school for embargo and used in index calculations
growth_wide <- growth_long %>%
  pivot_wider(
    names_from = Subgroup,
    values_from = c("Median SGP", "N"),
    names_glue = "{Subgroup} {.value}"
  ) %>%
  select(1:3, sort(names(.)[4:57]))

names(growth_wide)

##  [1] "Level"                       "SchNumb"                    
##  [3] "Subject"                     "01.All Median SGP"          
##  [5] "01.All N"                    "02.Female Median SGP"       
##  [7] "02.Female N"                 "02.NA.Female Median SGP"    
##  [9] "02.NA.Female N"              "03.Male Median SGP"         
## [11] "03.Male N"                   "03.NA.Male Median SGP"      
## [13] "03.NA.Male N"                "04.Hispanic Median SGP"     
## [15] "04.Hispanic N"               "05.White Median SGP"        
## [17] "05.White N"                  "06.Black Median SGP"        
## [19] "06.Black N"                  "07.Asian Median SGP"        
## [21] "07.Asian N"                  "08.Native Median SGP"       
## [23] "08.Native N"                 "09.Multirace Median SGP"    
## [25] "09.Multirace N"              "10.FRL Median SGP"          
## [27] "10.FRL N"                    "11.NotFRL Median SGP"       
## [29] "11.NotFRL N"                 "12.DirectCert Median SGP"   
## [31] "12.DirectCert N"             "13.NotDirectCert Median SGP"
## [33] "13.NotDirectCert N"          "14.SwD Median SGP"          
## [35] "14.SwD N"                    "15.NotSwD Median SGP"       
## [37] "15.NotSwD N"                 "16.EL Median SGP"           
## [39] "16.EL N"                     "17.NotEL Median SGP"        
## [41] "17.NotEL N"                  "18.Migrant Median SGP"      
## [43] "18.Migrant N"                "19.Homeless Median SGP"     
## [45] "19.Homeless N"               "20.Military Median SGP"     
## [47] "20.Military N"               "21.Foster Median SGP"       
## [49] "21.Foster N"                 "22.Any_Native Median SGP"   
## [51] "22.Any_Native N"             "22.NA.Any_Native Median SGP"
## [53] "22.NA.Any_Native N"          "23.Any_Black Median SGP"    
## [55] "23.Any_Black N"              "23.NA.Any_Black Median SGP" 
## [57] "23.NA.Any_Black N"

##Write as .csv and save as .Rdata

growth_wide[] <- lapply(growth_wide, as.character)

write.csv(growth_wide, "OUT/AMD SY 2023-24, Academic Growth, 20240926 SH.csv", row.names = FALSE)

save(growth_wide, file = "OUT/AMD SY 2023-24, Academic Growth, 20240926 SH.RData")

####prepare data for individual report to district and schools

data_out <- lapply(data, as.character)

write.csv(data_out, "OUT/SY 2023-24, AMD, Academic Growth Student-Level All, 20240926 SH.csv")