Prepare

This script starts with the New_Mexico_SGP_LONG_Data_2024.RData object generated with the SGP package.

#clear the global environment
rm(list = ls())

Load the necessary packages

Load data

#load data object
load("Data/New_Mexico_SGP_LONG_Data_2024.RData")

Define dataframe

data <- get("New_Mexico_SGP_LONG_Data_2024")

It then runs a sequence of functions to group and summarize by the subgroups required:

Reporting Requirements

Each file that reports on student subgroups should include the following groups:

  • 1.All

  • 2.Female

  • 3.Male

  • 4.Hispanic

  • 5.White

  • 6.Black

  • 7.Asian

  • 8.Native

  • 9.Multirace

  • 10.FRL

  • 11.NotFRL

  • 12.DirecrCert

  • 13.NotDirectCert

  • 14.SwD

  • 15.NotSwD

  • 16.EL

  • 17.NotEL

  • 18.Migrant

  • 19.Homeless

  • 20.Military

  • 21.Foster

  • 22.AnyNative

  • 23.AnyBlack

All indicators for which data is available must include:

  • Statewide numerators, denominators, and rates.

  • District-level numerators, denominators, and rates.

  • School-level numerators, denominators, and rates.

Content areas include: - ELA - MATHEMATICS

Start by examining the grouping variables.

View all variables

data <- data[1:34]
names(data)
##  [1] "VALID_CASE"              "CONTENT_AREA"           
##  [3] "YEAR"                    "ID"                     
##  [5] "GRADE_ENROLLED"          "SCALE_SCORE"            
##  [7] "ACHIEVEMENT_LEVEL"       "GRADE"                  
##  [9] "SCHOOL_NUMBER"           "AccountabilityModel"    
## [11] "ExclAccountabiltyReason" "Test"                   
## [13] "Hispanic.4"              "White.5"                
## [15] "Black.6"                 "Asian.7"                
## [17] "Native.8"                "Multirace.9"            
## [19] "FRL.10"                  "NotFRL.11"              
## [21] "DirectCert.12"           "NotDirectCert.13"       
## [23] "SwD.14"                  "NotSwD.15"              
## [25] "EL.16"                   "NotEL.17"               
## [27] "Migrant.18"              "Homeless.19"            
## [29] "Military.20"             "Any_Native.22"          
## [31] "Any_Black.23"            "Foster.21"              
## [33] "GENDER"                  "SGP"

convert factors to integers

cols_to_update <- colnames(data)[13:32]

for (col in cols_to_update) {
  data[[col]] <- as.integer(data[[col]])
  if (!all(is.na(data[[col]])) && max(data[[col]], na.rm = TRUE) > 1) {
    data[[col]] <- data[[col]] - 1
  }
}

Count NAs. This list will point to demographic variables with missing data to fill in.

# Calculate the sum of NAs for each variable and filter out variables with no NAs
na_summary <- sapply(data, function(x) sum(is.na(x)))

# Filter to keep only variables where NA count is greater than 0
na_summary <- na_summary[na_summary > 0]

# Display the resulting filtered dataframe
print(na_summary)
##    SCHOOL_NUMBER       Hispanic.4          White.5          Black.6 
##            13471            21519             1711             1711 
##          Asian.7         Native.8      Multirace.9           FRL.10 
##             1711             1711             1711            33567 
##        NotFRL.11    DirectCert.12 NotDirectCert.13           SwD.14 
##            33567             1711             1711             1711 
##        NotSwD.15            EL.16         NotEL.17       Migrant.18 
##             1711            68022            68022             1711 
##      Homeless.19      Military.20    Any_Native.22     Any_Black.23 
##             1711             1711           263527           263527 
##        Foster.21           GENDER              SGP 
##           263527             1943            57442

Transform GENDER into Female & Male dichotomous variables

data$Female.2 <- ifelse(data$GENDER == "Female", 1, 0)
data$Male.3 <- ifelse(data$GENDER == "Male", 1, 0)

data <- data %>%
  relocate(Female.2, Male.3, .after = Test)

#examine var counts
table(data$GENDER, data$Female.2, useNA = "always")
##         
##               0      1   <NA>
##   Female      0 128436      0
##   Male   133148      0      0
##   <NA>        0      0   1943
table(data$GENDER, data$AccountabilityModel, useNA = "always")
##         
##               0      1   <NA>
##   Female   5158 123278      0
##   Male     5039 128109      0
##   <NA>     1872     71      0
data <- data %>%
  select(-GENDER)

Merge final AVE demographics

Replace the following with values matching ID and StID from AVE

#before

na_count.1 <- sapply(data[, 13:34], function(x) sum(is.na(x), na.rm = TRUE))
na_count.1
##         Female.2           Male.3       Hispanic.4          White.5 
##             1943             1943            21519             1711 
##          Black.6          Asian.7         Native.8      Multirace.9 
##             1711             1711             1711             1711 
##           FRL.10        NotFRL.11    DirectCert.12 NotDirectCert.13 
##            33567            33567             1711             1711 
##           SwD.14        NotSwD.15            EL.16         NotEL.17 
##             1711             1711            68022            68022 
##       Migrant.18      Homeless.19      Military.20    Any_Native.22 
##             1711             1711             1711           263527 
##     Any_Black.23        Foster.21 
##           263527           263527
ave24 <- read_csv("SY 2023-24, AVE, All Demographic and Attendance Records, V20240924 SH.csv")
## New names:
## Rows: 336242 Columns: 62
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (22): SchoolYear, DISTRICT_NAME, LOCATION_NAME, STUDENT_FIRST_NM, STUDEN... dbl
## (38): ...1, STUDENT_ID, Disnum, LOCATION_ID, Schnum, SY2023_SchoolLevel_... lgl
## (2): RACE4_CODE, RACE5_CODE
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
cols_to_update <- colnames(data)[13:34]

# Loop through the column names and update the values in data from ave24
for (col in cols_to_update) {
  data[[col]] <- ifelse(
    !is.na(data[[col]]), 
    data[[col]], 
    ave24[[col]][match(data$ID, ave24$STUDENT_ID)]
  )
}
#after

na_count.2 <- sapply(data[, 13:34], function(x) sum(is.na(x), na.rm = TRUE))
na_count <- cbind(na_count.1, na_count.2)
na_count
##                  na_count.1 na_count.2
## Female.2               1943       1466
## Male.3                 1943       1466
## Hispanic.4            21519       2417
## White.5                1711       1309
## Black.6                1711       1309
## Asian.7                1711       1309
## Native.8               1711       1309
## Multirace.9            1711       1309
## FRL.10                33567       3695
## NotFRL.11             33567       3695
## DirectCert.12          1711       1309
## NotDirectCert.13       1711       1309
## SwD.14                 1711       1309
## NotSwD.15              1711       1309
## EL.16                 68022       5356
## NotEL.17              68022       5356
## Migrant.18             1711       1309
## Homeless.19            1711       1309
## Military.20            1711       1309
## Any_Native.22        263527      13564
## Any_Black.23         263527      13564
## Foster.21            263527      13564

Keep NAs only for Female.2, Male.3 and race categories.

# List of columns where you want to replace NAs with 0
cols_to_update <- c("Hispanic.4","FRL.10","NotFRL.11", "DirectCert.12","NotDirectCert.13", "SwD.14", "NotSwD.15","EL.16","NotEL.17", "Migrant.18","Homeless.19","Military.20","Foster.21")

# Replace NAs with 0 in the specified columns

# Loop through each column in the cols_to_update vector
for (col in cols_to_update) {
  # Replace NA with 0 in the current column
  data[[col]] <- ifelse(is.na(data[[col]]), 0, data[[col]])
}

#final
na_count.3 <- sapply(data[, 13:34], function(x) sum(is.na(x), na.rm = TRUE))
na_count <- cbind(na_count.1, na_count.2, na_count.3)
na_count
##                  na_count.1 na_count.2 na_count.3
## Female.2               1943       1466       1466
## Male.3                 1943       1466       1466
## Hispanic.4            21519       2417          0
## White.5                1711       1309       1309
## Black.6                1711       1309       1309
## Asian.7                1711       1309       1309
## Native.8               1711       1309       1309
## Multirace.9            1711       1309       1309
## FRL.10                33567       3695          0
## NotFRL.11             33567       3695          0
## DirectCert.12          1711       1309          0
## NotDirectCert.13       1711       1309          0
## SwD.14                 1711       1309          0
## NotSwD.15              1711       1309          0
## EL.16                 68022       5356          0
## NotEL.17              68022       5356          0
## Migrant.18             1711       1309          0
## Homeless.19            1711       1309          0
## Military.20            1711       1309          0
## Any_Native.22        263527      13564      13564
## Any_Black.23         263527      13564      13564
## Foster.21            263527      13564          0

Add a district and statewide code

#drop records with missing SCHOOL_NUMBER
data <- data[!is.na(data$SCHOOL_NUMBER),]

#add a district code
data$DISTCODE <- floor(data$SCHOOL_NUMBER/1000)

#add a statewide code
data$STATEWIDE <- 999999

#move these columns in front of SCHOOL_NUMBER
data <- data %>%
  relocate(STATEWIDE, DISTCODE, .after = "GRADE")

Aggregate by level, by group, by content area

#a function to call to summarize at level

summarize_level <- function(level_var) {  # Step 1: Summarize by level_var and CONTENT_AREA
  level_summary_df <- data %>%
    group_by(.data[[level_var]], CONTENT_AREA) %>%
    summarize(
      median_SGP = median(SGP, na.rm = TRUE),
      count = n(),
      .groups = 'drop'
    ) %>%
    mutate(group_value = "All.1",  # Store the primary grouping value
           level_value = .data[[level_var]],     # Store the level variable value
           group = "All.1",                   # Store the grouping variable name
           level = level_var,                  # Store the level variable name
           content = CONTENT_AREA) %>%        # Store the content area
    select(level, level_value, content, group, group_value,  count, median_SGP)
  return(level_summary_df)
}

#a function to call to summarize at level and group
summarize_group <- function(level_var, group_var) {  # Step 1: Summarize by level_var, group_var, and CONTENT_AREA
  group_summary_df <- data %>%
    group_by(.data[[level_var]], .data[[group_var]], CONTENT_AREA) %>%
    summarize(
      median_SGP = median(SGP, na.rm = TRUE),
      count = n(),
      .groups = 'drop'
    ) %>%
    mutate(group_value = .data[[group_var]],  # Store the primary grouping value
           level_value = .data[[level_var]],     # Store the level variable value
           group = group_var,                   # Store the grouping variable name
           level = level_var,                  # Store the level variable name
           content = CONTENT_AREA) %>%        # Store the content area
    select(level, level_value, content, group, group_value,  count, median_SGP)  
    
  return(group_summary_df)
}

#a list of grouping variables to use at all levels
grouping_vars <- names(data[, 15:36])
#a list of levels
level_vars <- c("STATEWIDE", "DISTCODE", "SCHOOL_NUMBER")
growth <- data.frame() #initialize the df for growth

#loop through levels first to calculate the 1.All counts and median SGPs
for (lv in level_vars) {
  df_level <- summarize_level(lv)
  #loop through groups at each level for counts and median SGPs
  df_group <- do.call(rbind, lapply(grouping_vars, function(var) summarize_group(lv, var)))
  #put them all together into one df
  growth <- rbind(growth, df_level, df_group)
}

#remove the group value = 0 counts and SGPs for all groups and levels
#this is a reduntant "not" category of the group
#keep the group value is NA counts and SGPS for those groups that retained NA
growth <- growth %>%
  filter(group_value != 0 | is.na(group_value))

#add "NA." to the beginning of the group identifier with NAs as group value
growth$group <- ifelse(is.na(growth$group_value), paste0("NA.",growth$group), growth$group)

growth_long <- growth %>%
  #remove the redundant group_value variable now that it's been added to group variable
  select(-group_value) %>% 
  #change the names to match conventions
  rename(
    Level = level,
    SchNumb = level_value,
    Subgroup = group,
    Subject = content,
    "N" = count,
    "Median SGP" = median_SGP
  ) 

#Sorting columns correctly in the wide format presents a problem without changing the format of the col names

# a function to extract number, pad it with zeros, and move it to the front of the group name

replace_subgroup <- function(x) {
  parts <- regmatches(x, regexec("(.*)\\.(\\d+)$", x))[[1]]  # Extract parts (text and number)
  if (length(parts) > 1) {
    sprintf("%02d.%s", as.numeric(parts[3]), parts[2])  # Format as "number.text"
  } else {
    x  # Return the original value if no match is found (no period or number)
  }
}

# modify the Subgroup variable by applying the function
growth_long$Subgroup <- sapply(growth_long$Subgroup, replace_subgroup)


#wide format provided to each district/charter school for embargo and used in index calculations
growth_wide <- growth_long %>%
  pivot_wider(
    names_from = Subgroup,
    values_from = c("Median SGP", "N"),
    names_glue = "{Subgroup} {.value}"
  ) %>%
  select(1:3, sort(names(.)[4:57]))

names(growth_wide)
##  [1] "Level"                       "SchNumb"                    
##  [3] "Subject"                     "01.All Median SGP"          
##  [5] "01.All N"                    "02.Female Median SGP"       
##  [7] "02.Female N"                 "02.NA.Female Median SGP"    
##  [9] "02.NA.Female N"              "03.Male Median SGP"         
## [11] "03.Male N"                   "03.NA.Male Median SGP"      
## [13] "03.NA.Male N"                "04.Hispanic Median SGP"     
## [15] "04.Hispanic N"               "05.White Median SGP"        
## [17] "05.White N"                  "06.Black Median SGP"        
## [19] "06.Black N"                  "07.Asian Median SGP"        
## [21] "07.Asian N"                  "08.Native Median SGP"       
## [23] "08.Native N"                 "09.Multirace Median SGP"    
## [25] "09.Multirace N"              "10.FRL Median SGP"          
## [27] "10.FRL N"                    "11.NotFRL Median SGP"       
## [29] "11.NotFRL N"                 "12.DirectCert Median SGP"   
## [31] "12.DirectCert N"             "13.NotDirectCert Median SGP"
## [33] "13.NotDirectCert N"          "14.SwD Median SGP"          
## [35] "14.SwD N"                    "15.NotSwD Median SGP"       
## [37] "15.NotSwD N"                 "16.EL Median SGP"           
## [39] "16.EL N"                     "17.NotEL Median SGP"        
## [41] "17.NotEL N"                  "18.Migrant Median SGP"      
## [43] "18.Migrant N"                "19.Homeless Median SGP"     
## [45] "19.Homeless N"               "20.Military Median SGP"     
## [47] "20.Military N"               "21.Foster Median SGP"       
## [49] "21.Foster N"                 "22.Any_Native Median SGP"   
## [51] "22.Any_Native N"             "22.NA.Any_Native Median SGP"
## [53] "22.NA.Any_Native N"          "23.Any_Black Median SGP"    
## [55] "23.Any_Black N"              "23.NA.Any_Black Median SGP" 
## [57] "23.NA.Any_Black N"

##Write as .csv and save as .Rdata

growth_wide[] <- lapply(growth_wide, as.character)

write.csv(growth_wide, "OUT/AMD SY 2023-24, Academic Growth, 20240926 SH.csv", row.names = FALSE)

save(growth_wide, file = "OUT/AMD SY 2023-24, Academic Growth, 20240926 SH.RData")

####prepare data for individual report to district and schools

data_out <- lapply(data, as.character)

write.csv(data_out, "OUT/SY 2023-24, AMD, Academic Growth Student-Level All, 20240926 SH.csv")