Notes from conversation with Istation:

#import CSV files to dataframes for beginning and end of year
boyReading <- read.csv("BOY_readingK3SY24.csv")
eoyReading <- read.csv("EOY_readingK3SY24.csv")

#how many records from beginning of the year tests?
nrow(boyReading)
## [1] 100151
#how many records from end of the year tests?
nrow(eoyReading)
## [1] 110122
#remove records with missing data

boyReading <- boyReading[complete.cases(boyReading[, c("STU_STATE_ID", "Overall_LEVEL", "Overall_SCORE", "Overall_PTILE", "Overall_ADMIN_DATE")]), ]

eoyReading <- eoyReading[complete.cases(eoyReading[, c("STU_STATE_ID", "Overall_LEVEL", "Overall_SCORE", "Overall_PTILE", "Overall_ADMIN_DATE")]), ]


#finding that there are duplicate administrations for many students
boyDuplicates <- boyReading %>%
  group_by(STU_STATE_ID) %>%
  filter(n() > 1)

eoyDuplicates <- eoyReading %>%
  group_by(STU_STATE_ID) %>%
  filter(n() > 1)

How many students tested more than once BOY or EOY?

#beginning of the year
nrow(boyDuplicates)
## [1] 82398
#end of the year
nrow(eoyDuplicates)
## [1] 101198
#taking the earliest performance across BOY duplicate administrations
boyReadingEarliest <- boyReading %>%
  arrange(STU_STATE_ID, Overall_ADMIN_DATE) %>% #arrange by ID then date
  distinct(STU_STATE_ID, .keep_all = TRUE) #keep the first occurance of each ID


#taking the highest performance across EOY duplicate administrations
eoyReadingBest <- eoyReading %>% 
  group_by(STU_STATE_ID) %>%
  summarize(
    Max_Overall_LEVEL = max(Overall_LEVEL),
    Max_Overall_PTILE = max(Overall_PTILE),
    Max_Overall_SCORE = max(Overall_SCORE),
    countAdmin = n()
  )

How many unique students’ test scores are in each of BOY and EOY?

#beginning of the year
nrow(boyReadingEarliest)
## [1] 58856
#end of the year
nrow(eoyReadingBest)
## [1] 59468

Only use students with test scores at BOY and EOY .

#Now check to make sure they both contain only the incommon students 

common_ids <- intersect(boyReadingEarliest$STU_STATE_ID, eoyReadingBest$STU_STATE_ID)

#how many students in common?
print(length(common_ids))
## [1] 56445
boy <- boyReadingEarliest[boyReadingEarliest$STU_STATE_ID %in% common_ids, ]

eoy <- eoyReadingBest[eoyReadingBest$STU_STATE_ID %in% common_ids, ]

How many students with both BOY and EOY test scores?

#beginning of the year
nrow(boy)
## [1] 56445
#beginning of year percentage with beginning of year tests also?
(nrow(boy)/nrow(boyReadingEarliest))*100
## [1] 95.90356
#end of the year number
nrow(eoy)
## [1] 56445
#end of the year percentage with end of year tests also?
(nrow(eoy)/nrow(eoyReadingBest))*100
## [1] 94.91659

This methodology results in a measure of actual change in proficiency of the same group of students over the year. Make sure there are no NAs.

# Check for NA values in STU_STATE_ID columns
sum(is.na(boyReadingEarliest$STU_STATE_ID))
## [1] 0
sum(is.na(eoyReadingBest$STU_STATE_ID))
## [1] 0

Rate of students K-3 scoring proficient on BOY English Reading test

boyP <- boy[boy$Overall_LEVEL >=3, ]
boyPRate <- nrow(boyP)/nrow(boy)
print(boyPRate)
## [1] 0.4439011

Rate of students K-3 scoring proficient on EOY English Reading test

eoyP <- eoy[eoy$Max_Overall_LEVEL >=3, ]
eoyPRate <- nrow(eoyP)/nrow(eoy)
print(eoyPRate)
## [1] 0.5320046

Change in proficiency rate from EOY to BOY

eoyPRate - boyPRate
## [1] 0.08810346

How many more students scored proficient at the end of the year?

nrow(eoy[eoy$Max_Overall_LEVEL >=3, ])-nrow(boy[boy$Overall_LEVEL >=3, ])
## [1] 4973