RQ1: To what extent (if any) do raw scores on standards-based CFAs predict performance on corresponding standards-based questions on the NC Check-In 2 (NCCI) and, ultimately, standards mastery?
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(readr)
CFAData <- read_csv("/cloud/project/5Clean_Data_CFAs - Math (5).csv")
## New names:
## * `` -> ...1
## Rows: 66 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): ...1, NBT1&2, NBT4, NBT5, NBT7
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
CheckInData <- X5Fourth_Grade_Dataset_NCCheckIn2_Mathematics_copy_3 <- read_csv("5Fourth Grade Dataset_NCCheckIn2_Mathematics copy 3.csv")
## New names:
## * `` -> ...2
## * `` -> ...3
## Rows: 66 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (25): Item_3, Item_15, Item_17, Item_20, Item_24, Item_2, Item_7, Item_1...
## dbl (1): Student
## lgl (2): ...2, ...3
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
CheckInData <- subset(CheckInData, select=c("Item_3", "Item_15", "Item_17", "Item_20",
"Item_24", "Item_2", "Item_7", "Item_10",
"Item_14", "Item_22", "Item_1", "Item_8",
"Item_11", "Item_12", "Item_23", "Item_4",
"Item_6", "Item_13", "Item_16", "Item_19"))
CorrectAnswers <- c("D", "B", "C", "D", "C", "A", "D", "A", "D", "A", "A", "D",
"C", "B", "D", "D", "B", "C", "D", "B")
NCCheckIn_Is_Correct <- sweep(CheckInData, 2, CorrectAnswers, "==")
NBT.2CheckIn <- (rowSums(NCCheckIn_Is_Correct[,c("Item_3", "Item_15",
"Item_17", "Item_20",
"Item_24")],
na.rm=TRUE)/5) * 100
NBT.4CheckIn <- (rowSums(NCCheckIn_Is_Correct[,c("Item_2", "Item_7", "Item_10",
"Item_14", "Item_22")],
na.rm=TRUE)/5) * 100
NBT.5CheckIn <- (rowSums(NCCheckIn_Is_Correct[,c("Item_1", "Item_8", "Item_11",
"Item_12", "Item_23")],
na.rm=TRUE)/5) * 100
NBT.7CheckIn <- (rowSums(NCCheckIn_Is_Correct[,c("Item_4", "Item_6", "Item_13",
"Item_16", "Item_19")],
na.rm=TRUE)/5) * 100
NBT.4CheckIn <- data.frame(NBT.4CheckIn)
NBT.4CheckIn <- mutate(NBT.4CheckIn, ID = 1:66)
NBT.7CheckIn <- data.frame(NBT.7CheckIn)
NBT.7CheckIn <- mutate(NBT.7CheckIn, ID = 1:66)
NBT.2CheckIn <- data.frame(NBT.2CheckIn)
NBT.2CheckIn <- mutate(NBT.2CheckIn, ID = 1:66)
NBT.5CheckIn <- data.frame(NBT.5CheckIn)
NBT.5CheckIn <- mutate(NBT.5CheckIn, ID = 1:66)
CFA_NBT.5 <- select(CFAData, "NBT5")
CombinedNBT.5 <- bind_cols(CFA_NBT.5, NBT.5CheckIn)
CombinedNBT.5
## # A tibble: 66 × 3
## NBT5 NBT.5CheckIn ID
## <dbl> <dbl> <int>
## 1 90 40 1
## 2 100 100 2
## 3 90 100 3
## 4 60 0 4
## 5 100 80 5
## 6 100 100 6
## 7 80 40 7
## 8 100 80 8
## 9 100 80 9
## 10 100 60 10
## # … with 56 more rows
CFA_NBT.2 <- select(CFAData, "NBT1&2")
CombinedNBT.2 <- bind_cols(CFA_NBT.2, NBT.2CheckIn)
CombinedNBT.2
## # A tibble: 66 × 3
## `NBT1&2` NBT.2CheckIn ID
## <dbl> <dbl> <int>
## 1 90 20 1
## 2 100 80 2
## 3 100 60 3
## 4 NA 0 4
## 5 80 60 5
## 6 90 100 6
## 7 NA 40 7
## 8 60 60 8
## 9 100 100 9
## 10 80 40 10
## # … with 56 more rows
CFA_NBT.4 <- select(CFAData, "NBT4")
CombinedNBT.4 <- bind_cols(CFA_NBT.4, NBT.4CheckIn)
CombinedNBT.4
## # A tibble: 66 × 3
## NBT4 NBT.4CheckIn ID
## <dbl> <dbl> <int>
## 1 90 60 1
## 2 100 100 2
## 3 100 60 3
## 4 50 40 4
## 5 90 80 5
## 6 100 100 6
## 7 60 20 7
## 8 100 60 8
## 9 90 100 9
## 10 100 100 10
## # … with 56 more rows
CFA_NBT.7 <- select(CFAData, "NBT7")
CombinedNBT.7 <- bind_cols(CFA_NBT.7, NBT.7CheckIn)
CombinedNBT.7
## # A tibble: 66 × 3
## NBT7 NBT.7CheckIn ID
## <dbl> <dbl> <int>
## 1 80 40 1
## 2 90 60 2
## 3 90 40 3
## 4 NA 40 4
## 5 80 60 5
## 6 80 40 6
## 7 90 40 7
## 8 100 80 8
## 9 80 60 9
## 10 80 20 10
## # … with 56 more rows
MegaData <- bind_cols(CombinedNBT.2, CombinedNBT.4, CombinedNBT.5, CombinedNBT.7)
## New names:
## * ID -> ID...3
## * ID -> ID...6
## * ID -> ID...9
## * ID -> ID...12
MegaData <- rename(MegaData, "NBT2" = "NBT1&2")
MegaData2 <- select(MegaData, "NBT2", "NBT.2CheckIn", "ID...3", "NBT4", "NBT.4CheckIn", "NBT5", "NBT.5CheckIn", "NBT7", "NBT.7CheckIn")
trial <- select(MegaData2, "ID...3", "NBT2", "NBT.2CheckIn")
trial <- mutate(trial, standard = "NBT2")
trial <- rename(trial, "CFA" = "NBT2")
trial <- rename(trial, "CheckIn" = "NBT.2CheckIn")
trial4 <- select(MegaData2, "ID...3", "NBT4", "NBT.4CheckIn")
trial4 <- mutate(trial4, standard = "NBT4")
trial4 <- rename(trial4, "CFA" = "NBT4")
trial4 <- rename(trial4, "CheckIn" = "NBT.4CheckIn")
trial5 <- select(MegaData2, "ID...3", "NBT5", "NBT.5CheckIn")
trial5 <- mutate(trial5, standard = "NBT5")
trial5 <- rename(trial5, "CFA" = "NBT5")
trial5 <- rename(trial5, CheckIn = NBT.5CheckIn)
trial7 <- select(MegaData2, "ID...3", "NBT7", "NBT.7CheckIn")
trial7 <- mutate(trial7, standard = "NBT7")
trial7 <- rename(trial7, "CFA" = "NBT7")
trial7 <- rename(trial7, "CheckIn" = "NBT.7CheckIn")
VerticalDataFrame <- bind_rows(trial, trial4, trial5, trial7)
## New names:
## * ID...3 -> ID
## New names:
## * ID...3 -> ID
## New names:
## * ID...3 -> ID
## New names:
## * ID...3 -> ID
MegaData %>%
ggplot() +
geom_smooth(mapping = aes(x = NBT2, y = NBT.2CheckIn), method = lm) +
labs(x = "CFA NBT2", y = "Check-In NBT2", title = "Ability of CFA Data to Predict Student Performance on NC Check-Ins for Standard NBT2")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
MegaData %>%
ggplot() +
geom_smooth(mapping = aes(x = NBT4, y = NBT.4CheckIn), method = lm) +
labs(x = "CFA NBT4", y = "Check-In NBT4", title = "Ability of CFA Data to Predict Student Performance on NC Check-Ins for Standard NBT4")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 7 rows containing non-finite values (stat_smooth).
MegaData %>%
ggplot() +
geom_smooth(mapping = aes(x = NBT5, y = NBT.5CheckIn), method = lm) +
labs(x = "CFA NBT5", y = "Check-In NBT5", title = "Ability of CFA Data to Predict Student Performance on NC Check-Ins for Standard NBT5")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
MegaData %>%
ggplot() +
geom_smooth(mapping = aes(x = NBT7, y = NBT.7CheckIn), method = lm) +
labs(x = "CFA NBT7", y = "Check-In NBT7", title = "Ability of CFA Data to Predict Student Performance on NC Check-Ins for Standard NBT7")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
Originally, I intended to run a secondary analysis comparing subgroups within the data. However, when examining the subgroups, I determined that this presented ethical concerns because the subgroups are so small that there is little anonymity. In the interest of protecting my students’ identities and avoiding any potential for labeling students according to their performance, I chose to maintain the two analyses shown in this section: the standards-based analysis and the holistic analysis. Furthermore, since the overall dataset is so small (66 students), there is little to be gained from segmenting this data further as it will create datasets so small that the reliability of the linear regression model will be significantly reduced.
Next, using the VerticalDataFrame, I created a geom_smooth plot showing the predictive power of the CFA data for the Check-In data, holistically. I used a linear regression model to plot the relationship with CFA on the x-axis and CheckIn on the y-axis.
VerticalDataFrame %>%
ggplot() +
geom_smooth(mapping = aes(x = CFA, y = CheckIn), color = "blue", method = lm) + labs(x = "CFA Data", y = "Check-In Data", title = "Ability of CFA Data to Predict Student Performance on NC Check-Ins")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 20 rows containing non-finite values (stat_smooth).
Based on both the standard-specific linear regression models and the holistic linear regression model (using the VerticalDataFrame), I was able to see that the CFA data are generally predictive of the CheckIn data.
However, there are two caveats to this statement. First, the scores on the CheckIns were generally lower than the scores on the CFAs by approximately 30 points. Therefore, even students who consistently received 90s and 100s on the CFAs scored around a 70 on the CheckIn, according to the holistic plot. Second, the regression model for standard NBT5 did not show a predictive relationship between the CFA and CheckIn; in fact, it showed an inverse relationship. Therefore, the holistic plot is somewhat skewed due to the inverse relationship created by NBT5. It could be hypothesized, therefore, that the slope of the holistic plot would be steeper if the NBT5 data were removed.
This analysis will prove useful to my team because it generally confirms that our CFAs are useful assessments in terms of preparing our students for standardized assessments. Further, it shows that SchoolNet, our assessment item bank that we use when generating the CFAs is a basically reliable source for test questions. Additionally, I will use this analysis to encourage my team to look more closely at the NBT5 questions that were tested on the CFA versus those that were tested on the CheckIn to try to determine why the CFA data were not predictive for this standard.
With the exception of NBT5, this analysis shows that performance on our SchoolNet-based CFAs is a good predictor of performance on state-created standardized assessments. The analysis also creates opportunities for further items-based analysis, particularly regarding standard NBT5.