Loading Data

Loading libraries

library(tidyverse)
library(magrittr)
#library(scales)
library(modelr)

Reading raw score data for last year’s tests

tracker <- read_csv("raw_tracker.csv") #### Read from LMS 
 #### Folder GoH Testing > Cohort17-21 > AY17-18 Class 9 BL-ML-EL
glimpse(tracker)
Observations: 5,260
Variables: 9
$ `Centre Name` <chr> "AMSSS Agroha, Hisar", "AMSSS Agroha, Hisar", "AMSSS Agroha, Hisar",...
$ `Student ID`  <chr> "HI10100001", "HI10100003", "HI10100005", "HI10100007", "HI10100009"...
$ ENG_BL        <int> 12, 8, 13, 13, 13, 11, NA, 12, 11, NA, NA, 8, NA, 10, NA, 8, 5, 3, 1...
$ MTH_BL        <int> 9, 12, 10, 20, 10, 13, 9, 14, 10, 14, 18, 18, 15, 15, 22, 12, 12, 9,...
$ MTH_EL        <int> 29, NA, 23, NA, NA, NA, NA, 35, 19, NA, NA, 38, NA, NA, 23, 12, NA, ...
$ MTH_ML        <int> 15, 8, 12, 16, 11, 11, 11, 22, NA, 13, NA, 22, NA, 17, 13, 8, 9, 7, ...
$ SCI_BL        <int> 9, 19, 15, 17, 26, 20, 9, 15, 16, 20, 17, 19, 15, 19, 23, 13, 8, 14,...
$ SCI_EL        <int> 13, NA, 9, NA, NA, NA, NA, 16, 8, NA, NA, 23, NA, NA, 8, 12, NA, NA,...
$ SCI_ML        <int> 12, 14, 15, 15, 11, 10, 8, 15, NA, 11, NA, 19, NA, 12, 11, 12, 12, 1...

Reading scores for Baseline 2 along with subject and grade level scores

df10 <- read_csv("df10_raw.csv") #### Evaluated from OMR data
#### Folder GoH Testing > AY18-19 > Baseline Eng Medium 59 Schools [Oct]
 
glimpse(df10)
Observations: 3,361
Variables: 13
$ X1           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20...
$ Student.ID   <chr> "HI10100051", "HI10100023", "HI10100119", "HI10100131", "HI10100049",...
$ Sheet.ID     <chr> "100673", "100643", "100644", "100645", "100646", "100648", "100649",...
$ Set          <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",...
$ School.Name  <chr> "AMSSS Agroha 10th.xlsx", "AMSSS Agroha 10th.xlsx", "AMSSS Agroha 10t...
$ Grade        <int> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...
$ Total        <int> 57, 62, 47, 44, 57, 70, 66, 52, 53, 54, 66, 42, 56, 60, 33, 58, 46, 5...
$ MTH_BL2      <int> 34, 35, 24, 18, 35, 38, 36, 24, 27, 28, 37, 28, 32, 30, 18, 29, 25, 3...
$ SCI_BL2      <int> 23, 27, 23, 26, 22, 32, 30, 28, 26, 26, 29, 14, 24, 30, 15, 29, 21, 2...
$ MTH_BL2_AT9  <int> 15, 13, 9, 8, 14, 16, 15, 9, 10, 10, 16, 9, 11, 10, 6, 11, 8, 12, 6, ...
$ SCI_BL2_AT9  <int> 8, 13, 9, 11, 11, 15, 12, 11, 9, 9, 12, 4, 10, 12, 7, 11, 6, 12, 7, 3...
$ MTH_BL2_BEL9 <int> 19, 22, 15, 10, 21, 22, 21, 15, 17, 18, 21, 19, 21, 20, 12, 18, 17, 1...
$ SCI_BL2_BEL9 <int> 15, 14, 14, 15, 11, 17, 18, 17, 17, 17, 17, 10, 14, 18, 8, 18, 15, 17...

Data Cleaning

Joining both tables

tracker %<>%
  inner_join(df10, by = c(`Student ID` = "Student.ID")) #### Notice " " for common variables
 (tracker)

Removing unnecessary columns

tracker %<>%
  select(-MTH_ML, -SCI_ML, -MTH_EL, -SCI_EL, -ENG_BL, -X1, -Sheet.ID, -Set, -School.Name, -Grade, -Total)
 (tracker)

Keeping only students who gave both tests

tracker %<>%
  filter(!is.na(MTH_BL)) %>%
  filter(!is.na(MTH_BL2))
dim(tracker)
[1] 2701   10

Combining Math and Science Scores to get Baseline 1 and Baseline 2 (Grade 8 and below only) scores

gain <- tracker %>%
  mutate(BL1 = (MTH_BL + SCI_BL)) %>%
  mutate(BL2_BEL9 = (MTH_BL2_BEL9 + SCI_BL2_BEL9)) %>%
  select(`Centre Name`, `Student ID`, BL1, BL2_BEL9)
 (gain)

Absolute Improvement on Normalized Scores

Min Max Scaling of Raw Scores to a range of [0,100]

normalize <- function(x) {
    return (round(100*(x - min(x)) / (max(x) - min(x))))
  }
gain %<>%
  bind_cols(as.data.frame(lapply(gain["BL1"], normalize))) %>%
  bind_cols(as.data.frame(lapply(gain["BL2_BEL9"], normalize))) %>%
  rename(norm_BL1 = BL11) %>%
  rename(norm_BL2_BEL9 = BL2_BEL91)
 (gain)

Average score per school and improvement

school_gain <- gain %>%
  group_by(`Centre Name`) %>%
  summarise(count = n(),
            Avg_Norm_BL1 = round(mean(norm_BL1)),
            Avg_Norm_BL2_BEL9 = round(mean(norm_BL2_BEL9)),
            Delta_Avg = Avg_Norm_BL2_BEL9 - Avg_Norm_BL1
            ) %>%
  arrange(Delta_Avg)
school_gain

Plotting normalized scores by student

gain %>%
  ggplot(aes(x = norm_BL1, y = norm_BL2_BEL9 )) +     #### No quotes needed in aes x and y arguement
  geom_point(aes(colour = `Centre Name`)) + 
  geom_smooth(method = "lm", se=FALSE) +      #### Removes the 95% confidence interval
  theme(legend.position="none") +
  xlim(0,100) +
  ylim(0,100)

Plotting school-wise average scores

school_gain %>%
  ggplot(aes(x = Avg_Norm_BL1, y = Avg_Norm_BL2_BEL9, label = `Centre Name` )) +     #### No quotes needed in aes x and y arguement
  geom_point() + 
  #geom_smooth(method = "lm", se=FALSE) +     #### Removes the 95% confidence interval
  #xlim(0,100) +
  #ylim(0,100)
  geom_text(check_overlap = TRUE, size = 2) +
  theme(legend.position="none")

Relative gain within school-groups based on performance

Let’s classify these schools into 3 categories based on their Avg Score in BL1

school_gain %<>%
  mutate(Performance = if_else(Avg_Norm_BL1 < 35, "Low", 
                               if_else(Avg_Norm_BL1 < 45, "Mid", "High")
                               )
         )
school_gain %>%
  ggplot(aes(x = Avg_Norm_BL1, y = Avg_Norm_BL2_BEL9, label = `Centre Name` )) +     #### No quotes needed in aes x and y arguement
  geom_point(aes(colour = factor(Performance), fill = factor(Performance))) + 
  #geom_smooth(method = "lm", se=FALSE) +     #### Removes the 95% confidence interval
  #xlim(0,100) +
  #ylim(0,100)
  geom_text(check_overlap = TRUE, size = 2) +
  theme(legend.position="none") +
  scale_colour_manual(values = c("Low" = "red", "Mid" = "blue", "High" = "green"))

Mapping school performance to student data

gain %<>%
  left_join(school_gain, by = "Centre Name") %>%
  select(-count, -Avg_Norm_BL1, -Avg_Norm_BL2_BEL9, -Delta_Avg)
(gain)

Checking student gain at Low performing schools first

low_performing <- gain %>%
  filter(Performance == "Low")
#### Getting linear regression fit
low_mod <- lm(norm_BL2_BEL9 ~ norm_BL1, data = low_performing)
low_performing %<>% 
  add_predictions(low_mod) %>%
  add_residuals(low_mod)
low_performing %<>%
  mutate(delta = round(100*resid / norm_BL1) )
low_performing %<>%
  mutate(movement = if_else(delta > 20, "UP", (if_else(delta > -20, "SAME", "DOWN"))))
#### Count of UP / DOWN and SAME per school for Low performing schools
low_performing %>%
  group_by(`Centre Name`, movement) %>%
  summarise(countCommonStudents = n() ) %>%
  spread(movement, countCommonStudents) %>%
  replace(., is.na(.), 0) %>%               #### Replace missing values from spread with 0
  mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
  arrange(NET_perc_UP)

Checking student gain of Middle Performing Schools

mid_performing <- gain %>%
  filter(Performance == "Mid")
#### Getting linear regression fit
mid_mod <- lm(norm_BL2_BEL9 ~ norm_BL1, data = mid_performing)
mid_performing %<>% 
  add_predictions(mid_mod) %>%
  add_residuals(mid_mod)
mid_performing %<>%
  mutate(delta = round(100*resid / norm_BL1) ) %>%
  mutate(movement = if_else(delta > 20, "UP", (if_else(delta > -20, "SAME", "DOWN"))))
#### Count of UP / DOWN and SAME per school for Low performing schools
mid_performing %>%
  group_by(`Centre Name`, movement) %>%
  summarise(countCommonStudents = n() ) %>%
  spread(movement, countCommonStudents) %>%
  replace(., is.na(.), 0) %>%
  mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
  arrange(NET_perc_UP)

Checking student gain of High Performing Schools

high_performing <- gain %>%
  filter(Performance == "High")
#### Getting linear regression fit
high_mod <- lm(norm_BL2_BEL9 ~ norm_BL1, data = high_performing)
high_performing %<>% 
  add_predictions(high_mod) %>%
  add_residuals(high_mod)
high_performing %<>%
  mutate(delta = round(100*resid / norm_BL1) ) %>%
  mutate(movement = if_else(delta > 20, "UP", (if_else(delta > -20, "SAME", "DOWN"))))
#### Count of UP / DOWN and SAME per school for Low performing schools
high_performing %>%
  group_by(`Centre Name`, movement) %>%
  summarise(countCommonStudents = n() ) %>%
  spread(movement, countCommonStudents) %>%
  replace(., is.na(.), 0) %>%
  mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
  arrange(NET_perc_UP)
---
title: "Baseline Scores Comparison - 2017 v 2018"
output: html_notebook
---

## Loading Data
#### Loading libraries
```{r echo=TRUE, warning=FALSE, message=FALSE}
library(tidyverse)
library(magrittr)
#library(scales)
library(modelr)
```

#### Reading raw score data for last year's tests
```{r echo=TRUE, message=FALSE}
tracker <- read_csv("raw_tracker.csv") #### Read from LMS 
 #### Folder GoH Testing > Cohort17-21 > AY17-18 Class 9 BL-ML-EL

glimpse(tracker)
```

#### Reading scores for Baseline 2 along with subject and grade level scores
```{r echo=TRUE, message=FALSE, warning=FALSE}
df10 <- read_csv("df10_raw.csv") #### Evaluated from OMR data
#### Folder GoH Testing > AY18-19 > Baseline Eng Medium 59 Schools [Oct]
 
glimpse(df10)
```

## Data Cleaning
#### Joining both tables
```{r echo=TRUE}
tracker %<>%
  inner_join(df10, by = c(`Student ID` = "Student.ID")) #### Notice " " for common variables

 (tracker)
```

#### Removing unnecessary columns
```{r echo=TRUE}
tracker %<>%
  select(-MTH_ML, -SCI_ML, -MTH_EL, -SCI_EL, -ENG_BL, -X1, -Sheet.ID, -Set, -School.Name, -Grade, -Total)

 (tracker)
```

#### Keeping only students who gave both tests
```{r echo=TRUE}
tracker %<>%
  filter(!is.na(MTH_BL)) %>%
  filter(!is.na(MTH_BL2))

dim(tracker)
```

#### Combining Math and Science Scores to get Baseline 1 and Baseline 2 (Grade 8 and below only) scores
```{r echo=TRUE}
gain <- tracker %>%
  mutate(BL1 = (MTH_BL + SCI_BL)) %>%
  mutate(BL2_BEL9 = (MTH_BL2_BEL9 + SCI_BL2_BEL9)) %>%
  select(`Centre Name`, `Student ID`, BL1, BL2_BEL9)

 (gain)
```

## Absolute Improvement on Normalized Scores
#### Min Max Scaling of Raw Scores to a range of [0,100]
```{r echo=TRUE}
normalize <- function(x) {
    return (round(100*(x - min(x)) / (max(x) - min(x))))
  }

gain %<>%
  bind_cols(as.data.frame(lapply(gain["BL1"], normalize))) %>%
  bind_cols(as.data.frame(lapply(gain["BL2_BEL9"], normalize))) %>%
  rename(norm_BL1 = BL11) %>%
  rename(norm_BL2_BEL9 = BL2_BEL91)

 (gain)
```

#### Average score per school and improvement
```{r echo=TRUE}
school_gain <- gain %>%
  group_by(`Centre Name`) %>%
  summarise(count = n(),
            Avg_Norm_BL1 = round(mean(norm_BL1)),
            Avg_Norm_BL2_BEL9 = round(mean(norm_BL2_BEL9)),
            Delta_Avg = Avg_Norm_BL2_BEL9 - Avg_Norm_BL1
            ) %>%
  arrange(Delta_Avg)

school_gain
```


#### Plotting normalized scores by student
```{r echo=TRUE}
gain %>%
  ggplot(aes(x = norm_BL1, y = norm_BL2_BEL9 )) +     #### No quotes needed in aes x and y arguement
  geom_point(aes(colour = `Centre Name`)) + 
  geom_smooth(method = "lm", se=FALSE) +      #### Removes the 95% confidence interval
  theme(legend.position="none") +
  xlim(0,100) +
  ylim(0,100)
```

#### Plotting school-wise average scores
```{r echo=TRUE}
school_gain %>%
  ggplot(aes(x = Avg_Norm_BL1, y = Avg_Norm_BL2_BEL9, label = `Centre Name` )) +     #### No quotes needed in aes x and y arguement
  geom_point() + 
  #geom_smooth(method = "lm", se=FALSE) +     #### Removes the 95% confidence interval
  #xlim(0,100) +
  #ylim(0,100)
  geom_text(check_overlap = TRUE, size = 2) +
  theme(legend.position="none")
```

## Relative gain within school-groups based on performance 
#### Let's classify these schools into 3 categories based on their Avg Score in BL1
```{r echo=TRUE}
school_gain %<>%
  mutate(Performance = if_else(Avg_Norm_BL1 < 35, "Low", 
                               if_else(Avg_Norm_BL1 < 45, "Mid", "High")
                               )
         )

school_gain %>%
  ggplot(aes(x = Avg_Norm_BL1, y = Avg_Norm_BL2_BEL9, label = `Centre Name` )) +     #### No quotes needed in aes x and y arguement
  geom_point(aes(colour = factor(Performance), fill = factor(Performance))) + 
  #geom_smooth(method = "lm", se=FALSE) +     #### Removes the 95% confidence interval
  #xlim(0,100) +
  #ylim(0,100)
  geom_text(check_overlap = TRUE, size = 2) +
  theme(legend.position="none") +
  scale_colour_manual(values = c("Low" = "red", "Mid" = "blue", "High" = "green"))
```

#### Mapping school performance to student data
```{r echo=TRUE}
gain %<>%
  left_join(school_gain, by = "Centre Name") %>%
  select(-count, -Avg_Norm_BL1, -Avg_Norm_BL2_BEL9, -Delta_Avg)

(gain)
```


#### Checking student gain at Low performing schools first
```{r echo=TRUE}
low_performing <- gain %>%
  filter(Performance == "Low")
#### Getting linear regression fit
low_mod <- lm(norm_BL2_BEL9 ~ norm_BL1, data = low_performing)

low_performing %<>% 
  add_predictions(low_mod) %>%
  add_residuals(low_mod)

low_performing %<>%
  mutate(delta = round(100*resid / norm_BL1) )

low_performing %<>%
  mutate(movement = if_else(delta > 20, "UP", (if_else(delta > -20, "SAME", "DOWN"))))

#### Count of UP / DOWN and SAME per school for Low performing schools
low_performing %>%
  group_by(`Centre Name`, movement) %>%
  summarise(countCommonStudents = n() ) %>%
  spread(movement, countCommonStudents) %>%
  replace(., is.na(.), 0) %>%               #### Replace missing values from spread with 0
  mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
  arrange(NET_perc_UP)
```

#### Checking student gain of Middle Performing Schools
```{r echo=TRUE}
mid_performing <- gain %>%
  filter(Performance == "Mid")
#### Getting linear regression fit
mid_mod <- lm(norm_BL2_BEL9 ~ norm_BL1, data = mid_performing)

mid_performing %<>% 
  add_predictions(mid_mod) %>%
  add_residuals(mid_mod)

mid_performing %<>%
  mutate(delta = round(100*resid / norm_BL1) ) %>%
  mutate(movement = if_else(delta > 20, "UP", (if_else(delta > -20, "SAME", "DOWN"))))

#### Count of UP / DOWN and SAME per school for Low performing schools
mid_performing %>%
  group_by(`Centre Name`, movement) %>%
  summarise(countCommonStudents = n() ) %>%
  spread(movement, countCommonStudents) %>%
  replace(., is.na(.), 0) %>%
  mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
  arrange(NET_perc_UP)
```

#### Checking student gain of High Performing Schools
```{r echo=TRUE}
high_performing <- gain %>%
  filter(Performance == "High")
#### Getting linear regression fit
high_mod <- lm(norm_BL2_BEL9 ~ norm_BL1, data = high_performing)

high_performing %<>% 
  add_predictions(high_mod) %>%
  add_residuals(high_mod)

high_performing %<>%
  mutate(delta = round(100*resid / norm_BL1) ) %>%
  mutate(movement = if_else(delta > 20, "UP", (if_else(delta > -20, "SAME", "DOWN"))))

#### Count of UP / DOWN and SAME per school for Low performing schools
high_performing %>%
  group_by(`Centre Name`, movement) %>%
  summarise(countCommonStudents = n() ) %>%
  spread(movement, countCommonStudents) %>%
  replace(., is.na(.), 0) %>%
  mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
  arrange(NET_perc_UP)
```
