Comparison of Baseline scores

Loading Data
Download final scores from LMS
Filter “OR-D-10-18”, “OR-D-20-18”, spread by code & rename to MTH_BL2, SCI_BL2
Data Cleaning
Absolute Improvement on Normalized Scores
Get school-wise intervention factor
Adding school type to School wise and student wise gain
Let’s try a boxplots for the 2 groups
Adding histograms
Relative gain within school-groups based on performance
Write to CSV
Regression with categorical variable of Intervention
Let’s try including Endline Scores
Normalizing all scores
Adding School Type
Linear model with categorical variable
Up/down

Loading Data

Loading libraries

library(tidyverse)
library(magrittr)
library(RCurl)
library(modelr)
library(gridExtra)

Reading raw score data for last year’s tests

tracker <- read_csv("raw_tracker.csv") #### Read from LMS 
 #### Folder GoH Testing > Cohort17-21 > AY17-18 Class 9 BL-ML-EL

glimpse(tracker)

## Observations: 5,260
## Variables: 9
## $ `Centre Name` <chr> "AMSSS Agroha, Hisar", "AMSSS Agroha, Hisar", "AMS…
## $ `Student ID`  <chr> "HI10100001", "HI10100003", "HI10100005", "HI10100…
## $ ENG_BL        <int> 12, 8, 13, 13, 13, 11, NA, 12, 11, NA, NA, 8, NA, …
## $ MTH_BL        <int> 9, 12, 10, 20, 10, 13, 9, 14, 10, 14, 18, 18, 15, …
## $ MTH_EL        <int> 29, NA, 23, NA, NA, NA, NA, 35, 19, NA, NA, 38, NA…
## $ MTH_ML        <int> 15, 8, 12, 16, 11, 11, 11, 22, NA, 13, NA, 22, NA,…
## $ SCI_BL        <int> 9, 19, 15, 17, 26, 20, 9, 15, 16, 20, 17, 19, 15, …
## $ SCI_EL        <int> 13, NA, 9, NA, NA, NA, NA, 16, 8, NA, NA, 23, NA, …
## $ SCI_ML        <int> 12, 14, 15, 15, 11, 10, 8, 15, NA, 11, NA, 19, NA,…

Download final scores from LMS

scores <- read.delim(
  textConnection(
getURL("https://s3-ap-southeast-1.amazonaws.com/avanti-dump/dashboard-dumps/onlinescores/onlinetestdump.tsv")),
  header = TRUE, stringsAsFactors = FALSE)
 
head(scores)

Filter “OR-D-10-18”, “OR-D-20-18”, spread by code & rename to MTH_BL2, SCI_BL2

bl_scores <- scores %>% filter(Test.Code %in% c("OR-D-10-18", "OR-D-20-18"))

df10 <- bl_scores %>% select(Student.ID, Test.Score, Test.Code) %>%
  spread(key = Test.Code, value = Test.Score) %>%
  rename(MTH_BL2 = `OR-D-10-18`, SCI_BL2 = `OR-D-20-18`)

head(df10)

Data Cleaning

Joining both tables

tracker %<>%
  inner_join(df10, by = c(`Student ID` = "Student.ID")) #### Notice " " for common variables

sample_n(tracker, 10)

Removing unnecessary columns

tracker %<>%
  select(-MTH_ML, -SCI_ML, -ENG_BL)

head(tracker)

Keeping only students who gave both tests

gain <- tracker %>%
  filter(!is.na(MTH_BL)) %>%
  filter(!is.na(MTH_BL2))

dim(gain)

## [1] 2716    8

Combining Math and Science Scores to get Baseline 1 and Baseline 2

gain %<>%
  mutate(BL1 = (MTH_BL + SCI_BL)) %>%
  mutate(BL2 = (MTH_BL2 + SCI_BL2)) %>%
  select(`Centre Name`, `Student ID`, BL1, BL2)

sample_n(gain, 10)

Absolute Improvement on Normalized Scores

Min Max Scaling of Raw Scores to a range of [0,100]

normalize <- function(x) {
    return (round(100*(x - min(x)) / (max(x) - min(x))))
  }

gain %<>%
  bind_cols(as.data.frame(lapply(gain["BL1"], normalize))) %>%
  bind_cols(as.data.frame(lapply(gain["BL2"], normalize))) %>%
  rename(norm_BL1 = BL11) %>%
  rename(norm_BL2 = BL21)

head(gain)

Average score per school and improvement

school_gain <- gain %>%
  group_by(`Centre Name`) %>%
  summarise(count = n(),
            Avg_Norm_BL1 = round(mean(norm_BL1)),
            Avg_Norm_BL2 = round(mean(norm_BL2)),
            Delta_Avg = Avg_Norm_BL2 - Avg_Norm_BL1
            ) %>%
  arrange(Delta_Avg)

school_gain

Get school-wise intervention factor

ss <- read_csv("59_Assessment_Report_2018 - Copy of Sheet1.csv")

## Warning: Missing column names filled in: 'X7' [7]

## Parsed with column specification:
## cols(
##   Schools = col_character(),
##   `Baseline score 2017` = col_character(),
##   `Baseline score 2018` = col_character(),
##   Increase = col_character(),
##   `School Type` = col_character(),
##   `Reason (infra/teacher)` = col_character(),
##   X7 = col_character(),
##   `Percentage increase` = col_character()
## )

head(ss)

Adding school type to School wise and student wise gain

school_gain %<>% left_join(select(ss, Schools, `School Type`), by = c("Centre Name" = "Schools"))
gain %<>% left_join(select(ss, Schools, `School Type`), by = c("Centre Name" = "Schools"))

head(school_gain)

Plotting normalized scores by student

gain %>%
  ggplot(aes(x = norm_BL1, y = norm_BL2 )) +     #### No quotes needed in aes x and y arguement
  geom_point(aes(colour = as.factor(`School Type`))) + 
  geom_smooth(method = "lm", se=FALSE) +      #### Removes the 95% confidence interval
  #theme(legend.position="none") +
  xlim(0,100) +
  ylim(0,100)

Let’s try a boxplots for the 2 groups

bl1 <- gain %>%
  ggplot() +     #### No quotes needed in aes x and y arguement
  geom_boxplot(aes(x = as.factor(`School Type`), y = norm_BL1)) 
  #theme(legend.position="none") +

bl2 <- gain %>%
  ggplot() +     #### No quotes needed in aes x and y arguement
  geom_boxplot(aes(x = as.factor(`School Type`), y = norm_BL2)) 

grid.arrange(bl1, bl2, nrow = 2)

Adding histograms

bl1_hist <- gain %>%
  ggplot(aes(x = norm_BL1)) +     #### No quotes needed in aes x and y arguement
  geom_density(aes(colour = as.factor(`School Type`), y = (..density..) )) +
  ylab("Density")
  #theme(legend.position="none") +

bl2_hist <- gain %>%
  ggplot(aes(x = norm_BL2)) +     #### No quotes needed in aes x and y arguement
  geom_density(aes(colour = as.factor(`School Type`), y = (..density..) )) +
  ylab("Density")

grid.arrange(bl1_hist, bl2_hist, nrow = 2)

Plotting school-wise average scores

school_gain %>%
  ggplot(aes(x = Avg_Norm_BL1, y = Avg_Norm_BL2, label = `Centre Name`, colour = as.factor(`School Type`) )) +     #### No quotes needed in aes x and y arguement
  geom_point() + 
  #geom_smooth(method = "lm", se=FALSE) +     #### Removes the 95% confidence interval
  #xlim(0,100) +
  #ylim(0,100)
  #theme(legend.position="none") +
  geom_text(check_overlap = TRUE, size = 2)

Relative gain within school-groups based on performance

Let’s classify these schools into 3 categories based on their Avg Score in BL1

school_gain %<>%
  mutate(Performance = if_else(Avg_Norm_BL1 < 35, "Low", 
                               if_else(Avg_Norm_BL1 < 45, "Mid", "High")
                               )
         )

school_gain %>%
  ggplot(aes(x = Avg_Norm_BL1, y = Avg_Norm_BL2, label = `Centre Name` )) +     #### No quotes needed in aes x and y arguement
  geom_point(aes(colour = factor(Performance), fill = factor(Performance), shape = as.factor(`School Type`))) + 
  #geom_smooth(method = "lm", se=FALSE) +     #### Removes the 95% confidence interval
  #xlim(0,100) +
  #ylim(0,100)
  geom_text(check_overlap = TRUE, size = 2) +
  theme(legend.position="none") +
  scale_colour_manual(values = c("Low" = "red", "Mid" = "blue", "High" = "green"))

Mapping school performance to student data

gain %<>%
  left_join(select(school_gain, - `School Type`) , by = "Centre Name") %>%
  select(-count, -Avg_Norm_BL1, -Avg_Norm_BL2,  -Delta_Avg)

(gain)

Checking student gain at Low performing schools first

low_performing <- gain %>%
  filter(Performance == "Low")
#### Getting linear regression fit
low_mod <- lm(norm_BL2 ~ norm_BL1, data = low_performing)

low_performing %<>% 
  add_predictions(low_mod) %>%
  add_residuals(low_mod)

low_performing %<>%
  mutate(delta = round(100*resid / norm_BL1) )

low_performing %<>%
  mutate(movement = if_else(delta > 20, "UP", (if_else(delta > -20, "SAME", "DOWN"))))

#### Count of UP / DOWN and SAME per school for Low performing schools
low_performing %>%
  group_by(`School Type`, movement) %>%
  summarise(countCommonStudents = n() ) %>%
  spread(movement, countCommonStudents) %>%
  replace(., is.na(.), 0) %>%               #### Replace missing values from spread with 0
  mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
  arrange(NET_perc_UP)

Checking student gain of Middle Performing Schools

mid_performing <- gain %>%
  filter(Performance == "Mid")
#### Getting linear regression fit
mid_mod <- lm(norm_BL2 ~ norm_BL1, data = mid_performing)

mid_performing %<>% 
  add_predictions(mid_mod) %>%
  add_residuals(mid_mod)

mid_performing %<>%
  mutate(delta = round(100*resid / norm_BL1) ) %>%
  mutate(movement = if_else(delta > 20, "UP", (if_else(delta > -20, "SAME", "DOWN"))))

#### Count of UP / DOWN and SAME per school for Low performing schools
mid_performing %>%
  group_by(`School Type`, movement) %>%
  summarise(countCommonStudents = n() ) %>%
  spread(movement, countCommonStudents) %>%
  replace(., is.na(.), 0) %>%
  mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
  arrange(NET_perc_UP)

Checking student gain of High Performing Schools

high_performing <- gain %>%
  filter(Performance == "High")
#### Getting linear regression fit
high_mod <- lm(norm_BL2 ~ norm_BL1, data = high_performing)

high_performing %<>% 
  add_predictions(high_mod) %>%
  add_residuals(high_mod)

high_performing %<>%
  mutate(delta = round(100*resid / norm_BL1) ) %>%
  mutate(movement = if_else(delta > 20, "UP", (if_else(delta > -20, "SAME", "DOWN"))))

#### Count of UP / DOWN and SAME per school for Low performing schools
high_performing %>%
  group_by(`School Type`, movement) %>%
  summarise(countCommonStudents = n() ) %>%
  spread(movement, countCommonStudents) %>%
  replace(., is.na(.), 0) %>%
  mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
  arrange(NET_perc_UP)

Write to CSV

write_csv(high_performing, "high_performing.csv")
write_csv(low_performing, "low_performing.csv")
write_csv(mid_performing, "mid_performing.csv")

Regression with categorical variable of Intervention

cat_mod <- lm(norm_BL2 ~ norm_BL1 + as.factor(`School Type`), data = gain)

coef(cat_mod)

##                            (Intercept) 
##                             17.0436764 
##                               norm_BL1 
##                              0.7946415 
## as.factor(`School Type`)Only workbooks 
##                             -3.0084193

Negative coefficeint if intervention has only workbooks available !!

Let’s try including Endline Scores

el_gain <- tracker %>%
  filter(!is.na(MTH_BL)) %>%
  filter(!is.na(MTH_BL2)) %>%
  filter(!is.na(MTH_EL))

el_gain %<>%
  mutate(BL1 = (MTH_BL + SCI_BL)) %>%
  mutate(BL2 = (MTH_BL2 + SCI_BL2)) %>%
  mutate(EL = MTH_EL+SCI_EL) %>%
  select(`Centre Name`, `Student ID`, BL1, EL, BL2 )

head(el_gain)

Normalizing all scores

normalize <- function(x) {
    return (round(100*(x - min(x)) / (max(x) - min(x))))
  }

el_gain %<>%
  bind_cols(as.data.frame(lapply(el_gain["BL1"], normalize))) %>%
  bind_cols(as.data.frame(lapply(el_gain["EL"], normalize))) %>%
  bind_cols(as.data.frame(lapply(el_gain["BL2"], normalize))) %>%
  rename(norm_BL1 = BL11) %>%
  rename(norm_EL = EL1) %>%
  rename(norm_BL2 = BL21)

head(el_gain)

Adding School Type

el_gain %<>% left_join(select(ss, Schools, `School Type`), by = c("Centre Name" = "Schools"))

head(el_gain)

Linear model with categorical variable

cat_el_mod <- lm(norm_BL2 ~ norm_BL1 + norm_EL + as.factor(`School Type`), data = el_gain)

coef(cat_el_mod)

##                            (Intercept) 
##                              9.5912565 
##                               norm_BL1 
##                              0.3738467 
##                                norm_EL 
##                              0.7727426 
## as.factor(`School Type`)Only workbooks 
##                             -2.8933468

Up/down

el_gain %<>% 
  add_predictions(cat_el_mod) %>%
  add_residuals(cat_el_mod) %>%
  mutate(delta = round(100*resid / norm_BL1) ) %>%
  mutate(movement = if_else(delta > 10, "UP", (if_else(delta > -10, "SAME", "DOWN"))))

#### Count of UP / DOWN and SAME per school for Low performing schools
el_gain %>%
  group_by(`School Type`, movement) %>%
  summarise(countCommonStudents = n() ) %>%
  spread(movement, countCommonStudents) %>%
  replace(., is.na(.), 0) %>%
  mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
  arrange(NET_perc_UP)

Comparison of Baseline scores - 2017 v 2018

Pritesh Shrivastava

23 April, 2019

Loading Data

Loading libraries

Reading raw score data for last year’s tests

Download final scores from LMS

Filter “OR-D-10-18”, “OR-D-20-18”, spread by code & rename to MTH_BL2, SCI_BL2

Data Cleaning

Joining both tables

Removing unnecessary columns

Keeping only students who gave both tests

Combining Math and Science Scores to get Baseline 1 and Baseline 2

Absolute Improvement on Normalized Scores

Min Max Scaling of Raw Scores to a range of [0,100]

Average score per school and improvement

Get school-wise intervention factor

Adding school type to School wise and student wise gain

Plotting normalized scores by student

Let’s try a boxplots for the 2 groups

Adding histograms

Plotting school-wise average scores

Relative gain within school-groups based on performance

Let’s classify these schools into 3 categories based on their Avg Score in BL1

Mapping school performance to student data

Checking student gain at Low performing schools first

Checking student gain of Middle Performing Schools

Checking student gain of High Performing Schools

Write to CSV

Regression with categorical variable of Intervention

Let’s try including Endline Scores

Normalizing all scores

Adding School Type

Linear model with categorical variable

Up/down