Loading Data

Loading libraries

library(tidyverse)
library(magrittr)
library(RCurl)
library(modelr)
library(gridExtra)

Reading raw score data for last year’s tests

tracker <- read_csv("raw_tracker.csv") #### Read from LMS 
 #### Folder GoH Testing > Cohort17-21 > AY17-18 Class 9 BL-ML-EL

glimpse(tracker)
## Observations: 5,260
## Variables: 9
## $ `Centre Name` <chr> "AMSSS Agroha, Hisar", "AMSSS Agroha, Hisar", "AMS…
## $ `Student ID`  <chr> "HI10100001", "HI10100003", "HI10100005", "HI10100…
## $ ENG_BL        <int> 12, 8, 13, 13, 13, 11, NA, 12, 11, NA, NA, 8, NA, …
## $ MTH_BL        <int> 9, 12, 10, 20, 10, 13, 9, 14, 10, 14, 18, 18, 15, …
## $ MTH_EL        <int> 29, NA, 23, NA, NA, NA, NA, 35, 19, NA, NA, 38, NA…
## $ MTH_ML        <int> 15, 8, 12, 16, 11, 11, 11, 22, NA, 13, NA, 22, NA,…
## $ SCI_BL        <int> 9, 19, 15, 17, 26, 20, 9, 15, 16, 20, 17, 19, 15, …
## $ SCI_EL        <int> 13, NA, 9, NA, NA, NA, NA, 16, 8, NA, NA, 23, NA, …
## $ SCI_ML        <int> 12, 14, 15, 15, 11, 10, 8, 15, NA, 11, NA, 19, NA,…

Download final scores from LMS

scores <- read.delim(
  textConnection(
getURL("https://s3-ap-southeast-1.amazonaws.com/avanti-dump/dashboard-dumps/onlinescores/onlinetestdump.tsv")),
  header = TRUE, stringsAsFactors = FALSE)
 
head(scores)

Filter “OR-D-10-18”, “OR-D-20-18”, spread by code & rename to MTH_BL2, SCI_BL2

bl_scores <- scores %>% filter(Test.Code %in% c("OR-D-10-18", "OR-D-20-18"))

df10 <- bl_scores %>% select(Student.ID, Test.Score, Test.Code) %>%
  spread(key = Test.Code, value = Test.Score) %>%
  rename(MTH_BL2 = `OR-D-10-18`, SCI_BL2 = `OR-D-20-18`)

head(df10)

Data Cleaning

Joining both tables

tracker %<>%
  inner_join(df10, by = c(`Student ID` = "Student.ID")) #### Notice " " for common variables

sample_n(tracker, 10)

Removing unnecessary columns

tracker %<>%
  select(-MTH_ML, -SCI_ML, -ENG_BL)

head(tracker)

Keeping only students who gave both tests

gain <- tracker %>%
  filter(!is.na(MTH_BL)) %>%
  filter(!is.na(MTH_BL2))

dim(gain)
## [1] 2716    8

Combining Math and Science Scores to get Baseline 1 and Baseline 2

gain %<>%
  mutate(BL1 = (MTH_BL + SCI_BL)) %>%
  mutate(BL2 = (MTH_BL2 + SCI_BL2)) %>%
  select(`Centre Name`, `Student ID`, BL1, BL2)

sample_n(gain, 10)

Absolute Improvement on Normalized Scores

Min Max Scaling of Raw Scores to a range of [0,100]

normalize <- function(x) {
    return (round(100*(x - min(x)) / (max(x) - min(x))))
  }

gain %<>%
  bind_cols(as.data.frame(lapply(gain["BL1"], normalize))) %>%
  bind_cols(as.data.frame(lapply(gain["BL2"], normalize))) %>%
  rename(norm_BL1 = BL11) %>%
  rename(norm_BL2 = BL21)

head(gain)

Average score per school and improvement

school_gain <- gain %>%
  group_by(`Centre Name`) %>%
  summarise(count = n(),
            Avg_Norm_BL1 = round(mean(norm_BL1)),
            Avg_Norm_BL2 = round(mean(norm_BL2)),
            Delta_Avg = Avg_Norm_BL2 - Avg_Norm_BL1
            ) %>%
  arrange(Delta_Avg)

school_gain

Get school-wise intervention factor

ss <- read_csv("59_Assessment_Report_2018 - Copy of Sheet1.csv")
## Warning: Missing column names filled in: 'X7' [7]
## Parsed with column specification:
## cols(
##   Schools = col_character(),
##   `Baseline score 2017` = col_character(),
##   `Baseline score 2018` = col_character(),
##   Increase = col_character(),
##   `School Type` = col_character(),
##   `Reason (infra/teacher)` = col_character(),
##   X7 = col_character(),
##   `Percentage increase` = col_character()
## )
head(ss)

Adding school type to School wise and student wise gain

school_gain %<>% left_join(select(ss, Schools, `School Type`), by = c("Centre Name" = "Schools"))
gain %<>% left_join(select(ss, Schools, `School Type`), by = c("Centre Name" = "Schools"))

head(school_gain)

Plotting normalized scores by student

gain %>%
  ggplot(aes(x = norm_BL1, y = norm_BL2 )) +     #### No quotes needed in aes x and y arguement
  geom_point(aes(colour = as.factor(`School Type`))) + 
  geom_smooth(method = "lm", se=FALSE) +      #### Removes the 95% confidence interval
  #theme(legend.position="none") +
  xlim(0,100) +
  ylim(0,100)

Let’s try a boxplots for the 2 groups

bl1 <- gain %>%
  ggplot() +     #### No quotes needed in aes x and y arguement
  geom_boxplot(aes(x = as.factor(`School Type`), y = norm_BL1)) 
  #theme(legend.position="none") +

bl2 <- gain %>%
  ggplot() +     #### No quotes needed in aes x and y arguement
  geom_boxplot(aes(x = as.factor(`School Type`), y = norm_BL2)) 

grid.arrange(bl1, bl2, nrow = 2)

Adding histograms

bl1_hist <- gain %>%
  ggplot(aes(x = norm_BL1)) +     #### No quotes needed in aes x and y arguement
  geom_density(aes(colour = as.factor(`School Type`), y = (..density..) )) +
  ylab("Density")
  #theme(legend.position="none") +

bl2_hist <- gain %>%
  ggplot(aes(x = norm_BL2)) +     #### No quotes needed in aes x and y arguement
  geom_density(aes(colour = as.factor(`School Type`), y = (..density..) )) +
  ylab("Density")

grid.arrange(bl1_hist, bl2_hist, nrow = 2)

Plotting school-wise average scores

school_gain %>%
  ggplot(aes(x = Avg_Norm_BL1, y = Avg_Norm_BL2, label = `Centre Name`, colour = as.factor(`School Type`) )) +     #### No quotes needed in aes x and y arguement
  geom_point() + 
  #geom_smooth(method = "lm", se=FALSE) +     #### Removes the 95% confidence interval
  #xlim(0,100) +
  #ylim(0,100)
  #theme(legend.position="none") +
  geom_text(check_overlap = TRUE, size = 2)

Relative gain within school-groups based on performance

Let’s classify these schools into 3 categories based on their Avg Score in BL1

school_gain %<>%
  mutate(Performance = if_else(Avg_Norm_BL1 < 35, "Low", 
                               if_else(Avg_Norm_BL1 < 45, "Mid", "High")
                               )
         )

school_gain %>%
  ggplot(aes(x = Avg_Norm_BL1, y = Avg_Norm_BL2, label = `Centre Name` )) +     #### No quotes needed in aes x and y arguement
  geom_point(aes(colour = factor(Performance), fill = factor(Performance), shape = as.factor(`School Type`))) + 
  #geom_smooth(method = "lm", se=FALSE) +     #### Removes the 95% confidence interval
  #xlim(0,100) +
  #ylim(0,100)
  geom_text(check_overlap = TRUE, size = 2) +
  theme(legend.position="none") +
  scale_colour_manual(values = c("Low" = "red", "Mid" = "blue", "High" = "green"))

Mapping school performance to student data

gain %<>%
  left_join(select(school_gain, - `School Type`) , by = "Centre Name") %>%
  select(-count, -Avg_Norm_BL1, -Avg_Norm_BL2,  -Delta_Avg)

(gain)

Checking student gain at Low performing schools first

low_performing <- gain %>%
  filter(Performance == "Low")
#### Getting linear regression fit
low_mod <- lm(norm_BL2 ~ norm_BL1, data = low_performing)

low_performing %<>% 
  add_predictions(low_mod) %>%
  add_residuals(low_mod)

low_performing %<>%
  mutate(delta = round(100*resid / norm_BL1) )

low_performing %<>%
  mutate(movement = if_else(delta > 20, "UP", (if_else(delta > -20, "SAME", "DOWN"))))

#### Count of UP / DOWN and SAME per school for Low performing schools
low_performing %>%
  group_by(`School Type`, movement) %>%
  summarise(countCommonStudents = n() ) %>%
  spread(movement, countCommonStudents) %>%
  replace(., is.na(.), 0) %>%               #### Replace missing values from spread with 0
  mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
  arrange(NET_perc_UP)

Checking student gain of Middle Performing Schools

mid_performing <- gain %>%
  filter(Performance == "Mid")
#### Getting linear regression fit
mid_mod <- lm(norm_BL2 ~ norm_BL1, data = mid_performing)

mid_performing %<>% 
  add_predictions(mid_mod) %>%
  add_residuals(mid_mod)

mid_performing %<>%
  mutate(delta = round(100*resid / norm_BL1) ) %>%
  mutate(movement = if_else(delta > 20, "UP", (if_else(delta > -20, "SAME", "DOWN"))))

#### Count of UP / DOWN and SAME per school for Low performing schools
mid_performing %>%
  group_by(`School Type`, movement) %>%
  summarise(countCommonStudents = n() ) %>%
  spread(movement, countCommonStudents) %>%
  replace(., is.na(.), 0) %>%
  mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
  arrange(NET_perc_UP)

Checking student gain of High Performing Schools

high_performing <- gain %>%
  filter(Performance == "High")
#### Getting linear regression fit
high_mod <- lm(norm_BL2 ~ norm_BL1, data = high_performing)

high_performing %<>% 
  add_predictions(high_mod) %>%
  add_residuals(high_mod)

high_performing %<>%
  mutate(delta = round(100*resid / norm_BL1) ) %>%
  mutate(movement = if_else(delta > 20, "UP", (if_else(delta > -20, "SAME", "DOWN"))))

#### Count of UP / DOWN and SAME per school for Low performing schools
high_performing %>%
  group_by(`School Type`, movement) %>%
  summarise(countCommonStudents = n() ) %>%
  spread(movement, countCommonStudents) %>%
  replace(., is.na(.), 0) %>%
  mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
  arrange(NET_perc_UP)

Write to CSV

write_csv(high_performing, "high_performing.csv")
write_csv(low_performing, "low_performing.csv")
write_csv(mid_performing, "mid_performing.csv")

Regression with categorical variable of Intervention

cat_mod <- lm(norm_BL2 ~ norm_BL1 + as.factor(`School Type`), data = gain)

coef(cat_mod)
##                            (Intercept) 
##                             17.0436764 
##                               norm_BL1 
##                              0.7946415 
## as.factor(`School Type`)Only workbooks 
##                             -3.0084193

Negative coefficeint if intervention has only workbooks available !!

Let’s try including Endline Scores

el_gain <- tracker %>%
  filter(!is.na(MTH_BL)) %>%
  filter(!is.na(MTH_BL2)) %>%
  filter(!is.na(MTH_EL))

el_gain %<>%
  mutate(BL1 = (MTH_BL + SCI_BL)) %>%
  mutate(BL2 = (MTH_BL2 + SCI_BL2)) %>%
  mutate(EL = MTH_EL+SCI_EL) %>%
  select(`Centre Name`, `Student ID`, BL1, EL, BL2 )

head(el_gain)

Normalizing all scores

normalize <- function(x) {
    return (round(100*(x - min(x)) / (max(x) - min(x))))
  }

el_gain %<>%
  bind_cols(as.data.frame(lapply(el_gain["BL1"], normalize))) %>%
  bind_cols(as.data.frame(lapply(el_gain["EL"], normalize))) %>%
  bind_cols(as.data.frame(lapply(el_gain["BL2"], normalize))) %>%
  rename(norm_BL1 = BL11) %>%
  rename(norm_EL = EL1) %>%
  rename(norm_BL2 = BL21)

head(el_gain)

Adding School Type

el_gain %<>% left_join(select(ss, Schools, `School Type`), by = c("Centre Name" = "Schools"))

head(el_gain)

Linear model with categorical variable

cat_el_mod <- lm(norm_BL2 ~ norm_BL1 + norm_EL + as.factor(`School Type`), data = el_gain)

coef(cat_el_mod)
##                            (Intercept) 
##                              9.5912565 
##                               norm_BL1 
##                              0.3738467 
##                                norm_EL 
##                              0.7727426 
## as.factor(`School Type`)Only workbooks 
##                             -2.8933468

Up/down

el_gain %<>% 
  add_predictions(cat_el_mod) %>%
  add_residuals(cat_el_mod) %>%
  mutate(delta = round(100*resid / norm_BL1) ) %>%
  mutate(movement = if_else(delta > 10, "UP", (if_else(delta > -10, "SAME", "DOWN"))))

#### Count of UP / DOWN and SAME per school for Low performing schools
el_gain %>%
  group_by(`School Type`, movement) %>%
  summarise(countCommonStudents = n() ) %>%
  spread(movement, countCommonStudents) %>%
  replace(., is.na(.), 0) %>%
  mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
  arrange(NET_perc_UP)