library(tidyverse)
library(magrittr)
library(RCurl)
library(modelr)
library(gridExtra)tracker <- read_csv("raw_tracker.csv") #### Read from LMS
#### Folder GoH Testing > Cohort17-21 > AY17-18 Class 9 BL-ML-EL
glimpse(tracker)## Observations: 5,260
## Variables: 9
## $ `Centre Name` <chr> "AMSSS Agroha, Hisar", "AMSSS Agroha, Hisar", "AMS…
## $ `Student ID` <chr> "HI10100001", "HI10100003", "HI10100005", "HI10100…
## $ ENG_BL <int> 12, 8, 13, 13, 13, 11, NA, 12, 11, NA, NA, 8, NA, …
## $ MTH_BL <int> 9, 12, 10, 20, 10, 13, 9, 14, 10, 14, 18, 18, 15, …
## $ MTH_EL <int> 29, NA, 23, NA, NA, NA, NA, 35, 19, NA, NA, 38, NA…
## $ MTH_ML <int> 15, 8, 12, 16, 11, 11, 11, 22, NA, 13, NA, 22, NA,…
## $ SCI_BL <int> 9, 19, 15, 17, 26, 20, 9, 15, 16, 20, 17, 19, 15, …
## $ SCI_EL <int> 13, NA, 9, NA, NA, NA, NA, 16, 8, NA, NA, 23, NA, …
## $ SCI_ML <int> 12, 14, 15, 15, 11, 10, 8, 15, NA, 11, NA, 19, NA,…
scores <- read.delim(
textConnection(
getURL("https://s3-ap-southeast-1.amazonaws.com/avanti-dump/dashboard-dumps/onlinescores/onlinetestdump.tsv")),
header = TRUE, stringsAsFactors = FALSE)
head(scores)bl_scores <- scores %>% filter(Test.Code %in% c("OR-D-10-18", "OR-D-20-18"))
df10 <- bl_scores %>% select(Student.ID, Test.Score, Test.Code) %>%
spread(key = Test.Code, value = Test.Score) %>%
rename(MTH_BL2 = `OR-D-10-18`, SCI_BL2 = `OR-D-20-18`)
head(df10)tracker %<>%
inner_join(df10, by = c(`Student ID` = "Student.ID")) #### Notice " " for common variables
sample_n(tracker, 10)tracker %<>%
select(-MTH_ML, -SCI_ML, -ENG_BL)
head(tracker)gain <- tracker %>%
filter(!is.na(MTH_BL)) %>%
filter(!is.na(MTH_BL2))
dim(gain)## [1] 2716 8
gain %<>%
mutate(BL1 = (MTH_BL + SCI_BL)) %>%
mutate(BL2 = (MTH_BL2 + SCI_BL2)) %>%
select(`Centre Name`, `Student ID`, BL1, BL2)
sample_n(gain, 10)normalize <- function(x) {
return (round(100*(x - min(x)) / (max(x) - min(x))))
}
gain %<>%
bind_cols(as.data.frame(lapply(gain["BL1"], normalize))) %>%
bind_cols(as.data.frame(lapply(gain["BL2"], normalize))) %>%
rename(norm_BL1 = BL11) %>%
rename(norm_BL2 = BL21)
head(gain)school_gain <- gain %>%
group_by(`Centre Name`) %>%
summarise(count = n(),
Avg_Norm_BL1 = round(mean(norm_BL1)),
Avg_Norm_BL2 = round(mean(norm_BL2)),
Delta_Avg = Avg_Norm_BL2 - Avg_Norm_BL1
) %>%
arrange(Delta_Avg)
school_gainss <- read_csv("59_Assessment_Report_2018 - Copy of Sheet1.csv")## Warning: Missing column names filled in: 'X7' [7]
## Parsed with column specification:
## cols(
## Schools = col_character(),
## `Baseline score 2017` = col_character(),
## `Baseline score 2018` = col_character(),
## Increase = col_character(),
## `School Type` = col_character(),
## `Reason (infra/teacher)` = col_character(),
## X7 = col_character(),
## `Percentage increase` = col_character()
## )
head(ss)school_gain %<>% left_join(select(ss, Schools, `School Type`), by = c("Centre Name" = "Schools"))
gain %<>% left_join(select(ss, Schools, `School Type`), by = c("Centre Name" = "Schools"))
head(school_gain)gain %>%
ggplot(aes(x = norm_BL1, y = norm_BL2 )) + #### No quotes needed in aes x and y arguement
geom_point(aes(colour = as.factor(`School Type`))) +
geom_smooth(method = "lm", se=FALSE) + #### Removes the 95% confidence interval
#theme(legend.position="none") +
xlim(0,100) +
ylim(0,100)bl1 <- gain %>%
ggplot() + #### No quotes needed in aes x and y arguement
geom_boxplot(aes(x = as.factor(`School Type`), y = norm_BL1))
#theme(legend.position="none") +
bl2 <- gain %>%
ggplot() + #### No quotes needed in aes x and y arguement
geom_boxplot(aes(x = as.factor(`School Type`), y = norm_BL2))
grid.arrange(bl1, bl2, nrow = 2)bl1_hist <- gain %>%
ggplot(aes(x = norm_BL1)) + #### No quotes needed in aes x and y arguement
geom_density(aes(colour = as.factor(`School Type`), y = (..density..) )) +
ylab("Density")
#theme(legend.position="none") +
bl2_hist <- gain %>%
ggplot(aes(x = norm_BL2)) + #### No quotes needed in aes x and y arguement
geom_density(aes(colour = as.factor(`School Type`), y = (..density..) )) +
ylab("Density")
grid.arrange(bl1_hist, bl2_hist, nrow = 2)school_gain %>%
ggplot(aes(x = Avg_Norm_BL1, y = Avg_Norm_BL2, label = `Centre Name`, colour = as.factor(`School Type`) )) + #### No quotes needed in aes x and y arguement
geom_point() +
#geom_smooth(method = "lm", se=FALSE) + #### Removes the 95% confidence interval
#xlim(0,100) +
#ylim(0,100)
#theme(legend.position="none") +
geom_text(check_overlap = TRUE, size = 2)school_gain %<>%
mutate(Performance = if_else(Avg_Norm_BL1 < 35, "Low",
if_else(Avg_Norm_BL1 < 45, "Mid", "High")
)
)
school_gain %>%
ggplot(aes(x = Avg_Norm_BL1, y = Avg_Norm_BL2, label = `Centre Name` )) + #### No quotes needed in aes x and y arguement
geom_point(aes(colour = factor(Performance), fill = factor(Performance), shape = as.factor(`School Type`))) +
#geom_smooth(method = "lm", se=FALSE) + #### Removes the 95% confidence interval
#xlim(0,100) +
#ylim(0,100)
geom_text(check_overlap = TRUE, size = 2) +
theme(legend.position="none") +
scale_colour_manual(values = c("Low" = "red", "Mid" = "blue", "High" = "green"))gain %<>%
left_join(select(school_gain, - `School Type`) , by = "Centre Name") %>%
select(-count, -Avg_Norm_BL1, -Avg_Norm_BL2, -Delta_Avg)
(gain)low_performing <- gain %>%
filter(Performance == "Low")
#### Getting linear regression fit
low_mod <- lm(norm_BL2 ~ norm_BL1, data = low_performing)
low_performing %<>%
add_predictions(low_mod) %>%
add_residuals(low_mod)
low_performing %<>%
mutate(delta = round(100*resid / norm_BL1) )
low_performing %<>%
mutate(movement = if_else(delta > 20, "UP", (if_else(delta > -20, "SAME", "DOWN"))))
#### Count of UP / DOWN and SAME per school for Low performing schools
low_performing %>%
group_by(`School Type`, movement) %>%
summarise(countCommonStudents = n() ) %>%
spread(movement, countCommonStudents) %>%
replace(., is.na(.), 0) %>% #### Replace missing values from spread with 0
mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
arrange(NET_perc_UP)mid_performing <- gain %>%
filter(Performance == "Mid")
#### Getting linear regression fit
mid_mod <- lm(norm_BL2 ~ norm_BL1, data = mid_performing)
mid_performing %<>%
add_predictions(mid_mod) %>%
add_residuals(mid_mod)
mid_performing %<>%
mutate(delta = round(100*resid / norm_BL1) ) %>%
mutate(movement = if_else(delta > 20, "UP", (if_else(delta > -20, "SAME", "DOWN"))))
#### Count of UP / DOWN and SAME per school for Low performing schools
mid_performing %>%
group_by(`School Type`, movement) %>%
summarise(countCommonStudents = n() ) %>%
spread(movement, countCommonStudents) %>%
replace(., is.na(.), 0) %>%
mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
arrange(NET_perc_UP)high_performing <- gain %>%
filter(Performance == "High")
#### Getting linear regression fit
high_mod <- lm(norm_BL2 ~ norm_BL1, data = high_performing)
high_performing %<>%
add_predictions(high_mod) %>%
add_residuals(high_mod)
high_performing %<>%
mutate(delta = round(100*resid / norm_BL1) ) %>%
mutate(movement = if_else(delta > 20, "UP", (if_else(delta > -20, "SAME", "DOWN"))))
#### Count of UP / DOWN and SAME per school for Low performing schools
high_performing %>%
group_by(`School Type`, movement) %>%
summarise(countCommonStudents = n() ) %>%
spread(movement, countCommonStudents) %>%
replace(., is.na(.), 0) %>%
mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
arrange(NET_perc_UP)write_csv(high_performing, "high_performing.csv")
write_csv(low_performing, "low_performing.csv")
write_csv(mid_performing, "mid_performing.csv")cat_mod <- lm(norm_BL2 ~ norm_BL1 + as.factor(`School Type`), data = gain)
coef(cat_mod)## (Intercept)
## 17.0436764
## norm_BL1
## 0.7946415
## as.factor(`School Type`)Only workbooks
## -3.0084193
Negative coefficeint if intervention has only workbooks available !!
el_gain <- tracker %>%
filter(!is.na(MTH_BL)) %>%
filter(!is.na(MTH_BL2)) %>%
filter(!is.na(MTH_EL))
el_gain %<>%
mutate(BL1 = (MTH_BL + SCI_BL)) %>%
mutate(BL2 = (MTH_BL2 + SCI_BL2)) %>%
mutate(EL = MTH_EL+SCI_EL) %>%
select(`Centre Name`, `Student ID`, BL1, EL, BL2 )
head(el_gain)normalize <- function(x) {
return (round(100*(x - min(x)) / (max(x) - min(x))))
}
el_gain %<>%
bind_cols(as.data.frame(lapply(el_gain["BL1"], normalize))) %>%
bind_cols(as.data.frame(lapply(el_gain["EL"], normalize))) %>%
bind_cols(as.data.frame(lapply(el_gain["BL2"], normalize))) %>%
rename(norm_BL1 = BL11) %>%
rename(norm_EL = EL1) %>%
rename(norm_BL2 = BL21)
head(el_gain)el_gain %<>% left_join(select(ss, Schools, `School Type`), by = c("Centre Name" = "Schools"))
head(el_gain)cat_el_mod <- lm(norm_BL2 ~ norm_BL1 + norm_EL + as.factor(`School Type`), data = el_gain)
coef(cat_el_mod)## (Intercept)
## 9.5912565
## norm_BL1
## 0.3738467
## norm_EL
## 0.7727426
## as.factor(`School Type`)Only workbooks
## -2.8933468
el_gain %<>%
add_predictions(cat_el_mod) %>%
add_residuals(cat_el_mod) %>%
mutate(delta = round(100*resid / norm_BL1) ) %>%
mutate(movement = if_else(delta > 10, "UP", (if_else(delta > -10, "SAME", "DOWN"))))
#### Count of UP / DOWN and SAME per school for Low performing schools
el_gain %>%
group_by(`School Type`, movement) %>%
summarise(countCommonStudents = n() ) %>%
spread(movement, countCommonStudents) %>%
replace(., is.na(.), 0) %>%
mutate(NET_perc_UP = round(100*(UP-DOWN) / (UP+DOWN+SAME))) %>%
arrange(NET_perc_UP)