This notebook uses data outputs from the “ASER variance over time” notebook to create a combined graph showing variance decomposition for all subject + grade + district/state + change/level combinations.
In addition, it also saves various other tables used in the working paper and simulates the effect of noise on state ranking.
library(tidyverse)
## -- Attaching packages --------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
int_path <- "C:/Users/dougj/Dropbox/Education in India/Original research/aservnas/figures/intermediate"
output <- "C:/Users/dougj/Dropbox/Education in India/Original research/aservnas/figures"
Import all of the data and save as csv files.
file_type <- c("rho", "corr dbl", "var deco", "bar data", "alt ests")
tables <- list()
for (type in file_type) {
tables[[type]] <- tibble(path = list.files(path = int_path, pattern = type)) %>%
rowwise() %>%
summarise(read_csv(file.path(int_path, path)))
}
## Parsed with column specification:
## cols(
## rho = col_double(),
## subject = col_character(),
## state_grade = col_double(),
## lag = col_double()
## )
## Parsed with column specification:
## cols(
## rho = col_double(),
## subject = col_character(),
## state_grade = col_double(),
## lag = col_double()
## )
## Parsed with column specification:
## cols(
## rho = col_double(),
## subject = col_character(),
## state_grade = col_double(),
## lag = col_double()
## )
## Parsed with column specification:
## cols(
## rho = col_double(),
## subject = col_character(),
## state_grade = col_double(),
## lag = col_double()
## )
## `summarise()` ungrouping output (override with `.groups` argument)
## Parsed with column specification:
## cols(
## level = col_character(),
## values = col_double(),
## subject = col_character(),
## state_grade = col_double()
## )
## Parsed with column specification:
## cols(
## level = col_character(),
## values = col_double(),
## subject = col_character(),
## state_grade = col_double()
## )
## Parsed with column specification:
## cols(
## level = col_character(),
## values = col_double(),
## subject = col_character(),
## state_grade = col_double()
## )
## Parsed with column specification:
## cols(
## level = col_character(),
## values = col_double(),
## subject = col_character(),
## state_grade = col_double()
## )
## `summarise()` ungrouping output (override with `.groups` argument)
## Parsed with column specification:
## cols(
## state_or_dist = col_character(),
## changes_or_levels = col_character(),
## subject = col_character(),
## state_grade = col_double(),
## Persistent = col_double(),
## `Transitory sampling` = col_double(),
## `Transitory other` = col_double(),
## total_var = col_double(),
## share_pers = col_double(),
## share_samp = col_double(),
## share_other = col_double()
## )
## Parsed with column specification:
## cols(
## state_or_dist = col_character(),
## changes_or_levels = col_character(),
## subject = col_character(),
## state_grade = col_double(),
## Persistent = col_double(),
## `Transitory sampling` = col_double(),
## `Transitory other` = col_double(),
## total_var = col_double(),
## share_pers = col_double(),
## share_samp = col_double(),
## share_other = col_double()
## )
## Parsed with column specification:
## cols(
## state_or_dist = col_character(),
## changes_or_levels = col_character(),
## subject = col_character(),
## state_grade = col_double(),
## Persistent = col_double(),
## `Transitory sampling` = col_double(),
## `Transitory other` = col_double(),
## total_var = col_double(),
## share_pers = col_double(),
## share_samp = col_double(),
## share_other = col_double()
## )
## Parsed with column specification:
## cols(
## state_or_dist = col_character(),
## changes_or_levels = col_character(),
## subject = col_character(),
## state_grade = col_double(),
## Persistent = col_double(),
## `Transitory sampling` = col_double(),
## `Transitory other` = col_double(),
## total_var = col_double(),
## share_pers = col_double(),
## share_samp = col_double(),
## share_other = col_double()
## )
## `summarise()` ungrouping output (override with `.groups` argument)
## Parsed with column specification:
## cols(
## state_or_dist = col_character(),
## changes_or_levels = col_character(),
## bar_part = col_character(),
## value = col_double(),
## subject = col_character(),
## state_grade = col_double()
## )
## Parsed with column specification:
## cols(
## state_or_dist = col_character(),
## changes_or_levels = col_character(),
## bar_part = col_character(),
## value = col_double(),
## subject = col_character(),
## state_grade = col_double()
## )
## Parsed with column specification:
## cols(
## state_or_dist = col_character(),
## changes_or_levels = col_character(),
## bar_part = col_character(),
## value = col_double(),
## subject = col_character(),
## state_grade = col_double()
## )
## Parsed with column specification:
## cols(
## state_or_dist = col_character(),
## changes_or_levels = col_character(),
## bar_part = col_character(),
## value = col_double(),
## subject = col_character(),
## state_grade = col_double()
## )
## `summarise()` ungrouping output (override with `.groups` argument)
## Parsed with column specification:
## cols(
## changes_levels = col_character(),
## grade = col_double(),
## subject = col_character(),
## share_persist = col_double(),
## share_sampling = col_double(),
## share_other = col_double()
## )
## Parsed with column specification:
## cols(
## changes_levels = col_character(),
## grade = col_double(),
## subject = col_character(),
## share_persist = col_double(),
## share_sampling = col_double(),
## share_other = col_double()
## )
## Parsed with column specification:
## cols(
## changes_levels = col_character(),
## grade = col_double(),
## subject = col_character(),
## share_persist = col_double(),
## share_sampling = col_double(),
## share_other = col_double()
## )
## Parsed with column specification:
## cols(
## changes_levels = col_character(),
## grade = col_double(),
## subject = col_character(),
## share_persist = col_double(),
## share_sampling = col_double(),
## share_other = col_double()
## )
## `summarise()` ungrouping output (override with `.groups` argument)
# Reshape the rho data and then save
tables[['rho']] %>%
pivot_wider(names_from = c("subject", "state_grade"), values_from = rho) %>%
write_csv(file.path(output, "rho.csv"))
# Fix up the alt ests table so that I can merge it with
alt_ests <- tables[["alt ests"]] %>%
rename(changes_or_levels = changes_levels, state_grade = grade, alt_share_persist = share_persist, alt_share_sampling = share_sampling, alt_share_other = share_other) %>%
mutate(changes_or_levels = str_to_sentence(changes_or_levels), state_or_dist = "State")
tables[["var deco"]] %>%
filter(!((state_or_dist == "District") & (state_grade == 5))) %>%
left_join(alt_ests) %>%
write_csv(file.path(output, "var deco.csv"))
## Joining, by = c("state_or_dist", "changes_or_levels", "subject", "state_grade")
rho <- tables[["rho"]] %>%
mutate(sub_grade = paste(subject, state_grade)) %>%
select(-subject, -state_grade) %>%
add_row(sub_grade = "math 3", lag = 0, rho = 1) %>%
add_row(sub_grade = "math 5", lag = 0, rho = 1) %>%
add_row(sub_grade = "reading 3", lag = 0, rho = 1) %>%
add_row(sub_grade = "reading 5", lag = 0, rho = 1)
ggplot(rho, aes(x = lag, y = rho, colour = sub_grade)) +
geom_line() +
scale_colour_discrete(name = "Subject and grade") +
labs(y = "correlation", title = "Decay in autocorrelation by lag")
ggsave("Correlation decay.png", width = 6, height = 3 , path = output)
bar_data <- tables[["bar data"]] %>%
filter(!((state_or_dist == "District") & (state_grade == 5))) %>%
mutate(state_or_dist = paste(state_or_dist, "grade", state_grade))
# Change the bar_part variable to an ordered factor
bar_data <- bar_data %>%
mutate(bar_part_ordered = factor(bar_part, ordered = TRUE, levels = c("Transitory other","Transitory sampling", "Persistent")))
ggplot(bar_data, aes(fill = bar_part_ordered, y=value, x= changes_or_levels)) +
geom_bar(position="stack", stat ="identity") +
facet_grid( subject ~ state_or_dist) +
scale_fill_manual(values = c("red", "orange", "blue"))+
labs(fill = "Variance component", x = "", y = "Variance")
ggsave("Combined bar chart.png", width = 7, height = 6 , path = output)
Using the estimates from the variance decomposition, use simulation to estimate the effect this would have on the accuracy of splitting up states into bottom quartile, middle 50%, and top quartile
Using the estimates from the variance decomposition, use simulation to estimate the effect this would have on the accuracy of splitting up states into bottom quartile, middle 50%, and top quartile.
!! WARNING: THE CODE BELOW IS STILL PRETTY MESSY. TAKE CARE IN USING IT.
# Inputs
n <- 100000
centile <- .25
subject <- "reading"
levels_changes <- "Changes"
state_dist <- "District"
grade <- 3
dist_type <- "normal"
# Read in and grab the appropriate variance figures
df <- tables[["var deco"]] %>%
filter((state_or_dist == state_dist) & (changes_or_levels == levels_changes) & (state_grade == grade))
var_pers <- df$Persistent[1]
var_trans <- df$`Transitory sampling`[1] + df$`Transitory other`[1]
print(var_pers/var_trans)
## [1] 0.3215349
# Create a dataframe of draws using the estimates of variance and the appropriate distribution type
if (dist_type == "normal"){
draws <- data.frame(pers = rnorm(n, mean = 0, sd = var_pers^.5),
trans = rnorm(n, mean = 0, sd = var_trans^.5))
} else if (dist_type == "beta") {
# with the beta, var = shape1*shape2 / [(shape1+shape2)^2*(shape1 + shape2 + 1)]
# note that if mean = .5 then shape1 = shape2 with the beta.
# Thus, to simplify things, I assume that the mean is .5.
# With this simplification, var = shape^2/[4shape^2*(2*shape+1)]=1/[8*shape+4]
# and shape = (1/8*var)-.5
shape_pers <- (1/(8*var_pers))-.5
shape_trans <- (1/(8*var_trans))-.5
draws <- data.frame(pers = rbeta(n, shape_pers, shape_pers),
trans = rbeta(n, shape_trans, shape_trans))
} else {
print("Couldn't match dist type")
}
# Create y, which is the measured score
draws <- draws %>% mutate(y = pers +trans)
# Create rank of measured score
draws$y_rank[order(draws$y)] <- 1:nrow(draws)
# Create rank of "true" score
draws$pers_rank[order(draws$pers)] <- 1:nrow(draws)
# Create binaries for whether true and measured scores are in the top centile
draws <- draws %>%
mutate(top_cent_pers = (pers_rank > (1-centile)*n), top_cent_y = (y_rank > (1-centile)*n))
# Calculate accuracy as the percentage of actual
accuracy <- mean(draws$top_cent_pers[draws$top_cent_y])
cat("Reporting accuracy as the proportion of those actually in the top centile which would be ranked as being in the top centile.\n\n")
## Reporting accuracy as the proportion of those actually in the top centile which would be ranked as being in the top centile.
cat("Inputs:::: \n")
## Inputs::::
cat("centile: ", centile, "\n")
## centile: 0.25
cat("Distribution: ", dist_type, "\n")
## Distribution: normal
cat("States or districts: ", state_dist, "\n")
## States or districts: District
cat("Subject: ", subject, "\n")
## Subject: reading
cat("Deltas or levels:", levels_changes, "\n\n")
## Deltas or levels: Changes
cat("accuracy is", accuracy)
## accuracy is 0.47548