This notebook uses data outputs from the “ASER variance over time” notebook to create a combined graph showing variance decomposition for all subject + grade + district/state + change/level combinations.

In addition, it also saves various other tables used in the working paper and simulates the effect of noise on state ranking.

library(tidyverse)

## -- Attaching packages --------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.0     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   1.0.0
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

int_path <- "C:/Users/dougj/Dropbox/Education in India/Original research/aservnas/figures/intermediate"
output <- "C:/Users/dougj/Dropbox/Education in India/Original research/aservnas/figures"

1 Import all data and save as csv

Import all of the data and save as csv files.

file_type <- c("rho", "corr dbl", "var deco", "bar data", "alt ests")
tables <- list()
for (type in file_type) {
  tables[[type]] <- tibble(path = list.files(path = int_path, pattern = type)) %>% 
  rowwise() %>% 
  summarise(read_csv(file.path(int_path, path))) 
}

## Parsed with column specification:
## cols(
##   rho = col_double(),
##   subject = col_character(),
##   state_grade = col_double(),
##   lag = col_double()
## )
## Parsed with column specification:
## cols(
##   rho = col_double(),
##   subject = col_character(),
##   state_grade = col_double(),
##   lag = col_double()
## )
## Parsed with column specification:
## cols(
##   rho = col_double(),
##   subject = col_character(),
##   state_grade = col_double(),
##   lag = col_double()
## )
## Parsed with column specification:
## cols(
##   rho = col_double(),
##   subject = col_character(),
##   state_grade = col_double(),
##   lag = col_double()
## )

## `summarise()` ungrouping output (override with `.groups` argument)

## Parsed with column specification:
## cols(
##   level = col_character(),
##   values = col_double(),
##   subject = col_character(),
##   state_grade = col_double()
## )
## Parsed with column specification:
## cols(
##   level = col_character(),
##   values = col_double(),
##   subject = col_character(),
##   state_grade = col_double()
## )
## Parsed with column specification:
## cols(
##   level = col_character(),
##   values = col_double(),
##   subject = col_character(),
##   state_grade = col_double()
## )
## Parsed with column specification:
## cols(
##   level = col_character(),
##   values = col_double(),
##   subject = col_character(),
##   state_grade = col_double()
## )

## `summarise()` ungrouping output (override with `.groups` argument)

## Parsed with column specification:
## cols(
##   state_or_dist = col_character(),
##   changes_or_levels = col_character(),
##   subject = col_character(),
##   state_grade = col_double(),
##   Persistent = col_double(),
##   `Transitory sampling` = col_double(),
##   `Transitory other` = col_double(),
##   total_var = col_double(),
##   share_pers = col_double(),
##   share_samp = col_double(),
##   share_other = col_double()
## )
## Parsed with column specification:
## cols(
##   state_or_dist = col_character(),
##   changes_or_levels = col_character(),
##   subject = col_character(),
##   state_grade = col_double(),
##   Persistent = col_double(),
##   `Transitory sampling` = col_double(),
##   `Transitory other` = col_double(),
##   total_var = col_double(),
##   share_pers = col_double(),
##   share_samp = col_double(),
##   share_other = col_double()
## )
## Parsed with column specification:
## cols(
##   state_or_dist = col_character(),
##   changes_or_levels = col_character(),
##   subject = col_character(),
##   state_grade = col_double(),
##   Persistent = col_double(),
##   `Transitory sampling` = col_double(),
##   `Transitory other` = col_double(),
##   total_var = col_double(),
##   share_pers = col_double(),
##   share_samp = col_double(),
##   share_other = col_double()
## )
## Parsed with column specification:
## cols(
##   state_or_dist = col_character(),
##   changes_or_levels = col_character(),
##   subject = col_character(),
##   state_grade = col_double(),
##   Persistent = col_double(),
##   `Transitory sampling` = col_double(),
##   `Transitory other` = col_double(),
##   total_var = col_double(),
##   share_pers = col_double(),
##   share_samp = col_double(),
##   share_other = col_double()
## )

## `summarise()` ungrouping output (override with `.groups` argument)

## Parsed with column specification:
## cols(
##   state_or_dist = col_character(),
##   changes_or_levels = col_character(),
##   bar_part = col_character(),
##   value = col_double(),
##   subject = col_character(),
##   state_grade = col_double()
## )
## Parsed with column specification:
## cols(
##   state_or_dist = col_character(),
##   changes_or_levels = col_character(),
##   bar_part = col_character(),
##   value = col_double(),
##   subject = col_character(),
##   state_grade = col_double()
## )
## Parsed with column specification:
## cols(
##   state_or_dist = col_character(),
##   changes_or_levels = col_character(),
##   bar_part = col_character(),
##   value = col_double(),
##   subject = col_character(),
##   state_grade = col_double()
## )
## Parsed with column specification:
## cols(
##   state_or_dist = col_character(),
##   changes_or_levels = col_character(),
##   bar_part = col_character(),
##   value = col_double(),
##   subject = col_character(),
##   state_grade = col_double()
## )

## `summarise()` ungrouping output (override with `.groups` argument)

## Parsed with column specification:
## cols(
##   changes_levels = col_character(),
##   grade = col_double(),
##   subject = col_character(),
##   share_persist = col_double(),
##   share_sampling = col_double(),
##   share_other = col_double()
## )
## Parsed with column specification:
## cols(
##   changes_levels = col_character(),
##   grade = col_double(),
##   subject = col_character(),
##   share_persist = col_double(),
##   share_sampling = col_double(),
##   share_other = col_double()
## )
## Parsed with column specification:
## cols(
##   changes_levels = col_character(),
##   grade = col_double(),
##   subject = col_character(),
##   share_persist = col_double(),
##   share_sampling = col_double(),
##   share_other = col_double()
## )
## Parsed with column specification:
## cols(
##   changes_levels = col_character(),
##   grade = col_double(),
##   subject = col_character(),
##   share_persist = col_double(),
##   share_sampling = col_double(),
##   share_other = col_double()
## )

## `summarise()` ungrouping output (override with `.groups` argument)

# Reshape the rho data and then save
tables[['rho']] %>% 
  pivot_wider(names_from = c("subject", "state_grade"), values_from = rho) %>% 
  write_csv(file.path(output, "rho.csv"))

# Fix up the alt ests table so that I can merge it with 
alt_ests <- tables[["alt ests"]] %>% 
  rename(changes_or_levels = changes_levels, state_grade = grade, alt_share_persist = share_persist, alt_share_sampling = share_sampling, alt_share_other = share_other) %>% 
  mutate(changes_or_levels = str_to_sentence(changes_or_levels), state_or_dist = "State")


tables[["var deco"]] %>% 
  filter(!((state_or_dist == "District") & (state_grade == 5))) %>% 
  left_join(alt_ests) %>% 
  write_csv(file.path(output, "var deco.csv"))

## Joining, by = c("state_or_dist", "changes_or_levels", "subject", "state_grade")

2 Create combined corr decay and var decomposition graphs

rho <- tables[["rho"]] %>% 
  mutate(sub_grade = paste(subject, state_grade)) %>% 
  select(-subject, -state_grade) %>% 
  add_row(sub_grade = "math 3", lag = 0, rho = 1) %>% 
  add_row(sub_grade = "math 5", lag = 0, rho = 1) %>% 
  add_row(sub_grade = "reading 3", lag = 0, rho = 1) %>% 
  add_row(sub_grade = "reading 5", lag = 0, rho = 1)

ggplot(rho, aes(x = lag, y = rho, colour = sub_grade)) + 
  geom_line() +
  scale_colour_discrete(name = "Subject and grade") +
  labs(y = "correlation", title = "Decay in autocorrelation by lag")

ggsave("Correlation decay.png", width = 6, height = 3 , path = output)

bar_data <- tables[["bar data"]] %>% 
  filter(!((state_or_dist == "District") & (state_grade == 5))) %>% 
  mutate(state_or_dist = paste(state_or_dist, "grade", state_grade))

# Change the bar_part variable to an ordered factor
bar_data <- bar_data %>%
  mutate(bar_part_ordered = factor(bar_part, ordered = TRUE, levels = c("Transitory other","Transitory sampling", "Persistent")))


ggplot(bar_data, aes(fill = bar_part_ordered, y=value, x= changes_or_levels)) +
  geom_bar(position="stack", stat ="identity") +
  facet_grid( subject ~ state_or_dist) + 
  scale_fill_manual(values = c("red", "orange", "blue"))+
  labs(fill = "Variance component", x = "", y = "Variance")

ggsave("Combined bar chart.png", width = 7, height = 6 , path = output)

Using the estimates from the variance decomposition, use simulation to estimate the effect this would have on the accuracy of splitting up states into bottom quartile, middle 50%, and top quartile

3 Simulate effect of noise on ranking

Using the estimates from the variance decomposition, use simulation to estimate the effect this would have on the accuracy of splitting up states into bottom quartile, middle 50%, and top quartile.

!! WARNING: THE CODE BELOW IS STILL PRETTY MESSY. TAKE CARE IN USING IT.

# Inputs
n <- 100000
centile <- .25
subject <- "reading"
levels_changes <- "Changes"
state_dist <- "District"
grade <- 3
dist_type <- "normal"


# Read in and grab the appropriate variance figures
df <- tables[["var deco"]] %>% 
  filter((state_or_dist == state_dist) & (changes_or_levels == levels_changes) & (state_grade == grade))

var_pers <- df$Persistent[1]
var_trans <- df$`Transitory sampling`[1] + df$`Transitory other`[1]

print(var_pers/var_trans)

## [1] 0.3215349

# Create a dataframe of draws using the estimates of variance and the appropriate distribution type

if (dist_type == "normal"){
  draws <- data.frame(pers = rnorm(n, mean = 0, sd = var_pers^.5),
             trans = rnorm(n, mean = 0, sd = var_trans^.5))
  
} else if (dist_type == "beta") {
  # with the beta, var = shape1*shape2 / [(shape1+shape2)^2*(shape1 + shape2 + 1)]
  # note that if mean = .5 then shape1 =  shape2 with the beta.  
  # Thus, to simplify things, I assume that the mean is .5.
  # With this simplification, var = shape^2/[4shape^2*(2*shape+1)]=1/[8*shape+4]
  # and shape = (1/8*var)-.5
    
  shape_pers <- (1/(8*var_pers))-.5
  shape_trans <- (1/(8*var_trans))-.5
  draws <- data.frame(pers = rbeta(n, shape_pers, shape_pers),
               trans = rbeta(n, shape_trans, shape_trans))
} else {
  print("Couldn't match dist type")
}

# Create y, which is the measured score
draws <- draws %>% mutate(y = pers +trans)
# Create rank of measured score
draws$y_rank[order(draws$y)] <- 1:nrow(draws)
# Create rank of "true" score
draws$pers_rank[order(draws$pers)] <- 1:nrow(draws)
# Create binaries for whether true and measured scores are in the top centile
draws <- draws %>%
  mutate(top_cent_pers = (pers_rank > (1-centile)*n), top_cent_y = (y_rank > (1-centile)*n))

# Calculate accuracy as the percentage of actual 
accuracy <- mean(draws$top_cent_pers[draws$top_cent_y])
cat("Reporting accuracy as the proportion of those actually in the top centile which would be ranked as being in the top centile.\n\n")

## Reporting accuracy as the proportion of those actually in the top centile which would be ranked as being in the top centile.

cat("Inputs:::: \n")

## Inputs::::

cat("centile: ", centile, "\n")

## centile:  0.25

cat("Distribution: ", dist_type, "\n")

## Distribution:  normal

cat("States or districts: ", state_dist, "\n")

## States or districts:  District

cat("Subject: ", subject, "\n")

## Subject:  reading

cat("Deltas or levels:", levels_changes, "\n\n")

## Deltas or levels: Changes

cat("accuracy is", accuracy)

## accuracy is 0.47548

ASER variance - create combined graphs

1 Import all data and save as csv

2 Create combined corr decay and var decomposition graphs

3 Simulate effect of noise on ranking