This notebook generates outputs which are used in the section “Assess ASER’s internal reliability” of the working paper “Assessing the Assessments”.
Pretty much all of the data used in the analysis is on the web (in my public datasets git repo) so if you would like to run this locally you just need to install all the relevant packages and change the one folder path in the setup chunk.
Unfortunately, generating the full results in the working paper using this notebook is a bit of a pain. Since each run of the notebook only performs the analysis for one subject + grade combination of ASER data, to generate all the results included in the working paper you need to run the notebook 4 times – one for each combination of grade and subject for which we have longitudinal ASER data (grades 3 and 5 and math and reading).
More precisely, to generate the tables, graphs, and figures for the working paper you should:
In addition, note down the results from the regression of grade 5 scores on grade 3 scores. This regression doesn’t change with each run, so you just have to note it down once.
We use ASER data from the ASER district pages from years 2006 to 2011. These are the only years for which ASER has publicly released district level data (to my knowledge). There are four learning outcomes variables which all of these district pages include:
We use the last two learning outcomes – those for children in class 3, 4, and 5. These learning outcomes should be, in theory, more stable. In addition, they are similar to the learning outcome used in the ASER vs. NAS analysis.
Most of the analyses in this notebook are based on the ASER trends over time report. The ASER Trends over Time report includes ASER figures by state (and for the country as a whole) for the years 2006 to 2014. We use the following variables from this dataset:
library(tidyverse); library(broom)
## -- Attaching packages ---------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
output <- "C:/Users/dougj/Dropbox/Education in India/Original research/aservnas/figures"
dists <- read_csv("https://raw.githubusercontent.com/dougj892/public-datasets/master/aser_district_all_6.csv")
## Parsed with column specification:
## cols(
## State = col_character(),
## state_abbr = col_character(),
## District = col_character(),
## year = col_double(),
## std12_letters_and_up = col_double(),
## std12_numbers_and_up = col_double(),
## std35_std1_and_up = col_double(),
## std35_subtraction_and_up = col_double(),
## private_6_14 = col_double(),
## out_of_school_6_14 = col_double(),
## in_school_3_4 = col_double()
## )
states <- read_csv("https://raw.githubusercontent.com/dougj892/public-datasets/master/ASER%20trends%20over%20time.csv") %>%
select(year, State, state_abbr, ends_with("all"))
## Parsed with column specification:
## cols(
## .default = col_double(),
## State = col_character(),
## state_abbr = col_character()
## )
## See spec(...) for full column specifications.
##### IMPORTANT: SET SUBJECT TO BE USED FOR THE ENTIRE ANALYSIS HERE ###
subject <- "math"
state_grade <- 3
if (subject == "math") {
dists$score <- dists$std35_subtraction_and_up
if (state_grade ==3) {
states$score <- states$std3_subtract_all
} else if (state_grade ==5) {
states$score <- states$std5_divis_all
}
} else if (subject == "reading") {
dists$score <- dists$std35_std1_and_up
if (state_grade ==3) {
states$score <- states$std3_read_std1_all
} else if (state_grade == 5) {
states$score <- states$std5_read_std2_all
}
} else {
stop()
}
### IMPORTANT -- THIS IS WHERE I SPECIFY DISTRICT SAMPLING VARIANCE ###
var_dist_sampling <- .0016
Create graphs showing ASER state math and reading scores over time.
# reshape to long
states_long <- states %>%
select(state_abbr, year, starts_with("std")) %>%
pivot_longer(cols = starts_with("std"), names_to = "series", values_to = "pct") %>%
mutate(grade = str_extract(series, "\\d"),
sector = str_extract(series, "(govt)|(pvt)|(all)"),
subject = str_extract(series, "(read)|(subtract)|(divis)")) %>%
mutate(subject = if_else(subject != "read", "math", "read"))
df_read <- states_long %>% filter((sector == "all") & (subject == "read"))
df_math <- states_long %>% filter((sector == "all") & (subject == "math"))
ggplot(df_read, aes(x = year, y = pct, color = grade)) + geom_line() +
theme(axis.text.x = element_blank())+
facet_wrap( ~ state_abbr, nrow = 3) +
labs(title = "Grade 3 and 5 reading levels")
## Warning: Removed 2 row(s) containing missing values (geom_path).
ggsave("aser reading over time.png", path = output)
## Saving 7 x 5 in image
## Warning: Removed 2 row(s) containing missing values (geom_path).
ggplot(df_math, aes(x = year, y = pct, color = grade)) + geom_line() +
theme(axis.text.x = element_blank())+
facet_wrap( ~ state_abbr, nrow = 3) +
labs(title = "Grade 3 and 5 math levels")
## Warning: Removed 4 row(s) containing missing values (geom_path).
ggsave("aser math over time.png", path = output)
## Saving 7 x 5 in image
## Warning: Removed 4 row(s) containing missing values (geom_path).
To test for cohort effects, I regress change in grade 5 scores on twice lagged change in grade 3 scores.
# Create dataset with appropriate lags
lags <- states %>%
group_by(state_abbr) %>%
mutate(read3_lag2 = lag(std3_read_std1_all, n = 2, order_by = year),
read3_lag3 = lag(std3_read_std1_all, n = 3, order_by = year),
read5_lag1 = lag(std5_read_std2_all, n = 1, order_by = year),
delta_read3_lag2 = read3_lag2-read3_lag3,
delta_read5 = std5_read_std2_all-read5_lag1,
math3_lag2 = lag(std3_subtract_all, n = 2, order_by = year),
math3_lag3 = lag(std3_subtract_all, n = 3, order_by = year),
math5_lag1 = lag(std5_divis_all, n = 1, order_by = year),
delta_math3_lag2 = math3_lag2 - math3_lag3,
delta_math5 =std5_divis_all-math5_lag1)
fit_read <- lm(delta_read5 ~ delta_read3_lag2 - 1, lags)
summary(fit_read)
##
## Call:
## lm(formula = delta_read5 ~ delta_read3_lag2 - 1, data = lags)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.202025 -0.049925 -0.006036 0.034492 0.151110
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## delta_read3_lag2 -0.03579 0.06347 -0.564 0.574
##
## Residual standard error: 0.06898 on 174 degrees of freedom
## (119 observations deleted due to missingness)
## Multiple R-squared: 0.001824, Adjusted R-squared: -0.003912
## F-statistic: 0.318 on 1 and 174 DF, p-value: 0.5735
fit_math <- lm(delta_math5 ~ delta_math3_lag2 - 1, lags)
summary(fit_math)
##
## Call:
## lm(formula = delta_math5 ~ delta_math3_lag2 - 1, data = lags)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30503 -0.05673 -0.01062 0.01928 0.19508
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## delta_math3_lag2 -0.04451 0.06911 -0.644 0.52
##
## Residual standard error: 0.08032 on 151 degrees of freedom
## (142 observations deleted due to missingness)
## Multiple R-squared: 0.00274, Adjusted R-squared: -0.003865
## F-statistic: 0.4148 on 1 and 151 DF, p-value: 0.5205
write_csv(rbind(tidy(fit_read), tidy(fit_math)), file.path(output, "cohort analysis.csv"))
Estimate \(corr(\Delta y_t , \Delta y_{t-1} )\) by using the autocorrelation of district deltas.
# Create variable for deltas and first and second lag of the delta
dist_deltas <- dists %>%
group_by(State, District) %>%
mutate(delta = score - lag(score, order_by = year)) %>%
mutate(delta_lagged = lag(delta, order_by = year), delta_dbl_lag = lag(delta, n= 2, order_by = year)) %>%
arrange(State, District, year)
# Calculate autocorrelation of delta by district and then average over all districts
# Note that since the data is already grouped by State and district, we are calculating
# autocorrelation for each district and then averaging across all districts
corr_dist_deltas <- dist_deltas %>%
filter(year >= 2008) %>%
summarise(auto = cor(delta, delta_lagged)) %>%
ungroup() %>%
summarise(mean_auto=mean(auto)) %>% .$mean_auto
## `summarise()` regrouping output by 'State' (override with `.groups` argument)
# Calculate the autocorrelation in one go for all pairs of delta and delta_lagged.
# Note that we are not doing any averaging here.
# This should spit out a very similar value and it does.
check_corr_dist_deltas <- dist_deltas %>%
ungroup() %>%
filter(year >= 2008) %>%
summarise(auto = cor(delta, delta_lagged))
# Calculate the overall variance of the deltas
# Note that we have to divide by 100 here to get the actual number.
# We didn't need to do this above because we just calculated correlation and thus scale didn't matter
var_dist_deltas <- var(dist_deltas$delta/100, na.rm = TRUE)
# Calculate overall variance of the district values.
# Note that we don't want to first calculate variance of dist reading scores by state and then average
# That would really underestimate the variance of the district scores.
# (To understand intuition, note districts are considered iid in the model above.)
var_dist_levels <- var(dists$score/100, na.rm = TRUE)
# Calculate variance of the transitory component
var_dist_epsilon <- -corr_dist_deltas*var_dist_deltas
Check that \(corr(\Delta y_t , \Delta y_{t-2} )=0\) and that letters and numbers are correlated.
#
corr_dist_dbl_lag <- dist_deltas %>% filter(year >= 2009) %>%
ungroup() %>%
summarize(auto =cor(delta,delta_dbl_lag, use = "pairwise.complete.obs")) %>%
.$auto
Based on the code above, the final values for the key parameters for the district level are…
From these figures, we can calculate:
Estimate \(corr(\Delta y_t , \Delta y_{t-1} )\) at state level by average autocorrelation of state deltas.
# Create variable for year on year change for each district
state_deltas <- states %>%
group_by(State) %>%
mutate(delta = score - lag(score, order_by = year)) %>%
mutate(delta_lagged = lag(delta, order_by = year), delta_dbl_lag = lag(delta,n=2, order_by = year)) %>%
arrange(State, year)
# Calculate average autocorrelation of delta
corr_state_deltas <- state_deltas %>%
filter(year >= 2008) %>%
summarise(auto = cor(delta, delta_lagged, use = "pairwise.complete.obs")) %>%
ungroup() %>%
summarise(mean_auto=mean(auto)) %>% .$mean_auto
## `summarise()` ungrouping output (override with `.groups` argument)
# Calculate correlation betwen delta letters and delta letters lagged for entire dataset (without first calculating for each state and then averaging).
# this should be about the same as when we first calculate for each state and then average --> which it is.
check_corr_state_deltas <- state_deltas %>% filter(year >= 2008) %>%
ungroup() %>%
summarize(auto =cor(delta,delta_lagged, use = "pairwise.complete.obs")) %>%
.$auto
# calculate the variance of the deltas
var_state_deltas <- var(state_deltas$delta, na.rm = TRUE)
# calculate the variance of the levels
var_state_levels <- var(state_deltas$score, na.rm = TRUE)
# calculate variance of transitory component
var_state_epsilon <- -corr_state_deltas*var_state_deltas
Calculate average sampling variance. Note that since we are just calculating the overall average, I don’t have to merge on state names.
var_state_sampling <- dists %>% filter(year ==2008) %>%
count(State) %>%
mutate(sampling_var = var_dist_sampling/n) %>%
summarize(mean_sampling = mean(sampling_var)) %>% .$mean_sampling
check that there is no correlation with double lags
corr_state_dbl_lag <- state_deltas %>% filter(year >= 2009) %>%
ungroup() %>%
summarize(auto =cor(delta,delta_dbl_lag, use = "pairwise.complete.obs")) %>%
.$auto
temp <- state_deltas %>% filter(year >= 2009) %>%
group_by(State) %>%
summarize(auto =cor(delta,delta_dbl_lag, use = "pairwise.complete.obs"))
## `summarise()` ungrouping output (override with `.groups` argument)
# Save the value fo the correlation with double lags at both state and district level for use in
temp_df_dbl <- tibble(level = c("state", "district"), values = c(corr_state_dbl_lag, corr_dist_dbl_lag))
temp_df_dbl$subject <- subject
temp_df_dbl$state_grade <- state_grade
write_csv(temp_df_dbl, file.path(output, "intermediate", paste(subject, "-", state_grade, "- corr dbl lag.csv")))
Since the correlation of current levels and twice lags is not 0, we don’t use these values, but consolidating them here for reference:
# Generate estimates for the
change_pct_pers <- 1+2*corr_state_deltas
change_pct_samp <- 2*var_state_sampling/var_state_deltas
change_pct_other <- 1-change_pct_pers-change_pct_samp
level_pct_pers <- 1+corr_state_deltas*var_state_deltas/var_state_levels
level_pct_samp <- var_state_sampling/var_state_levels
level_pct_other <- 1-level_pct_pers-level_pct_samp
# save consolidated method 1 estimates as a csv
meth1_df <- tribble(
~changes_levels, ~grade, ~subject, ~share_persist, ~share_sampling, ~share_other,
"changes", state_grade, subject, change_pct_pers, change_pct_samp, change_pct_other,
"levels", state_grade, subject, level_pct_pers, level_pct_samp, level_pct_other,
)
write_csv(meth1_df, file.path(output, "intermediate", paste(subject, state_grade, " alt ests.csv")))
Calculate the share of variance due to persistent effects by look at how correlation in levels decays over time. Note that I could theoretically calculate more lags (since the state data is for 2006 to 2014) but then each lag would be specific to a single year (e.g. 2009-2008) rather than multiple years (the average of 2014-2013, 2013-2012, etc).
state_lags <- states %>%
group_by(State) %>%
mutate(read_lag1 = lag(score, order_by = year),
read_lag2 = lag(score, n= 2, order_by = year),
read_lag3 = lag(score, n =3, order_by = year),
read_lag4 = lag(score, n =4, order_by = year),
read_lag5 = lag(score, n =5, order_by = year)) %>%
ungroup()
rho = c()
# for each of the lags, calculate the correlation between current and the lag and store in vector rho
for (lag in seq(1,5)) {
rho <- c(rho, cor(state_lags$score, state_lags[[paste("read_lag",as.character(lag),sep="")]], use = "pairwise.complete.obs"))
}
# Save the values of rho for use in the working paper
temp_df <- tibble(rho = rho)
temp_df$subject <- subject
temp_df$state_grade <- state_grade
temp_df$lag <- 1:5
write_csv(temp_df, file.path(output, "intermediate", paste(subject, "-", state_grade, "- rho.csv")))
# Calculate variance of the transitory component by comparing rho_1 with the decay of rho after that
var_state_levels_persistent2 <- rho[1]/mean(rho/lag(rho), na.rm = TRUE)*var_state_levels
# Calculate the share of variance coming from other transitory sources.
# We will need this to calculate the variance breakup for changes
var_state_levels_other2 <- var_state_levels - var_state_levels_persistent2 - var_state_sampling
# Save a graph showing the decay of the correlation
quick_df <- tibble(lag = seq(0,5), correlation = c(1,rho))
ggplot(quick_df, aes(x= lag, y = correlation)) +
geom_line() +
geom_point() +
ylim(0,1)
# ggsave(paste(subject, "-", state_grade, "- correlation_decay.png"), width = 5, height = 6 , path = output)
With method 2, we have the same values for * \(Var(\Delta y_t )\),\(\sigma^2_y\), and \(\sigma^2_{sampling}\).
Instead of calculating \(\sigma^2_{\varepsilon}\), we calculate…
And then use these values to calculate…
We now have all the inputs we need to replicate figure 4 from Kane and Staiger
# Create empty tibble to store results
# I
df_bar <- tibble(state_or_dist = rep(c("State","District"),each =6),
changes_or_levels =rep(rep(c("Changes","Levels"), each =3), times= 2),
bar_part = factor(rep(c("Persistent","Transitory sampling", "Transitory other"),4), levels = c("Transitory other","Transitory sampling", "Persistent")),
value = rep(.0016,12)
)
# Update the values for the district changes bar
df_bar$value[(df_bar$state_or_dist == "District") & (df_bar$changes_or_levels == "Changes") & (df_bar$bar_part == "Persistent")] <- var_dist_deltas - 2*var_dist_epsilon
df_bar$value[(df_bar$state_or_dist == "District") & (df_bar$changes_or_levels == "Changes") & (df_bar$bar_part == "Transitory sampling")] <- 2*var_dist_sampling
df_bar$value[(df_bar$state_or_dist == "District") & (df_bar$changes_or_levels == "Changes") & (df_bar$bar_part == "Transitory other")] <- 2*var_dist_epsilon - 2*var_dist_sampling
# Update the values for the district levels bar
df_bar$value[(df_bar$state_or_dist == "District") & (df_bar$changes_or_levels == "Levels") & (df_bar$bar_part == "Persistent")] <- var_dist_levels - var_dist_epsilon
df_bar$value[(df_bar$state_or_dist == "District") & (df_bar$changes_or_levels == "Levels") & (df_bar$bar_part == "Transitory sampling")] <- var_dist_sampling
df_bar$value[(df_bar$state_or_dist == "District") & (df_bar$changes_or_levels == "Levels") & (df_bar$bar_part == "Transitory other")] <- var_dist_epsilon - var_dist_sampling
### STATE DATA ###
# CHANGES
df_bar$value[(df_bar$state_or_dist == "State") & (df_bar$changes_or_levels == "Changes") & (df_bar$bar_part == "Persistent")] <- var_state_deltas-2*(var_state_sampling+var_state_levels_other2)
df_bar$value[(df_bar$state_or_dist == "State") & (df_bar$changes_or_levels == "Changes") & (df_bar$bar_part == "Transitory sampling")] <- 2*var_state_sampling
df_bar$value[(df_bar$state_or_dist == "State") & (df_bar$changes_or_levels == "Changes") & (df_bar$bar_part == "Transitory other")] <- 2*var_state_levels_other2
# LEVELS
df_bar$value[(df_bar$state_or_dist == "State") & (df_bar$changes_or_levels == "Levels") & (df_bar$bar_part == "Persistent")] <- var_state_levels_persistent2
df_bar$value[(df_bar$state_or_dist == "State") & (df_bar$changes_or_levels == "Levels") & (df_bar$bar_part == "Transitory sampling")] <- var_state_sampling
df_bar$value[(df_bar$state_or_dist == "State") & (df_bar$changes_or_levels == "Levels") & (df_bar$bar_part == "Transitory other")] <- var_state_levels_other2
# I have created this graph so that you can see it in the notebook
# The final version which combines both subjects, is created in a separate notebook
ggplot(df_bar, aes(fill = bar_part, y=value, x= changes_or_levels)) +
geom_bar(position="stack", stat ="identity") +
facet_grid(~ state_or_dist) +
scale_fill_manual(values = c("red", "orange", "blue"))+
labs(fill = "Variance component", x = "")
# ggsave(paste(subject, "-", state_grade," - variance_decomposition.png"), width = 5, height = 6 , path = output)
df_bar$subject <- subject
df_bar$state_grade <- state_grade
write_csv(df_bar, file.path(output, "intermediate", paste(subject, "-", state_grade, "- bar data.csv")))
Take the output that was used to create the bar graph, reshape it, and save it as a csv
df_final <- df_bar %>%
pivot_wider(names_from = "bar_part", values_from= "value") %>%
mutate(total_var = `Persistent`+`Transitory sampling`+`Transitory other`) %>%
mutate(share_pers = `Persistent`/total_var, share_samp = `Transitory sampling`/total_var, share_other = `Transitory other`/total_var)
df_final$subject <- subject
df_final$state_grade <- state_grade
write_csv(df_final, file.path(output, "intermediate", paste(subject, "-", state_grade, "- var deco.csv")))