This notebook performs calculations and creates graphs for the section “Comparison of ASER, NAS, and IHDS” in the paper “Assessing the Assessments: Taking Stock of Learning Outcomes Data in India.”
Others who would like to run this notebook locally will need to:
library(tidyverse); library(haven); library(survey); library(ggrepel); library(broom); library(ICC)
## -- Attaching packages ---------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
ihds_ind_dir <- "C:/Users/dougj/Documents/Data/IHDS/IHDS 2012/DS0001"
figures <- "C:/Users/dougj/Dropbox/Education in India/Original research/aservnas/figures"
# import ASER and NAS data
aser_nas <- read_csv("https://raw.githubusercontent.com/dougj892/public-datasets/master/ASER%202018%20and%20NAS%202017%20govt%20school%20grade%203%20reading.csv")
## Parsed with column specification:
## cols(
## State = col_character(),
## state_abbr = col_character(),
## ASER_2012 = col_double(),
## ASER_2014 = col_double(),
## ASER_2016 = col_double(),
## ASER_2018 = col_double(),
## NAS = col_double(),
## aser_rank = col_double(),
## nas_rank = col_double()
## )
# Create graph
ggplot(data = aser_nas,aes(x=aser_rank,y=nas_rank,label= state_abbr)) +
geom_abline(intercept = 0, slope = 1, color="orange") +
geom_point(color="darkblue") +
theme_bw() +
labs(title="State Rankings Based on Language Results for Std III Students (Rural)", x = "Rank in ASER (2018)", y = "Rank in NAS (2017)") +
scale_y_continuous(breaks=c(1:28)) + scale_x_continuous(breaks=c(1:28)) + theme(panel.grid.minor = element_blank(), panel.grid.major = element_line(colour = "gray",size=0.1)) +
geom_label_repel()
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_label_repel).
# Save graph
ggsave("aser_nas_lang_ranking.png", width = 9, height = 6 , path = figures)
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_label_repel).
Given the huge variation in state GDP per capita, one would expect that there would be some correlation in NSDP per capita and learning outcomes. Surprisingly, this is not the case.
# Import NSDP data
nsdp <- read_csv("https://raw.githubusercontent.com/dougj892/public-datasets/master/state%20GDP%20per%20capita.csv")
## Parsed with column specification:
## cols(
## State = col_character(),
## `NSDP per capita` = col_double(),
## `Data year` = col_character()
## )
# Merge NSDP data with ASER and NAS data
aser_nas_nsdp <- aser_nas %>% left_join(nsdp, by = "State")
# Check that there are no states which didn't match
print(paste("Number of unmatched states", sum(is.na(aser_nas_nsdp$`NSDP per capita`))))
## [1] "Number of unmatched states 0"
# Create a ranking for NSDP
aser_nas_nsdp$gdp_rank <- NA
aser_nas_nsdp$gdp_rank[order(aser_nas_nsdp$`NSDP per capita`, decreasing = TRUE)] <- 1:nrow(aser_nas_nsdp)
# Calculate pairwise correlations for the 3 datasets
temp <- aser_nas_nsdp %>% select(ASER_2018, 'NSDP per capita', NAS)
print("Pearson (regular) correlation:")
## [1] "Pearson (regular) correlation:"
cor(temp, method = "pearson", use = "pairwise.complete.obs")
## ASER_2018 NSDP per capita NAS
## ASER_2018 1.0000000 0.4154964 0.1872901
## NSDP per capita 0.4154964 1.0000000 0.0515147
## NAS 0.1872901 0.0515147 1.0000000
print("Spearman (rank) correlation")
## [1] "Spearman (rank) correlation"
cor(temp, method = "spearman", use = "pairwise.complete.obs")
## ASER_2018 NSDP per capita NAS
## ASER_2018 1.0000000 0.3758205 0.1267176
## NSDP per capita 0.3758205 1.0000000 0.1382236
## NAS 0.1267176 0.1382236 1.0000000
# Calculate correlations with p-values for null rho = 0
cor.test(temp$ASER_2018,temp$`NSDP per capita`, use = "pairwise.complete.obs", method = "pearson")
##
## Pearson's product-moment correlation
##
## data: temp$ASER_2018 and temp$`NSDP per capita`
## t = 2.284, df = 25, p-value = 0.03113
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.04213542 0.68703188
## sample estimates:
## cor
## 0.4154964
cor.test(temp$ASER_2018,temp$`NSDP per capita`, use = "pairwise.complete.obs", method = "spearman")
## Warning in cor.test.default(temp$ASER_2018, temp$`NSDP per capita`, use =
## "pairwise.complete.obs", : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: temp$ASER_2018 and temp$`NSDP per capita`
## S = 2044.8, p-value = 0.05337
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.3758205
cor.test(temp$NAS,temp$`NSDP per capita`, use = "pairwise.complete.obs", method = "pearson")
##
## Pearson's product-moment correlation
##
## data: temp$NAS and temp$`NSDP per capita`
## t = 0.26302, df = 26, p-value = 0.7946
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3278634 0.4165853
## sample estimates:
## cor
## 0.0515147
cor.test(temp$NAS,temp$`NSDP per capita`, use = "pairwise.complete.obs", method = "spearman")
## Warning in cor.test.default(temp$NAS, temp$`NSDP per capita`, use =
## "pairwise.complete.obs", : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: temp$NAS and temp$`NSDP per capita`
## S = 3148.9, p-value = 0.483
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1382236
aser_trends <- read_csv("https://raw.githubusercontent.com/dougj892/public-datasets/master/ASER%20trends%20over%20time.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## State = col_character(),
## state_abbr = col_character()
## )
## See spec(...) for full column specifications.
corrs <- aser_trends %>%
group_by(year) %>%
summarise(pearson = cor(std3_read_std1_all, std3_subtract_all, use="pairwise.complete.obs", method="pearson"),
spearman = cor(std3_read_std1_all, std3_subtract_all, use="pairwise.complete.obs", method="spearman"))
## `summarise()` ungrouping output (override with `.groups` argument)
corrs %>% summarise_all(mean, na.rm = TRUE)
## # A tibble: 1 x 3
## year pearson spearman
## <dbl> <dbl> <dbl>
## 1 2011. 0.819 0.798
Calculate percentage of students with ASER (TA8B) == 4 (i.e. can read std 2 level text) for rural (URBAN2011 == 0) govt school (CS4 ==2) or private aided (CS4 == 3) students in grades (TA4) 2 to 5 (there are too few students if we restrict only to grade 3) by state. warning: this code takes a long time
ind_file <- file.path(ihds_ind_dir, "36151-0001-Data.dta")
# read in just those variables that i need
# this is much faster than reading in everything and then selecting
ihds <- read_dta(ind_file, col_select = c(STATEID, DISTID, PSUID, HHID, HHSPLITID, PERSONID, IDPSU, WT, RO3, RO7, RO5, URBAN2011, starts_with("CS"), starts_with("TA"), starts_with("ED")))
ihds <- ihds %>% mutate(psu_expanded = paste(STATEID, DISTID, PSUID, sep ="-"), hh_expanded = paste(STATEID, DISTID, PSUID, HHID, HHSPLITID, sep ="-"))
# confirm that TA4 (class) is not NA if TA8B is not NA --> there are only 38 instances when TA8B is not NA but TA4 is NA
ihds %>% filter(!is.na(TA8B)) %>% count(TA4)
## # A tibble: 11 x 2
## TA4 n
## <dbl> <int>
## 1 0 185
## 2 1 1205
## 3 2 1981
## 4 3 2571
## 5 4 2613
## 6 5 2001
## 7 6 949
## 8 7 242
## 9 8 57
## 10 9 15
## 11 NA 38
# drop the one row with missing values for weights
ihds <- ihds %>% filter(!is.na(WT))
# create variable for ASER at level 4
ihds <- ihds %>% mutate(ASER4 = (TA8B ==4)) %>% mutate(State = as_factor(STATEID))
# use the survey package to set the survey design. I will use the ihds_svy object to calculate CIs
ihds_svy <- svydesign(id =~ psu_expanded + hh_expanded, weights =~ WT, data = ihds)
# use statsby to get the % of selected kids who achieve level 4 on ASER reading by state
# note that I am not sure if subsetting within statsby is kosher, but the standard errors should be ok
# more or less ok regardless
ihds_scores <- svyby(~ASER4, ~State, subset(ihds_svy, !is.na(TA8B) & (CS4 == 2 | CS4 == 3) & (TA4 >= 2 & TA4 <= 5) & (URBAN2011 == 0)), svymean, na.rm=TRUE)
# convert to a tibble
ihds_scores <- as.tibble(ihds_scores) %>% select(State, ASER4TRUE, se.ASER4TRUE) %>% rename(ihds = ASER4TRUE, ihds_se = se.ASER4TRUE)
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
# Unsure, but I think this gets rid of the space and number at the end of the state name
ihds_scores$State <- sub("...$", "", ihds_scores$State)
# replace the standard errors with NA is SE == 0
ihds_scores$ihds[ihds_scores$ihds_se == 0] <- NA
# replace Orissa with Odisha in state
ihds_scores$State[ihds_scores$State == "Orissa"] <- "Odisha"
ihds_scores$ihds <- round(ihds_scores$ihds, 3)*100
scores <- aser_nas %>% select(State, ASER_2018, NAS) %>%
full_join(ihds_scores, by = "State")
# drop if IHDS_ASER is NA
# scores <- scores %>% filter(!is.na(IHDS_ASER)) %>% arrange(ASER) %>% select(State, ASER, IHDS_ASER, NAS, obs)
# calculate correlation matrix --> while there are a few outliers, overall the ASER data and IHDS data match Ok
temp <- scores %>% select(ASER_2018, ihds, NAS)
cor(temp, method = "pearson", use = "pairwise.complete.obs")
## ASER_2018 ihds NAS
## ASER_2018 1.0000000 0.617984954 0.187290066
## ihds 0.6179850 1.000000000 0.003204316
## NAS 0.1872901 0.003204316 1.000000000
cor(temp, method = "spearman", use = "pairwise.complete.obs")
## ASER_2018 ihds NAS
## ASER_2018 1.0000000 0.60152502 0.12671756
## ihds 0.6015250 1.00000000 -0.03106467
## NAS 0.1267176 -0.03106467 1.00000000
# Same as above, but with p-values
cor.test(temp$NAS,temp$ihds, use = "pairwise.complete.obs", method = "pearson")
##
## Pearson's product-moment correlation
##
## data: temp$NAS and temp$ihds
## t = 0.01433, df = 20, p-value = 0.9887
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.4189701 0.4242396
## sample estimates:
## cor
## 0.003204316
cor.test(temp$ASER_2018,temp$ihds, use = "pairwise.complete.obs", method = "pearson")
##
## Pearson's product-moment correlation
##
## data: temp$ASER_2018 and temp$ihds
## t = 3.5153, df = 20, p-value = 0.002176
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2655702 0.8247157
## sample estimates:
## cor
## 0.617985
cor.test(temp$NAS,temp$ASER_2018, use = "pairwise.complete.obs", method = "pearson")
##
## Pearson's product-moment correlation
##
## data: temp$NAS and temp$ASER_2018
## t = 0.95332, df = 25, p-value = 0.3496
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2074917 0.5296102
## sample estimates:
## cor
## 0.1872901
cor.test(temp$NAS,temp$ihds, use = "pairwise.complete.obs", method = "spearman")
## Warning in cor.test.default(temp$NAS, temp$ihds, use =
## "pairwise.complete.obs", : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: temp$NAS and temp$ihds
## S = 1826, p-value = 0.8908
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.03106467
cor.test(temp$ASER_2018,temp$ihds, use = "pairwise.complete.obs", method = "spearman")
## Warning in cor.test.default(temp$ASER_2018, temp$ihds, use =
## "pairwise.complete.obs", : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: temp$ASER_2018 and temp$ihds
## S = 705.7, p-value = 0.003063
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.601525
cor.test(temp$NAS,temp$ASER_2018, use = "pairwise.complete.obs", method = "spearman")
## Warning in cor.test.default(temp$NAS, temp$ASER_2018, use =
## "pairwise.complete.obs", : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: temp$NAS and temp$ASER_2018
## S = 2860.9, p-value = 0.5288
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1267176
Calculate ASER ICC using the ICCest function from the ICC package as well manually using the output from the ANOVA table. (Note that the manual method assumes constant number of children per village which is not accurate but yields a similar result.)
# Calculate ICC
icc_df <- ihds %>%
filter(!is.na(TA8B) & (TA4 >= 2 & TA4 <= 5) & (URBAN2011 == 0))
ICCest(x = psu_expanded, y = TA8B, data = icc_df)
## $ICC
## [1] 0.182186
##
## $LowerCI
## [1] 0.1582673
##
## $UpperCI
## [1] 0.2073873
##
## $N
## [1] 1295
##
## $k
## [1] 4.933094
##
## $varw
## [1] 1.525748
##
## $vara
## [1] 0.339894
# Verify this by calculating the ICC manually from the ANOVA table
fit <- lm(TA8B~psu_expanded, data = icc_df)
anova_est <- tidy(anova(fit))
ms_between <- anova_est$meansq[anova_est$term == "psu_expanded"]
ms_resid <- anova_est$meansq[anova_est$term == "Residuals"]
# get average number of children per village
k <- icc_df %>% count(psu_expanded) %>% summarize(mean(n))
k <- k[[1,1]]
icc <- (ms_between - ms_resid)/(ms_between+(k-1)*ms_resid)
print(icc)
## [1] 0.1821244
# treat the SE as just another score
scores.long2 <- scores %>%
filter(!is.na(ASER_2018) & !is.na(NAS)) %>%
mutate(IHDS = paste(ihds,ihds_se, sep ="-")) %>%
select(State, ASER_2018, NAS, IHDS) %>%
gather(Assessment, avg_score, -State) %>%
separate(avg_score, sep = "-", into = c("avg_score","se"))
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 54 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# convert avg_score and se back to numeric
scores.long2 <- scores.long2 %>%
mutate(avg_score = as.numeric(avg_score), se = as.numeric(se))
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
# multiply se
scores.long2 <- scores.long2 %>%
mutate(se = 100*se)
# create y_min and y_max
m <- qnorm(1-.05/2)
scores.long2 <- scores.long2 %>%
mutate(ymin = avg_score-m*se, ymax = avg_score+m*se) %>%
mutate(ymin = ifelse(ymin <0, 0, ymin))
ggplot(data = scores.long2, aes(x= reorder(State, avg_score),y=avg_score, fill= Assessment)) +
geom_bar(stat = "identity", position = position_dodge(width=.8)) +
geom_errorbar(aes(ymin = ymin, ymax = ymax), width = .1) +
theme(axis.text.x = element_text(angle=90)) +
scale_fill_manual(values = c("lightblue", "blue", "darkblue"))+
labs(y = "Average score", x = "", title = "ASER, NAS, and IHDS", caption = "Bars on IHDS estimates show 95% confidence intervals.")
## Warning: Removed 5 rows containing missing values (geom_bar).
ggsave("aser_nas_ihds_values.png", width = 9, height = 6 , path = figures)
## Warning: Removed 5 rows containing missing values (geom_bar).