1 Summary

This notebook analyses IHDS ASER data to determine whether absent students could affect in-school assessments like the NAS. To do this we…

Run a regression of the ASER score on a variable for the number of days absent from school in the previous month, state dummies, and interactions between the state dummies and the absence variable. We then inspect the coefficients for the interaction terms and test their joint significance.
Compare estimates of state averages on the full sample with estimates of state averages where we take into account possible missingness due to absence.

Note that much of this analysis performed in this notebook is not included in the working paper but I am keeping it here since it may be useful later.

library(tidyverse)

## -- Attaching packages ------------------------------------------------------------------------ tidyverse 1.3.0 --

## v ggplot2 3.3.0     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   1.0.0
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts --------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(ggrepel)
library(haven)
library(survey)

## Loading required package: grid

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

library(broom)
library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

figures <- "C:/Users/dougj/Dropbox/Education in India/Original research/aservnas/figures"

2 Read data and set the survey design

ihds_ind_dir <- "C:/Users/dougj/Documents/Data/IHDS/IHDS 2012/DS0001"
ind_file <- file.path(ihds_ind_dir, "36151-0001-Data.dta")
# read in just those variables that i need
# this is much faster than reading in everything and then selecting
ihds <- read_dta(ind_file, col_select = c(STATEID, DISTID, PSUID, URBAN2011, HHID, HHSPLITID, PERSONID, IDPSU, WT, RO3, RO7, RO5, starts_with("CS"), starts_with("TA"), starts_with("ED")) )

# create a variable for whether highest language level achieved
ihds <- ihds %>% mutate(ASER4 = (TA8B ==4)) %>% mutate(State = as_factor(STATEID), Class = as_factor(TA4))

# drop the one row with missing values for weights
ihds <- ihds %>% filter(!is.na(WT))
ihds <- ihds %>% mutate(psu_expanded = paste(STATEID, DISTID, PSUID, sep ="-"), hh_expanded = paste(STATEID, DISTID, PSUID, HHID, HHSPLITID, sep ="-"))

# Specify the survey design
ihds_svy <- svydesign(id =~ psu_expanded + hh_expanded, weights =~ WT, data = ihds)

3 Inspect data

Inspect the CS13 variable which is self-reported number of days absent from school each month. Only look at observations for which we have ASER score and the child attends a govt or govt aided school.

attributes(ihds$CS13)

## $label
## [1] "EQ4 2.13 Days/month absent"
## 
## $format.stata
## [1] "%2.0f"

# simple tab of CS13 --> note that the max value is 30.
ihds %>% filter(!is.na(TA8B)) %>%
  filter(CS4 == 2 | CS4 == 3) %>%
  group_by(CS13) %>% 
  count()

## # A tibble: 30 x 2
## # Groups:   CS13 [30]
##     CS13     n
##    <dbl> <int>
##  1     0  2147
##  2     1   382
##  3     2   983
##  4     3   688
##  5     4   675
##  6     5   667
##  7     6   288
##  8     7   182
##  9     8   260
## 10     9    64
## # ... with 20 more rows

# check that this is not missing for kids with ASER results --> looks like there are only a few NAs
ihds %>% filter(!is.na(TA8B)) %>% summarise(non_na_count = sum(!is.na(CS13)), na_count = sum(is.na(CS13)) )

## # A tibble: 1 x 2
##   non_na_count na_count
##          <int>    <int>
## 1        11163      694

4 Graphical analysis of ASER scores by absence

Compare average ASER score for different levels of absence for kids attending govt schools. First, graph average score vs absence. Note that there are relatively few kids with absence over 10 so the area of this graph with absence > 10 should probably be ignored. Second, create histograms of score for each value of absence up to 10.

temp <- ihds %>% filter(!is.na(TA8B) & !is.na(CS13)) %>% 
  filter(CS4 == 2 | CS4 == 3) %>%
  group_by(CS13) %>% 
  summarise(ASER_score = weighted.mean(TA8B, WT)) %>% 
  ungroup() %>%
  rename(days_absent = CS13)

## `summarise()` ungrouping output (override with `.groups` argument)

ggplot(temp, aes(x = days_absent, y = ASER_score)) + geom_line()

temp <- ihds %>% mutate(days_absent_capped = ifelse(CS13 > 10, 10, CS13)) %>% 
  filter(!is.na(TA8B) & !is.na(days_absent_capped))
# check that this worked
temp %>% group_by(days_absent_capped) %>% count()

## # A tibble: 11 x 2
## # Groups:   days_absent_capped [11]
##    days_absent_capped     n
##                 <dbl> <int>
##  1                  0  3574
##  2                  1   681
##  3                  2  1690
##  4                  3  1125
##  5                  4  1009
##  6                  5  1002
##  7                  6   451
##  8                  7   259
##  9                  8   385
## 10                  9    95
## 11                 10   892

# the graph below shows the relative frequency of different ASER scores by # days absent
# note that 10 could be 10 or greater as I have replaced all values of CS13>10 with 10
ggplot(temp, aes(factor(TA8B), group = factor(days_absent_capped))) + 
  geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count", fill = "gray") + 
  scale_y_continuous(labels=scales::percent) +
  ylab("relative frequencies") +
  xlab("ASER reading score") +
  facet_grid(~factor(days_absent_capped))

5 Regression of ASER score on absence and state vars

Regress ASER score on age, absence, state, and state x absence. Then test the joint significance of all the state x absence terms.

# display the label for each of variables
# vars <- list(df$CS13, df$RO5)
# lapply(vars, FUN = function(x) attributes(x)$label)


# Run a regression of ASER on absence, state dummies, state and absence interactions, and class dummies
# Restrict observations to rural students (URBAN2011 ==0) and
# in govt (CS4 == 2) or private aided (CS4 ==3) schools in grades 2 to 5 (TA4 between 2 and 5)

model <- svyglm(formula = ASER4 ~ CS13 + State + CS13*State + Class, 
       design = subset(ihds_svy, !is.na(TA8B) & (CS4 == 2 | CS4 == 3) & (TA4 >= 2 & TA4 <= 5) & (URBAN2011 == 0)))

# test the joint hypothesis that the interaction terms are null
linearHypothesis(model, matchCoefs(model, "CS13:"), white.adjust = "hc1")

## Linear hypothesis test
## 
## Hypothesis:
## CS13:StateHimachal Pradesh 02 = 0
## CS13:StatePunjab 03 = 0
## CS13:StateUttarakhand 05 = 0
## CS13:StateHaryana 06 = 0
## CS13:StateRajasthan 08 = 0
## CS13:StateUttar Pradesh 09 = 0
## CS13:StateBihar 10 = 0
## CS13:StateArunachal Pradesh 12 = 0
## CS13:StateNagaland 13 = 0
## CS13:StateTripura 16 = 0
## CS13:StateMeghalaya 17 = 0
## CS13:StateAssam 18 = 0
## CS13:StateWest Bengal 19 = 0
## CS13:StateJharkhand 20 = 0
## CS13:StateOrissa 21 = 0
## CS13:StateChhattisgarh 22 = 0
## CS13:StateMadhya Pradesh 23 = 0
## CS13:StateGujarat 24 = 0
## CS13:StateDaman & Diu 25 = 0
## CS13:StateDadra  + Nagar Haveli 26 = 0
## CS13:StateMaharashtra 27 = 0
## CS13:StateAndhra Pradesh 28 = 0
## CS13:StateKarnataka 29 = 0
## CS13:StateGoa 30 = 0
## CS13:StateKerala 32 = 0
## CS13:StateTamil Nadu 33 = 0
## 
## Model 1: restricted model
## Model 2: ASER4 ~ CS13 + State + CS13 * State + Class
## 
##   Res.Df Df  Chisq Pr(>Chisq)    
## 1   1100                         
## 2   1074 26 64.486  4.065e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# load the broom package and use it to generate a nice tibble of the coefficients
coefs <- tidy(model)
coefs

## # A tibble: 61 x 5
##    term                     estimate std.error statistic p.value
##    <chr>                       <dbl>     <dbl>     <dbl>   <dbl>
##  1 (Intercept)                0.164     0.128     1.28    0.201 
##  2 CS13                      -0.0427    0.0216   -1.98    0.0481
##  3 StateHimachal Pradesh 02   0.127     0.141     0.905   0.366 
##  4 StatePunjab 03             0.119     0.152     0.786   0.432 
##  5 StateUttarakhand 05        0.160     0.170     0.940   0.347 
##  6 StateHaryana 06            0.0638    0.148     0.431   0.666 
##  7 StateDelhi 07              0.0490    0.0447    1.10    0.273 
##  8 StateRajasthan 08          0.0120    0.139     0.0869  0.931 
##  9 StateUttar Pradesh 09      0.0246    0.137     0.180   0.857 
## 10 StateBihar 10             -0.126     0.138    -0.916   0.360 
## # ... with 51 more rows

6 Compare state estimates assuming some students more likely to be absent

We can also more directly calculate the effect of absence on state scores by comparing estimates of state averages with estimates which take into account potential absence on the day of the assessment. To do this, we create a new weight variable which takes into account the probability that the student would be present on the day of the exam. Our new WT variable is…

\[ newweight = oldweight*\frac{30-daysabsentpermonth}{30} \] As in previous analyses, we only look at students in rural govt or private aided schools.

state_averages <- ihds %>% filter(!is.na(TA8B) & !is.na(CS13)) %>% 
  filter((CS4 == 2 | CS4 == 3) & (URBAN2011 ==0)) %>%
  mutate(new.weight = WT*(30-CS13)/30) %>%
  group_by(State) %>% 
  summarise(full_sample = weighted.mean(ASER4, WT), absence_weighted = weighted.mean(ASER4, new.weight), 
            diff = full_sample-absence_weighted)

## `summarise()` ungrouping output (override with `.groups` argument)

state_averages$full_rank[order(state_averages$full_sample)] <- 1:nrow(state_averages)

## Warning: Unknown or uninitialised column: `full_rank`.

state_averages$absence_rank[order(state_averages$absence_weighted)] <- 1:nrow(state_averages)

## Warning: Unknown or uninitialised column: `absence_rank`.

state_averages

## # A tibble: 31 x 6
##    State            full_sample absence_weighted     diff full_rank absence_rank
##    <fct>                  <dbl>            <dbl>    <dbl>     <int>        <int>
##  1 Jammu & Kashmir~       0.240            0.247 -6.91e-3        17           16
##  2 Himachal Prades~       0.526            0.531 -4.95e-3        29           29
##  3 Punjab 03              0.420            0.429 -9.22e-3        27           26
##  4 Uttarakhand 05         0.409            0.459 -5.03e-2        26           27
##  5 Haryana 06             0.406            0.416 -1.08e-2        25           25
##  6 Delhi 07               0.667            0.699 -3.21e-2        30           30
##  7 Rajasthan 08           0.329            0.339 -9.96e-3        20           21
##  8 Uttar Pradesh 09       0.236            0.248 -1.21e-2        16           17
##  9 Bihar 10               0.174            0.174  1.26e-4        12           12
## 10 Sikkim 11              0                0      0.              1            1
## # ... with 21 more rows

# convert state column from factor to character
state_averages$State <- sub("...$", "", as.character(state_averages$State))

# Create scatter plot of full sample ASER4 score versus absence weighted score for each state
ggplot(state_averages, aes(x = full_sample, y = absence_weighted, label = State)) +
  geom_abline(intercept = 0, slope = 1, color="orange")  +
  geom_point() +
  geom_label_repel(size = 3) +
  labs(x = "NAS score", y ="Expected absence-weighted NAS sore", title = "NAS score vs. expected absence-weighted score")

ggsave("absence_weighted_aser.png", path= figures)

## Saving 7 x 5 in image

# Export a table with the state rankings
state_rankings <- state_averages %>% 
  mutate(State = str_replace(State, "&", "and")) %>%
  select(State, full_rank, absence_rank) %>% 
  arrange(full_rank)
stargazer(state_rankings, type = "html", covariate.labels = c("SL", "State", "Rank - unweighted", "Rank - absence weighted"),summary = FALSE, out = file.path(figures, "state_rankings.doc"))