This notebook analyses IHDS ASER data to determine whether absent students could affect in-school assessments like the NAS. To do this we…
Note that much of this analysis performed in this notebook is not included in the working paper but I am keeping it here since it may be useful later.
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts --------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggrepel)
library(haven)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(broom)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
figures <- "C:/Users/dougj/Dropbox/Education in India/Original research/aservnas/figures"
ihds_ind_dir <- "C:/Users/dougj/Documents/Data/IHDS/IHDS 2012/DS0001"
ind_file <- file.path(ihds_ind_dir, "36151-0001-Data.dta")
# read in just those variables that i need
# this is much faster than reading in everything and then selecting
ihds <- read_dta(ind_file, col_select = c(STATEID, DISTID, PSUID, URBAN2011, HHID, HHSPLITID, PERSONID, IDPSU, WT, RO3, RO7, RO5, starts_with("CS"), starts_with("TA"), starts_with("ED")) )
# create a variable for whether highest language level achieved
ihds <- ihds %>% mutate(ASER4 = (TA8B ==4)) %>% mutate(State = as_factor(STATEID), Class = as_factor(TA4))
# drop the one row with missing values for weights
ihds <- ihds %>% filter(!is.na(WT))
ihds <- ihds %>% mutate(psu_expanded = paste(STATEID, DISTID, PSUID, sep ="-"), hh_expanded = paste(STATEID, DISTID, PSUID, HHID, HHSPLITID, sep ="-"))
# Specify the survey design
ihds_svy <- svydesign(id =~ psu_expanded + hh_expanded, weights =~ WT, data = ihds)
Inspect the CS13 variable which is self-reported number of days absent from school each month. Only look at observations for which we have ASER score and the child attends a govt or govt aided school.
attributes(ihds$CS13)
## $label
## [1] "EQ4 2.13 Days/month absent"
##
## $format.stata
## [1] "%2.0f"
# simple tab of CS13 --> note that the max value is 30.
ihds %>% filter(!is.na(TA8B)) %>%
filter(CS4 == 2 | CS4 == 3) %>%
group_by(CS13) %>%
count()
## # A tibble: 30 x 2
## # Groups: CS13 [30]
## CS13 n
## <dbl> <int>
## 1 0 2147
## 2 1 382
## 3 2 983
## 4 3 688
## 5 4 675
## 6 5 667
## 7 6 288
## 8 7 182
## 9 8 260
## 10 9 64
## # ... with 20 more rows
# check that this is not missing for kids with ASER results --> looks like there are only a few NAs
ihds %>% filter(!is.na(TA8B)) %>% summarise(non_na_count = sum(!is.na(CS13)), na_count = sum(is.na(CS13)) )
## # A tibble: 1 x 2
## non_na_count na_count
## <int> <int>
## 1 11163 694
Compare average ASER score for different levels of absence for kids attending govt schools. First, graph average score vs absence. Note that there are relatively few kids with absence over 10 so the area of this graph with absence > 10 should probably be ignored. Second, create histograms of score for each value of absence up to 10.
temp <- ihds %>% filter(!is.na(TA8B) & !is.na(CS13)) %>%
filter(CS4 == 2 | CS4 == 3) %>%
group_by(CS13) %>%
summarise(ASER_score = weighted.mean(TA8B, WT)) %>%
ungroup() %>%
rename(days_absent = CS13)
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(temp, aes(x = days_absent, y = ASER_score)) + geom_line()
temp <- ihds %>% mutate(days_absent_capped = ifelse(CS13 > 10, 10, CS13)) %>%
filter(!is.na(TA8B) & !is.na(days_absent_capped))
# check that this worked
temp %>% group_by(days_absent_capped) %>% count()
## # A tibble: 11 x 2
## # Groups: days_absent_capped [11]
## days_absent_capped n
## <dbl> <int>
## 1 0 3574
## 2 1 681
## 3 2 1690
## 4 3 1125
## 5 4 1009
## 6 5 1002
## 7 6 451
## 8 7 259
## 9 8 385
## 10 9 95
## 11 10 892
# the graph below shows the relative frequency of different ASER scores by # days absent
# note that 10 could be 10 or greater as I have replaced all values of CS13>10 with 10
ggplot(temp, aes(factor(TA8B), group = factor(days_absent_capped))) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count", fill = "gray") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
xlab("ASER reading score") +
facet_grid(~factor(days_absent_capped))
Regress ASER score on age, absence, state, and state x absence. Then test the joint significance of all the state x absence terms.
# display the label for each of variables
# vars <- list(df$CS13, df$RO5)
# lapply(vars, FUN = function(x) attributes(x)$label)
# Run a regression of ASER on absence, state dummies, state and absence interactions, and class dummies
# Restrict observations to rural students (URBAN2011 ==0) and
# in govt (CS4 == 2) or private aided (CS4 ==3) schools in grades 2 to 5 (TA4 between 2 and 5)
model <- svyglm(formula = ASER4 ~ CS13 + State + CS13*State + Class,
design = subset(ihds_svy, !is.na(TA8B) & (CS4 == 2 | CS4 == 3) & (TA4 >= 2 & TA4 <= 5) & (URBAN2011 == 0)))
# test the joint hypothesis that the interaction terms are null
linearHypothesis(model, matchCoefs(model, "CS13:"), white.adjust = "hc1")
## Linear hypothesis test
##
## Hypothesis:
## CS13:StateHimachal Pradesh 02 = 0
## CS13:StatePunjab 03 = 0
## CS13:StateUttarakhand 05 = 0
## CS13:StateHaryana 06 = 0
## CS13:StateRajasthan 08 = 0
## CS13:StateUttar Pradesh 09 = 0
## CS13:StateBihar 10 = 0
## CS13:StateArunachal Pradesh 12 = 0
## CS13:StateNagaland 13 = 0
## CS13:StateTripura 16 = 0
## CS13:StateMeghalaya 17 = 0
## CS13:StateAssam 18 = 0
## CS13:StateWest Bengal 19 = 0
## CS13:StateJharkhand 20 = 0
## CS13:StateOrissa 21 = 0
## CS13:StateChhattisgarh 22 = 0
## CS13:StateMadhya Pradesh 23 = 0
## CS13:StateGujarat 24 = 0
## CS13:StateDaman & Diu 25 = 0
## CS13:StateDadra + Nagar Haveli 26 = 0
## CS13:StateMaharashtra 27 = 0
## CS13:StateAndhra Pradesh 28 = 0
## CS13:StateKarnataka 29 = 0
## CS13:StateGoa 30 = 0
## CS13:StateKerala 32 = 0
## CS13:StateTamil Nadu 33 = 0
##
## Model 1: restricted model
## Model 2: ASER4 ~ CS13 + State + CS13 * State + Class
##
## Res.Df Df Chisq Pr(>Chisq)
## 1 1100
## 2 1074 26 64.486 4.065e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# load the broom package and use it to generate a nice tibble of the coefficients
coefs <- tidy(model)
coefs
## # A tibble: 61 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.164 0.128 1.28 0.201
## 2 CS13 -0.0427 0.0216 -1.98 0.0481
## 3 StateHimachal Pradesh 02 0.127 0.141 0.905 0.366
## 4 StatePunjab 03 0.119 0.152 0.786 0.432
## 5 StateUttarakhand 05 0.160 0.170 0.940 0.347
## 6 StateHaryana 06 0.0638 0.148 0.431 0.666
## 7 StateDelhi 07 0.0490 0.0447 1.10 0.273
## 8 StateRajasthan 08 0.0120 0.139 0.0869 0.931
## 9 StateUttar Pradesh 09 0.0246 0.137 0.180 0.857
## 10 StateBihar 10 -0.126 0.138 -0.916 0.360
## # ... with 51 more rows
We can also more directly calculate the effect of absence on state scores by comparing estimates of state averages with estimates which take into account potential absence on the day of the assessment. To do this, we create a new weight variable which takes into account the probability that the student would be present on the day of the exam. Our new WT variable is…
\[ newweight = oldweight*\frac{30-daysabsentpermonth}{30} \] As in previous analyses, we only look at students in rural govt or private aided schools.
state_averages <- ihds %>% filter(!is.na(TA8B) & !is.na(CS13)) %>%
filter((CS4 == 2 | CS4 == 3) & (URBAN2011 ==0)) %>%
mutate(new.weight = WT*(30-CS13)/30) %>%
group_by(State) %>%
summarise(full_sample = weighted.mean(ASER4, WT), absence_weighted = weighted.mean(ASER4, new.weight),
diff = full_sample-absence_weighted)
## `summarise()` ungrouping output (override with `.groups` argument)
state_averages$full_rank[order(state_averages$full_sample)] <- 1:nrow(state_averages)
## Warning: Unknown or uninitialised column: `full_rank`.
state_averages$absence_rank[order(state_averages$absence_weighted)] <- 1:nrow(state_averages)
## Warning: Unknown or uninitialised column: `absence_rank`.
state_averages
## # A tibble: 31 x 6
## State full_sample absence_weighted diff full_rank absence_rank
## <fct> <dbl> <dbl> <dbl> <int> <int>
## 1 Jammu & Kashmir~ 0.240 0.247 -6.91e-3 17 16
## 2 Himachal Prades~ 0.526 0.531 -4.95e-3 29 29
## 3 Punjab 03 0.420 0.429 -9.22e-3 27 26
## 4 Uttarakhand 05 0.409 0.459 -5.03e-2 26 27
## 5 Haryana 06 0.406 0.416 -1.08e-2 25 25
## 6 Delhi 07 0.667 0.699 -3.21e-2 30 30
## 7 Rajasthan 08 0.329 0.339 -9.96e-3 20 21
## 8 Uttar Pradesh 09 0.236 0.248 -1.21e-2 16 17
## 9 Bihar 10 0.174 0.174 1.26e-4 12 12
## 10 Sikkim 11 0 0 0. 1 1
## # ... with 21 more rows
# convert state column from factor to character
state_averages$State <- sub("...$", "", as.character(state_averages$State))
# Create scatter plot of full sample ASER4 score versus absence weighted score for each state
ggplot(state_averages, aes(x = full_sample, y = absence_weighted, label = State)) +
geom_abline(intercept = 0, slope = 1, color="orange") +
geom_point() +
geom_label_repel(size = 3) +
labs(x = "NAS score", y ="Expected absence-weighted NAS sore", title = "NAS score vs. expected absence-weighted score")
ggsave("absence_weighted_aser.png", path= figures)
## Saving 7 x 5 in image
# Export a table with the state rankings
state_rankings <- state_averages %>%
mutate(State = str_replace(State, "&", "and")) %>%
select(State, full_rank, absence_rank) %>%
arrange(full_rank)
stargazer(state_rankings, type = "html", covariate.labels = c("SL", "State", "Rank - unweighted", "Rank - absence weighted"),summary = FALSE, out = file.path(figures, "state_rankings.doc"))
##
## <table style="text-align:center"><tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">SL</td><td>State</td><td>Rank - unweighted</td><td>Rank - absence weighted</td></tr>
## <tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">1</td><td>Sikkim</td><td>1</td><td>1</td></tr>
## <tr><td style="text-align:left">2</td><td>Manipur</td><td>2</td><td>2</td></tr>
## <tr><td style="text-align:left">3</td><td>Mizoram</td><td>3</td><td>3</td></tr>
## <tr><td style="text-align:left">4</td><td>Daman and Diu</td><td>4</td><td>6</td></tr>
## <tr><td style="text-align:left">5</td><td>Arunachal Pradesh</td><td>5</td><td>5</td></tr>
## <tr><td style="text-align:left">6</td><td>Andhra Pradesh</td><td>6</td><td>4</td></tr>
## <tr><td style="text-align:left">7</td><td>Meghalaya</td><td>7</td><td>7</td></tr>
## <tr><td style="text-align:left">8</td><td>Tripura</td><td>8</td><td>8</td></tr>
## <tr><td style="text-align:left">9</td><td>Tamil Nadu</td><td>9</td><td>9</td></tr>
## <tr><td style="text-align:left">10</td><td>Jharkhand</td><td>10</td><td>10</td></tr>
## <tr><td style="text-align:left">11</td><td>Karnataka</td><td>11</td><td>11</td></tr>
## <tr><td style="text-align:left">12</td><td>Bihar</td><td>12</td><td>12</td></tr>
## <tr><td style="text-align:left">13</td><td>Assam</td><td>13</td><td>13</td></tr>
## <tr><td style="text-align:left">14</td><td>Maharashtra</td><td>14</td><td>14</td></tr>
## <tr><td style="text-align:left">15</td><td>Gujarat</td><td>15</td><td>15</td></tr>
## <tr><td style="text-align:left">16</td><td>Uttar Pradesh</td><td>16</td><td>17</td></tr>
## <tr><td style="text-align:left">17</td><td>Jammu and Kashmir</td><td>17</td><td>16</td></tr>
## <tr><td style="text-align:left">18</td><td>Dadra+Nagar Haveli</td><td>18</td><td>18</td></tr>
## <tr><td style="text-align:left">19</td><td>Chhattisgarh</td><td>19</td><td>19</td></tr>
## <tr><td style="text-align:left">20</td><td>Rajasthan</td><td>20</td><td>21</td></tr>
## <tr><td style="text-align:left">21</td><td>West Bengal</td><td>21</td><td>20</td></tr>
## <tr><td style="text-align:left">22</td><td>Madhya Pradesh</td><td>22</td><td>23</td></tr>
## <tr><td style="text-align:left">23</td><td>Orissa</td><td>23</td><td>22</td></tr>
## <tr><td style="text-align:left">24</td><td>Kerala</td><td>24</td><td>24</td></tr>
## <tr><td style="text-align:left">25</td><td>Haryana</td><td>25</td><td>25</td></tr>
## <tr><td style="text-align:left">26</td><td>Uttarakhand</td><td>26</td><td>27</td></tr>
## <tr><td style="text-align:left">27</td><td>Punjab</td><td>27</td><td>26</td></tr>
## <tr><td style="text-align:left">28</td><td>Goa</td><td>28</td><td>28</td></tr>
## <tr><td style="text-align:left">29</td><td>Himachal Pradesh</td><td>29</td><td>29</td></tr>
## <tr><td style="text-align:left">30</td><td>Delhi</td><td>30</td><td>30</td></tr>
## <tr><td style="text-align:left">31</td><td>Nagaland</td><td>31</td><td>31</td></tr>
## <tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr></table>