setwd("~/Dropbox/EQUIDE/Investigación/COVID/R/COVID19")

Why R?

I used to be a STATA user. But several features made me switch to R:

R is free; better for students
Great community support; most tutorials and text books are free (i.e Measuring what Matters: Introduction to Rasch Analysis in R)
R is flexible; multiple datasets, multiple objects, and customization of all features (i.e. graphs)
R is OpenSource; it is easier for coders to add new algorithms (i.e. FAO)
R facilitates reproducibility; from data to paper in one file (i.e. Rmarkdown)

As any other code environment, it takes time to master the syntax. However, constant add-ons facilitate its use.

Base-R -> RStudio -> RMarkdown
R “Packages” for common analyses (i.e wrappers)

Brief example of the R syntax for data management

I will explore the ELCSA scale from ENSANUT 2018, a large sample with a complex design. I will illustrate the whole process so you can see R in action. This is the same procedure we used for a recent article:

Research article

Load dataset and a little data management

Let´s start by activating the neccesary packages we need for this session.

#Load multipes packages 

library(pacman)
p_load(haven, tidyverse, knitr, skimr, janitor, devtools, survey, visdat, naniar, corrr, corrplot, GGally, lemon, psych, RM.weights, wesanderson, kableExtra, eRm)


#Load dataset

library(haven)
segali_a <- read_dta("~/Dropbox/EQUIDE/Investigación/COVID/InseguridadAlimentaria/ensanut2018/segali_a.dta")

#store as an object 
ensanut <- segali_a

#select key variables
ensanut <- ensanut %>% 
  select("upm","est_dis","upm_dis","estrato","f_segal","edad","sexo","ent",
         "ia_preocupacion","ia_sinalimento","ia_saludable","ia_variedad","ia_desayunar","ia_menos","ia_hambre", "ia_dejocomer")

# rename ELCSA items

ensanut <- ensanut %>% 
  rename(
    worried="ia_preocupacion",
    ranout="ia_sinalimento",
    healthy="ia_saludable",
    fewfoods="ia_variedad",
    skipped="ia_desayunar",
    ateless="ia_menos",
    hungry="ia_hambre",
    whlday="ia_dejocomer")

#examine descriptives
skim(ensanut) # a nice wrapper to summarize key characteristics

Data summary

Name	ensanut
Number of rows	44509
Number of columns	16
_______________________
Column type frequency:
numeric	16
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
upm	0	1	3238.10	1797.17	1	1688	3257	4830	6289	▆▇▇▇▇
est_dis	0	1	161.44	95.90	1	77	158	250	328	▇▇▇▇▇
upm_dis	0	1	3238.10	1797.17	1	1688	3257	4830	6289	▆▇▇▇▇
estrato	0	1	2.15	0.83	1	2	2	3	4	▃▇▁▃▁
f_segal	0	1	748.73	683.29	72	314	548	917	6048	▇▁▁▁▁
edad	0	1	46.22	15.95	18	34	45	58	111	▆▇▅▁▁
sexo	0	1	1.71	0.45	1	1	2	2	2	▃▁▁▁▇
ent	0	1	16.67	9.22	1	9	17	25	32	▇▆▇▇▇
worried	21	1	0.51	0.50	0	0	1	1	1	▇▁▁▁▇
ranout	9	1	0.16	0.37	0	0	0	0	1	▇▁▁▁▂
healthy	85	1	0.35	0.48	0	0	0	1	1	▇▁▁▁▅
fewfoods	39	1	0.38	0.49	0	0	0	1	1	▇▁▁▁▅
skipped	35	1	0.15	0.36	0	0	0	0	1	▇▁▁▁▂
ateless	25	1	0.24	0.42	0	0	0	0	1	▇▁▁▁▂
hungry	62	1	0.14	0.35	0	0	0	0	1	▇▁▁▁▁
whlday	27	1	0.11	0.31	0	0	0	0	1	▇▁▁▁▁

There are a few missing values. Let´s explore the patterns

# A specific package for missing values NANIAR

ensanut %>% 
  select(worried:whlday)%>% 
  miss_case_table ()

## # A tibble: 7 x 3
##   n_miss_in_case n_cases pct_cases
##            <int>   <int>     <dbl>
## 1              0   44243  99.4    
## 2              1     245   0.550  
## 3              2      15   0.0337 
## 4              3       1   0.00225
## 5              4       3   0.00674
## 6              5       1   0.00225
## 7              8       1   0.00225

Nothing worrisome, but let´s see them for each item

ensanut %>% 
  select(worried:whlday)%>% 
gg_miss_var()

Inter-item correlations

I will now combine another specific package: psych. It allows me to specify the type of correlations I need for dichotomous responses: tetrachoric correlation matrix.

graphics.off()

cor_ensanut <- ensanut  %>% 
  select(worried:whlday)%>%
  psych::tetrachoric()

psych::cor.plot(cor_ensanut$rho, numbers = T, upper=FALSE, main = "Food Insecurity ENSANUT 2018")

graphics.off()

As I expected: high intra-item correlations.

A nice thing about R is that you can graph most statistcs. It is much easier to see graphs than tables.

Mean comparisons

We assume that there is a probabilistic pattern in the item means.

Following Nord (2014): "More severe items are less frequently affirmed than less severe items.

Moreover, a household that affirms an item of mid-range severity is likely to also affirm all items that are less severe.

Similarly, a household that denies an item at mid-range is likely to deny all items that are more severe.

These typical response patterns are probabilistic, not universal, but they are generally predominant in good quality data."

A quick way to see if this is actually happening in the sample is to compare means. With graphs!

mean_ensanut2 <-ensanut %>%
  summarise_at(vars(worried, ranout, healthy, fewfoods, skipped, ateless, hungry, whlday),
               funs(weighted.mean(.,f_segal,na.rm = TRUE)*100))

mean_ensanut3 <- mean_ensanut2 %>%
  pivot_longer(
    worried:whlday,
    names_to = "items", 
    values_to = "means")

mean_ensanut3 <- mean_ensanut3%>% 
  arrange(desc(means))

kable(mean_ensanut3, digits=2) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)

items	means
worried	49.28
fewfoods	36.97
healthy	34.60
ateless	23.45
ranout	15.02
skipped	14.55
hungry	14.13
whlday	10.68

mean_ensanut3 <- mean_ensanut3 %>%
  mutate(items= ordered(items, levels = c("worried", "fewfoods", "healthy", "ateless", "ranout","skipped", "hungry", "whlday")))

mean_ensanut3 %>% 
  ggplot(aes(x=items, y= means))+
   geom_segment(aes(x=items, xend=items, y=0, yend=means), color="grey") +
  geom_point( color="orange", size=4)+
  labs(
    x= "Items",
    y= "Weighted Means",
    title= "Mean comparison",
    subtitle= "",
    caption= "Source: ENSANUT 2018") +
  theme_classic()

Rasch models

There are several packages to run Rasch models. It is necessarry to examine which algorithm suits our needs.

Multiples options in rasch.org

Fortunately, Sara Viviani, with the assitance of Carlo Cafiero and Mark Nord, created a package specifically designed to analyze the FAO´s FIES scale: the RM.Weights package

Multiples options in rasch.org

Let´s see how it works. The first step is to create two types of objects that the package can identify; one to index the items and the other to specify the sampling weights

ensanut_rasch <- ensanut %>% 
  select(worried, fewfoods, healthy, ateless,  ranout, skipped, hungry, whlday, f_segal)

ensanut_items= ensanut_rasch[,1:8] #items
ensanut_weights= ensanut_rasch$f_segal #weights

An now the key line, with the command “RM.w”. It fits a one-parameter logistic Rasch model using a conditional maximum likelihood approach (CML) - other packages may use different approaches JML or MML.

#Weighted model ensanut 
ensanut_rasch_mod2 = RM.w(ensanut_items, ensanut_weights)

We now have an object with all the relevant information, from the model and from each participant.

The next step is to extract the information we need and as we need it.

cbind("Item sev."=ensanut_rasch_mod2$b, "St.err."=ensanut_rasch_mod2$se.b,
      "Infit"=ensanut_rasch_mod2$infit, "Outfit"=ensanut_rasch_mod2$outfit)

##            Item sev.    St.err.     Infit    Outfit
## worried  -3.31105210 0.02225378 1.1973361 2.9066404
## fewfoods -1.80131915 0.01926807 0.8461629 1.1523158
## healthy  -1.49092881 0.01915278 0.9328547 1.3251750
## ateless  -0.01809466 0.01967170 0.8256640 0.7394070
## ranout    1.35809474 0.02247298 1.2354185 1.3237823
## skipped   1.44003425 0.02272043 0.8450221 0.7685367
## hungry    1.52733932 0.02299593 0.8180176 0.6642158
## whlday    2.29592768 0.02613741 0.9726495 0.9585899

These are the key parameters we are looking for. Please note how the severity moves along the latent continuum.

Probably a graph will help see the distance between each item.

severity <- as_tibble(cbind("Item sev."=ensanut_rasch_mod2$b), rownames = "items")

severity <- severity %>% 
  rename(sev = "Item sev.")

severity <- severity %>%
  mutate(items= ordered(items, levels = c("worried", "fewfoods", "healthy", "ateless", "ranout","skipped", "hungry", "whlday"))) 


severity %>% 
  ggplot(aes(x=items, y= sev))+
  geom_point( color="orange", size=4)+
  ylim(-3.5, 3)+
  coord_flip()+
  labs(
    x= "Items",
    y= "Severities",
    title= "Severity comparison",
    subtitle= "",
    caption= "Source: ENSANUT 2018") +
  theme_classic()

Let´s now turn to the INFIT values – performance statistic about equal discrimination.

We dont want them above 1.2 (“Excelent”) or 1.3 (“Acceptable”)

infits <- as_tibble(cbind("Infit"=ensanut_rasch_mod2$infit), rownames = "items")

infits <- infits %>%
  mutate(items= ordered(items, levels = c("worried", "fewfoods", "healthy", "ateless", "ranout","skipped", "hungry", "whlday"))) 

infits %>% 
  ggplot(aes(x=items, y= Infit))+
  geom_point( color="blue", size=4)+
  ylim(0,1.6)+
  geom_hline(yintercept = 0.7)+
  geom_hline(yintercept = 1.2, color= "orange")+
  geom_hline(yintercept = 1.3, color= "red")+ #threshold of interest
  labs(
    x= "Items",
    y= "INFIT values",
    title= "INFIT comparison",
    subtitle= "",
    caption= "Source: ENSANUT 2018") +
  theme_classic()

Some additional parameters. They reflect severity by summative score 0-8

#Respondent parameters 
cbind("Person par."=ensanut_rasch_mod2$a, "Error"=ensanut_rasch_mod2$se.a)

##       Person par.     Error
##  [1,]  -4.1055952 1.6013922
##  [2,]  -3.0762736 1.2519852
##  [3,]  -1.7785015 1.0578662
##  [4,]  -0.8750190 1.0009506
##  [5,]   0.1780338 0.9417313
##  [6,]   1.0255624 0.9068156
##  [7,]   1.8151694 0.9349086
##  [8,]   2.8892919 1.1440033
##  [9,]   3.7356798 1.6013922

We can also estimate an overall model fit with the Rasch “flat” reliability statistic – the proportion of total variation in true severity in the sample accounted by the model (range 0-1). In this sample overall model fit corresponds to 0.7750242.

The previous number is an example of inline coding. If the data changes, the estimate gets updated.

And the residual correlations; we want to avoid high correlations (> 0.3).

#Residual correlations
ensanut_resid_corr2<- ensanut_rasch_mod2$res.cor

ensanut_resid_corr2

##               worried    fewfoods     healthy     ateless       ranout
## worried   1.000000000 -0.12466902 -0.13924835 -0.07550443 -0.008410536
## fewfoods -0.124669019  1.00000000  0.34901387  0.06283178 -0.095222170
## healthy  -0.139248350  0.34901387  1.00000000 -0.02191721 -0.065394316
## ateless  -0.075504427  0.06283178 -0.02191721  1.00000000 -0.099261207
## ranout   -0.008410536 -0.09522217 -0.06539432 -0.09926121  1.000000000
## skipped  -0.084625113 -0.02496571 -0.07567743  0.20707793 -0.029388037
## hungry   -0.060075874 -0.04122727 -0.07258965  0.18482301 -0.080807961
## whlday   -0.092685491 -0.06383555 -0.09348817  0.09848346 -0.110024792
##              skipped      hungry      whlday
## worried  -0.08462511 -0.06007587 -0.09268549
## fewfoods -0.02496571 -0.04122727 -0.06383555
## healthy  -0.07567743 -0.07258965 -0.09348817
## ateless   0.20707793  0.18482301  0.09848346
## ranout   -0.02938804 -0.08080796 -0.11002479
## skipped   1.00000000  0.25061357  0.10996360
## hungry    0.25061357  1.00000000  0.26688409
## whlday    0.10996360  0.26688409  1.00000000

#Alternative R package

Now I want to show you an alternative option: the eRM package (CML). Close estimates, but not the same.

ensanut_rasch2 <- ensanut %>% 
  select(worried, fewfoods, healthy, ateless,  ranout, skipped, hungry, whlday)

ensanut_rasch3 <- ensanut_rasch2 %>% drop_na()

fit_ensanut <- RM(ensanut_rasch3)

round(sort(-fit_ensanut$betapar), 3)

##  beta worried beta fewfoods  beta healthy  beta ateless   beta ranout 
##        -3.273        -1.790        -1.450         0.035         1.234 
##  beta skipped   beta hungry   beta whlday 
##         1.396         1.562         2.285

plotjointICC(fit_ensanut, xlab = "Severity parameters (ICC)",
             main = "ENSANUT 2018", legend = TRUE)

Food insecurity prevalence with sampling weights

The last step is to estimate the prevalence with the complex design

ensanut <- ensanut %>% 
  mutate(ia_total = worried +fewfoods+ healthy+ ateless+ skipped+ ranout+ hungry+ whlday)

ensanut <- ensanut %>% 
  mutate(ia_level = case_when(
    ia_total == 0 ~ 0,
    ia_total == 1 ~ 1,
    ia_total == 2 ~ 1,
    ia_total == 3 ~ 1,
    ia_total == 4 ~ 2,
    ia_total == 5 ~ 2,
    ia_total == 6 ~ 2,
    ia_total == 7 ~ 3,
    ia_total == 8 ~ 3))

ensanut <- ensanut %>% 
  mutate(ia_level2= ordered(ia_level, levels = c("0", "1", "2", "3"), labels=c("Security", "Mild Insecurity", "Moderate Insecurity", "Severe Insecurity"))) 

prop.table(table(ensanut$ia_level2))*100  #not weighted

## 
##            Security     Mild Insecurity Moderate Insecurity   Severe Insecurity 
##           43.193274           32.506837           14.648645            9.651244

We need to set up the complex design

### Complex design
complexhh_ensanut <- svydesign(id=~upm_dis, strata=~est_dis, weights=~f_segal, data=ensanut)

complexhh_ensanut

## Stratified 1 - level Cluster Sampling design (with replacement)
## With (6268) clusters.
## svydesign(id = ~upm_dis, strata = ~est_dis, weights = ~f_segal, 
##     data = ensanut)

And finally the prevalence with confidence intervals

#Inseguridad alimentaria con CI
svymean(~ia_level2, complexhh_ensanut, deff=T, na.rm = T)*100

##                                    mean         SE   DEff
## ia_level2Security            44.7500371  0.0036741 2.4188
## ia_level2Mild Insecurity     31.0884154  0.0032016 2.1196
## ia_level2Moderate Insecurity 14.9030344  0.0025330 2.2413
## ia_level2Severe Insecurity    9.2585130  0.0019535 2.0123

confint(svymean(~ia_level2, complexhh_ensanut, na.rm = T))*100

##                                  2.5 %    97.5 %
## ia_level2Security            44.029922 45.470153
## ia_level2Mild Insecurity     30.460911 31.715920
## ia_level2Moderate Insecurity 14.406577 15.399492
## ia_level2Severe Insecurity    8.875637  9.641389

Yale Lab Meeting —Rasch Models in R

Pablo Gaitán Rossi

3/10/2021