Context

Understanding Society is an annual longitudinal study of households in the UK, operational since 2009. The data collection for one wave takes place over a 24 month period, but each household is surveyed every year, thus creating overlapping “waves”. The study is run by the Institute for Social and Economic Research at the University of Essex.

In this script, we focus on importing and cleaning the youth survey, a pen-and-paper questionnaire completed by 10-15 year olds in participating households. For each wave of data collection (marked “a”, “b”, … “o”) we import the youth survey from it’s Stata form (.dta). We limit the variables imported to those of interest (e.g. ID numbers, demographics, mental health, life satisfaction) whose variable names are all preceded with the wave name e.g. “a_sex”. The Strengths & difficulties Questionnaire (SDQ) was administered every second wave (“a”, “c”, “e”, etc.).

We also import some information about participant, their families, and their household (e.g. income) from other datasets supplied by Understanding Society.

Set up

Packages required:

library(haven)
library(readstata13)
library(sjlabelled)
library(dplyr)
library(readxl)
library(stringr)

Import select data:

# variables of interest for this study
# all preceded by a wave letter e.g. a_, b_
suffix.list <- c("_hidp", "_psu", "_strata", "_dvage", "_ypdoby", "_doby_dv", "_sex", "_ypsex", "_sex_dv", "_ethn_dv", "_intdatm_dv", "_intdaty_dv", "_country", "_gor_dv", "_urban_dv" , "_hhorig", "_ypsocweb", "_yphsw", "_yphap", "_yphfm", "_yphfr", "_yphsc", "_yphlf")

  dbls=0 # count to list numbers of waves with repeat participants
  usoc <- c() # dataframe to store all waves
  wgts_list <-c()
  # select wave
  for (wave in c("a","b","c","d", "e","f", "g","h", "i","j", "k","l", "m", "n", "o")) {
    
    # save youth data from *this wave* into dataframe tmp
    
   # if (wave=="f") {
     # import from different path (TO DOWNLOAD VERSION 14/15 FROM UK DATA SERVICE)
      #alt.path = ""
     # tmp <- readstata13::read.dta13(paste0(alt.path, wave, "_youth.dta"), generate.factors = TRUE)
      
    #} else {
      tmp <- readstata13::read.dta13(paste0(path, wave, "_youth.dta"), generate.factors = TRUE)
      #xwave <- readstata13::read.dta13(paste0(path, "xwaveid.dta"), generate.factors = TRUE)
   # }
  
  
 
  
  
   # "odd" years have the SDQ (a,c, etc), and "even" years don't
  if (is.element(wave, c("a", "c", "e", "g", "i", "k", "m", "o")) ) {                 
  varlist <- c("pidp", #person ID
               paste0(wave, suffix.list), #  wave_suffix_variable
               names(tmp)[str_detect(names(tmp), "sdq")], # variables containing "sdq" (items and subscores)
               names(tmp)[str_ends(names(tmp), "_xw")]) #  var ending in "_xw" (cross-sec weight)*
  
  # * weights: ythscus_xw for wave 1, ythscub_xw for waves 2 to 5, ythscui_xw for waves 6 to 13
  
  tmp <- tmp[, varlist] # limit data only to these variables, comparable across waves
  
  # and for all other waves (b, d, f, etc.), exclude the sdq from your variable list
  } else {
  varlist  <- c("pidp", #person ID
               paste0(wave, suffix.list), 
               names(tmp)[str_ends(names(tmp), "_xw")])
  tmp <- tmp[, varlist]

  }
  
  
  ## GET DATA FROM 'hhresp' AND ADD TO 'tmp'
  # load in household data from this wave and extract household income. Also extract the month that household is DUE to be interviewed (rather than actual interview date)
  # fihhmnnet1_dv: Total household net income – no deductions. It is the sum of monthly total net personal income – no deductions received by all household members. 
  hsd <- readstata13::read.dta13(paste0(path, wave, "_hhresp.dta"), generate.factors = TRUE, select.cols=c(paste0(wave,"_hidp"),
                                                                                                           paste0(wave,"_fihhmnnet1_dv"), 
                                                                                                           paste0(wave, "_ieqmoecd_dv"),
                                                                                                           paste0(wave, "_month")) )
  # merge into main dataframe, matching by household ID (hidp)
  tmp <- merge(tmp, hsd, by.x = paste0(wave,"_hidp"), by.y= paste0(wave,"_hidp"))
  
  
  ## CLEAN UP...

  # add variable for wave "a", "b", etc
  tmp$wave <- wave
  
  # remove wave prefix to all var names
  # if there's an underscore, remove it and the letter before it (wave specific var names)
  names(tmp)[substr(names(tmp),1,2)==paste0(wave,"_")] <- 
    substr(names(tmp)[str_detect(names(tmp), paste0(wave,"_"))], 3, 
           nchar(names(tmp)[str_detect(names(tmp), paste0(wave,"_"))]))
  
  # change the weights name to "x_wgt". All weights end in suffix _xw. Checked that correct vars are captured here.
  names(tmp)[str_ends(names(tmp), "_xw")] <- "x_wgt"
  
  tmp <- relocate(tmp, c(x_wgt, wave), .after="hidp")
 
  # add blank SDQ vars to waves that didn't have it
  if (is.element(wave, c("b", "d", "f", "h", "j", "l", "n")) ) {
    sdq <- data.frame(ypsdqa=NA,  ypsdqb=NA    ,ypsdqc=NA     ,ypsdqd=NA,    
ypsdqe=NA, ypsdqf=NA, ypsdqg=NA, ypsdqh=NA, ypsdqi=NA, ypsdqj=NA, ypsdqk=NA, ypsdql=NA, ypsdqm=NA, ypsdqn=NA, ypsdqo=NA, ypsdqp=NA, ypsdqq=NA,  ypsdqr=NA,  ypsdqs=NA,  ypsdqt=NA, ypsdqu=NA, ypsdqv=NA, ypsdqw=NA, ypsdqx=NA, ypsdqy=NA, ypsdqes_dv=NA, ypsdqcp_dv=NA, ypsdqha_dv=NA, ypsdqpp_dv=NA, ypsdqps_dv=NA, ypsdqtd_dv=NA)
    tmp <- cbind(tmp, sdq)
  }
  
  # create a flag to ensure each pidp has only been included once per wave (confirmed)
  if (sum(table(tmp$pidp)>1)>0) {
    print(paste0("A participant has been counted more than once in wave ", wave))
    dbls = dbls+1
  }
  
  usoc <- rbind(usoc, tmp)

  }

Dataset summary

Number of waves: 15

Number of unique individuals: 16287

Number of observations: 45867

Data cleaning

# remove rows with no personal ID 
usoc <- usoc[!is.na(usoc$pidp), ]


# for all the following numeric vars that are coded as factors, explictly make numeric & replace various types of missing with NA
varnames = c("ypsdqes_dv", "ypsdqcp_dv", "ypsdqha_dv", "ypsdqpp_dv", "ypsdqps_dv", "ypsdqtd_dv","ypdoby")

for (var in varnames){
ind <- str_which(names(usoc), var)

# replace different missing types with NA
usoc[!is.na(usoc[,ind]) & (usoc[,ind]=="don't know" | usoc[,ind]=="refusal" |
                      usoc[,ind]=="inapplicable" | usoc[,ind]=="missing"), ind] <- NA

# make numeric
usoc[,ind] <- as.numeric(as.character(usoc[,ind]))
}


# for the following categorical variables, replace missing types (e.g. refused, don't know) with NA
varnames = c("country", "dvage","doby_dv", "sex_dv", "urban_dv", "hhorig", "ypsocweb","month", "intdaty_dv", "ethn_dv", "gor_dv",
             "yphsw", "yphap", "yphfm", "yphfr", "yphsc", "yphlf")

for (var in varnames){
  ind <- str_which(names(usoc), var)
  
  # replace different missing types with NA, only if its a variable with those options
  if (sum(usoc[,ind]=="don't know" | usoc[,ind]=="refusal" |
      usoc[,ind]=="inapplicable" | usoc[,ind]=="missing", na.rm=T)>0) {
    
    usoc[(usoc[,ind]=="don't know" | usoc[,ind]=="refusal" |
                        usoc[,ind]=="inapplicable" | usoc[,ind]=="missing"), ind] <- NA
  }
  
  usoc[,ind] <- droplevels(usoc[,ind]) # regardless if there was anyone with one of those missings or not, remove the labels for 'don't know', 'refusal', etc
  
# additionlly, for the happiness vars, make "completely happy" =1 and "not at all happy"=7. i.e. all numbers. Already coded this way for waves b-f, then it changes
if (str_detect(var, "yph")){
    
    usoc[,ind] <- factor(usoc[,ind], levels = c("completely happy", "2","3","4","5", "6", "not at all happy"),
                                     labels = c("1",                "2","3","4","5", "6",  "7"))
    usoc[,ind] <- as.numeric(usoc[,ind])
  } 

}

# date of birth (year) of 9999 should be NA. DOB should be numeric
usoc$ypdoby[usoc$ypdoby==9999] <- NA
usoc$doby_dv.n <- as.numeric(usoc$doby_dv)+1992 # use year of birth as derived by study team

# for household monthly income, change var name and check nonsensical values
usoc$hsdincome <- usoc$fihhmnnet1_dv
# Notes:
# 24 participants have negative incomes (in debt? no: self-employed)
# 45 participants have zero income

# create equivalised household income by dividing net household monthly income by number of adult equivalents
usoc$equivinc <- usoc$fihhmnnet1_dv/usoc$ieqmoecd_dv

# collapse ethnicity levels to avoid small groups
# note: the ethnicity Q was asked in the following way: "Do you/Does [participant] come from, or have parents or grandparents from any of the following ethnic groups?"... but the variable "ethn_dv" also takes into account the question "what is your ethnic group?"
# Also note: this is derived from parent responses, not the child's
# https://www.understandingsociety.ac.uk/documentation/mainstage/variables/ethn_dv/
usoc$ethnicity <- factor(usoc$ethn_dv, levels=c("british/english/scottish/welsh/northern irish", "irish" ,
                                                "any other white background",
                                                "white and black caribbean", "white and black african", "white and asian","any other mixed background"   ,              
                                                "indian","pakistani", "bangladeshi", "chinese",
                                                "caribbean", "african" , 
                                                "any other black background","any other asian background" , "arab","any other ethnic group" ),
                         labels=c("British or Irish", "British or Irish",
                                  "Any other white background",
                                  "Mixed background", "Mixed background", "Mixed background", "Mixed background",
                                  "South or East Asian", "South or East Asian","South or East Asian","South or East Asian",
                                  "African or Caribbean", "African or Caribbean", 
                                  "Other", "Other","Other", "Other"))

# change direction of life satisfaction vars to make lower value indicate unhappy
# and rename
usoc$h.schoolwork <- (usoc$yphsw-8)*-1
usoc$h.appear <- (usoc$yphap-8)*-1
usoc$h.fam <- (usoc$yphfm-8)*-1
usoc$h.friends <- (usoc$yphfr-8)*-1
usoc$h.school <- (usoc$yphsc-8)*-1
usoc$h.life <- (usoc$yphlf-8)*-1

# make zero-centred versions where :-| face is 0 
usoc$h.schoolwork.c <- usoc$h.schoolwork-4
usoc$h.appear.c <- usoc$h.appear-4
usoc$h.fam.c <- usoc$h.fam-4
usoc$h.friends.c <- usoc$h.friends-4
usoc$h.school.c <- usoc$h.school-4
usoc$h.life.c <- usoc$h.life-4


# create binary variable of happiness variables
usoc$unhappy.life <- ifelse(usoc$h.life.c < 0, 1, 0)
usoc$unhappy.school <- ifelse(usoc$h.school.c < 0, 1, 0)
usoc$unhappy.schoolwk<- ifelse(usoc$h.schoolwork.c < 0, 1, 0)
usoc$unhappy.fam <- ifelse(usoc$h.fam.c < 0, 1, 0)
usoc$unhappy.friends <- ifelse(usoc$h.friends.c < 0, 1, 0)
usoc$unhappy.appear <- ifelse(usoc$h.appear.c < 0, 1, 0)


# make a binary version of sex, where "inconsistent" -> NA and female is the reference category
usoc$sex.bin <- NA
usoc$sex.bin[usoc$sex_dv=="Male"] <- "Male"
usoc$sex.bin[usoc$sex_dv=="Female"] <- "Female"
usoc$sex.bin <- as.factor(usoc$sex.bin)
usoc$sex.bin <- relevel(usoc$sex.bin, "Female")

# make age numeric
usoc$dvage.n <- as.numeric(usoc$dvage)+9


# update response levels for social media account ownership (separate study)
usoc$socialmed <- factor(usoc$ypsocweb, levels=c("yes    ", "no     ", "yes",     "no",      "Yes",     "No",      "9"),
                                        labels=c("Yes",     "No",      "Yes",      "No",     "Yes",     "No",      NA_character_))

# create version of "wave" variable that shows approximate years of data collection
usoc$wave.y <- factor(usoc$wave, levels=c("a","b", "c","d", "e","f", "g","h", "i","j", "k","l", "m", "n", "o"),
                      labels=c("2009-10","2010-11", "2011-12","2012-13", "2013-14","2014-15", "2015-16","2016-17", "2017-18","2018-19", "2019-20","2020-21", "2021-22", "2022-23", "2023-24"))

# and create a version of "wave" which is simply a number from 1-15
usoc$wave.n <- as.numeric(usoc$wave.y)

# dichotomise age
usoc$age.group <- factor(usoc$dvage, levels=c(10,11,12,13,14,15),
                         labels=c("10-12", "10-12", "10-12", "13-15", "13-15", "13-15"))

# create generation groups from year of birth
usoc$gen <- cut(usoc$doby_dv.n, breaks= c(-Inf, 1997, 2013, Inf), labels=c("Millenials", "Gen Z", "Gen A"), right=T)


# Time
# Create your measure of calendar year, based on scheduled month of interview (same every wave for each hsd)
# use this variable instead of actual interview date because a hsd could have a delayed interview one year; on time the next, leading to 2 data points from same individual recorded in the same calendar year.

# from the "month" and "wave" variables, identify the calendar year of the scheduled interview
year.grid = data.frame(wave= letters[1:16],
                       year1 = c(2009:2024),
                       year2 = c(2010:2025))

# create new year & mth variables to indicate *scheduled* interview mth
# every household falls into a *sample month*
usoc$int.year.sched <- NA
usoc$int.mth.sched <- NA
for (w in sort(unique(usoc$wave))) {
  usoc$int.mth.sched[usoc$wave==w] <- substr(usoc$month[usoc$wave==w],1,3)
  usoc$int.year.sched[usoc$wave==w & str_detect(usoc$month, "yr1")] <- year.grid$year1[year.grid$wave==w]
  usoc$int.year.sched[usoc$wave==w & str_detect(usoc$month, "yr2")] <- year.grid$year2[year.grid$wave==w]
}

# observe discrepency between scheduled (int.year.sched) and actual survey date (intdaty_dv), in years (note this would include people whose scheduled survey in Dec was delayed to Jan)
usoc$interview.delay.y <- as.numeric(as.character(usoc$intdaty_dv)) - as.numeric(as.character(usoc$int.year.sched))
# all seem to be within 4 months of scheduled month, as per the study documentation


# move these time variables to front of dataset
usoc <- relocate(usoc, c(month, int.mth.sched, int.year.sched, intdatm_dv, intdaty_dv, interview.delay.y), .after="wave")

Next, import the xhhrel dataset which provides meta-information about the households, across all waves. In particular, we needed a household ID that does not changes over time, the household ID for the original survey member (osm_hh). As it happens, we use the hidp household ID in the analysis, which does update at each wave, but is sufficient because our analysis is cross-sectional. We also pull out information about siblings from this meta-data.

tmp <- readstata13::read.dta13(paste0(path,"xhhrel.dta"), generate.factors = TRUE)

# limit rows to those with youth data (aged 10-16)
hsds <- tmp[is.element(tmp$pidp, unique(usoc$pidp)), ]
# hsds <- tmp[tmp$pidp %in% pidp.list$pidp, ] # alternative way to select overlapping pidps

# identify all variables that are about siblings (_s) of each pidp 
#                                  biological sibling                   half-sibling              adoptive sibling                step-sibling                          foster sibling
vars <- names(hsds)[startsWith(names(hsds),"bsbx") | startsWith(names(hsds),"hsbx") | startsWith(names(hsds),"asbx") | startsWith(names(hsds),"ssbx") | startsWith(names(hsds),"fsbx")]

#                orig hsd  pid    dob         hsd size    num relat   num bio par    num bio children   
hsds <- hsds[,c("osm_hh","pidp", "doby_dv", "osm_hh_size", "rels_N", "bpx_N",        "bcx_N", vars)]

x= as.data.frame(table(hsds$osm_hh)) #unique(?) number of households = 51,508. Note there may be as many as 50 people linked with this hsd because its the *original* household that people can be linked with by blood or relationship, or multiple generations

usoc <- merge(usoc, hsds[,c("pidp","osm_hh", "bpx_N", "bsbx_N", "hsbx_N", "asbx_N", "ssbx_N", "fsbx_N")], by.x = "pidp", by.y= "pidp")


# move around variables
usoc <- relocate(usoc, c(pidp), 1)
usoc <- relocate(usoc, c(hidp, osm_hh, hsdincome), .after="pidp")
usoc <- relocate(usoc, c(bsbx_N, hsbx_N, asbx_N, ssbx_N, fsbx_N), .after="sex_dv")

Next create a dataframe pidp.list that summarises time-invariant information about each INDIVIDUAL in the study, including the wave they first took part, the number of observations they contributed to date & their demographic info. And a dataframe for time-invariant household information osm.list (original survey member).

# create a dataset for all unique participants (n~16,000) and save some key info about them
pidp.list <- data.frame(pidp = unique(usoc$pidp))

for (x in pidp.list$pidp) {
pidp.list$firstwave[pidp.list$pidp==x] <- usoc$wave[usoc$pidp==x][1] 
pidp.list$num.rows[pidp.list$pidp==x] <- nrow(usoc[usoc$pidp==x,])
pidp.list$hsd_osm[pidp.list$pidp==x] <- usoc$osm_hh[usoc$pidp==x][1] 
# the line below is not strictly number of siblings, it's respondents from the same household (throughout any point in the 15y)
#pidp.list$num.sibs[pidp.list$pidp==x] <- length(unique(usoc$pidp[usoc$osm_hh==pidp.list$hsd_osm[pidp.list$pidp==x]]))-1 # gives a value of -1 for most
pidp.list$dob[pidp.list$pidp==x] <- usoc$doby_dv.n[usoc$pidp==x]
pidp.list$sex[pidp.list$pidp==x] <- usoc$sex_dv[usoc$pidp==x][1]
pidp.list$country[pidp.list$pidp==x] <- usoc$country[usoc$pidp==x][1]
pidp.list$age_atfirst[pidp.list$pidp==x] <- usoc$dvage.n[usoc$pidp==x][1]
pidp.list$ethnic[pidp.list$pidp==x] <- usoc$ethnicity[usoc$pidp==x][1]
}

pidp.list$sex <- factor(pidp.list$sex, levels=c(1,2,3), labels=c("inconsistent", "male", "female"))
pidp.list$country <- factor(pidp.list$country, levels=c(1,2,3,4), labels=c("England","Wales" ,"Scotland","Northern Ireland"))
pidp.list$ethnic <- factor(pidp.list$ethnic, levels=c(1:6), labels=c("British or Irish", "Any other white background", "South or East Asian", "African or Caribbean", "Mixed background" ,"Other"  ))

osm.list <- data.frame(osm = unique(usoc$osm_hh))

for (x in osm.list$osm) {
  osm.list$num.partic[osm.list$osm==x] <- length(unique(usoc$pidp[usoc$osm_hh==x]))
  osm.list$num.obs[osm.list$osm==x] <- length(usoc$pidp[usoc$osm_hh==x]) 
  osm.list$hsd.inc.first[osm.list$osm==x] <- usoc$fihhmnnet1_dv[usoc$osm_hh==x][1]
  osm.list$hsd.inc.ave[osm.list$osm==x] <- mean(usoc$fihhmnnet1_dv[usoc$osm_hh==x], na.rm=T)
  osm.list$equivinc.first[osm.list$osm==x] <- usoc$equivinc[usoc$osm_hh==x][1]
  osm.list$equivinc.ave[osm.list$osm==x] <- mean(usoc$equivinc[usoc$osm_hh==x], na.rm=T)
}

Next, create a correlation matrix that shows the extent of participant overlap across waves. We will use this later to create a variance-covariance matrix and add it to the meta-regression when assessing changes over time.

# calculate matrix R: a symmetrical matrix of correlation values between years
years <- sort(unique(usoc$int.year.sched))

df <- c()

for (x in years){
  # for 2009
  
  # find overlap of people with all other years
  for (y in years){
   lag = y-x # the bigger minus the smaller year
   # n = the number of participants taking part in both years
   n = length(intersect(usoc$pidp[usoc$int.year.sched==x], usoc$pidp[usoc$int.year.sched==y]))
   # denominator = the combination of sample sizes from both years
   # note: Ns need to be combined in this way (sqrd multiple) so that the resulting matrix is symmetrical
   denom = sqrt(length(usoc$pidp[usoc$int.year.sched==x])*length(usoc$pidp[usoc$int.year.sched==y]))
   prop = n/denom
   df <- as.data.frame(rbind(df, 
                             cbind(x, y, lag, prop)))
  }
}

# correlation matrix R
library(reshape2)
R <- acast(df, y~x, value.var = "prop", mean)%>%round(2)

# confirm symmetrical matrix:
all.equal(R, t(R))

## [1] TRUE

# confirm all 1s on diagonal:
all(diag(R) == 1)

## [1] TRUE

# confirm eigenvalues are positive or zero:
summary(eigen(R, symmetric = TRUE)$values)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2500  0.3530  0.5081  1.0000  0.9966  4.0226

# display the correlation matrix
print(R)

##      2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023
## 2009 1.00 0.38 0.28 0.20 0.13 0.06 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 2010 0.38 1.00 0.65 0.49 0.35 0.22 0.09 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 2011 0.28 0.65 1.00 0.66 0.50 0.35 0.21 0.09 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 2012 0.20 0.49 0.66 1.00 0.67 0.49 0.33 0.20 0.08 0.00 0.00 0.00 0.00 0.00 0.00
## 2013 0.13 0.35 0.50 0.67 1.00 0.66 0.47 0.32 0.20 0.10 0.01 0.00 0.00 0.00 0.00
## 2014 0.06 0.22 0.35 0.49 0.66 1.00 0.62 0.45 0.31 0.19 0.09 0.00 0.00 0.00 0.00
## 2015 0.00 0.09 0.21 0.33 0.47 0.62 1.00 0.62 0.45 0.31 0.20 0.09 0.01 0.00 0.00
## 2016 0.00 0.00 0.09 0.20 0.32 0.45 0.62 1.00 0.62 0.44 0.33 0.19 0.10 0.01 0.00
## 2017 0.00 0.00 0.00 0.08 0.20 0.31 0.45 0.62 1.00 0.61 0.48 0.31 0.21 0.10 0.01
## 2018 0.00 0.00 0.00 0.00 0.10 0.19 0.31 0.44 0.61 1.00 0.61 0.43 0.32 0.18 0.07
## 2019 0.00 0.00 0.00 0.00 0.01 0.09 0.20 0.33 0.48 0.61 1.00 0.56 0.45 0.27 0.15
## 2020 0.00 0.00 0.00 0.00 0.00 0.00 0.09 0.19 0.31 0.43 0.56 1.00 0.59 0.38 0.23
## 2021 0.00 0.00 0.00 0.00 0.00 0.00 0.01 0.10 0.21 0.32 0.45 0.59 1.00 0.54 0.34
## 2022 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.01 0.10 0.18 0.27 0.38 0.54 1.00 0.39
## 2023 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.01 0.07 0.15 0.23 0.34 0.39 1.00
## 2024 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.06 0.13 0.21 0.30 0.37
##      2024
## 2009 0.00
## 2010 0.00
## 2011 0.00
## 2012 0.00
## 2013 0.00
## 2014 0.00
## 2015 0.00
## 2016 0.00
## 2017 0.00
## 2018 0.00
## 2019 0.06
## 2020 0.13
## 2021 0.21
## 2022 0.30
## 2023 0.37
## 2024 1.00

Note correlation matrix shows that there is 0 overlap in individuals participating between year X and year X+6. This is because household members can only complete the youth survey between ages 10 and 15 (max of 6 years of participation).

Export cleaned dataset

Save all dataframes to .RData file, for next analysis script:
- usoc: the main dataset with survey responses across all waves
- pidp.list: time-invariant information about every participant
- osm.list: time-invariant information about every household
- R: correlation between waves, in terms of overlap of same individuals

save(usoc, pidp.list, osm.list, R,  file = paste0(path, "usoc_cleaned.RData"))

Cleaning Understanding Society Youth data (2009-2024)

Niamh Dooley

2025-10-16

Context

Set up

Dataset summary

Data cleaning

Export cleaned dataset