import libraries

library(haven)
library(readstata13)
library(sjlabelled)
library(dplyr)
library(stringr)
library(ggplot2)
library(ggpubr)
library(reshape2)
library(ggpubr)
library(sjmisc)
library(qwraps2)

import PY data

Import county-specific data
Import variable labels from stata file using read_dta
Create variables for “county” and “subject ID”

cavan_labs <- read_dta("Planet Youth/Data/cavan master file.dta")
cavan <-readstata13::read.dta13("Planet Youth/Data/cavan master file.dta")
cavan <- sjlabelled::copy_labels(cavan, cavan_labs)
cavan$county <- "cavan"
cavan$id <- paste0("w1_c", seq(from=1, to=length(cavan$county)))
cavan<- relocate(cavan, c(id, county), .before = "school_code")


mon_labs <- read_dta("Planet Youth/Data/monaghan master file.dta") 
mon <-readstata13::read.dta13("Planet Youth/Data/monaghan master file.dta")
mon <- sjlabelled::copy_labels(mon, mon_labs)
mon$county <- "monaghan"
mon$id <- paste0("w1_m", seq(from=1, to=length(mon$county)))
mon<- relocate(mon, c(id, county), .before = "school_code")


dub_labs <- read_dta("Planet Youth/Data/north dublin master data file.dta") 
dub <-readstata13::read.dta13("Planet Youth/Data/north dublin master data file.dta")

## Warning in readstata13::read.dta13("Planet Youth/Data/north dublin master data file.dta"): 
##    Duplicated factor levels for variables
## 
##    Pa_edu, par_work, smoke30days
## 
##    Unique labels for these variables have been generated.

## Warning in readstata13::read.dta13("Planet Youth/Data/north dublin master data file.dta"): 
##    Missing factor labels for variables
## 
##    bullied_online
## 
##    No labels have been assigned.
##    Set option 'generate.factors=TRUE' to generate labels.

dub <- sjlabelled::copy_labels(dub, dub_labs)
dub$county <- "dublin"
dub$id <- paste0("w1_nd", seq(from=1, to=length(dub$county)))
dub<- relocate(dub, c(id, county), .before = "gender")

rm(list=c("cavan_labs", "mon_labs", "dub_labs"))

# merge
# set variables to have same names
dub$school_code <- NA
dub$local_district <- NA
dub<- relocate(dub, school_code, .before = "gender")
dub$vapeharm1 <- NA
dub$vapeharm2 <- NA
dub<- relocate(dub, vapeharm2, .before = "vapeharm3")

dub <- dub%>%
  rename(
    local_schoolcode = NorthDublin_school
  )

mon <- mon%>%
  rename(
    local_schoolcode = Monaghan_school,
    local_district = Monaghan_district
  )

cavan <- cavan%>%
  rename(
    local_schoolcode = Cavan_school,
    local_district = Cavan_district
  )

# YEAR IN SCHOOL variable "grade". Differences in response coding across *each* county (see guide)
# set labels for each value (informed by discussions with survey administrators), save as characters, then back to factors
cavan$grade <- as.character(factor(cavan$grade, levels=c(3,4,5), labels=c("Transition year", "5th year", "Leaving Cert Applied")))
mon$grade <- as.character(factor(mon$grade, levels=c(4,5), labels=c("Transition year", "5th year")))

mon$grade <- as.factor(mon$grade)
cavan$grade <- as.factor(cavan$grade)

# replace factors with character vars so all levels are allowed
cavan$local_district <- as.character(cavan$local_district)
mon$local_district <- as.character(mon$local_district)
dub$local_district <- "NorthDub"

# make sure number & names of variabels line up
#a<- data.frame(dub=names(dub), cav=names(cavan), mon=names(mon))

py_remerged <- rbind(dub, cavan, mon)
py_remerged <- sjlabelled::copy_labels(py_remerged, cavan)

py_remerged$local_schoolcode <- set_label(py_remerged$local_schoolcode, "")
py_remerged$local_district <- set_label(py_remerged$local_district, "for monaghan and cavan only")
py_remerged$local_district <- as.factor(py_remerged$local_district)
py_remerged$county <- as.factor(py_remerged$county)
py_remerged$grade <- as.factor(py_remerged$grade)
mypy <- py_remerged

Clean specific variables

Cleaning Notes:
* For “opp_attract” & “same_attract” missing values were stored as ““– replace with NA
*”vapeharm2” – response options “agree” “strongly agree” etc stored as characters rather than levels of a factor
* Also, the questions for “vapeharm1” and “vapeharm2” were merged together in the North Dublin survey, meaning the results of these items are meaningless for this site. All North Dublin responses to these items are considered missing (done above on dub dataframe, before merge)
* “grade” years had different response levels across counties– this was rectified above before merging county data
* “bullied_online” – responses from dublin surveys were coded as 10862-10866 instead of categories of frequency “Never”, “Once”, “Twice”, etc. As the dublin survey lists response options “never, once, twice, 3-4 times and 5 times or more”, numeric response levels were assumed to match those in order. Frequencies also suggest this is the case (decreasing Ns with higher frequency of online bullying)
* In “smoke30days” and “health3” and “health4” response options were erroneously “stuck” together in one or more sites. So the response options for the other site were combined to allow cross-site comparison
* In “living”, “par_work”, “Pa_edu” and “Ma_edu”, there were more response options than were endorsed. The response options may have been binned into a smaller group of options post survey collection by Icelandic or RCSI researchers, to avoid identification of participants
* parsup1, parsup2, parsup3 are about how easy/hard it would be for respondents to get support/advice/warmth from their parents. Response options differed across sites

More detail can be found in the document “Researchers guide to PY 2021 DubMonCav”

mypy <- mypy %>%
  mutate(
     local_district = factor(local_district),
     opp_attract = factor(opp_attract, levels=c("", "1", "2", "3", "4", "5"),
                                       labels=c(NA_character_, "1", "2", "3", "4", "5" )),
     same_attract=factor(same_attract, levels=c("", "1", "2", "3", "4", "5"),
                                       labels=c(NA_character_, "1", "2", "3", "4", "5" )),
     vapeharm2=factor(vapeharm2, levels=c("Stongly Agree", "Agree", "Neither disagree nor agree",
                                          "Disagree", "Strongly Disagree")),
     # grade=factor(grade, levels=c("2nd year", "3rd year", "Transition year",
     #                                      "5th year", "Leaving Cert Applied")),
     bullied_online=factor(bullied_online, levels=c("Never", "Once", "Twice", "3-4 times", "5 times or more", 
                                                    "10862", "10863", "10864", "10865",     "10866"),
                           labels=c("Never", "Once", "Twice", "3-4 times", "5 times or more",
                                    "Never", "Once", "Twice", "3-4 times", "5 times or more")),
     bully_online=factor(bully_online, levels=c("Never", "Once", "Twice", "3-4 times", "5 times or more", 
                                                    "10862", "10863", "10864", "10865",     "10866"),
                           labels=c("Never", "Once", "Twice", "3-4 times", "5 times or more",
                                    "Never", "Once", "Twice", "3-4 times", "5 times or more")),
     sharm2 = factor(sharm2, 
              levels=c("Never", "Once", "Twice", "3-4 times", "5 times or more often", "5 times or more"), 
             labels=c("Never", "Once", "Twice", "3-4 times", "5 times or more", "5 times or more")),
     vapeharm1 = factor(vapeharm1, 
              levels=c("Strongly Agree", "Agree", "Neither disagree nor agree", "Disagree", "Strongly Disagree")),
     vapeharm2 = factor(vapeharm2, 
              levels=c("Strongly Agree", "Agree", "Neither disagree nor agree", "Disagree", "Strongly Disagree")),
     parsup1 = factor(parsup1, 
              levels=c("Very easy", "Easy", "Rather hard", "Hard", "Very hard"), 
             labels=c("Very easy", "Easy", "Hard/rather hard", "Hard/rather hard", "Very hard")),
     parsup2 = factor(parsup2, 
              levels=c("Very easy", "Easy", "Rather hard", "Hard", "Very hard"), 
             labels=c("Very easy", "Easy", "Hard/rather hard", "Hard/rather hard", "Very hard")),
     parsup3 = factor(parsup3, 
              levels=c("Very easy", "Easy", "Rather hard", "Hard", "Very hard"), 
             labels=c("Very easy", "Easy", "Hard/rather hard", "Hard/rather hard", "Very hard")))


# smoke30days 
# in N dublin two levels are merged together: 1-5 cigarettes per day & 6-10 cigarettes per day
# NB: is this a legitimate/erroneous merging of levels OR has there been a skip in levels. For instance, did "11-20 cigarettes per day" get moved up to replace the missing "6-10 cigarettes" label, and so forth?
# Below, I've merged 1-5 and 6-10 cigs for everyone
#table(mypy$smoke30days)
mypy$smoke30days <- factor(mypy$smoke30days, levels=c("None", "Less than one cigarette per week" , 
                                                      "Less than one cigarette per day" , 
                                                      "1-5 cigarettes per day 6-10 cigarettes per day", 
                                                      "11-20 cigarettes per day", 
                                                      "More than 20 cigarettes per day_(6)" , 
                                                      "More than 20 cigarettes per day_(7)" , 
                                                      "1-5 cigarettes per day"  ,         
                                                      "6-10 cigarettes per day" , 
                                                      "More than 20 cigarettes per day"),
                           labels=c("None", "Less than one cigarette per week" , 
                                    "Less than one cigarette per day" , 
                                    "1-10 cigarettes per day", 
                                    "11-20 cigarettes per day", 
                                    "More than 20 cigarettes per day" , 
                                    "More than 20 cigarettes per day" , 
                                    "1-10 cigarettes per day"  ,  
                                    "1-10 cigarettes per day" , 
                                    "More than 20 cigarettes per day"))

  
#  a similar issue with the variables health3 (physical health) and health4 (mental health)
# "good" and "okay" were merged together in both cavan & monaghan
mypy$health4 <- factor(mypy$health4, 
                levels=c("Very good",  "Good",     "Okay" ,      "Good Okay", "Bad", "Very bad"),
                labels=c("Very good",  "Good/Okay", "Good/Okay" , "Good/Okay", "Bad", "Very bad"))
mypy$health3 <- factor(mypy$health3, 
                levels=c("Very good",  "Good",     "Okay" ,      "Good Okay", "Bad", "Very bad"),
                labels=c("Very good",  "Good/Okay", "Good/Okay" , "Good/Okay", "Bad", "Very bad"))

# in mother and father education, there are many levels which nobody endorsed. They seem to be sublevels of the responses.
# perhaps they were binned into these 4: 'college', 'secondary', 'primary', 'don't know'
mypy$Pa_edu <- droplevels(mypy$Pa_edu)
mypy$Ma_edu <- droplevels(mypy$Ma_edu)

# same goes for par_work.
#table(mypy$par_work)
mypy$par_work <- droplevels(mypy$par_work)
set_label(mypy$par_work) <- "Do  you have a parent or carer that works outside the home"

# same goes for living circumstance. 
# NB: "other adults" includes the following which were probably collapsed to avoid participant identification:
# grandparents only, different arrangements (foster family, group home)
# parent+other adult includes: mother+partner, father+partner, grandparents+parent, 
# how did  "live with friends" or "i live on my own" get coded? perhaps nobody selected?
mypy$living <- droplevels(mypy$living)
mypy$living <- factor(mypy$living, levels=c("I live with both my parents", "One Parent" , "A Parent and other adult(s)", "Other adults (i e  extended family, foster family etc )","A Parent and another adult(s)"),
                      labels=c("I live with both my parents", "One Parent" , "A Parent and other adult(s)", "Other adults (i e  extended family, foster family etc )", "A Parent and other adult(s)"))


# it might make sense to merge those with non-cis genders together, given small Ns.
# those who are non-binary or trans are already merged. Also add those who may be "questioning" or who may be too concerned with the anonymity of the questionnaire to report a non-cis gender.

mypy$gender3 <- factor(mypy$gender, levels= c("Male", "Female", "Non-Binary and/or Trans", "Prefer not to say"), 
                                    labels = c("Male", "Female", "NB,trans or PNTS", "NB,trans or PNTS"))


# move basic demographic info to top of the datafile
mypy<- relocate(mypy, gender3, .after = "gender")
mypy<- relocate(mypy, local_schoolcode, .after = "school_code")
mypy<- relocate(mypy, local_district, .after = "county")

Derive extra variables

SDQ totals
APSS totals
SCL-90 anxiety and depression totals
Rosenberg self-esteem totals

# calculate SDQ total scores ###################################################
# extract SDQ and ID codes from data frame to work on them
col1 <- str_which(names(mypy), "SDQ1")[1] # isolated to '1st' because its picking up SDQ10, SDQ11, etc
col2 <- str_which(names(mypy), "SDQ25")
# note: SDQ26-32 contains APSS items

sdq <- mypy[,c(1, c(col1:col2))]

# identifify prosocial items, items that don't need to be reverse coded (where agreement reflects more prosocial behaviours) &
# items that need to be reverse coded (were DISagreement reflects problems)
prosoc = c(1, 4,  9, 17, 20 )
leave=c(2, 3, 5, 6, 8, 10, 12, 13, 15, 16, 18, 19, 22, 23, 24)
rev = c(11, 14, 21, 25, 7)
## NB: these numbers refer to ITEM numbers, not column indices

# reverse code the 5 items
for (x in rev){
  var <- sdq[,x+1] # item # plus 1 to get correct index
  var_rev <- var
  var_rev[var=="Not True"] <- "Certainly True"
  var_rev[var=="Certainly True"] <- "Not True"
  sdq[,x+1] <- var_rev
}

# make levels 'Not True'  'Somewhat True' & 'Certainly True' correspond to 0,1,2
for (x in 2:length(sdq)){
  sdq[,x] <- as.numeric(sdq[,x])-1
}


sdq$sdqtot <- rowSums(sdq[, (c(2, 3, 5, 6, 8, 10, 12, 13, 15, 16, 18, 19, 22, 23, 24,
                              11, 14, 21, 25, 7)+1)], na.rm=T)
sdq$sdqtot[rowSums(is.na(sdq[, (c(2, 3, 5, 6, 8, 10, 12, 13, 15, 16, 18, 19, 22, 23, 24,
                              11, 14, 21, 25, 7)+1)]))>1] <- NA

sdq$sdqhyp <- rowSums(sdq[,  (c(2, 10, 15, 21, 25)+1)], na.rm=F)
sdq$sdqpeer <- rowSums(sdq[, (c(6, 11, 14, 19, 23)+1)], na.rm=F)
sdq$sdqemot <- rowSums(sdq[, (c(3, 8, 13, 16, 24)+1)], na.rm=F)
sdq$sdqcond <- rowSums(sdq[, (c(5, 7, 12, 18, 22)+1)], na.rm=F)


# NB Notes: 
# I've created totals with na.rm = T, and put back the NAs when more than 1 item is missing.
# But for sub-score, any missing items, gives a subscore of NA.

mypy1 <- merge(mypy, sdq[,c("id", "sdqtot", "sdqhyp", "sdqpeer", "sdqemot", "sdqcond")], by.x = "id", by.y= "id")

# put variable labels back in
mypy1 <- sjlabelled::copy_labels(mypy1, mypy)
mypy<- mypy1

rm(list=c("py_remerged", "mypy1", "sdq"))

# APSS totals ###################################################
ind1 <- which(names(mypy)=="SDQ26")
ind2 <- which(names(mypy)=="SDQ32")

# create empty data frame # of participants X # of APSS items
pe.data <- data.frame(id = mypy$id,
                      pe1=rep(NA,length(mypy$id)), 
                      pe2=rep(NA,length(mypy$id)), 
                      pe3=rep(NA,length(mypy$id)), 
                      pe4=rep(NA,length(mypy$id)), 
                      pe5=rep(NA,length(mypy$id)), 
                      pe6=rep(NA,length(mypy$id)), 
                      pe7=rep(NA,length(mypy$id)))
count=1
for (col in ind1:ind2) {
count=count+1
 # calculate score for 
 x <- as.numeric(mypy[,col])-1
 x[x==1] <- 0.5
 x[x==2] <- 1
pe.data[,count] <- x
}

# create total, such that 1 missing item is ignored
pe.data$pe.tot <- rowSums(pe.data[,2:8], na.rm=T)
pe.data$pe.tot[rowSums(is.na(pe.data[2:8]))>1] <- NA
  
mypy1 <- merge(mypy, pe.data[,c("id","pe.tot")], by.x = "id", by.y= "id")
mypy1 <- sjlabelled::copy_labels(mypy1, mypy)
mypy<- mypy1

rm(list=c("mypy1"))

mypy$pe.risk <- as.factor(ifelse(mypy$pe.tot>=2, 1, 0))

# SCL-90 depression and anxiety totals ###################################################
# almost never=0
# rarely=1
# sometimes=2
# often=3

anxdep <- mypy[,c("id", "anx1", "anx2", "anx3", "dep1", "dep2", "dep3", "dep4", "dep5", "dep6", "dep7", "dep8", "dep9")]

for (col in 2:length(anxdep)) {
   anxdep[,col] <- as.numeric(anxdep[,col])-1
}

anxdep$tot.anxdep <- rowSums(anxdep[,2:length(anxdep)], na.rm=T)
anxdep$tot.anxdep[ rowSums(is.na(anxdep[,2:length(anxdep)]))>1 ] <- NA

anxdep$tot.dep <- rowSums(anxdep[,c("dep1", "dep2", "dep3", "dep4", "dep5", "dep6", "dep7", "dep8", "dep9")], na.rm=T)
anxdep$tot.dep[ rowSums(is.na(anxdep[,c("dep1", "dep2", "dep3", "dep4", "dep5", "dep6", "dep7", "dep8", "dep9")]))>1 ] <- NA

anxdep$tot.anx <- rowSums(anxdep[, c("anx1", "anx2", "anx3")])

mypy1 <- merge(mypy, anxdep[,c("id", "tot.anxdep", "tot.dep", "tot.anx")], by.x = "id", by.y= "id")

# put variable labels back in
mypy1 <- sjlabelled::copy_labels(mypy1, mypy)
mypy<- mypy1


# Note: quite a lot of people didnt do the sdq/apss, 7-16% of each county missing (probably as it was toward the end of the Q'aire in 2021)


# Rosenberg self-esteem total score ###################################################
# 1worth at least as much, 
# 2number of good qualities, 
# 3feel like failure, REV**
# 4as well as most, 
# 5not much to be proud of,  REV**
# 6positive attitude toward self, 
# 7on whole satisfied, 
# 8wish had more respect,  REV**
# 9no good,  REV**
# 10useless  REV**

# strongly disagree / very poorly =1
# strongly agree / very well =4
est <- mypy[,c("id", "esteem1", "esteem2", "esteem3", "esteem4", "esteem5", "esteem6", "esteem7", "esteem8", "esteem9", "esteem10")]

# i manually went thru levels of each item to check all responses on same level

est$esteem1 <- (as.numeric(est$esteem1)-5)*-1
est$esteem2 <- (as.numeric(est$esteem2)-5)*-1
est$esteem4 <- (as.numeric(est$esteem4)-5)*-1
est$esteem6 <- (as.numeric(est$esteem6)-5)*-1
est$esteem7 <- (as.numeric(est$esteem7)-5)*-1

est$esteem3 <- as.numeric(est$esteem3)
est$esteem5 <- as.numeric(est$esteem5)
est$esteem8 <- as.numeric(est$esteem8)
est$esteem9 <- as.numeric(est$esteem9)
est$esteem10 <- as.numeric(est$esteem10)

est$esteem.tot <- rowSums(est[,2:length(est)],na.rm=T)
est$esteem.tot[rowSums(is.na(est[,2:length(est)-1]))>1] <- NA # do not calculate total for anyone with more than 1 item missing 

mypy1 <- merge(mypy, est[,c("id", "esteem.tot")], by.x = "id", by.y= "id")

# put variable labels back in
mypy1 <- sjlabelled::copy_labels(mypy1, mypy)
mypy<- mypy1

mypy<- relocate(mypy, esteem.tot, .after = "esteem10")

Age of participants

# Note: there is no age var, only year of birth, use DOB and year of testing (2021) to derive rough age groups
mypy$age.group <- NA
mypy$age.group[mypy$birth=="Before 2004"] <- "18 or older"
mypy$age.group[mypy$birth=="2004"] <- "17"
mypy$age.group[mypy$birth=="2005"] <- "16"
mypy$age.group[mypy$birth=="2006"] <- "15"
mypy$age.group[mypy$birth=="2007 or later"] <- "14 or younger"

mypy$age.group <- as.factor(mypy$age.group)
  
# % of each age group
table(mypy$age.group) %>% prop.table() %>% round(digits=2)

## 
## 14 or younger            15            16            17   18 or older 
##          0.11          0.38          0.37          0.11          0.02

# move it up to front of dataset
mypy <- move_columns(mypy, age.group, .before="grade")

Variable descriptions

# fill in any missing variable descriptions
set_label(mypy$id) <- "Participant ID"
set_label(mypy$county) <- "Dublin, Monaghan or Cavan"
set_label(mypy$local_district) <- "Town/area in county"
set_label(mypy$pct.missing) <- "percent NAs for subject"
set_label(mypy$local_schoolcode) <- "School ID (short)"
set_label(mypy$school_code) <- "School ID (long; N/A for N.Dublin)"
set_label(mypy$gender3) <- "Gender (3 groups)"
set_label(mypy$age.group) <- "Approx age (2021-YOB)"
set_label(mypy$living) <- "I live with..."
set_label(mypy$Ma_edu) <- "What is the highest level of education your mother/carer completed?"
set_label(mypy$Pa_edu) <- "What is the highest level of education your father/carer completed?"
set_label(mypy$health3) <- "How would you rate your physical health?"
set_label(mypy$health4) <- "How would you rate your mental health?"
set_label(mypy$smoke30days) <- "How many cigarettes, on average, have you smoked the last 30 days?"
set_label(mypy$bullied_online) <- "You bullied someone else online yourself: How often during lifetime..."
set_label(mypy$bully_online) <- "You were bullied online by somebody: How often during lifetime..."

set_label(mypy$sharm2) <- "During your lifetime have you harmed yourself on purpose (e g , scratching, burning..."
set_label(mypy$vapeharm1) <- "Do you think e-cigarettes/vapes are addictive?"
set_label(mypy$vapeharm2) <- "Do you think vaping is generally safe?"

set_label(mypy$opp_attract) <- "Attraction to opposite sex (1=no attraction; 5=strong attraction)"
set_label(mypy$same_attract) <- "Attraction to same sex (1=no attraction; 5=strong attraction)"

set_label(mypy$parsup1) <- "Caring and warmth:How easy or difficult would it be for you to receive the follo"
set_label(mypy$parsup2) <- "Advice about personal matters :How easy or difficult would it be for you to rece"
set_label(mypy$parsup3) <- "Advice about schoolwork :How easy or difficult would it be for you to receive th"

set_label(mypy$sdqtot)<- "SDQ total difficulties score"
set_label(mypy$sdqhyp)<- "SDQ inattention/hyperactivity score"
set_label(mypy$sdqpeer)<- "SDQ peer problems score"
set_label(mypy$sdqemot)<- "SDQ emotional problems score"
set_label(mypy$sdqcond)<- "SDQ conduct problems score"
set_label(mypy$sdqcond)<- "SDQ conduct problems score"
set_label(mypy$tot.anxdep)<- "SCL90 anxiety and depressive Sx 0-36"
set_label(mypy$tot.anx)<- "SCL90 anxiety Sx 0-9"
set_label(mypy$tot.dep)<- "SCL90 depressive Sx 0-27"
set_label(mypy$esteem.tot)<- "Rosenberg self-esteem total"
set_label(mypy$pe.tot)<- "APSS total"
set_label(mypy$pe.risk)<- "APSS total scores 2 or above"

set_label(mypy$SDQ26) <- paste0("APSS1: " , get_label(mypy$SDQ26))
set_label(mypy$SDQ27) <- paste0("APSS2: " , get_label(mypy$SDQ27))
set_label(mypy$SDQ28) <- paste0("APSS3: " , get_label(mypy$SDQ28))
set_label(mypy$SDQ29) <- paste0("APSS4: " , get_label(mypy$SDQ29))
set_label(mypy$SDQ30) <- paste0("APSS5: " , get_label(mypy$SDQ30))
set_label(mypy$SDQ31) <- paste0("APSS6: " , get_label(mypy$SDQ31))
set_label(mypy$SDQ32) <- paste0("APSS7: " , get_label(mypy$SDQ32))


labels <- as.data.frame(get_label(mypy))
labels$var_name <- rownames(labels)
rownames(labels)<-c()

a=lapply(mypy, class) # extract var types (factor, numeric, etc.)
b <- t(as.data.frame(a))
rownames(b) <- c()
labels$var_type <- b

labels$resp_opts <-NA
for (x in 1:length(labels$var_name)) {
  if (labels$var_type[x]=="factor") {
    labels$resp_opts[x] <- str_flatten(levels(mypy[,x]), collapse= ", ")  
  }
}


names(labels) <- c("description", "var_name", "var_type", "resp_opts")
labels <- move_columns(labels, var_name, .before="description")

Describe extent of missing data

What schools have most missing data?
What variables are most commonly missing?

Variables with high rates of missingness:
* school_code: missing for all North Dublin schools (use ‘local_schoolcode’ instead)
* vapeharm1 and vapeharm2: an error in the way these Qs were asked required us to remove all N.Dublin responses to these Qs
* skip2 (“how many whole days were you absent from school because you skipped/cut class”) possibly not answered due to fears of being caught out

All other variables have missingness rates <15%

# Calculate the percentage of missing information for each participant (each row)
mypy$pct.missing <- round(rowSums(is.na(mypy))/length(mypy),2)
mypy <- move_columns(mypy, pct.missing, .after="county")

mypy$missingSDQ <- is.na(mypy$sdqtot) 
table(mypy$missingSDQ)%>%prop.table() # 11.58% of participants missing an SDQ total score

## 
##     FALSE      TRUE 
## 0.8841962 0.1158038

# Plot the average % total missing data, by school
ggplot(mypy)+
  stat_summary(aes(x= local_schoolcode, y= pct.missing, fill=county), fun.y = mean, geom = "bar")+
  theme(axis.text.x = element_text(angle=90, vjust=1, hjust=1))

# which variables were most often missing (limit to those with >10% (n>400) missing)
x= as.data.frame(colSums(is.na(mypy)))
x$var <- rownames(x)
names(x) <- c("num.missing", "var")

ggplot(x[x$num.missing>400,])+
  stat_summary(aes(x=var, y= num.missing), fun.y = mean, geom = "bar")+
  theme(axis.text.x = element_text(angle=90, vjust=1, hjust=1, size=10))

# for school ND12, what vars are mostly missing? Ans: The ones at the end of the survey
school_x <- mypy[mypy$local_schoolcode==" ND12",]
x= as.data.frame(colSums(is.na(school_x)))
x$var <- rownames(x)
names(x) <- c("num.missing", "var")
print(x[order(-x$num.missing)[1:10],])

##             num.missing         var
## school_code         144 school_code
## vapeharm1           144   vapeharm1
## vapeharm2           144   vapeharm2
## sdqemot              52     sdqemot
## SDQ24                51       SDQ24
## sdqhyp               51      sdqhyp
## sdqcond              51     sdqcond
## SDQ1                 50        SDQ1
## SDQ2                 50        SDQ2
## SDQ5                 50        SDQ5

rm("x")

# for school PYC09, what vars are mostly missing? 
# Ans: a few cannabis questions, bullying others, use of dating apps, vaping behaviours & perceptions 
# Note: small sample size of this school, so high % of missing data shouldnt be dwelt upon

school_x <- mypy[mypy$local_schoolcode==" PYC09",]
x= as.data.frame(colSums(is.na(school_x)))
x$var <- rownames(x)
names(x) <- c("num.missing", "var")
print(x[order(-x$num.missing)[1:10],])

##                num.missing            var
## Bgf_cannabis            12   Bgf_cannabis
## press3                  12         press3
## puse2                   12          puse2
## bully_part3             12    bully_part3
## bullied_online          12 bullied_online
## sh3                     12            sh3
## ecigons                 12        ecigons
## vapeharm2               12      vapeharm2
## vapeharm5               12      vapeharm5
## diet2                   12          diet2

rm("x")

Create cleaned data files

# (1) save as csv file
write.csv(mypy, file="combinedpy_V3.csv")

# (2) save as STATA file (not . are not allowed in STATA var names)
mypy.4stata<-mypy
names(mypy.4stata) <- gsub("\\.", "_", names(mypy.4stata))
write_dta(mypy.4stata, path="Planet Youth/Data/combinedpy_V3.dta")

# (3) save as R data filee
save(mypy, file = "combinedpy_V3.RData")

# save variable descriptions as its own csv
write.csv(labels, file="varlabels_V3.csv")

Cleaning and merging 2021 Planet Youth data from Cavan, Monaghan & North Dublin

Niamh Dooley

21/09/2023