library(haven)
library(readstata13)
library(sjlabelled)
library(dplyr)
library(stringr)
library(ggplot2)
library(ggpubr)
library(reshape2)
library(ggpubr)
library(sjmisc)
library(qwraps2)
cavan_labs <- read_dta("Planet Youth/Data/cavan master file.dta")
cavan <-readstata13::read.dta13("Planet Youth/Data/cavan master file.dta")
cavan <- sjlabelled::copy_labels(cavan, cavan_labs)
cavan$county <- "cavan"
cavan$id <- paste0("w1_c", seq(from=1, to=length(cavan$county)))
cavan<- relocate(cavan, c(id, county), .before = "school_code")
mon_labs <- read_dta("Planet Youth/Data/monaghan master file.dta")
mon <-readstata13::read.dta13("Planet Youth/Data/monaghan master file.dta")
mon <- sjlabelled::copy_labels(mon, mon_labs)
mon$county <- "monaghan"
mon$id <- paste0("w1_m", seq(from=1, to=length(mon$county)))
mon<- relocate(mon, c(id, county), .before = "school_code")
dub_labs <- read_dta("Planet Youth/Data/north dublin master data file.dta")
dub <-readstata13::read.dta13("Planet Youth/Data/north dublin master data file.dta")
## Warning in readstata13::read.dta13("Planet Youth/Data/north dublin master data file.dta"):
## Duplicated factor levels for variables
##
## Pa_edu, par_work, smoke30days
##
## Unique labels for these variables have been generated.
## Warning in readstata13::read.dta13("Planet Youth/Data/north dublin master data file.dta"):
## Missing factor labels for variables
##
## bullied_online
##
## No labels have been assigned.
## Set option 'generate.factors=TRUE' to generate labels.
dub <- sjlabelled::copy_labels(dub, dub_labs)
dub$county <- "dublin"
dub$id <- paste0("w1_nd", seq(from=1, to=length(dub$county)))
dub<- relocate(dub, c(id, county), .before = "gender")
rm(list=c("cavan_labs", "mon_labs", "dub_labs"))
# merge
# set variables to have same names
dub$school_code <- NA
dub$local_district <- NA
dub<- relocate(dub, school_code, .before = "gender")
dub$vapeharm1 <- NA
dub$vapeharm2 <- NA
dub<- relocate(dub, vapeharm2, .before = "vapeharm3")
dub <- dub%>%
rename(
local_schoolcode = NorthDublin_school
)
mon <- mon%>%
rename(
local_schoolcode = Monaghan_school,
local_district = Monaghan_district
)
cavan <- cavan%>%
rename(
local_schoolcode = Cavan_school,
local_district = Cavan_district
)
# YEAR IN SCHOOL variable "grade". Differences in response coding across *each* county (see guide)
# set labels for each value (informed by discussions with survey administrators), save as characters, then back to factors
cavan$grade <- as.character(factor(cavan$grade, levels=c(3,4,5), labels=c("Transition year", "5th year", "Leaving Cert Applied")))
mon$grade <- as.character(factor(mon$grade, levels=c(4,5), labels=c("Transition year", "5th year")))
mon$grade <- as.factor(mon$grade)
cavan$grade <- as.factor(cavan$grade)
# replace factors with character vars so all levels are allowed
cavan$local_district <- as.character(cavan$local_district)
mon$local_district <- as.character(mon$local_district)
dub$local_district <- "NorthDub"
# make sure number & names of variabels line up
#a<- data.frame(dub=names(dub), cav=names(cavan), mon=names(mon))
py_remerged <- rbind(dub, cavan, mon)
py_remerged <- sjlabelled::copy_labels(py_remerged, cavan)
py_remerged$local_schoolcode <- set_label(py_remerged$local_schoolcode, "")
py_remerged$local_district <- set_label(py_remerged$local_district, "for monaghan and cavan only")
py_remerged$local_district <- as.factor(py_remerged$local_district)
py_remerged$county <- as.factor(py_remerged$county)
py_remerged$grade <- as.factor(py_remerged$grade)
mypy <- py_remerged
Cleaning Notes:
* For “opp_attract” & “same_attract” missing values were stored as
““– replace with NA
*”vapeharm2” – response options “agree” “strongly agree” etc stored as
characters rather than levels of a factor
* Also, the questions for “vapeharm1” and “vapeharm2” were merged
together in the North Dublin survey, meaning the results of these items
are meaningless for this site. All North Dublin responses to these items
are considered missing (done above on dub dataframe, before merge)
* “grade” years had different response levels across counties– this was
rectified above before merging county data
* “bullied_online” – responses from dublin surveys were coded as
10862-10866 instead of categories of frequency “Never”, “Once”, “Twice”,
etc. As the dublin survey lists response options “never, once, twice,
3-4 times and 5 times or more”, numeric response levels were assumed to
match those in order. Frequencies also suggest this is the case
(decreasing Ns with higher frequency of online bullying)
* In “smoke30days” and “health3” and “health4” response options were
erroneously “stuck” together in one or more sites. So the response
options for the other site were combined to allow cross-site
comparison
* In “living”, “par_work”, “Pa_edu” and “Ma_edu”, there were more
response options than were endorsed. The response options may have been
binned into a smaller group of options post survey collection by
Icelandic or RCSI researchers, to avoid identification of
participants
* parsup1, parsup2, parsup3 are about how easy/hard it would be for
respondents to get support/advice/warmth from their parents. Response
options differed across sites
More detail can be found in the document “Researchers guide to PY 2021 DubMonCav”
mypy <- mypy %>%
mutate(
local_district = factor(local_district),
opp_attract = factor(opp_attract, levels=c("", "1", "2", "3", "4", "5"),
labels=c(NA_character_, "1", "2", "3", "4", "5" )),
same_attract=factor(same_attract, levels=c("", "1", "2", "3", "4", "5"),
labels=c(NA_character_, "1", "2", "3", "4", "5" )),
vapeharm2=factor(vapeharm2, levels=c("Stongly Agree", "Agree", "Neither disagree nor agree",
"Disagree", "Strongly Disagree")),
# grade=factor(grade, levels=c("2nd year", "3rd year", "Transition year",
# "5th year", "Leaving Cert Applied")),
bullied_online=factor(bullied_online, levels=c("Never", "Once", "Twice", "3-4 times", "5 times or more",
"10862", "10863", "10864", "10865", "10866"),
labels=c("Never", "Once", "Twice", "3-4 times", "5 times or more",
"Never", "Once", "Twice", "3-4 times", "5 times or more")),
bully_online=factor(bully_online, levels=c("Never", "Once", "Twice", "3-4 times", "5 times or more",
"10862", "10863", "10864", "10865", "10866"),
labels=c("Never", "Once", "Twice", "3-4 times", "5 times or more",
"Never", "Once", "Twice", "3-4 times", "5 times or more")),
sharm2 = factor(sharm2,
levels=c("Never", "Once", "Twice", "3-4 times", "5 times or more often", "5 times or more"),
labels=c("Never", "Once", "Twice", "3-4 times", "5 times or more", "5 times or more")),
vapeharm1 = factor(vapeharm1,
levels=c("Strongly Agree", "Agree", "Neither disagree nor agree", "Disagree", "Strongly Disagree")),
vapeharm2 = factor(vapeharm2,
levels=c("Strongly Agree", "Agree", "Neither disagree nor agree", "Disagree", "Strongly Disagree")),
parsup1 = factor(parsup1,
levels=c("Very easy", "Easy", "Rather hard", "Hard", "Very hard"),
labels=c("Very easy", "Easy", "Hard/rather hard", "Hard/rather hard", "Very hard")),
parsup2 = factor(parsup2,
levels=c("Very easy", "Easy", "Rather hard", "Hard", "Very hard"),
labels=c("Very easy", "Easy", "Hard/rather hard", "Hard/rather hard", "Very hard")),
parsup3 = factor(parsup3,
levels=c("Very easy", "Easy", "Rather hard", "Hard", "Very hard"),
labels=c("Very easy", "Easy", "Hard/rather hard", "Hard/rather hard", "Very hard")))
# smoke30days
# in N dublin two levels are merged together: 1-5 cigarettes per day & 6-10 cigarettes per day
# NB: is this a legitimate/erroneous merging of levels OR has there been a skip in levels. For instance, did "11-20 cigarettes per day" get moved up to replace the missing "6-10 cigarettes" label, and so forth?
# Below, I've merged 1-5 and 6-10 cigs for everyone
#table(mypy$smoke30days)
mypy$smoke30days <- factor(mypy$smoke30days, levels=c("None", "Less than one cigarette per week" ,
"Less than one cigarette per day" ,
"1-5 cigarettes per day 6-10 cigarettes per day",
"11-20 cigarettes per day",
"More than 20 cigarettes per day_(6)" ,
"More than 20 cigarettes per day_(7)" ,
"1-5 cigarettes per day" ,
"6-10 cigarettes per day" ,
"More than 20 cigarettes per day"),
labels=c("None", "Less than one cigarette per week" ,
"Less than one cigarette per day" ,
"1-10 cigarettes per day",
"11-20 cigarettes per day",
"More than 20 cigarettes per day" ,
"More than 20 cigarettes per day" ,
"1-10 cigarettes per day" ,
"1-10 cigarettes per day" ,
"More than 20 cigarettes per day"))
# a similar issue with the variables health3 (physical health) and health4 (mental health)
# "good" and "okay" were merged together in both cavan & monaghan
mypy$health4 <- factor(mypy$health4,
levels=c("Very good", "Good", "Okay" , "Good Okay", "Bad", "Very bad"),
labels=c("Very good", "Good/Okay", "Good/Okay" , "Good/Okay", "Bad", "Very bad"))
mypy$health3 <- factor(mypy$health3,
levels=c("Very good", "Good", "Okay" , "Good Okay", "Bad", "Very bad"),
labels=c("Very good", "Good/Okay", "Good/Okay" , "Good/Okay", "Bad", "Very bad"))
# in mother and father education, there are many levels which nobody endorsed. They seem to be sublevels of the responses.
# perhaps they were binned into these 4: 'college', 'secondary', 'primary', 'don't know'
mypy$Pa_edu <- droplevels(mypy$Pa_edu)
mypy$Ma_edu <- droplevels(mypy$Ma_edu)
# same goes for par_work.
#table(mypy$par_work)
mypy$par_work <- droplevels(mypy$par_work)
set_label(mypy$par_work) <- "Do you have a parent or carer that works outside the home"
# same goes for living circumstance.
# NB: "other adults" includes the following which were probably collapsed to avoid participant identification:
# grandparents only, different arrangements (foster family, group home)
# parent+other adult includes: mother+partner, father+partner, grandparents+parent,
# how did "live with friends" or "i live on my own" get coded? perhaps nobody selected?
mypy$living <- droplevels(mypy$living)
mypy$living <- factor(mypy$living, levels=c("I live with both my parents", "One Parent" , "A Parent and other adult(s)", "Other adults (i e extended family, foster family etc )","A Parent and another adult(s)"),
labels=c("I live with both my parents", "One Parent" , "A Parent and other adult(s)", "Other adults (i e extended family, foster family etc )", "A Parent and other adult(s)"))
# it might make sense to merge those with non-cis genders together, given small Ns.
# those who are non-binary or trans are already merged. Also add those who may be "questioning" or who may be too concerned with the anonymity of the questionnaire to report a non-cis gender.
mypy$gender3 <- factor(mypy$gender, levels= c("Male", "Female", "Non-Binary and/or Trans", "Prefer not to say"),
labels = c("Male", "Female", "NB,trans or PNTS", "NB,trans or PNTS"))
# move basic demographic info to top of the datafile
mypy<- relocate(mypy, gender3, .after = "gender")
mypy<- relocate(mypy, local_schoolcode, .after = "school_code")
mypy<- relocate(mypy, local_district, .after = "county")
# calculate SDQ total scores ###################################################
# extract SDQ and ID codes from data frame to work on them
col1 <- str_which(names(mypy), "SDQ1")[1] # isolated to '1st' because its picking up SDQ10, SDQ11, etc
col2 <- str_which(names(mypy), "SDQ25")
# note: SDQ26-32 contains APSS items
sdq <- mypy[,c(1, c(col1:col2))]
# identifify prosocial items, items that don't need to be reverse coded (where agreement reflects more prosocial behaviours) &
# items that need to be reverse coded (were DISagreement reflects problems)
prosoc = c(1, 4, 9, 17, 20 )
leave=c(2, 3, 5, 6, 8, 10, 12, 13, 15, 16, 18, 19, 22, 23, 24)
rev = c(11, 14, 21, 25, 7)
## NB: these numbers refer to ITEM numbers, not column indices
# reverse code the 5 items
for (x in rev){
var <- sdq[,x+1] # item # plus 1 to get correct index
var_rev <- var
var_rev[var=="Not True"] <- "Certainly True"
var_rev[var=="Certainly True"] <- "Not True"
sdq[,x+1] <- var_rev
}
# make levels 'Not True' 'Somewhat True' & 'Certainly True' correspond to 0,1,2
for (x in 2:length(sdq)){
sdq[,x] <- as.numeric(sdq[,x])-1
}
sdq$sdqtot <- rowSums(sdq[, (c(2, 3, 5, 6, 8, 10, 12, 13, 15, 16, 18, 19, 22, 23, 24,
11, 14, 21, 25, 7)+1)], na.rm=T)
sdq$sdqtot[rowSums(is.na(sdq[, (c(2, 3, 5, 6, 8, 10, 12, 13, 15, 16, 18, 19, 22, 23, 24,
11, 14, 21, 25, 7)+1)]))>1] <- NA
sdq$sdqhyp <- rowSums(sdq[, (c(2, 10, 15, 21, 25)+1)], na.rm=F)
sdq$sdqpeer <- rowSums(sdq[, (c(6, 11, 14, 19, 23)+1)], na.rm=F)
sdq$sdqemot <- rowSums(sdq[, (c(3, 8, 13, 16, 24)+1)], na.rm=F)
sdq$sdqcond <- rowSums(sdq[, (c(5, 7, 12, 18, 22)+1)], na.rm=F)
# NB Notes:
# I've created totals with na.rm = T, and put back the NAs when more than 1 item is missing.
# But for sub-score, any missing items, gives a subscore of NA.
mypy1 <- merge(mypy, sdq[,c("id", "sdqtot", "sdqhyp", "sdqpeer", "sdqemot", "sdqcond")], by.x = "id", by.y= "id")
# put variable labels back in
mypy1 <- sjlabelled::copy_labels(mypy1, mypy)
mypy<- mypy1
rm(list=c("py_remerged", "mypy1", "sdq"))
# APSS totals ###################################################
ind1 <- which(names(mypy)=="SDQ26")
ind2 <- which(names(mypy)=="SDQ32")
# create empty data frame # of participants X # of APSS items
pe.data <- data.frame(id = mypy$id,
pe1=rep(NA,length(mypy$id)),
pe2=rep(NA,length(mypy$id)),
pe3=rep(NA,length(mypy$id)),
pe4=rep(NA,length(mypy$id)),
pe5=rep(NA,length(mypy$id)),
pe6=rep(NA,length(mypy$id)),
pe7=rep(NA,length(mypy$id)))
count=1
for (col in ind1:ind2) {
count=count+1
# calculate score for
x <- as.numeric(mypy[,col])-1
x[x==1] <- 0.5
x[x==2] <- 1
pe.data[,count] <- x
}
# create total, such that 1 missing item is ignored
pe.data$pe.tot <- rowSums(pe.data[,2:8], na.rm=T)
pe.data$pe.tot[rowSums(is.na(pe.data[2:8]))>1] <- NA
mypy1 <- merge(mypy, pe.data[,c("id","pe.tot")], by.x = "id", by.y= "id")
mypy1 <- sjlabelled::copy_labels(mypy1, mypy)
mypy<- mypy1
rm(list=c("mypy1"))
mypy$pe.risk <- as.factor(ifelse(mypy$pe.tot>=2, 1, 0))
# SCL-90 depression and anxiety totals ###################################################
# almost never=0
# rarely=1
# sometimes=2
# often=3
anxdep <- mypy[,c("id", "anx1", "anx2", "anx3", "dep1", "dep2", "dep3", "dep4", "dep5", "dep6", "dep7", "dep8", "dep9")]
for (col in 2:length(anxdep)) {
anxdep[,col] <- as.numeric(anxdep[,col])-1
}
anxdep$tot.anxdep <- rowSums(anxdep[,2:length(anxdep)], na.rm=T)
anxdep$tot.anxdep[ rowSums(is.na(anxdep[,2:length(anxdep)]))>1 ] <- NA
anxdep$tot.dep <- rowSums(anxdep[,c("dep1", "dep2", "dep3", "dep4", "dep5", "dep6", "dep7", "dep8", "dep9")], na.rm=T)
anxdep$tot.dep[ rowSums(is.na(anxdep[,c("dep1", "dep2", "dep3", "dep4", "dep5", "dep6", "dep7", "dep8", "dep9")]))>1 ] <- NA
anxdep$tot.anx <- rowSums(anxdep[, c("anx1", "anx2", "anx3")])
mypy1 <- merge(mypy, anxdep[,c("id", "tot.anxdep", "tot.dep", "tot.anx")], by.x = "id", by.y= "id")
# put variable labels back in
mypy1 <- sjlabelled::copy_labels(mypy1, mypy)
mypy<- mypy1
# Note: quite a lot of people didnt do the sdq/apss, 7-16% of each county missing (probably as it was toward the end of the Q'aire in 2021)
# Rosenberg self-esteem total score ###################################################
# 1worth at least as much,
# 2number of good qualities,
# 3feel like failure, REV**
# 4as well as most,
# 5not much to be proud of, REV**
# 6positive attitude toward self,
# 7on whole satisfied,
# 8wish had more respect, REV**
# 9no good, REV**
# 10useless REV**
# strongly disagree / very poorly =1
# strongly agree / very well =4
est <- mypy[,c("id", "esteem1", "esteem2", "esteem3", "esteem4", "esteem5", "esteem6", "esteem7", "esteem8", "esteem9", "esteem10")]
# i manually went thru levels of each item to check all responses on same level
est$esteem1 <- (as.numeric(est$esteem1)-5)*-1
est$esteem2 <- (as.numeric(est$esteem2)-5)*-1
est$esteem4 <- (as.numeric(est$esteem4)-5)*-1
est$esteem6 <- (as.numeric(est$esteem6)-5)*-1
est$esteem7 <- (as.numeric(est$esteem7)-5)*-1
est$esteem3 <- as.numeric(est$esteem3)
est$esteem5 <- as.numeric(est$esteem5)
est$esteem8 <- as.numeric(est$esteem8)
est$esteem9 <- as.numeric(est$esteem9)
est$esteem10 <- as.numeric(est$esteem10)
est$esteem.tot <- rowSums(est[,2:length(est)],na.rm=T)
est$esteem.tot[rowSums(is.na(est[,2:length(est)-1]))>1] <- NA # do not calculate total for anyone with more than 1 item missing
mypy1 <- merge(mypy, est[,c("id", "esteem.tot")], by.x = "id", by.y= "id")
# put variable labels back in
mypy1 <- sjlabelled::copy_labels(mypy1, mypy)
mypy<- mypy1
mypy<- relocate(mypy, esteem.tot, .after = "esteem10")
Age of participants
# Note: there is no age var, only year of birth, use DOB and year of testing (2021) to derive rough age groups
mypy$age.group <- NA
mypy$age.group[mypy$birth=="Before 2004"] <- "18 or older"
mypy$age.group[mypy$birth=="2004"] <- "17"
mypy$age.group[mypy$birth=="2005"] <- "16"
mypy$age.group[mypy$birth=="2006"] <- "15"
mypy$age.group[mypy$birth=="2007 or later"] <- "14 or younger"
mypy$age.group <- as.factor(mypy$age.group)
# % of each age group
table(mypy$age.group) %>% prop.table() %>% round(digits=2)
##
## 14 or younger 15 16 17 18 or older
## 0.11 0.38 0.37 0.11 0.02
# move it up to front of dataset
mypy <- move_columns(mypy, age.group, .before="grade")
# fill in any missing variable descriptions
set_label(mypy$id) <- "Participant ID"
set_label(mypy$county) <- "Dublin, Monaghan or Cavan"
set_label(mypy$local_district) <- "Town/area in county"
set_label(mypy$pct.missing) <- "percent NAs for subject"
set_label(mypy$local_schoolcode) <- "School ID (short)"
set_label(mypy$school_code) <- "School ID (long; N/A for N.Dublin)"
set_label(mypy$gender3) <- "Gender (3 groups)"
set_label(mypy$age.group) <- "Approx age (2021-YOB)"
set_label(mypy$living) <- "I live with..."
set_label(mypy$Ma_edu) <- "What is the highest level of education your mother/carer completed?"
set_label(mypy$Pa_edu) <- "What is the highest level of education your father/carer completed?"
set_label(mypy$health3) <- "How would you rate your physical health?"
set_label(mypy$health4) <- "How would you rate your mental health?"
set_label(mypy$smoke30days) <- "How many cigarettes, on average, have you smoked the last 30 days?"
set_label(mypy$bullied_online) <- "You bullied someone else online yourself: How often during lifetime..."
set_label(mypy$bully_online) <- "You were bullied online by somebody: How often during lifetime..."
set_label(mypy$sharm2) <- "During your lifetime have you harmed yourself on purpose (e g , scratching, burning..."
set_label(mypy$vapeharm1) <- "Do you think e-cigarettes/vapes are addictive?"
set_label(mypy$vapeharm2) <- "Do you think vaping is generally safe?"
set_label(mypy$opp_attract) <- "Attraction to opposite sex (1=no attraction; 5=strong attraction)"
set_label(mypy$same_attract) <- "Attraction to same sex (1=no attraction; 5=strong attraction)"
set_label(mypy$parsup1) <- "Caring and warmth:How easy or difficult would it be for you to receive the follo"
set_label(mypy$parsup2) <- "Advice about personal matters :How easy or difficult would it be for you to rece"
set_label(mypy$parsup3) <- "Advice about schoolwork :How easy or difficult would it be for you to receive th"
set_label(mypy$sdqtot)<- "SDQ total difficulties score"
set_label(mypy$sdqhyp)<- "SDQ inattention/hyperactivity score"
set_label(mypy$sdqpeer)<- "SDQ peer problems score"
set_label(mypy$sdqemot)<- "SDQ emotional problems score"
set_label(mypy$sdqcond)<- "SDQ conduct problems score"
set_label(mypy$sdqcond)<- "SDQ conduct problems score"
set_label(mypy$tot.anxdep)<- "SCL90 anxiety and depressive Sx 0-36"
set_label(mypy$tot.anx)<- "SCL90 anxiety Sx 0-9"
set_label(mypy$tot.dep)<- "SCL90 depressive Sx 0-27"
set_label(mypy$esteem.tot)<- "Rosenberg self-esteem total"
set_label(mypy$pe.tot)<- "APSS total"
set_label(mypy$pe.risk)<- "APSS total scores 2 or above"
set_label(mypy$SDQ26) <- paste0("APSS1: " , get_label(mypy$SDQ26))
set_label(mypy$SDQ27) <- paste0("APSS2: " , get_label(mypy$SDQ27))
set_label(mypy$SDQ28) <- paste0("APSS3: " , get_label(mypy$SDQ28))
set_label(mypy$SDQ29) <- paste0("APSS4: " , get_label(mypy$SDQ29))
set_label(mypy$SDQ30) <- paste0("APSS5: " , get_label(mypy$SDQ30))
set_label(mypy$SDQ31) <- paste0("APSS6: " , get_label(mypy$SDQ31))
set_label(mypy$SDQ32) <- paste0("APSS7: " , get_label(mypy$SDQ32))
labels <- as.data.frame(get_label(mypy))
labels$var_name <- rownames(labels)
rownames(labels)<-c()
a=lapply(mypy, class) # extract var types (factor, numeric, etc.)
b <- t(as.data.frame(a))
rownames(b) <- c()
labels$var_type <- b
labels$resp_opts <-NA
for (x in 1:length(labels$var_name)) {
if (labels$var_type[x]=="factor") {
labels$resp_opts[x] <- str_flatten(levels(mypy[,x]), collapse= ", ")
}
}
names(labels) <- c("description", "var_name", "var_type", "resp_opts")
labels <- move_columns(labels, var_name, .before="description")
Variables with high rates of missingness:
* school_code: missing for all North Dublin schools (use
‘local_schoolcode’ instead)
* vapeharm1 and vapeharm2: an error in the way these Qs were asked
required us to remove all N.Dublin responses to these Qs
* skip2 (“how many whole days were you absent from school because you
skipped/cut class”) possibly not answered due to fears of being caught
out
All other variables have missingness rates <15%
# Calculate the percentage of missing information for each participant (each row)
mypy$pct.missing <- round(rowSums(is.na(mypy))/length(mypy),2)
mypy <- move_columns(mypy, pct.missing, .after="county")
mypy$missingSDQ <- is.na(mypy$sdqtot)
table(mypy$missingSDQ)%>%prop.table() # 11.58% of participants missing an SDQ total score
##
## FALSE TRUE
## 0.8841962 0.1158038
# Plot the average % total missing data, by school
ggplot(mypy)+
stat_summary(aes(x= local_schoolcode, y= pct.missing, fill=county), fun.y = mean, geom = "bar")+
theme(axis.text.x = element_text(angle=90, vjust=1, hjust=1))
# which variables were most often missing (limit to those with >10% (n>400) missing)
x= as.data.frame(colSums(is.na(mypy)))
x$var <- rownames(x)
names(x) <- c("num.missing", "var")
ggplot(x[x$num.missing>400,])+
stat_summary(aes(x=var, y= num.missing), fun.y = mean, geom = "bar")+
theme(axis.text.x = element_text(angle=90, vjust=1, hjust=1, size=10))
# for school ND12, what vars are mostly missing? Ans: The ones at the end of the survey
school_x <- mypy[mypy$local_schoolcode==" ND12",]
x= as.data.frame(colSums(is.na(school_x)))
x$var <- rownames(x)
names(x) <- c("num.missing", "var")
print(x[order(-x$num.missing)[1:10],])
## num.missing var
## school_code 144 school_code
## vapeharm1 144 vapeharm1
## vapeharm2 144 vapeharm2
## sdqemot 52 sdqemot
## SDQ24 51 SDQ24
## sdqhyp 51 sdqhyp
## sdqcond 51 sdqcond
## SDQ1 50 SDQ1
## SDQ2 50 SDQ2
## SDQ5 50 SDQ5
rm("x")
# for school PYC09, what vars are mostly missing?
# Ans: a few cannabis questions, bullying others, use of dating apps, vaping behaviours & perceptions
# Note: small sample size of this school, so high % of missing data shouldnt be dwelt upon
school_x <- mypy[mypy$local_schoolcode==" PYC09",]
x= as.data.frame(colSums(is.na(school_x)))
x$var <- rownames(x)
names(x) <- c("num.missing", "var")
print(x[order(-x$num.missing)[1:10],])
## num.missing var
## Bgf_cannabis 12 Bgf_cannabis
## press3 12 press3
## puse2 12 puse2
## bully_part3 12 bully_part3
## bullied_online 12 bullied_online
## sh3 12 sh3
## ecigons 12 ecigons
## vapeharm2 12 vapeharm2
## vapeharm5 12 vapeharm5
## diet2 12 diet2
rm("x")
# (1) save as csv file
write.csv(mypy, file="combinedpy_V3.csv")
# (2) save as STATA file (not . are not allowed in STATA var names)
mypy.4stata<-mypy
names(mypy.4stata) <- gsub("\\.", "_", names(mypy.4stata))
write_dta(mypy.4stata, path="Planet Youth/Data/combinedpy_V3.dta")
# (3) save as R data filee
save(mypy, file = "combinedpy_V3.RData")
# save variable descriptions as its own csv
write.csv(labels, file="varlabels_V3.csv")