#packages
library(flextable) #tables
library(tidyverse) #enrinonment
library(mice) # imputation
library(janitor)
library(psych) #psychometrics (classical theory)
library(lavaan) #SEM
library(semPlot) #plot cfa models
library(semTools) #reliability sem models
library(DataExplorer) #missing
library(summarytools) #show data info
library(gridExtra) #add features for grid
library(emmeans) #anova and posthoc
library(effsize) #cohen D
library(ggpubr) #ridge plot
library(cutpointr) #ROC
#round
options(scipen=1, digits=2)-Psychometric properties and clinical utility of the executive function inventory for children and adolescents: a large multistage populational study including children with ADHD
-The clinical utility of psychometric tools: a real-data approach from a study including children with ADHD
The first is a published manuscript (DOI: https://doi.org/10.1080/21622965.2020.1726353) and the last is a book chapter.
This markdown, the data presented here, and the manuscripts are also used in my Psychometrics course, at Pontifical Catholic University of Rio de Janeiro (PUC-Rio), which partially explains the way I built this markdown document.
The first part of the codes (Data handling) explains how to get the data and adjust some issues. Originally, the data was a messy Excel file and several steps were needed to fix everything. As everything was done, you do not need to run these chunks again, but just load the Rdata file and skip the related chunks. Thus, Data handling chunks, both from Parents and Teachers, are only useful to be aware of what was done. The second part of this markdown is Psychometrics, from parents and teachers, all tables of the first manuscript can be replicated. Be aware that chunks with the eval = TRUE options are useful, whereas the other chunks don’t need to run. The chunk ADHD group contains preliminary codes to make possible compute one solid dataset include both groups (typical and ADHD) and the same variables used with them. I’m very much aware that these codes could be placed at the beginning, which makes sense. However, these steps were intentionally developed to make possible study the manuscript with different perspectives. In other words, when someone wants to dig into psychometric models, the firsts sets of chunks are useful. When one wants to dig into the most specific part of clinical utility, the second part of this markdown is really useful. The chunk Clinical utility uses the dataset composed of the two groups (Typical and ADHD) and reproduce the results reported in the manuscript, as well as provides a partial replication of figures of the manuscript. Then, after all that said, ROC chunk relates to the book chapter, in which I and other co-authors have introduced concepts such as the sensitivity and specificity of a psychological test. The last part of this document is Future work, which mix temporary stuffs that I did during the process and what would be really suitable for future works.
Please feel free to ask further information and/or to reach me out via luisfac@puc-rio.br
Thanks, Luis
Please check at https://osf.io/kx2ym/files/. I recommend importing and loading the R files (Base R - EFICA (manuscript and book chapter) instead of raw files. However, feel free to get the raw data if you want to replicate these analyses.
#get file
load(file="C:/Users/luisf/Dropbox/Puc-Rio/Consultoria - Arruda/Base R - EFICA (manuscript and book chapter).RData")#add sex
ds <- ds %>%
mutate(sex = if_else(sexo_certo == "1", "Female", "Male"))
#add race
ds <- ds %>%
mutate(race = if_else(cor == "1","White","non-white"))
#rename age
ds <- ds %>%
rename(age = idade)
#add guardian
ds <- ds %>%
mutate(guardian = recode(quem_preenche_certo,
"1" = "mother",
"2" = "father",
"3" = "other"))
# add schooling
ds <- ds %>%
rename(schooling = grau_instrucao)
# add socioeconomic
ds <- ds %>%
rename(socioeconomic = classe_economica)
#change values (tipico for typical)
ds <- ds %>% mutate(group = "typical")
#remove useless variables
ds <- ds %>% select(-c(pais_tct_metacog1,pais_tct_metacog2, pais_tct_regulacao, profs_tct_metacog, profs_tct_regulacao, quem_preenche))#https://stackoverflow.com/questions/44930149/replace-a-subset-of-a-data-frame-with-dplyr-join-operations
replace_subset <- function(df, df_subset, id_col_names = c()) {
# work out which of the columns contain "new" data
new_data_col_names <- colnames(df_subset)[which(!colnames(df_subset) %in% id_col_names)]
# complete the df_subset with the extra columns from df
df_sub_to_join <- df_subset %>%
left_join(select(df, -new_data_col_names), by = c(id_col_names))
# join and bind rows
df_out <- df %>%
anti_join(df_sub_to_join, by = c(id_col_names)) %>%
bind_rows(df_sub_to_join)
return(df_out)
}First, get a random sample subsetting the original dataset (into a smaller but representative data)
Then, I’ll create and run the model
#create the model
mod_cfa_pais <- 'metacog1 =~ y12 + y26 + y8 + y1 + y17 + y20 + y10 + y47 + y48 + y60 + y55 + y6 + y7 + y57
regulacao =~ y21 + y23 + y19 + y52 + y28 + y3 + y4 + y50 + y45 + y24 + y25 + y22 + y18 + y54 + y59 + y14 + y42 + y16 + y11 + y53 + y58 + y32
metacog2 =~ y30 + y41 + y31 + y29 + y35 + y38 + y37 + y2 + y40 + y13 + y46 + y43 + y62 + y44 + y9 + y27 + y33 + y39 + y65 + y34 + y36 + y56 + y5'
#run the model
cfa_pais <- cfa(mod_cfa_pais, #model name
data=ife_65_650, #dataset
estimator = 'WLSMV', #method for extracting factors
orthogonal=FALSE,
ordered=names(ife_65_650))
#reliability
rel_pais <- reliability(cfa_pais)And get the results avaiable in Table 2. Items analysis, descriptive, and CFA results of EFICA-P (parents).
## Report the model
parameterEstimates(cfa_pais, standardized=TRUE) %>%
filter(op == "=~") %>%
mutate(stars = ifelse(pvalue < .001, "***",
ifelse(pvalue < .01, "**",
ifelse(pvalue < .05, "*", "")))) %>%
select('Latent Factor'=lhs,
Indicator=rhs,
B=est,
SE=se, Z=z,
Beta=std.all,
sig=stars) %>%
arrange(`Latent Factor`,desc(Beta)) %>%
flextable() %>%
colformat_num(., digit = 3)Latent Factor | Indicator | B | SE | Z | Beta | sig |
metacog1 | y8 | 1.082 | 0.062 | 17.322 | 0.715 | *** |
metacog1 | y20 | 1.058 | 0.071 | 14.913 | 0.699 | *** |
metacog1 | y17 | 1.011 | 0.062 | 16.321 | 0.667 | *** |
metacog1 | y12 | 1.000 | 0.000 | 0.660 | ||
metacog1 | y26 | 0.994 | 0.048 | 20.628 | 0.656 | *** |
metacog1 | y48 | 0.993 | 0.066 | 14.950 | 0.655 | *** |
metacog1 | y7 | 0.964 | 0.071 | 13.652 | 0.637 | *** |
metacog1 | y1 | 0.957 | 0.065 | 14.832 | 0.632 | *** |
metacog1 | y47 | 0.935 | 0.061 | 15.245 | 0.617 | *** |
metacog1 | y10 | 0.880 | 0.065 | 13.489 | 0.581 | *** |
metacog1 | y55 | 0.868 | 0.063 | 13.860 | 0.573 | *** |
metacog1 | y60 | 0.860 | 0.064 | 13.418 | 0.568 | *** |
metacog1 | y57 | 0.802 | 0.069 | 11.685 | 0.529 | *** |
metacog1 | y6 | 0.752 | 0.080 | 9.349 | 0.497 | *** |
metacog2 | y46 | 1.988 | 0.262 | 7.584 | 0.734 | *** |
metacog2 | y27 | 1.928 | 0.251 | 7.671 | 0.712 | *** |
metacog2 | y37 | 1.838 | 0.240 | 7.668 | 0.679 | *** |
metacog2 | y44 | 1.829 | 0.243 | 7.529 | 0.675 | *** |
metacog2 | y5 | 1.715 | 0.219 | 7.831 | 0.633 | *** |
metacog2 | y62 | 1.662 | 0.225 | 7.385 | 0.613 | *** |
metacog2 | y31 | 1.645 | 0.216 | 7.631 | 0.607 | *** |
metacog2 | y2 | 1.641 | 0.226 | 7.256 | 0.606 | *** |
metacog2 | y35 | 1.631 | 0.213 | 7.641 | 0.602 | *** |
metacog2 | y13 | 1.569 | 0.226 | 6.939 | 0.579 | *** |
metacog2 | y38 | 1.525 | 0.215 | 7.092 | 0.563 | *** |
metacog2 | y34 | 1.481 | 0.200 | 7.418 | 0.547 | *** |
metacog2 | y43 | 1.455 | 0.202 | 7.207 | 0.537 | *** |
metacog2 | y40 | 1.399 | 0.199 | 7.038 | 0.517 | *** |
metacog2 | y36 | 1.355 | 0.184 | 7.374 | 0.500 | *** |
metacog2 | y9 | 1.352 | 0.194 | 6.963 | 0.499 | *** |
metacog2 | y56 | 1.241 | 0.189 | 6.565 | 0.458 | *** |
metacog2 | y41 | 1.225 | 0.182 | 6.727 | 0.452 | *** |
metacog2 | y65 | 1.183 | 0.181 | 6.530 | 0.437 | *** |
metacog2 | y39 | 1.125 | 0.175 | 6.414 | 0.415 | *** |
metacog2 | y29 | 1.017 | 0.121 | 8.433 | 0.376 | *** |
metacog2 | y30 | 1.000 | 0.000 | 0.369 | ||
metacog2 | y33 | 0.959 | 0.153 | 6.270 | 0.354 | *** |
regulacao | y16 | 1.114 | 0.049 | 22.936 | 0.785 | *** |
regulacao | y4 | 1.016 | 0.048 | 21.356 | 0.716 | *** |
regulacao | y21 | 1.000 | 0.000 | 0.705 | ||
regulacao | y45 | 0.997 | 0.047 | 21.320 | 0.703 | *** |
regulacao | y19 | 0.989 | 0.043 | 23.012 | 0.697 | *** |
regulacao | y23 | 0.978 | 0.044 | 22.380 | 0.690 | *** |
regulacao | y3 | 0.959 | 0.047 | 20.243 | 0.676 | *** |
regulacao | y28 | 0.934 | 0.049 | 19.191 | 0.658 | *** |
regulacao | y25 | 0.929 | 0.051 | 18.108 | 0.655 | *** |
regulacao | y59 | 0.928 | 0.047 | 19.738 | 0.654 | *** |
regulacao | y52 | 0.916 | 0.043 | 21.229 | 0.646 | *** |
regulacao | y14 | 0.911 | 0.049 | 18.743 | 0.642 | *** |
regulacao | y24 | 0.878 | 0.046 | 19.047 | 0.619 | *** |
regulacao | y18 | 0.873 | 0.051 | 16.998 | 0.615 | *** |
regulacao | y50 | 0.860 | 0.048 | 17.879 | 0.606 | *** |
regulacao | y53 | 0.855 | 0.055 | 15.677 | 0.603 | *** |
regulacao | y42 | 0.806 | 0.049 | 16.379 | 0.568 | *** |
regulacao | y22 | 0.802 | 0.051 | 15.802 | 0.565 | *** |
regulacao | y32 | 0.773 | 0.055 | 14.036 | 0.545 | *** |
regulacao | y11 | 0.766 | 0.058 | 13.157 | 0.540 | *** |
regulacao | y54 | 0.753 | 0.053 | 14.254 | 0.530 | *** |
regulacao | y58 | 0.630 | 0.056 | 11.221 | 0.444 | *** |
## aic chisq df pvalue cfi tli rmsea
## NA 4607.853 1649.000 0.000 0.965 0.963 0.053
fitMeasures(cfa_pais, c("chisq.scaled", "df.scaled", "pvalue.scaled", "cfi.scaled","tli.scaled","rmsea.scaled","srmr"))## chisq.scaled df.scaled pvalue.scaled cfi.scaled tli.scaled
## 3716.879 1649.000 0.000 0.902 0.898
## rmsea.scaled srmr
## 0.044 0.069
Now I’ll plot this solution.
semPaths(cfa_pais, title = FALSE,
label.cex = 2, #font - variables
sizeLat = 7, #font latent
sizeMan = 4, #font - variables
edge.label.cex = 1.2, #lambda font
minimum = 0.1,
sizeInt = 0.8,
mar = c(1, 1, 1, 1), residuals = FALSE,
intercepts = FALSE, thresholds = FALSE, layout = "circle",
"std",
cut = 0.3,
"hide", #hide latent estimates
posCol = c("black", "dimgray"), #color of paths
color = list(lat = rgb(173, 216, 230, maxColorValue = 255),
man = rgb(240, 248, 255, maxColorValue = 255)))First, I’ll include an age interval index.
ds <- ds %>%
mutate(age_group = cut(idade, breaks = c(5,6,8,10,12,14, 16, 18, +Inf), include.lowest = TRUE)) %>%
mutate_if(is.factor,fct_explicit_na)then, I’ll add a summative scores to the dataset
#parents
ds <- ds %>%
mutate(parents_metacog1 = rowSums(select(., y12 , y26 , y8 , y1 , y17 , y20 , y10 , y47 , y48 , y60 , y55 , y6 , y7 , y57), na.rm = T)) %>%
mutate(parents_hotindex = rowSums(select(., y21 , y23 , y19 , y52 , y28 , y3 , y4 , y50 , y45 , y24 , y25 , y22 , y18 , y54 , y59 , y14 , y42 , y16 , y11 , y53 , y58 , y32), na.rm = T)) %>%
mutate(parents_metacog2 = rowSums(select(., y30 , y41 , y31 , y29 , y35 , y38 , y37 , y2 , y40 , y13 , y46 , y43 , y62 , y44 , y9 , y27 , y33 , y39 , y65 , y34 , y36 , y56 , y5), na.rm = T)) %>%
mutate(parents_total = parents_metacog1+parents_hotindex+parents_metacog2)And now I’ll report the normative data for parents scale. To make clear if I need specific normative tables, I’ll compare the results of boys and girls.
Table 4. Normative table for EFICA-p and EFICA-t results.
# By age
ds %>%
group_by(age_group) %>% #grouping
summarise_at(vars(parents_total),
list(mean = mean,
sd = sd,
p90 = ~quantile(., .9),
p95 = ~quantile(., .95),
sample = ~sum(!is.na(.)))) %>%
flextable()age_group | mean | sd | p90 | p95 | sample |
[5,6] | 44 | 19 | 69 | 77 | 1017 |
(6,8] | 45 | 20 | 71 | 79 | 983 |
(8,10] | 43 | 20 | 70 | 76 | 807 |
(10,12] | 43 | 20 | 71 | 77 | 273 |
(12,14] | 42 | 22 | 72 | 81 | 88 |
(14,16] | 40 | 19 | 67 | 72 | 72 |
(16,18] | 31 | 18 | 57 | 60 | 44 |
First, get a random sample subsetting the original dataset (into a smaller but representative data)
Then, I’ll create and run the model
#create the model
mod_cfa_professores <- 'MetaCog =~ i23 + i14 + i17 + i9 + i19 + i30 + i38 + i49 + i43 + i21 + i25 + i8 + i52 + i36 + i32 + i37 + i48 + i13 + i46 + i6 + i7 + i10 + i15 + i1 + i29 + i4 + i18 + i12 + i33 + i39 + i47 + i3
Regulacao =~ i26 + i45 + i42 + i28 + i2 + i27 + i34 + i16 + i50 + i5 + i24 + i22 + i20 + i41 + i35 + i44 + i40 + i31 + i51 + i11'
#run the model
cfa_prof <- cfa(mod_cfa_professores, #model name
data=ife_52_520, #dataset
estimator = 'WLSMV', #method for extracting factors
orthogonal=FALSE,
ordered=names(ife_52_520))And get the results, as presented in Table 3. Item analysis and CFA results – EFICA-T (Teachers).
## Report the model
parameterEstimates(cfa_prof, standardized=TRUE) %>%
filter(op == "=~") %>%
mutate(stars = ifelse(pvalue < .001, "***",
ifelse(pvalue < .01, "**",
ifelse(pvalue < .05, "*", "")))) %>%
select('Latent Factor'=lhs,
Indicator=rhs,
B=est,
SE=se, Z=z,
Beta=std.all,
sig=stars) %>%
arrange(`Latent Factor`,desc(Beta)) %>%
flextable() %>%
colformat_num(., digit = 3)Latent Factor | Indicator | B | SE | Z | Beta | sig |
MetaCog | i8 | 1.020 | 0.013 | 78.629 | 0.922 | *** |
MetaCog | i25 | 1.005 | 0.013 | 78.514 | 0.909 | *** |
MetaCog | i23 | 1.000 | 0.000 | 0.904 | ||
MetaCog | i19 | 0.990 | 0.015 | 67.904 | 0.895 | *** |
MetaCog | i48 | 0.986 | 0.016 | 62.499 | 0.892 | *** |
MetaCog | i14 | 0.973 | 0.014 | 70.083 | 0.880 | *** |
MetaCog | i21 | 0.972 | 0.015 | 65.262 | 0.879 | *** |
MetaCog | i30 | 0.969 | 0.015 | 63.116 | 0.876 | *** |
MetaCog | i17 | 0.968 | 0.015 | 64.253 | 0.875 | *** |
MetaCog | i49 | 0.966 | 0.016 | 60.570 | 0.874 | *** |
MetaCog | i29 | 0.963 | 0.022 | 44.651 | 0.871 | *** |
MetaCog | i36 | 0.957 | 0.016 | 58.206 | 0.866 | *** |
MetaCog | i12 | 0.943 | 0.018 | 51.511 | 0.853 | *** |
MetaCog | i52 | 0.933 | 0.022 | 41.675 | 0.844 | *** |
MetaCog | i9 | 0.931 | 0.019 | 50.117 | 0.843 | *** |
MetaCog | i13 | 0.920 | 0.020 | 45.391 | 0.832 | *** |
MetaCog | i1 | 0.919 | 0.020 | 44.840 | 0.831 | *** |
MetaCog | i32 | 0.918 | 0.022 | 42.234 | 0.830 | *** |
MetaCog | i33 | 0.911 | 0.021 | 43.136 | 0.824 | *** |
MetaCog | i3 | 0.911 | 0.030 | 30.005 | 0.824 | *** |
MetaCog | i39 | 0.905 | 0.024 | 37.075 | 0.818 | *** |
MetaCog | i43 | 0.905 | 0.020 | 45.196 | 0.818 | *** |
MetaCog | i37 | 0.895 | 0.024 | 37.411 | 0.810 | *** |
MetaCog | i46 | 0.887 | 0.020 | 43.452 | 0.802 | *** |
MetaCog | i18 | 0.875 | 0.027 | 32.896 | 0.791 | *** |
MetaCog | i38 | 0.872 | 0.023 | 37.454 | 0.789 | *** |
MetaCog | i4 | 0.851 | 0.024 | 35.552 | 0.770 | *** |
MetaCog | i6 | 0.844 | 0.024 | 34.550 | 0.763 | *** |
MetaCog | i7 | 0.827 | 0.026 | 32.004 | 0.748 | *** |
MetaCog | i47 | 0.827 | 0.031 | 27.095 | 0.748 | *** |
MetaCog | i10 | 0.815 | 0.027 | 30.518 | 0.737 | *** |
MetaCog | i15 | 0.757 | 0.032 | 23.635 | 0.685 | *** |
Regulacao | i44 | 1.104 | 0.036 | 30.870 | 0.920 | *** |
Regulacao | i51 | 1.102 | 0.039 | 27.938 | 0.918 | *** |
Regulacao | i28 | 1.091 | 0.032 | 34.075 | 0.909 | *** |
Regulacao | i40 | 1.080 | 0.037 | 29.150 | 0.899 | *** |
Regulacao | i35 | 1.050 | 0.033 | 31.492 | 0.875 | *** |
Regulacao | i41 | 1.047 | 0.036 | 28.714 | 0.872 | *** |
Regulacao | i11 | 1.037 | 0.039 | 26.295 | 0.864 | *** |
Regulacao | i22 | 1.024 | 0.035 | 28.954 | 0.854 | *** |
Regulacao | i34 | 1.017 | 0.036 | 28.603 | 0.848 | *** |
Regulacao | i27 | 1.016 | 0.034 | 30.101 | 0.846 | *** |
Regulacao | i45 | 1.011 | 0.028 | 36.371 | 0.842 | *** |
Regulacao | i31 | 1.009 | 0.038 | 26.890 | 0.841 | *** |
Regulacao | i26 | 1.000 | 0.000 | 0.833 | ||
Regulacao | i2 | 0.994 | 0.037 | 26.522 | 0.828 | *** |
Regulacao | i5 | 0.993 | 0.039 | 25.241 | 0.828 | *** |
Regulacao | i16 | 0.992 | 0.038 | 26.313 | 0.827 | *** |
Regulacao | i50 | 0.985 | 0.039 | 24.950 | 0.821 | *** |
Regulacao | i42 | 0.976 | 0.038 | 25.818 | 0.813 | *** |
Regulacao | i24 | 0.968 | 0.037 | 26.302 | 0.806 | *** |
Regulacao | i20 | 0.951 | 0.036 | 26.596 | 0.792 | *** |
## aic chisq df pvalue cfi tli rmsea
## NA 5151.299 1273.000 0.000 0.991 0.991 0.077
fitMeasures(cfa_prof, c("chisq.scaled", "df.scaled", "pvalue.scaled", "cfi.scaled","tli.scaled","rmsea.scaled","srmr"))## chisq.scaled df.scaled pvalue.scaled cfi.scaled tli.scaled
## 3678.288 1273.000 0.000 0.955 0.953
## rmsea.scaled srmr
## 0.060 0.074
Now I’ll plot this solution.
semPaths(cfa_prof, title = FALSE,
label.cex = 2, #font - variables
sizeLat = 7, #font latent
sizeMan = 4, #font - variables
edge.label.cex = 1.2, #lambda font
minimum = 0.1,
sizeInt = 0.8,
mar = c(1, 1, 1, 1), residuals = FALSE,
intercepts = FALSE, thresholds = FALSE, layout = "circle",
"std",
cut = 0.3,
"hide", #hide latent estimates
posCol = c("black", "dimgray"), #color of paths
color = list(lat = rgb(173, 216, 230, maxColorValue = 255),
man = rgb(240, 248, 255, maxColorValue = 255)))First, I’ll add a summative score to the dataset
ds <- ds %>%
mutate(teachers_metacog = rowSums(select(., i23 , i14 , i17 , i9 , i19 , i30 , i38 , i49 , i43 , i21 , i25 , i8 , i52 , i36 , i32 , i37 , i48 , i13 , i46 , i6 , i7 , i10 , i15 , i1 , i29 , i4 , i18 , i12 , i33 , i39 , i47 , i3), na.rm=T)) %>%
mutate(teachers_hotindex = rowSums(select(., i26 , i45 , i42 , i28 , i2 , i27 , i34 , i16 , i50 , i5 , i24 , i22 , i20 , i41 , i35 , i44 , i40 , i31 , i51 , i11), na.rm=T)) %>%
mutate(teachers_total = teachers_metacog+teachers_hotindex)I’ll report the normative data for parents scale, as presented in Table 4. Normative table for EFICA-p and EFICA-t results.
##
##
## ANOVA results using teachers_total as the dependent variable
##
##
## Predictor SS df MS F p partial_eta2
## (Intercept) 75321.98 1 75321.98 163.49 .000
## factor(sex) 205.34 1 205.34 0.45 .504 .00
## age 203.27 1 203.27 0.44 .507 .00
## factor(sex) x age 7979.86 1 7979.86 17.32 .000 .01
## Error 1511175.67 3280 460.72
## CI_90_partial_eta2
##
## [.00, .00]
## [.00, .00]
## [.00, .01]
##
##
## Note: Values in square brackets indicate the bounds of the 90% confidence interval for partial eta-squared
# By age
ds %>%
group_by(age_group, sex) %>% #grouping
summarise_at(vars(teachers_total),
list(mean = mean,
sd = sd,
p90 = ~quantile(., .9),
p95 = ~quantile(., .95),
sample = ~sum(!is.na(.)))) %>%
flextable()age_group | sex | mean | sd | p90 | p95 | sample |
[5,6] | Female | 21 | 20 | 49 | 61 | 498 |
[5,6] | Male | 27 | 22 | 58 | 70 | 519 |
(6,8] | Female | 22 | 20 | 51 | 60 | 492 |
(6,8] | Male | 29 | 23 | 61 | 74 | 491 |
(8,10] | Female | 20 | 19 | 46 | 55 | 414 |
(8,10] | Male | 29 | 23 | 65 | 72 | 393 |
(10,12] | Female | 21 | 18 | 42 | 55 | 128 |
(10,12] | Male | 34 | 24 | 65 | 77 | 145 |
(12,14] | Female | 20 | 23 | 56 | 65 | 46 |
(12,14] | Male | 38 | 29 | 78 | 91 | 42 |
(14,16] | Female | 20 | 22 | 52 | 53 | 35 |
(14,16] | Male | 45 | 26 | 75 | 98 | 37 |
(16,18] | Female | 24 | 20 | 52 | 58 | 25 |
(16,18] | Male | 29 | 32 | 69 | 101 | 19 |
For this set of analyses, I’ll create a new dataset that contains both typical (n = 3284) and ADHD children (n = 165). No missing values imputation methods will be used in the adhd dataset. Both parents and teachers will be merged into the chunks.
First I’ll define a ds for this group
Then I’ll add and rename specific variables
#add sex
ds_adhd <- ds_adhd %>%
mutate(sex = if_else(sexo == "1", "Female", "Male"))
#add race
ds_adhd <- ds_adhd %>%
mutate(race = if_else(cor == "1","White","non-white"))
#rename age
ds_adhd <- ds_adhd %>%
rename(age = idade)
#add guardian
ds_adhd <- ds_adhd %>%
mutate(guardian = recode(quem_preenche,
"1" = "mother",
"2" = "father",
"3" = "other"))
# add socioeconomic
ds_adhd <- ds_adhd %>%
rename(socioeconomic = classe_economica)
#age group
ds_adhd <- ds_adhd %>%
mutate(age_group = cut(age, breaks = c(5,6,8,10,12,14, 16, 18, +Inf), include.lowest = TRUE))Then I’ll check if this dataset is reliable.
# Parents
ds_adhd <- ds_adhd %>%
# Copy variables
mutate_at(vars(if_epa_1:if_epa_65), list(cat = ~paste0(.))) %>% #<-- here labels are different
rename_at(vars(ends_with( "_cat")), list(~paste0("y", 1:65))) %>%
# Transform into numeric
mutate_at(vars(y1:y65), ~as.numeric(.)) %>%
# Round values if necessary - Parents
mutate_at(vars(y1:y65), ~round(.,0))
# Teachers
ds_adhd <- ds_adhd %>%
mutate_at(vars(if_epr_1:if_epr_52), list(prof = ~paste0(.))) %>% #<-- here labels are different again
rename_at(vars(ends_with( "_prof")), list(~paste0("i", 1:52))) %>%
## Transform into numeric - teachers
mutate_at(vars(i1:i52), ~as.numeric(.)) %>%
## Round values if necessary - teachers
mutate_at(vars(i1:i52), ~round(.,0)) I’ll also check distribution (outliers)
ds_adhd %>% select(y1:y65) %>%
pivot_longer(everything()) %>%
ggplot(., aes(name, value)) + geom_boxplot()
ds_adhd %>% select(i1:i52) %>%
pivot_longer(everything()) %>%
ggplot(., aes(name, value)) + geom_boxplot()and replace coding errors with missing code - Parents
ds_adhd <- ds_adhd %>%
mutate_at(vars(y1:y65), ~replace(., . > 2 | . < 0, NA))
ds_adhd <- ds_adhd %>%
mutate_at(vars(i1:i52), ~replace(., . > 2 | . < 0, NA))Now check missings
#parents
ds_adhd %>% select(y1:y65) %>% summarise(sum(is.na(.)))
ds_adhd %>% select(y1:y65) %>% plot_missing()
#teachers
ds_adhd %>% select(i1:i52) %>% summarise(sum(is.na(.)))
ds_adhd %>% select(i1:i52) %>% plot_missing()Then I’ll add summative scores
ds_adhd <- ds_adhd %>%
#parents
mutate(parents_metacog1 = rowSums(select(., y12 , y26 , y8 , y1 , y17 , y20 , y10 , y47 , y48 , y60 , y55 , y6 , y7 , y57), na.rm = T)) %>%
mutate(parents_hotindex = rowSums(select(., y21 , y23 , y19 , y52 , y28 , y3 , y4 , y50 , y45 , y24 , y25 , y22 , y18 , y54 , y59 , y14 , y42 , y16 , y11 , y53 , y58 , y32), na.rm = T)) %>%
mutate(parents_metacog2 = rowSums(select(., y30 , y41 , y31 , y29 , y35 , y38 , y37 , y2 , y40 , y13 , y46 , y43 , y62 , y44 , y9 , y27 , y33 , y39 , y65 , y34 , y36 , y56 , y5), na.rm = T)) %>%
mutate(parents_total = parents_metacog1+parents_hotindex+parents_metacog2) %>%
#teachers
mutate(teachers_metacog = rowSums(select(., i23 , i14 , i17 , i9 , i19 , i30 , i38 , i49 , i43 , i21 , i25 , i8 , i52 , i36 , i32 , i37 , i48 , i13 , i46 , i6 , i7 , i10 , i15 , i1 , i29 , i4 , i18 , i12 , i33 , i39 , i47 , i3), na.rm=T)) %>% #<- consider missing
mutate(teachers_hotindex = rowSums(select(., i26 , i45 , i42 , i28 , i2 , i27 , i34 , i16 , i50 , i5 , i24 , i22 , i20 , i41 , i35 , i44 , i40 , i31 , i51 , i11), na.rm=T)) %>% #<- consider missing
mutate(teachers_total = teachers_metacog+teachers_hotindex)Now I’ll create a new dataset including the common variables among two groups
#check similatiries and differences
base_1 <- ds %>%
rename_at(vars(snap_1:snap_18, -contains("int")), ~paste0("sna_ppa_",1:18)) %>%
rename_at(vars(snap_1_1:snap_18_1, -contains("int")), ~paste0("sna_ppr_",1:18))
#get names
base_1 <- base_1 %>% names %>% data.frame %>% mutate(base = "general") %>% setNames(.,c("var","base"))
base_2 <- ds_adhd %>% names %>% data.frame %>% mutate(base = "adhd")%>% setNames(.,c("var","base"))
#check names
bind_rows(base_1,base_2) %>% #get both datasets
mutate_at(vars(var),
~ tolower(.) %>% stringi::stri_trans_general(.,"Latin-ASCII")) %>% #just fix name issues
group_by(var) %>% #group by variable
mutate(n = n()) %>% #add counts
filter(n==1) %>% #filter for exclusive variables
#arrange(n) %>%
print(n=nrow(.)) #display
rm(base_1, base_2)Now merge the datasets
#merge datasets
base_1 <- ds %>%
rename_at(vars(snap_1:snap_18, -contains("int")), ~paste0("sna_ppa_",1:18)) %>%
rename_at(vars(snap_1_1:snap_18_1, -contains("int")), ~paste0("sna_ppr_",1:18)) %>%
mutate_at(vars(starts_with("sdq"), starts_with("sna_ppr_")), ~as.numeric(.))
base_2 <- ds_adhd %>%
mutate(group = "adhd") %>%
rename(no_x = no)
common_names <- function(base1, base2){
#create two datasets to storage the colnames
base1 <- base1 %>% names %>% data.frame %>% mutate(base = "base1") %>% setNames(.,c("var","base"))
base2 <- base2 %>% names %>% data.frame %>% mutate(base = "base2") %>% setNames(.,c("var","base"))
#merge
lista_comum <- bind_rows(base1,base2) %>% #get both datasets
mutate_at(vars(var),
~ tolower(.) %>% stringi::stri_trans_general(.,"Latin-ASCII")) %>% #just fix name issues
group_by(var) %>% #group by variable
mutate(n = n()) %>% #add counts
filter(n==2) %>% #filter for varialbes into the two datasets
arrange(var) %>%
#distinct(var, .keep_all = TRUE) %>%
unstack(.,var~base) #unstack
var_comum <<- lista_comum %>% pull(base1)
}
common_names(base1 = base_1, base2 = base_2)
#create a dataset with typical and adhd children with same variables
ds_full <- bind_rows(base_1, base_2) %>%
select(all_of(var_comum)) %>%
select(group, parents_total, teachers_total,
parents_metacog1, parents_hotindex, parents_metacog2, teachers_metacog, teachers_hotindex, everything())Check if everything matches
ds_full %>%
group_by(group) %>%
select(parents_total, teachers_total,
parents_metacog1, parents_hotindex, parents_metacog2, teachers_metacog, teachers_hotindex) %>%
descr()
ds_full <- ds_full %>%
mutate(group = factor(group, levels=c("typical", "adhd")))
cor(ds_full %>% filter(group == "typical") %>% select(sdq_1),ds$sdq_1, use = "complete.obs")
cor(ds_full %>% filter(group == "adhd") %>% select(sdq_1),ds_adhd$sdq_1, use = "complete.obs")Remove unnecessary variables
lm(parents_total ~ factor(group), data = ds_full) %>% summary()
lm(teachers_total ~ factor(group), data = ds_full) %>% summary()ggplot(ds_full, aes(x = parents_total, group = group, fill = group)) +
geom_density() +
scale_fill_manual(values = c("#0072B250","#D55E0050"), labels = c("Typical", "ADHD")) +
geom_segment(data = ds_full %>%
group_by(group) %>%
summarize(mean = mean(parents_total), sd = sd(parents_total)), aes(x = mean, xend = mean, y = 0, yend = 0.022),
size = 1, alpha=0.5, lty="21", color = "red") +
geom_text(data = ds_full %>%
group_by(group) %>%
summarize(mean = mean(parents_total), sd = sd(parents_total)),
aes(x = mean, y = 0.005,label = paste0("μ = ",round(mean, 2))),position = position_stack(vjust = .5)) +
theme_bw() +
xlim(-10,130) +
labs(x = "Parents score", y = "", fill = "Group")ggplot(ds_full, aes(x = parents_total, y = group, fill = group)) +
geom_density_ridges(scale = 10,
size = 0.25,
rel_min_height = 0.03,
# quantile_lines = T,
alpha = .8,
color = "black") +
scale_fill_manual(values = c("#0072B250","#D55E0050"), labels = c("Typical", "ADHD")) +
theme_ridges(center = TRUE) +
geom_text(
data = density_lines,
aes(x = mean, y = group, label = paste("mean = ",round(mean, 2))),
size = 4, nudge_x = 0.03, nudge_y = 0.5) ggplot(ds_full, aes(x = teachers_total, group = group, fill = group)) +
geom_density() +
scale_fill_manual(values = c("#0072B250","#D55E0050"), labels = c("Typical", "ADHD")) +
geom_segment(data = ds_full %>%
group_by(group) %>%
summarize(mean = mean(teachers_total), sd = sd(teachers_total)), aes(x = mean, xend = mean, y = 0, yend = 0.025),
size = 1, alpha=0.5, lty="21", color = "red") +
geom_text(data = ds_full %>%
group_by(group) %>%
summarize(mean = mean(teachers_total), sd = sd(teachers_total)),
aes(x = mean, y = 0.005,label = paste0("μ = ",round(mean, 2))),position = position_stack(vjust = .5)) +
theme_bw() +
xlim(-10,120) +
labs(x = "Teachers score", y = "", fill = "Group")This chunk refers to the book chapter
This function below I’ll make possible plot and check tables
#get dataset
ds_full %>%
select(group, age_group,parents_total, teachers_total) %>%
mutate(diag = if_else(group == "typical",0,1)) -> dados_predict#EFICA P
cutoff_efica_p <- cutpointr(dados_predict, parents_total, diag,
method = maximize_metric,
metric = youden)
summary(cutoff_efica_p)## Method: maximize_metric
## Predictor: parents_total
## Outcome: diag
## Direction: >=
##
## AUC n n_pos n_neg
## 0.88 3449 165 3284
##
## optimal_cutpoint youden acc sensitivity specificity tp fn fp tn
## 55 0.61 0.72 0.9 0.71 149 16 955 2329
##
## Predictor summary:
## Min. 5% 1st Qu. Median Mean 3rd Qu. 95% Max. SD NAs
## overall 0 14 30 44 45 59 81 113 21 0
## 0 0 13 29 43 44 57 77 113 20 0
## 1 34 47 61 73 75 88 104 111 17 0
#plot_metric_boot(cutoff_efica_p)
#plot_sensitivity_specificity(cp)
#plot_roc(cutoff_efica_p) + ggtitle("ROC curve for EFICA-P")
#plot_cutpointr(cp, xvar = cutpoints, yvar = sum_sens_spec, conf_lvl = 0.9)
#set.seed(123)
#opt_cut <- cutpointr(dados_predict, parents_total, diag, direction = ">=",
# pos_class = 1,
# neg_class = 0,
# method = maximize_metric, metric = youden, boot_runs = 1000)
#plot_metric(opt_cut)
#summary(opt_cut)
#https://cran.r-project.org/web/packages/cutpointr/vignettes/cutpointr.html#EFICA P
cutoff_efica_t <- cutpointr(dados_predict, teachers_total, diag,
method = maximize_metric,
metric = youden)
summary(cutoff_efica_t)## Method: maximize_metric
## Predictor: teachers_total
## Outcome: diag
## Direction: >=
##
## AUC n n_pos n_neg
## 0.92 3449 165 3284
##
## optimal_cutpoint youden acc sensitivity specificity tp fn fp tn
## 45 0.7 0.81 0.89 0.81 147 18 626 2658
##
## Predictor summary:
## Min. 5% 1st Qu. Median Mean 3rd Qu. 95% Max. SD NAs
## overall 0 0 7 22 27 42 74 104 24 0
## 0 0 0 6 21 25 39 68 104 22 0
## 1 0 35 56 70 68 81 95 104 19 0
grid.arrange(plot_roc(cutoff_efica_p) + ggtitle("ROC curve for EFICA-P") + theme_bw(),
plot_roc(cutoff_efica_t) + ggtitle("ROC curve for EFICA-T") + theme_bw(),
ncol = 2, nrow = 1)Now, I’ll present the trade-off (payoff?) between sensitivity and specificity. In near future, it will become a shiny app.
plot_efica_roc_curve <- function(cutoff) {
#display table
tabela_roc <- dados_predict %>%
mutate(test_result = if_else(parents_total>=cutoff,"(1)Positive","(2)Negative")) %>%
mutate(diag = if_else(diag==1,"ADHD","Typical")) %>%
group_by(test_result,diag) %>%
count() %>%
pivot_wider(names_from = diag, values_from = n) %>%
ungroup() %>%
add_row(test_result = "Sens&Spec",
"ADHD" = .$ADHD[1]/(.$ADHD[1]+.$ADHD[2]),
"Typical" = .$Typical[2]/(.$Typical[2]+.$Typical[1])) %>%
add_row(test_result = "Accuracy",
"ADHD" = (.$ADHD[1]+.$Typical[2])/(.$ADHD[1]+.$ADHD[2]+.$Typical[2]+.$Typical[1]),
"Typical" = 0) %>%
print(tabela_roc)
#get means for plotting
mean_typical <- dados_predict %>% filter(group == "typical") %>% summarise(mean(parents_total)) %>% pull()
sd_typical <- dados_predict %>% filter(group == "typical") %>% summarise(sd(parents_total)) %>% pull()
mean_adhd <- dados_predict %>% filter(group == "adhd") %>% summarise(mean(parents_total)) %>% pull()
sd_adhd <- dados_predict %>% filter(group == "adhd") %>% summarise(sd(parents_total)) %>% pull()
#plot
gridExtra::grid.arrange(
ggplot(data = data.frame(u = c(-20, 150)),
aes(x = u)) +
geom_density(stat = "function",
alpha=0.6,
fun = dnorm,
mapping = aes(fill='Typical'),
args = list(mean = mean_typical,
sd = sd_typical)) +
geom_density(stat = "function",
alpha=0.6,
fun = dnorm,
mapping = aes(fill='Clinical'),
args = list(mean = mean_adhd,
sd = sd_adhd)) +
geom_vline(xintercept = cutoff, size=2) +
geom_text(y = 0.02, x = cutoff-15, label = " - \n (no disease)") +
geom_text(y = 0.02, x = cutoff+15, label = " + \n (disease)") +
geom_text(y = 0.011, x = 130,
label = paste("Cut-off = ",cutoff)) + #cut-off
geom_text(y = 0.009, x = 130,
label = paste("Sensitivity = ",round(tabela_roc$ADHD[3],2))) + #that's the sensitivity
geom_text(y = 0.007, x = 130, label =
paste("Specificity =",round(tabela_roc$Typical[3],2))) + #that's the specificity
geom_text(y = 0.005, x = 130, label =
paste("Accuracy =",round(tabela_roc$ADHD[4],2))) + #that's the accuracy
scale_fill_grey() +
theme_void() +
theme(legend.position = "none") ,
#ROC CURVE plot
ggplot(dados_predict, aes(d = diag, m = parents_total)) +
geom_vline(xintercept = 1-(tabela_roc$Typical[3]), color = 'grey50') + #1-specificity (False positive rate)
geom_hline(yintercept = (tabela_roc$ADHD[3]), color = 'grey50') + #Sensitivity (True positive)
plotROC::geom_roc(cutoffs.at = cutoff,size = 0.5, size.point = 0.5) +
plotROC::style_roc() + geom_abline(slope = 1, intercept = 0, color = 'grey50')
#geom_text(y = 0.1, x = 0.90, label = paste("AUC ",round(plotROC::calc_auc(groc)[3],2))) #in case I want the area under the curve
)
}## # A tibble: 4 x 3
## test_result ADHD Typical
## <chr> <dbl> <dbl>
## 1 (1)Positive 149 955
## 2 (2)Negative 16 2329
## 3 Sens&Spec 0.903 0.709
## 4 Accuracy 0.718 0
#https://rviews.rstudio.com/2019/03/01/some-r-packages-for-roc-curves/ #http://www.biosoft.hacettepe.edu.tr/easyROC/ #incrivel: https://kennis-research.shinyapps.io/ROC-Curves/ and http://www.navan.name/roc/
library("OptimalCutpoints")
dados_predict <- as.data.frame(dados_predict)
#compute the best cut-off
optimal.cutpoint.Youden <- optimal.cutpoints(X = "parents_total",
status = "diag",
tag.healthy = 0,
methods = "Youden",
data = dados_predict,
pop.prev = NULL,
control = control.cutpoints(),
ci.fit = FALSE,
conf.level = 0.95,
trace = FALSE)
optimal.cutpoint.Youden<-optimal.cutpoints(X = "parents_total", status = "diag", tag.healthy = 0,
methods = "Youden", data = dados_predict, pop.prev = NULL,
control = control.cutpoints(generalized.Youden = TRUE), ci.fit = TRUE, conf.level = 0.95,
trace = FALSE)
summary(optimal.cutpoint.Youden)
#get summary
summary(optimal.cutpoint.Youden)
#print
print.optimal.cutpoints(optimal.cutpoint.Youden)
#plot
plot(optimal.cutpoint.Youden, col = "blue")
#https://rpubs.com/thisisrg/supp_code_17 -- interessantelibrary(pROC)
roc_obj <- roc(response=dados_predict$diag, predictor=factor(dados_predict$prediction, ordered = TRUE), plot=TRUE)
roc_obj <- roc(response=as.numeric(dados_predict$diag), predictor=dados_predict$parents_total, plot=TRUE)
plot(roc_obj, col="red", lwd=3, main="ROC curve QDA")
auc_qda<-auc(roc_obj)
coords(roc_obj, x="best", input="threshold", best.method="youden", transpose = TRUE) #
#library(InformationValue)
#optCutOff <- optimalCutoff(logitMod$diag, predicted)[1]
#http://r-statistics.co/Logistic-Regression-With-R.htmllibrary(ROCR)
predictions <- prediction(dados_predict$parents_total, dados_predict$diag)
predictions
perf <- performance(pred,"tpr","fpr")
plot(perf,
avg="threshold",
spread.estimate="boxplot")
plot(perf)
perf <- performance(pred, "tpr", "fpr")
plot(perf,
avg= "threshold",
colorize=TRUE,
lwd= 3,
main= "With ROCR you can produce standard plots\nlike ROC curves ...")
plot(perf,
lty=3,
col="grey78",
add=TRUE)
sens <- data.frame(x=unlist(performance(predictions, "sens")@x.values),
y=unlist(performance(predictions, "sens")@y.values))
spec <- data.frame(x=unlist(performance(predictions, "spec")@x.values),
y=unlist(performance(predictions, "spec")@y.values))
sens %>% ggplot(aes(x,y)) +
geom_line() +
geom_line(data=spec, aes(x,y,col="red")) +
scale_y_continuous(sec.axis = sec_axis(~., name = "Specificity")) +
labs(x='Cutoff', y="Sensitivity") +
theme(axis.title.y.right = element_text(colour = "red"), legend.position="none") +
geom_vline(xintercept = 61)
#https://stackoverflow.com/questions/23240182/deciding-threshold-for-glm-logistic-regression-model-in-r
sens = cbind(unlist(performance(predictions, "sens")@x.values), unlist(performance(predictions, "sens")@y.values))
spec = cbind(unlist(performance(predictions, "spec")@x.values), unlist(performance(predictions, "spec")@y.values))
sens[which.min(apply(sens, 1, function(x) min(colSums(abs(t(spec) - x))))), 1]ds_full %>%
filter(age_group == "[5,6]") %>%
select(group, parents_total) %>%
mutate(diag = factor(if_else(group == "typical",0,1))) -> dados_predict
str(dados_predict)
#https://community.rstudio.com/t/how-to-choose-best-threshold-value-automatically/12910
library(caret)
controlparameter <- trainControl(method="cv",
number=10,
savePredictions = TRUE)
##Apply Logistic Regression with K fold Cross Validation
##train
modelglm <- train(diag ~ parents_total,data = dados_predict,
method="glm",
family="binomial",
trControl=controlparameter)
modelglm
## Make prediction with mymodelglm to train data
prediction <- predict(modelglm,dados_predict)
#add to dataset
dados_predict$prediction <- prediction
##Validation of Prediction
confu_mat <- table(prediction=prediction,acctual=dados_predict$diag)
confu_mat
confusionMatrix(data=dados_predict$prediction, reference=dados_predict$diag, positive = "1")