#packages
library(flextable) #tables
library(tidyverse) #enrinonment

library(mice) # imputation

library(janitor)

library(psych) #psychometrics (classical theory)
library(lavaan) #SEM
library(semPlot) #plot cfa models
library(semTools) #reliability sem models

library(DataExplorer) #missing
library(summarytools) #show data info
library(gridExtra) #add features for grid

library(emmeans) #anova and posthoc
library(effsize) #cohen D

library(ggpubr) #ridge plot

library(cutpointr) #ROC

#round
options(scipen=1, digits=2)

1 Cautionary notes

  1. This markdown relates to two different manuscripts:

-Psychometric properties and clinical utility of the executive function inventory for children and adolescents: a large multistage populational study including children with ADHD

-The clinical utility of psychometric tools: a real-data approach from a study including children with ADHD

The first is a published manuscript (DOI: https://doi.org/10.1080/21622965.2020.1726353) and the last is a book chapter.

  1. This markdown, the data presented here, and the manuscripts are also used in my Psychometrics course, at Pontifical Catholic University of Rio de Janeiro (PUC-Rio), which partially explains the way I built this markdown document.

  2. The first part of the codes (Data handling) explains how to get the data and adjust some issues. Originally, the data was a messy Excel file and several steps were needed to fix everything. As everything was done, you do not need to run these chunks again, but just load the Rdata file and skip the related chunks. Thus, Data handling chunks, both from Parents and Teachers, are only useful to be aware of what was done. The second part of this markdown is Psychometrics, from parents and teachers, all tables of the first manuscript can be replicated. Be aware that chunks with the eval = TRUE options are useful, whereas the other chunks don’t need to run. The chunk ADHD group contains preliminary codes to make possible compute one solid dataset include both groups (typical and ADHD) and the same variables used with them. I’m very much aware that these codes could be placed at the beginning, which makes sense. However, these steps were intentionally developed to make possible study the manuscript with different perspectives. In other words, when someone wants to dig into psychometric models, the firsts sets of chunks are useful. When one wants to dig into the most specific part of clinical utility, the second part of this markdown is really useful. The chunk Clinical utility uses the dataset composed of the two groups (Typical and ADHD) and reproduce the results reported in the manuscript, as well as provides a partial replication of figures of the manuscript. Then, after all that said, ROC chunk relates to the book chapter, in which I and other co-authors have introduced concepts such as the sensitivity and specificity of a psychological test. The last part of this document is Future work, which mix temporary stuffs that I did during the process and what would be really suitable for future works.

Please feel free to ask further information and/or to reach me out via

Thanks, Luis

2 Get data

Please check at https://osf.io/kx2ym/files/. I recommend importing and loading the R files (Base R - EFICA (manuscript and book chapter) instead of raw files. However, feel free to get the raw data if you want to replicate these analyses.

#get file
load(file="C:/Users/luisf/Dropbox/Puc-Rio/Consultoria - Arruda/Base R - EFICA (manuscript and book chapter).RData")

2.1 Data handling - Parents

ds <- backup
ds <- clean_names(ds)

2.2 Add demographic variables to ds

#add sex
ds <- ds %>% 
  mutate(sex = if_else(sexo_certo == "1", "Female", "Male"))

#add race
ds <- ds %>% 
  mutate(race = if_else(cor == "1","White","non-white"))

#rename age
ds <- ds %>% 
  rename(age = idade)

#add guardian
ds <- ds %>% 
 mutate(guardian = recode(quem_preenche_certo, 
                          "1" = "mother",
                          "2" = "father",
                          "3" = "other"))
# add schooling
ds <- ds %>% 
  rename(schooling = grau_instrucao)

# add socioeconomic
ds <- ds %>% 
  rename(socioeconomic = classe_economica)

#change values (tipico for typical)
ds <- ds %>% mutate(group = "typical")

#remove useless variables
ds <- ds %>% select(-c(pais_tct_metacog1,pais_tct_metacog2, pais_tct_regulacao, profs_tct_metacog, profs_tct_regulacao, quem_preenche))

2.3 Copy variables - Parents

ds <- ds %>% 
  mutate_at(vars(ife_1:ife_65), list(cat = ~paste0(.))) %>%  
  rename_at(vars(ends_with( "_cat")), list(~paste0("y", 1:65)))

2.4 Transform into numeric - Parents

ds <- ds %>%
  mutate_at(vars(y1:y65), ~as.numeric(.)) 

2.5 Round values if necessary - Parents

ds <- ds %>% 
  mutate_at(vars(y1:y65), ~round(.,0)) 

2.6 Check distribution (outliers) - Parents

ds %>% select(y1:y65) %>% 
  pivot_longer(everything()) %>% 
  ggplot(., aes(name, value)) + geom_boxplot()

2.7 Replace coding errors with missing code - Parents

ds <- ds %>% 
  mutate_at(vars(y1:y65), ~replace(., . > 2 | . < 0, NA))

2.8 Missing cases - Parents

ds %>% select(y1:y65) %>% summarise(sum(is.na(.)))
ds %>% select(y1:y65) %>% plot_missing()
#get first temp data (pais)
tempData <- ds %>% select(y1:y65) %>% mice(.,m=5,maxit=50,meth='pmm',seed=123)
ife_65_c <- complete(tempData,1) 

2.9 Replace from the original dataset

2.9.1 Function to update values

#https://stackoverflow.com/questions/44930149/replace-a-subset-of-a-data-frame-with-dplyr-join-operations
replace_subset <- function(df, df_subset, id_col_names = c()) {
  
  # work out which of the columns contain "new" data
  new_data_col_names <- colnames(df_subset)[which(!colnames(df_subset) %in% id_col_names)]
  
  # complete the df_subset with the extra columns from df
  df_sub_to_join <- df_subset %>%
    left_join(select(df, -new_data_col_names), by = c(id_col_names))
  
  # join and bind rows
  df_out <- df %>%
    anti_join(df_sub_to_join, by = c(id_col_names)) %>%
    bind_rows(df_sub_to_join)
  
  return(df_out)
}
# create an index for Dataset
ds <- ds %>%  mutate(id = row_number())
#create an index for complete dataset (items-only)
ife_65_c <- ife_65_c %>%  mutate(id = row_number())

#replace 
ds <- replace_subset(df = ds , df_subset = ife_65_c, id_col_names = c("id"))

2.10 Check if the imputation worked

ds %>% select(y1:y65) %>% summarise(sum(is.na(.)))
ds %>% select(y1:y65) %>% plot_missing()

2.11 Remove unecessary data

#rm(tempData, ife_65_c)

3 Psychometrics - Parents

3.1 Factorial models - Parents

First, get a random sample subsetting the original dataset (into a smaller but representative data)

#random subsample -- Parents
set.seed(123)
ife_65_650 <- ife_65_c %>% sample_n(., 650)

Then, I’ll create and run the model

#create the model
mod_cfa_pais <- 'metacog1   =~ y12 + y26 + y8 + y1 + y17 + y20 + y10 + y47 + y48 + y60 + y55 + y6 + y7 + y57
regulacao   =~ y21 + y23 + y19 + y52 + y28 + y3 + y4 + y50 + y45 + y24 + y25 + y22 + y18 + y54 + y59 + y14 + y42 + y16 + y11 + y53 + y58 + y32
metacog2 =~ y30 + y41 + y31 + y29 + y35 + y38 + y37 + y2 + y40 + y13 + y46 + y43 + y62 + y44 + y9 + y27 + y33 + y39 + y65 + y34 + y36 + y56 + y5'

#run the model
cfa_pais <- cfa(mod_cfa_pais, #model name
                data=ife_65_650, #dataset
                estimator = 'WLSMV', #method for extracting factors
                orthogonal=FALSE,
                ordered=names(ife_65_650))
#reliability
rel_pais <- reliability(cfa_pais)

And get the results avaiable in Table 2. Items analysis, descriptive, and CFA results of EFICA-P (parents).

## Report the model
parameterEstimates(cfa_pais, standardized=TRUE) %>% 
  filter(op == "=~") %>% 
  mutate(stars = ifelse(pvalue < .001, "***", 
                        ifelse(pvalue < .01, "**", 
                               ifelse(pvalue < .05, "*", "")))) %>%
  select('Latent Factor'=lhs, 
         Indicator=rhs, 
         B=est, 
         SE=se, Z=z, 
         Beta=std.all, 
         sig=stars) %>% 
  arrange(`Latent Factor`,desc(Beta)) %>% 
  flextable() %>% 
  colformat_num(., digit = 3)

Latent Factor

Indicator

B

SE

Z

Beta

sig

metacog1

y8

1.082

0.062

17.322

0.715

***

metacog1

y20

1.058

0.071

14.913

0.699

***

metacog1

y17

1.011

0.062

16.321

0.667

***

metacog1

y12

1.000

0.000

0.660

metacog1

y26

0.994

0.048

20.628

0.656

***

metacog1

y48

0.993

0.066

14.950

0.655

***

metacog1

y7

0.964

0.071

13.652

0.637

***

metacog1

y1

0.957

0.065

14.832

0.632

***

metacog1

y47

0.935

0.061

15.245

0.617

***

metacog1

y10

0.880

0.065

13.489

0.581

***

metacog1

y55

0.868

0.063

13.860

0.573

***

metacog1

y60

0.860

0.064

13.418

0.568

***

metacog1

y57

0.802

0.069

11.685

0.529

***

metacog1

y6

0.752

0.080

9.349

0.497

***

metacog2

y46

1.988

0.262

7.584

0.734

***

metacog2

y27

1.928

0.251

7.671

0.712

***

metacog2

y37

1.838

0.240

7.668

0.679

***

metacog2

y44

1.829

0.243

7.529

0.675

***

metacog2

y5

1.715

0.219

7.831

0.633

***

metacog2

y62

1.662

0.225

7.385

0.613

***

metacog2

y31

1.645

0.216

7.631

0.607

***

metacog2

y2

1.641

0.226

7.256

0.606

***

metacog2

y35

1.631

0.213

7.641

0.602

***

metacog2

y13

1.569

0.226

6.939

0.579

***

metacog2

y38

1.525

0.215

7.092

0.563

***

metacog2

y34

1.481

0.200

7.418

0.547

***

metacog2

y43

1.455

0.202

7.207

0.537

***

metacog2

y40

1.399

0.199

7.038

0.517

***

metacog2

y36

1.355

0.184

7.374

0.500

***

metacog2

y9

1.352

0.194

6.963

0.499

***

metacog2

y56

1.241

0.189

6.565

0.458

***

metacog2

y41

1.225

0.182

6.727

0.452

***

metacog2

y65

1.183

0.181

6.530

0.437

***

metacog2

y39

1.125

0.175

6.414

0.415

***

metacog2

y29

1.017

0.121

8.433

0.376

***

metacog2

y30

1.000

0.000

0.369

metacog2

y33

0.959

0.153

6.270

0.354

***

regulacao

y16

1.114

0.049

22.936

0.785

***

regulacao

y4

1.016

0.048

21.356

0.716

***

regulacao

y21

1.000

0.000

0.705

regulacao

y45

0.997

0.047

21.320

0.703

***

regulacao

y19

0.989

0.043

23.012

0.697

***

regulacao

y23

0.978

0.044

22.380

0.690

***

regulacao

y3

0.959

0.047

20.243

0.676

***

regulacao

y28

0.934

0.049

19.191

0.658

***

regulacao

y25

0.929

0.051

18.108

0.655

***

regulacao

y59

0.928

0.047

19.738

0.654

***

regulacao

y52

0.916

0.043

21.229

0.646

***

regulacao

y14

0.911

0.049

18.743

0.642

***

regulacao

y24

0.878

0.046

19.047

0.619

***

regulacao

y18

0.873

0.051

16.998

0.615

***

regulacao

y50

0.860

0.048

17.879

0.606

***

regulacao

y53

0.855

0.055

15.677

0.603

***

regulacao

y42

0.806

0.049

16.379

0.568

***

regulacao

y22

0.802

0.051

15.802

0.565

***

regulacao

y32

0.773

0.055

14.036

0.545

***

regulacao

y11

0.766

0.058

13.157

0.540

***

regulacao

y54

0.753

0.053

14.254

0.530

***

regulacao

y58

0.630

0.056

11.221

0.444

***

fitMeasures(cfa_pais, c("aic","chisq", "df", "pvalue", "cfi", "tli", "rmsea"))
##      aic    chisq       df   pvalue      cfi      tli    rmsea 
##       NA 4607.853 1649.000    0.000    0.965    0.963    0.053
fitMeasures(cfa_pais, c("chisq.scaled", "df.scaled", "pvalue.scaled", "cfi.scaled","tli.scaled","rmsea.scaled","srmr"))
##  chisq.scaled     df.scaled pvalue.scaled    cfi.scaled    tli.scaled 
##      3716.879      1649.000         0.000         0.902         0.898 
##  rmsea.scaled          srmr 
##         0.044         0.069

Now I’ll plot this solution.

semPaths(cfa_pais, title = FALSE, 
         label.cex = 2, #font - variables
         sizeLat = 7,  #font latent
         sizeMan = 4, #font - variables
         edge.label.cex = 1.2, #lambda font
         minimum = 0.1, 
         sizeInt = 0.8, 
         mar = c(1, 1, 1, 1), residuals = FALSE, 
         intercepts = FALSE, thresholds = FALSE, layout = "circle", 
         "std", 
         cut = 0.3,
         "hide", #hide latent estimates
         posCol = c("black", "dimgray"), #color of paths
         color = list(lat = rgb(173, 216, 230, maxColorValue = 255), 
                      man = rgb(240, 248, 255, maxColorValue = 255)))

4 Normative data

First, I’ll include an age interval index.

ds <- ds %>% 
  mutate(age_group = cut(idade, breaks = c(5,6,8,10,12,14, 16, 18, +Inf), include.lowest = TRUE)) %>% 
  mutate_if(is.factor,fct_explicit_na)

then, I’ll add a summative scores to the dataset

#parents
ds <- ds %>% 
  mutate(parents_metacog1 = rowSums(select(., y12 , y26 , y8 , y1 , y17 , y20 , y10 , y47 , y48 , y60 , y55 , y6 , y7 , y57), na.rm = T)) %>% 
  mutate(parents_hotindex = rowSums(select(., y21 ,  y23 ,  y19 ,  y52 ,  y28 ,  y3 ,  y4 ,  y50 ,  y45 ,  y24 ,  y25 ,  y22 ,  y18 ,  y54 ,  y59 ,  y14 ,  y42 ,  y16 ,  y11 ,  y53 ,  y58 ,  y32), na.rm = T)) %>% 
  mutate(parents_metacog2 = rowSums(select(.,  y30 ,  y41 ,  y31 ,  y29 ,  y35 ,  y38 ,  y37 ,  y2 ,  y40 ,  y13 ,  y46 ,  y43 ,  y62 ,  y44 ,  y9 ,  y27 ,  y33 ,  y39 ,  y65 ,  y34 ,  y36 ,  y56 ,  y5), na.rm = T)) %>% 
  mutate(parents_total = parents_metacog1+parents_hotindex+parents_metacog2)

And now I’ll report the normative data for parents scale. To make clear if I need specific normative tables, I’ll compare the results of boys and girls.

lm(parents_total ~  factor(sex) * age, ds) %>%
  apaTables::apa.aov.table(.)

Table 4. Normative table for EFICA-p and EFICA-t results.

# By age
ds %>%
  group_by(age_group) %>% #grouping
  summarise_at(vars(parents_total),
               list(mean = mean, 
                    sd = sd,
                    p90 = ~quantile(., .9),
                    p95 = ~quantile(., .95),
                    sample = ~sum(!is.na(.)))) %>%
  flextable()

age_group

mean

sd

p90

p95

sample

[5,6]

44

19

69

77

1017

(6,8]

45

20

71

79

983

(8,10]

43

20

70

76

807

(10,12]

43

20

71

77

273

(12,14]

42

22

72

81

88

(14,16]

40

19

67

72

72

(16,18]

31

18

57

60

44

5 Data handling - Teachers

5.1 Copy variables - Teachers

ds <- ds %>% 
  mutate_at(vars(ife_1_1:ife_52_1), list(prof = ~paste0(.))) %>%  
  rename_at(vars(ends_with( "_prof")), list(~paste0("i", 1:52)))

5.2 Transform into numeric - teachers

ds <- ds %>%
  mutate_at(vars(i1:i52), ~as.numeric(.)) 

5.3 Round values if necessary - teachers

ds <- ds %>% 
  mutate_at(vars(i1:i52), ~round(.,0)) 

5.4 Check distribution (outliers) - teachers

ds %>% select(i1:i52) %>% 
  pivot_longer(everything()) %>% 
  ggplot(., aes(name, value)) + geom_boxplot()

5.5 Replace coding errors with missing code - teachers

ds <- ds %>% 
  mutate_at(vars(i1:i52), ~replace(., . > 2 | . < 0, NA))

5.6 Missing cases - Teachers

ds %>% select(i1:i52) %>% summarise(sum(is.na(.)))
ds %>% select(i1:i52) %>% plot_missing()
#get first temp data (pais)
tempData <- ds %>% select(i1:i52) %>% mice(.,m=5,maxit=50,meth='pmm',seed=123)
ife_52_c <- complete(tempData,1)

5.7 Replace from the original dataset - Teachers

# create an index for Dataset
#ds <- ds %>%  mutate(id = row_number())
#create an index for complete dataset (items-only)
ife_52_c <- ife_52_c %>%  mutate(id = row_number())

#replace 
ds <- replace_subset(df = ds , df_subset = ife_52_c, id_col_names = c("id"))

5.8 Check if the imputation worked

ds %>% select(i1:i52) %>% summarise(sum(is.na(.)))
ds %>% select(i1:i52) %>% plot_missing()

6 Psychometrics - Teachers

6.1 Factorial models - Teachers

First, get a random sample subsetting the original dataset (into a smaller but representative data)

#random subsample -- teachers
set.seed(2)
ife_52_520 <- ife_52_c %>% sample_n(., 520)

Then, I’ll create and run the model

#create the model
mod_cfa_professores <- 'MetaCog =~ i23 + i14 + i17 + i9 + i19 + i30 + i38 + i49 + i43 + i21 + i25 + i8 + i52 + i36 + i32 + i37 + i48 + i13 + i46 + i6 + i7 + i10 + i15 + i1 + i29 + i4 + i18 + i12 + i33 + i39 + i47 + i3
                        Regulacao =~ i26 + i45 + i42 + i28 + i2 + i27 + i34 + i16 + i50 + i5 + i24 + i22 + i20 + i41 + i35 + i44 + i40 + i31 + i51 + i11'
#run the model

cfa_prof <- cfa(mod_cfa_professores, #model name
                  data=ife_52_520, #dataset
                  estimator = 'WLSMV', #method for extracting factors
                  orthogonal=FALSE,
                  ordered=names(ife_52_520))

And get the results, as presented in Table 3. Item analysis and CFA results – EFICA-T (Teachers).

## Report the model
parameterEstimates(cfa_prof, standardized=TRUE) %>% 
  filter(op == "=~") %>% 
  mutate(stars = ifelse(pvalue < .001, "***", 
                        ifelse(pvalue < .01, "**", 
                               ifelse(pvalue < .05, "*", "")))) %>%
  select('Latent Factor'=lhs, 
         Indicator=rhs, 
         B=est, 
         SE=se, Z=z, 
         Beta=std.all, 
         sig=stars) %>% 
  arrange(`Latent Factor`,desc(Beta)) %>% 
  flextable() %>% 
  colformat_num(., digit = 3)

Latent Factor

Indicator

B

SE

Z

Beta

sig

MetaCog

i8

1.020

0.013

78.629

0.922

***

MetaCog

i25

1.005

0.013

78.514

0.909

***

MetaCog

i23

1.000

0.000

0.904

MetaCog

i19

0.990

0.015

67.904

0.895

***

MetaCog

i48

0.986

0.016

62.499

0.892

***

MetaCog

i14

0.973

0.014

70.083

0.880

***

MetaCog

i21

0.972

0.015

65.262

0.879

***

MetaCog

i30

0.969

0.015

63.116

0.876

***

MetaCog

i17

0.968

0.015

64.253

0.875

***

MetaCog

i49

0.966

0.016

60.570

0.874

***

MetaCog

i29

0.963

0.022

44.651

0.871

***

MetaCog

i36

0.957

0.016

58.206

0.866

***

MetaCog

i12

0.943

0.018

51.511

0.853

***

MetaCog

i52

0.933

0.022

41.675

0.844

***

MetaCog

i9

0.931

0.019

50.117

0.843

***

MetaCog

i13

0.920

0.020

45.391

0.832

***

MetaCog

i1

0.919

0.020

44.840

0.831

***

MetaCog

i32

0.918

0.022

42.234

0.830

***

MetaCog

i33

0.911

0.021

43.136

0.824

***

MetaCog

i3

0.911

0.030

30.005

0.824

***

MetaCog

i39

0.905

0.024

37.075

0.818

***

MetaCog

i43

0.905

0.020

45.196

0.818

***

MetaCog

i37

0.895

0.024

37.411

0.810

***

MetaCog

i46

0.887

0.020

43.452

0.802

***

MetaCog

i18

0.875

0.027

32.896

0.791

***

MetaCog

i38

0.872

0.023

37.454

0.789

***

MetaCog

i4

0.851

0.024

35.552

0.770

***

MetaCog

i6

0.844

0.024

34.550

0.763

***

MetaCog

i7

0.827

0.026

32.004

0.748

***

MetaCog

i47

0.827

0.031

27.095

0.748

***

MetaCog

i10

0.815

0.027

30.518

0.737

***

MetaCog

i15

0.757

0.032

23.635

0.685

***

Regulacao

i44

1.104

0.036

30.870

0.920

***

Regulacao

i51

1.102

0.039

27.938

0.918

***

Regulacao

i28

1.091

0.032

34.075

0.909

***

Regulacao

i40

1.080

0.037

29.150

0.899

***

Regulacao

i35

1.050

0.033

31.492

0.875

***

Regulacao

i41

1.047

0.036

28.714

0.872

***

Regulacao

i11

1.037

0.039

26.295

0.864

***

Regulacao

i22

1.024

0.035

28.954

0.854

***

Regulacao

i34

1.017

0.036

28.603

0.848

***

Regulacao

i27

1.016

0.034

30.101

0.846

***

Regulacao

i45

1.011

0.028

36.371

0.842

***

Regulacao

i31

1.009

0.038

26.890

0.841

***

Regulacao

i26

1.000

0.000

0.833

Regulacao

i2

0.994

0.037

26.522

0.828

***

Regulacao

i5

0.993

0.039

25.241

0.828

***

Regulacao

i16

0.992

0.038

26.313

0.827

***

Regulacao

i50

0.985

0.039

24.950

0.821

***

Regulacao

i42

0.976

0.038

25.818

0.813

***

Regulacao

i24

0.968

0.037

26.302

0.806

***

Regulacao

i20

0.951

0.036

26.596

0.792

***

fitMeasures(cfa_prof, c("aic","chisq", "df", "pvalue", "cfi", "tli", "rmsea"))
##      aic    chisq       df   pvalue      cfi      tli    rmsea 
##       NA 5151.299 1273.000    0.000    0.991    0.991    0.077
fitMeasures(cfa_prof, c("chisq.scaled", "df.scaled", "pvalue.scaled", "cfi.scaled","tli.scaled","rmsea.scaled","srmr"))
##  chisq.scaled     df.scaled pvalue.scaled    cfi.scaled    tli.scaled 
##      3678.288      1273.000         0.000         0.955         0.953 
##  rmsea.scaled          srmr 
##         0.060         0.074

Now I’ll plot this solution.

semPaths(cfa_prof, title = FALSE, 
         label.cex = 2, #font - variables
         sizeLat = 7,  #font latent
         sizeMan = 4, #font - variables
         edge.label.cex = 1.2, #lambda font
         minimum = 0.1, 
         sizeInt = 0.8, 
         mar = c(1, 1, 1, 1), residuals = FALSE, 
         intercepts = FALSE, thresholds = FALSE, layout = "circle", 
         "std", 
         cut = 0.3,
         "hide", #hide latent estimates
         posCol = c("black", "dimgray"), #color of paths
         color = list(lat = rgb(173, 216, 230, maxColorValue = 255), 
                      man = rgb(240, 248, 255, maxColorValue = 255)))

6.2 Remove unecessary data

rm(tempData, ife_52_c)

7 Normative data - teachers

First, I’ll add a summative score to the dataset

ds <- ds %>% 
  mutate(teachers_metacog = rowSums(select(., i23 , i14 , i17 , i9 , i19 , i30 , i38 , i49 , i43 , i21 , i25 , i8 , i52 , i36 , i32 , i37 , i48 , i13 , i46 , i6 , i7 , i10 , i15 , i1 , i29 , i4 , i18 , i12 , i33 , i39 , i47 , i3), na.rm=T)) %>% 
  mutate(teachers_hotindex = rowSums(select(., i26 , i45 , i42 , i28 , i2 , i27 , i34 , i16 , i50 , i5 , i24 , i22 , i20 , i41 , i35 , i44 , i40 , i31 , i51 , i11), na.rm=T)) %>% 
  mutate(teachers_total = teachers_metacog+teachers_hotindex)

I’ll report the normative data for parents scale, as presented in Table 4. Normative table for EFICA-p and EFICA-t results.

lm(teachers_total ~  factor(sex) * age, ds) %>%
  apaTables::apa.aov.table(.)
## 
## 
## ANOVA results using teachers_total as the dependent variable
##  
## 
##          Predictor         SS   df       MS      F    p partial_eta2
##        (Intercept)   75321.98    1 75321.98 163.49 .000             
##        factor(sex)     205.34    1   205.34   0.45 .504          .00
##                age     203.27    1   203.27   0.44 .507          .00
##  factor(sex) x age    7979.86    1  7979.86  17.32 .000          .01
##              Error 1511175.67 3280   460.72                         
##  CI_90_partial_eta2
##                    
##          [.00, .00]
##          [.00, .00]
##          [.00, .01]
##                    
## 
## Note: Values in square brackets indicate the bounds of the 90% confidence interval for partial eta-squared
# By age
ds %>%
  group_by(age_group, sex) %>% #grouping
  summarise_at(vars(teachers_total),
               list(mean = mean, 
                    sd = sd,
                    p90 = ~quantile(., .9),
                    p95 = ~quantile(., .95),
                    sample = ~sum(!is.na(.)))) %>%
  flextable()

age_group

sex

mean

sd

p90

p95

sample

[5,6]

Female

21

20

49

61

498

[5,6]

Male

27

22

58

70

519

(6,8]

Female

22

20

51

60

492

(6,8]

Male

29

23

61

74

491

(8,10]

Female

20

19

46

55

414

(8,10]

Male

29

23

65

72

393

(10,12]

Female

21

18

42

55

128

(10,12]

Male

34

24

65

77

145

(12,14]

Female

20

23

56

65

46

(12,14]

Male

38

29

78

91

42

(14,16]

Female

20

22

52

53

35

(14,16]

Male

45

26

75

98

37

(16,18]

Female

24

20

52

58

25

(16,18]

Male

29

32

69

101

19

8 ADHD group

For this set of analyses, I’ll create a new dataset that contains both typical (n = 3284) and ADHD children (n = 165). No missing values imputation methods will be used in the adhd dataset. Both parents and teachers will be merged into the chunks.

8.1 Data handling

First I’ll define a ds for this group

ds_adhd <- dados_tdah
ds_adhd <- clean_names(ds_adhd)

Then I’ll add and rename specific variables

#add sex
ds_adhd <- ds_adhd %>% 
  mutate(sex = if_else(sexo == "1", "Female", "Male"))

#add race
ds_adhd <- ds_adhd %>%
  mutate(race = if_else(cor == "1","White","non-white"))

#rename age
ds_adhd <- ds_adhd %>%
  rename(age = idade)

#add guardian
ds_adhd <- ds_adhd %>%
 mutate(guardian = recode(quem_preenche, 
                          "1" = "mother",
                          "2" = "father",
                          "3" = "other"))
# add socioeconomic
ds_adhd <- ds_adhd %>%
  rename(socioeconomic = classe_economica)

#age group
ds_adhd <- ds_adhd %>%
  mutate(age_group = cut(age, breaks = c(5,6,8,10,12,14, 16, 18, +Inf), include.lowest = TRUE))

Then I’ll check if this dataset is reliable.

# Parents
ds_adhd <- ds_adhd %>% 
  # Copy variables
  mutate_at(vars(if_epa_1:if_epa_65), list(cat = ~paste0(.))) %>%  #<-- here labels are different
  rename_at(vars(ends_with( "_cat")), list(~paste0("y", 1:65))) %>% 
  # Transform into numeric
  mutate_at(vars(y1:y65), ~as.numeric(.)) %>%  
  # Round values if necessary - Parents
  mutate_at(vars(y1:y65), ~round(.,0)) 


# Teachers
ds_adhd <- ds_adhd %>% 
  mutate_at(vars(if_epr_1:if_epr_52), list(prof = ~paste0(.))) %>%  #<-- here labels are different again
  rename_at(vars(ends_with( "_prof")), list(~paste0("i", 1:52))) %>% 
  ## Transform into numeric - teachers
  mutate_at(vars(i1:i52), ~as.numeric(.)) %>% 
  ## Round values if necessary - teachers
  mutate_at(vars(i1:i52), ~round(.,0)) 

I’ll also check distribution (outliers)

ds_adhd %>% select(y1:y65) %>% 
  pivot_longer(everything()) %>% 
  ggplot(., aes(name, value)) + geom_boxplot()

ds_adhd  %>% select(i1:i52) %>% 
  pivot_longer(everything()) %>% 
  ggplot(., aes(name, value)) + geom_boxplot()

and replace coding errors with missing code - Parents

ds_adhd <- ds_adhd %>% 
  mutate_at(vars(y1:y65), ~replace(., . > 2 | . < 0, NA))

ds_adhd <- ds_adhd %>% 
  mutate_at(vars(i1:i52), ~replace(., . > 2 | . < 0, NA))

Now check missings

#parents
ds_adhd %>% select(y1:y65) %>% summarise(sum(is.na(.)))
ds_adhd %>% select(y1:y65) %>% plot_missing()

#teachers
ds_adhd %>% select(i1:i52) %>% summarise(sum(is.na(.)))
ds_adhd %>% select(i1:i52) %>% plot_missing()

Then I’ll add summative scores

ds_adhd <- ds_adhd %>% 
  #parents
  mutate(parents_metacog1 = rowSums(select(., y12 , y26 , y8 , y1 , y17 , y20 , y10 , y47 , y48 , y60 , y55 , y6 , y7 , y57), na.rm = T)) %>% 
  mutate(parents_hotindex = rowSums(select(., y21 ,  y23 ,  y19 ,  y52 ,  y28 ,  y3 ,  y4 ,  y50 ,  y45 ,  y24 ,  y25 ,  y22 ,  y18 ,  y54 ,  y59 ,  y14 ,  y42 ,  y16 ,  y11 ,  y53 ,  y58 ,  y32), na.rm = T)) %>% 
  mutate(parents_metacog2 = rowSums(select(.,  y30 ,  y41 ,  y31 ,  y29 ,  y35 ,  y38 ,  y37 ,  y2 ,  y40 ,  y13 ,  y46 ,  y43 ,  y62 ,  y44 ,  y9 ,  y27 ,  y33 ,  y39 ,  y65 ,  y34 ,  y36 ,  y56 ,  y5), na.rm = T)) %>% 
  mutate(parents_total = parents_metacog1+parents_hotindex+parents_metacog2) %>% 
  #teachers
  mutate(teachers_metacog = rowSums(select(., i23 , i14 , i17 , i9 , i19 , i30 , i38 , i49 , i43 , i21 , i25 , i8 , i52 , i36 , i32 , i37 , i48 , i13 , i46 , i6 , i7 , i10 , i15 , i1 , i29 , i4 , i18 , i12 , i33 , i39 , i47 , i3), na.rm=T)) %>% #<- consider missing
  mutate(teachers_hotindex = rowSums(select(., i26 , i45 , i42 , i28 , i2 , i27 , i34 , i16 , i50 , i5 , i24 , i22 , i20 , i41 , i35 , i44 , i40 , i31 , i51 , i11), na.rm=T)) %>%  #<- consider missing
  mutate(teachers_total = teachers_metacog+teachers_hotindex)

Now I’ll create a new dataset including the common variables among two groups

#check similatiries and differences
base_1 <- ds %>% 
  rename_at(vars(snap_1:snap_18, -contains("int")), ~paste0("sna_ppa_",1:18)) %>% 
  rename_at(vars(snap_1_1:snap_18_1, -contains("int")), ~paste0("sna_ppr_",1:18)) 
#get names
base_1 <- base_1 %>% names %>% data.frame %>% mutate(base = "general") %>% setNames(.,c("var","base"))
base_2 <- ds_adhd %>% names %>% data.frame %>% mutate(base = "adhd")%>% setNames(.,c("var","base"))
#check names
bind_rows(base_1,base_2) %>% #get both datasets
  mutate_at(vars(var),
            ~ tolower(.) %>% stringi::stri_trans_general(.,"Latin-ASCII")) %>% #just fix name issues
  group_by(var) %>% #group by variable 
  mutate(n = n()) %>%  #add counts
  filter(n==1) %>% #filter for exclusive variables
  #arrange(n) %>% 
  print(n=nrow(.)) #display
rm(base_1, base_2)

Now merge the datasets

#merge datasets
base_1 <- ds %>% 
  rename_at(vars(snap_1:snap_18, -contains("int")), ~paste0("sna_ppa_",1:18)) %>% 
  rename_at(vars(snap_1_1:snap_18_1, -contains("int")), ~paste0("sna_ppr_",1:18)) %>% 
  mutate_at(vars(starts_with("sdq"), starts_with("sna_ppr_")), ~as.numeric(.)) 

base_2 <- ds_adhd %>% 
  mutate(group = "adhd") %>% 
  rename(no_x = no)

common_names <- function(base1, base2){
  #create two datasets to storage the colnames
  base1 <- base1 %>% names %>% data.frame %>% mutate(base = "base1") %>% setNames(.,c("var","base"))
  base2 <- base2 %>% names %>% data.frame %>% mutate(base = "base2") %>% setNames(.,c("var","base"))
  
  #merge     
  lista_comum <- bind_rows(base1,base2) %>% #get both datasets
  mutate_at(vars(var),
            ~ tolower(.) %>% stringi::stri_trans_general(.,"Latin-ASCII")) %>% #just fix name issues
  group_by(var) %>% #group by variable 
  mutate(n = n()) %>%  #add counts
  filter(n==2) %>% #filter for varialbes into the two datasets
  arrange(var) %>% 
  #distinct(var, .keep_all = TRUE)  %>% 
  unstack(.,var~base) #unstack
  
  var_comum <<- lista_comum %>% pull(base1)
}
common_names(base1 = base_1, base2 = base_2)

#create a dataset with typical and adhd children with same variables
ds_full <- bind_rows(base_1, base_2) %>% 
  select(all_of(var_comum)) %>% 
  select(group, parents_total, teachers_total, 
         parents_metacog1, parents_hotindex, parents_metacog2, teachers_metacog, teachers_hotindex, everything())

Check if everything matches

ds_full %>% 
  group_by(group) %>% 
  select(parents_total, teachers_total, 
         parents_metacog1, parents_hotindex, parents_metacog2, teachers_metacog, teachers_hotindex) %>% 
  descr()

ds_full <- ds_full %>% 
  mutate(group = factor(group, levels=c("typical", "adhd")))

cor(ds_full %>% filter(group == "typical") %>% select(sdq_1),ds$sdq_1, use = "complete.obs")
cor(ds_full %>% filter(group == "adhd") %>% select(sdq_1),ds_adhd$sdq_1, use = "complete.obs")

Remove unnecessary variables

rm(base_1, base_2)

9 Clinical Utility

lm(parents_total ~ factor(group), data = ds_full) %>% summary()
lm(teachers_total ~ factor(group), data = ds_full) %>% summary()

9.1 plot (known-groups) - Parents

ggplot(ds_full, aes(x = parents_total, group =  group, fill = group)) +
  geom_density() +
  scale_fill_manual(values = c("#0072B250","#D55E0050"), labels = c("Typical", "ADHD")) +
   geom_segment(data = ds_full %>%
                  group_by(group) %>% 
                  summarize(mean = mean(parents_total), sd = sd(parents_total)), aes(x = mean, xend = mean, y = 0, yend = 0.022),
                size = 1, alpha=0.5, lty="21", color = "red") + 
  geom_text(data = ds_full %>%
              group_by(group) %>% 
              summarize(mean = mean(parents_total), sd = sd(parents_total)), 
            aes(x = mean, y = 0.005,label = paste0("μ = ",round(mean, 2))),position = position_stack(vjust = .5)) +
  theme_bw() +
  xlim(-10,130) +
  labs(x = "Parents score", y = "", fill = "Group")

ggplot(ds_full, aes(x = parents_total, y = group, fill = group)) +
  geom_density_ridges(scale = 10, 
                      size = 0.25, 
                      rel_min_height = 0.03,
                    # quantile_lines = T, 
                      alpha = .8, 
                      color = "black") +
  scale_fill_manual(values = c("#0072B250","#D55E0050"), labels = c("Typical", "ADHD")) +
  theme_ridges(center = TRUE) +
  geom_text(
    data = density_lines,
    aes(x = mean, y = group, label = paste("mean = ",round(mean, 2))),
    size = 4, nudge_x = 0.03, nudge_y = 0.5) 

9.2 plot (known-groups) - Teachers

ggplot(ds_full, aes(x = teachers_total, group =  group, fill = group)) +
  geom_density() +
  scale_fill_manual(values = c("#0072B250","#D55E0050"), labels = c("Typical", "ADHD")) +
  geom_segment(data = ds_full %>%
                 group_by(group) %>% 
                 summarize(mean = mean(teachers_total), sd = sd(teachers_total)), aes(x = mean, xend = mean, y = 0, yend = 0.025),
               size = 1, alpha=0.5, lty="21", color = "red") + 
  geom_text(data = ds_full %>%
              group_by(group) %>% 
              summarize(mean = mean(teachers_total), sd = sd(teachers_total)), 
            aes(x = mean, y = 0.005,label = paste0("μ = ",round(mean, 2))),position = position_stack(vjust = .5)) +
  theme_bw() +
  xlim(-10,120) +
  labs(x = "Teachers score", y = "", fill = "Group")

10 ROC curve

This chunk refers to the book chapter

This function below I’ll make possible plot and check tables

10.1 Getting dataset done

#get dataset
ds_full %>% 
  select(group, age_group,parents_total, teachers_total) %>% 
  mutate(diag = if_else(group == "typical",0,1)) -> dados_predict
#EFICA P
cutoff_efica_p <- cutpointr(dados_predict, parents_total, diag, 
                method = maximize_metric,  
                metric = youden)

summary(cutoff_efica_p)
## Method: maximize_metric 
## Predictor: parents_total 
## Outcome: diag 
## Direction: >= 
## 
##   AUC    n n_pos n_neg
##  0.88 3449   165  3284
## 
##  optimal_cutpoint youden  acc sensitivity specificity  tp fn  fp   tn
##                55   0.61 0.72         0.9        0.71 149 16 955 2329
## 
## Predictor summary: 
##         Min. 5% 1st Qu. Median Mean 3rd Qu. 95% Max. SD NAs
## overall    0 14      30     44   45      59  81  113 21   0
## 0          0 13      29     43   44      57  77  113 20   0
## 1         34 47      61     73   75      88 104  111 17   0
plot(cutoff_efica_p)

#plot_metric_boot(cutoff_efica_p)

#plot_sensitivity_specificity(cp)
#plot_roc(cutoff_efica_p) + ggtitle("ROC curve for EFICA-P")
#plot_cutpointr(cp, xvar = cutpoints, yvar = sum_sens_spec, conf_lvl = 0.9)

#set.seed(123)
#opt_cut <- cutpointr(dados_predict, parents_total, diag, direction = ">=", 
#                     pos_class = 1,
#                     neg_class = 0, 
#                     method = maximize_metric, metric = youden, boot_runs = 1000)
#plot_metric(opt_cut)
#summary(opt_cut)
#https://cran.r-project.org/web/packages/cutpointr/vignettes/cutpointr.html
#EFICA P
cutoff_efica_t <- cutpointr(dados_predict, teachers_total, diag, 
                method = maximize_metric, 
                metric = youden)
summary(cutoff_efica_t)
## Method: maximize_metric 
## Predictor: teachers_total 
## Outcome: diag 
## Direction: >= 
## 
##   AUC    n n_pos n_neg
##  0.92 3449   165  3284
## 
##  optimal_cutpoint youden  acc sensitivity specificity  tp fn  fp   tn
##                45    0.7 0.81        0.89        0.81 147 18 626 2658
## 
## Predictor summary: 
##         Min. 5% 1st Qu. Median Mean 3rd Qu. 95% Max. SD NAs
## overall    0  0       7     22   27      42  74  104 24   0
## 0          0  0       6     21   25      39  68  104 22   0
## 1          0 35      56     70   68      81  95  104 19   0
plot(cutoff_efica_t)

plot_sensitivity_specificity(cutoff_efica_t)

grid.arrange(plot_roc(cutoff_efica_p) + ggtitle("ROC curve for EFICA-P") + theme_bw(),
             plot_roc(cutoff_efica_t) + ggtitle("ROC curve for EFICA-T") + theme_bw(),
             ncol = 2, nrow = 1)

Now, I’ll present the trade-off (payoff?) between sensitivity and specificity. In near future, it will become a shiny app.

plot_efica_roc_curve <- function(cutoff) {  
  
  #display table
  tabela_roc <- dados_predict %>% 
    mutate(test_result = if_else(parents_total>=cutoff,"(1)Positive","(2)Negative")) %>% 
    mutate(diag = if_else(diag==1,"ADHD","Typical")) %>% 
    group_by(test_result,diag) %>% 
    count() %>% 
    pivot_wider(names_from = diag, values_from = n) %>% 
    ungroup() %>% 
    add_row(test_result = "Sens&Spec", 
            "ADHD" = .$ADHD[1]/(.$ADHD[1]+.$ADHD[2]), 
            "Typical" = .$Typical[2]/(.$Typical[2]+.$Typical[1])) %>% 
    add_row(test_result = "Accuracy", 
            "ADHD" = (.$ADHD[1]+.$Typical[2])/(.$ADHD[1]+.$ADHD[2]+.$Typical[2]+.$Typical[1]), 
            "Typical" = 0) %>% 
    
    print(tabela_roc)
  
  #get means for plotting
  mean_typical <- dados_predict %>% filter(group == "typical") %>% summarise(mean(parents_total)) %>% pull()
  sd_typical <- dados_predict %>% filter(group == "typical") %>% summarise(sd(parents_total)) %>% pull()
  
  mean_adhd <- dados_predict %>% filter(group == "adhd") %>% summarise(mean(parents_total)) %>% pull()
  sd_adhd <- dados_predict %>% filter(group == "adhd") %>% summarise(sd(parents_total)) %>% pull()
  
  #plot
  gridExtra::grid.arrange(
  ggplot(data = data.frame(u = c(-20, 150)),
         aes(x = u)) +
    geom_density(stat = "function", 
                 alpha=0.6,
                 fun = dnorm, 
                 mapping = aes(fill='Typical'), 
                 args = list(mean = mean_typical,
                             sd = sd_typical)) +
    geom_density(stat = "function", 
                 alpha=0.6,
                 fun = dnorm, 
                 mapping = aes(fill='Clinical'), 
                 args = list(mean = mean_adhd,
                             sd = sd_adhd)) +
    geom_vline(xintercept = cutoff, size=2) +
    geom_text(y  = 0.02, x = cutoff-15, label = " - \n (no disease)") +
    geom_text(y  = 0.02, x = cutoff+15, label = " + \n (disease)") +
    geom_text(y  = 0.011, x = 130, 
              label = paste("Cut-off = ",cutoff)) + #cut-off
    geom_text(y  = 0.009, x = 130, 
              label = paste("Sensitivity = ",round(tabela_roc$ADHD[3],2))) + #that's the sensitivity
    geom_text(y  = 0.007, x = 130, label = 
            paste("Specificity =",round(tabela_roc$Typical[3],2))) + #that's the specificity
    geom_text(y  = 0.005, x = 130, label = 
                paste("Accuracy =",round(tabela_roc$ADHD[4],2))) + #that's the accuracy    
    scale_fill_grey() +
    theme_void() +
    theme(legend.position = "none") ,

  #ROC CURVE plot
 
  ggplot(dados_predict, aes(d = diag, m = parents_total)) + 
    geom_vline(xintercept = 1-(tabela_roc$Typical[3]), color = 'grey50') + #1-specificity (False positive rate)
    geom_hline(yintercept = (tabela_roc$ADHD[3]), color = 'grey50') + #Sensitivity (True positive)
    plotROC::geom_roc(cutoffs.at = cutoff,size = 0.5, size.point = 0.5) +
    plotROC::style_roc() + geom_abline(slope = 1, intercept = 0, color = 'grey50')
    #geom_text(y  = 0.1, x = 0.90, label = paste("AUC ",round(plotROC::calc_auc(groc)[3],2))) #in case I want the area under the curve
  )
}
plot_efica_roc_curve(cutoff = 55)
## # A tibble: 4 x 3
##   test_result    ADHD  Typical
##   <chr>         <dbl>    <dbl>
## 1 (1)Positive 149      955    
## 2 (2)Negative  16     2329    
## 3 Sens&Spec     0.903    0.709
## 4 Accuracy      0.718    0

11 Future work

#https://rviews.rstudio.com/2019/03/01/some-r-packages-for-roc-curves/ #http://www.biosoft.hacettepe.edu.tr/easyROC/ #incrivel: https://kennis-research.shinyapps.io/ROC-Curves/ and http://www.navan.name/roc/

library("OptimalCutpoints")
dados_predict <- as.data.frame(dados_predict)

#compute the best cut-off
optimal.cutpoint.Youden <- optimal.cutpoints(X = "parents_total", 
                                             status = "diag", 
                                             tag.healthy = 0, 
                                             methods = "Youden", 
                                             data = dados_predict, 
                                             pop.prev = NULL,
                                             control = control.cutpoints(), 
                                             ci.fit = FALSE, 
                                             conf.level = 0.95, 
                                             trace = FALSE)
optimal.cutpoint.Youden<-optimal.cutpoints(X = "parents_total", status = "diag", tag.healthy = 0, 
methods = "Youden", data = dados_predict, pop.prev = NULL, 
control = control.cutpoints(generalized.Youden = TRUE), ci.fit = TRUE, conf.level = 0.95, 
trace = FALSE)

summary(optimal.cutpoint.Youden)
#get summary
summary(optimal.cutpoint.Youden)
#print
print.optimal.cutpoints(optimal.cutpoint.Youden)

#plot
plot(optimal.cutpoint.Youden, col = "blue")
#https://rpubs.com/thisisrg/supp_code_17 -- interessante
library(pROC)
roc_obj <- roc(response=dados_predict$diag, predictor=factor(dados_predict$prediction, ordered = TRUE), plot=TRUE)
roc_obj <- roc(response=as.numeric(dados_predict$diag), predictor=dados_predict$parents_total, plot=TRUE)

plot(roc_obj, col="red", lwd=3, main="ROC curve QDA")
auc_qda<-auc(roc_obj)
coords(roc_obj, x="best", input="threshold", best.method="youden", transpose = TRUE) # 


#library(InformationValue)
#optCutOff <- optimalCutoff(logitMod$diag, predicted)[1] 
#http://r-statistics.co/Logistic-Regression-With-R.html
library(ROCR)
predictions <- prediction(dados_predict$parents_total, dados_predict$diag)
predictions
perf <- performance(pred,"tpr","fpr")
plot(perf,
     avg="threshold",
     spread.estimate="boxplot")

plot(perf)

perf <- performance(pred, "tpr", "fpr")
plot(perf,
     avg= "threshold",
     colorize=TRUE,
     lwd= 3,
     main= "With ROCR you can produce standard plots\nlike ROC curves ...")
plot(perf,
     lty=3,
     col="grey78",
     add=TRUE)

sens <- data.frame(x=unlist(performance(predictions, "sens")@x.values), 
                   y=unlist(performance(predictions, "sens")@y.values))
spec <- data.frame(x=unlist(performance(predictions, "spec")@x.values), 
                   y=unlist(performance(predictions, "spec")@y.values))

sens %>% ggplot(aes(x,y)) + 
  geom_line() + 
  geom_line(data=spec, aes(x,y,col="red")) +
  scale_y_continuous(sec.axis = sec_axis(~., name = "Specificity")) +
  labs(x='Cutoff', y="Sensitivity") +
  theme(axis.title.y.right = element_text(colour = "red"), legend.position="none") +
  geom_vline(xintercept = 61)

#https://stackoverflow.com/questions/23240182/deciding-threshold-for-glm-logistic-regression-model-in-r

sens = cbind(unlist(performance(predictions, "sens")@x.values), unlist(performance(predictions, "sens")@y.values))
spec = cbind(unlist(performance(predictions, "spec")@x.values), unlist(performance(predictions, "spec")@y.values))
sens[which.min(apply(sens, 1, function(x) min(colSums(abs(t(spec) - x))))), 1]
ds_full %>% 
  filter(age_group == "[5,6]") %>% 
  select(group, parents_total) %>% 
  mutate(diag = factor(if_else(group == "typical",0,1))) -> dados_predict

str(dados_predict)

#https://community.rstudio.com/t/how-to-choose-best-threshold-value-automatically/12910
library(caret)
controlparameter <- trainControl(method="cv",
                                 number=10,
                                 savePredictions = TRUE)

##Apply Logistic Regression with K fold Cross Validation
##train
modelglm <- train(diag ~ parents_total,data = dados_predict,
                  method="glm",
                  family="binomial",
                  trControl=controlparameter)
modelglm
## Make prediction with mymodelglm to train data
prediction <- predict(modelglm,dados_predict)
#add to dataset
dados_predict$prediction <- prediction
##Validation of Prediction
confu_mat <- table(prediction=prediction,acctual=dados_predict$diag)
confu_mat
confusionMatrix(data=dados_predict$prediction, reference=dados_predict$diag, positive = "1")
devtools::install_github("sachsmc/plotROC")
#https://cran.r-project.org/web/packages/plotROC/vignettes/examples.html
library(plotROC)
shiny_plotROC()

ggplot(dados_predict, aes(d = diag, m = parents_total)) +  
  geom_roc(cutoffs.at = c(30, 55, 80)) + style_roc(theme = theme_grey)
#http://www.epeter-stats.de:3838/fbroc.serv/