Análise - Escribo Panamá (Leitura)

1. Carregamento da base de dados

Aqui iniciamos a importação da base de dados onde foi necessário utilizar o pacote read_spss para realizar a importação no formato .sav do SPSS.

#importando a base de dados
dados <- read_spss("data/Reading dados.sav")

2. Pré-processamento dos dados

Realizamos um pré processamento: limitando apenas alunos em um intervalo de idade; apenas alunos do Panamá; alunos que jogaram os games de matemática (grupo experimental); Apenas características gerais, pré-pós-teste e interação com os jogos e por fim retiramos todos os alunos que deixaram de fazer um dos testes (pré ou pós)

#Pré-processamento
dadosControle <- dados %>%
  filter(age >= 4 & age <= 5) %>% #Apenas alunos entre 4 à 7 anos.
  filter(`Reading_games` == 0) %>% # Apenas grupo experimental.
  select(5:12, 26, 40) %>%
  mutate(Gain_Read = POS_Read_SCORE - PRE_Read_Score) %>%  #Criação da variável de ganho.
  drop_na() #Apenas estudantes sem NA's.

dados <- dados %>%
  filter(age >= 4 & age <= 5) %>% #Apenas alunos entre 4 à 7 anos.
  filter(`Reading_games` == 1) %>% # Apenas grupo experimental.
  select(5:12, 26, 40, 79:154, 247:250) %>%
  mutate(Gain_Read = POS_Read_SCORE - PRE_Read_Score) %>%  #Criação da variável de ganho.
  drop_na() #Apenas estudantes sem NA's.

3. Análise Descritiva

Nesta seção mostraremos as tabela de contingência para as variáveis qualitativas, bem como testes de relação entre as variáveis….

3.1 Base de dados

Aqui apresentamos a base de dados utilizada para as análises após o pré-processamento dos dados

#library(rmarkdown)
paged_table(dados)

3.2 Tabela de contingência

Nesta tabela de contingência fizemos uma dupla entrada entre as variáveis qualitativas cruzando com uma única variável escolhida. É apresentado os valores de p-valor referente ao teste de relação de qui-quadrado.

Geral

dados %>% 
  select( #aqui ta selecionando todas as variáveis na tabela
    special,
    child_school,
    child_school_grade,
    age,
    gender,
    PRE_Read_Score,
    POS_Read_SCORE) %>% 
  tbl_summary(statistic = list(all_continuous() ~ "{mean} ({sd})"))

Characteristic	N = 202¹
special
no	178 (88%)
not-informed	15 (7.4%)
yes	9 (4.5%)
child_school
centro-educativo-particular	42 (21%)
centro-educativo-publico	152 (75%)
otro	8 (4.0%)
child_school_grade
kinder	92 (46%)
otro	6 (3.0%)
pre-kinder	69 (34%)
primary-basica-general	35 (17%)
age
4	84 (42%)
5	118 (58%)
gender
f	87 (43%)
m	115 (57%)
PRE_Read_Score	0.93 (0.14)
POS_Read_SCORE	0.96 (0.10)
¹ n (%); Mean (SD)

Sexo

dados %>% 
  select( #aqui ta selecionando todas as variáveis na tabela
    special,
    child_school,
    child_school_grade,
    age,
    gender,
    PRE_Read_Score,
    POS_Read_SCORE) %>% 
  tbl_summary(
    by = gender,
    statistic = list(all_continuous() ~ "{mean} ({sd})")) %>% #o by informa a varilavel do cruzamento
  add_p() #adiciona os p-valores dos testes de qui-quadrado

Characteristic	f, N = 87¹	m, N = 115¹	p-value²
special			0.11
no	78 (90%)	100 (87%)
not-informed	8 (9.2%)	7 (6.1%)
yes	1 (1.1%)	8 (7.0%)
child_school			0.9
centro-educativo-particular	17 (20%)	25 (22%)
centro-educativo-publico	66 (76%)	86 (75%)
otro	4 (4.6%)	4 (3.5%)
child_school_grade			0.5
kinder	44 (51%)	48 (42%)
otro	2 (2.3%)	4 (3.5%)
pre-kinder	29 (33%)	40 (35%)
primary-basica-general	12 (14%)	23 (20%)
age			0.8
4	37 (43%)	47 (41%)
5	50 (57%)	68 (59%)
PRE_Read_Score	0.97 (0.06)	0.91 (0.18)	0.052
POS_Read_SCORE	0.95 (0.12)	0.97 (0.09)	0.3
¹ n (%); Mean (SD) ² Fisher's exact test; Pearson's Chi-squared test; Wilcoxon rank sum test

Especial

dados %>% 
  select( #aqui ta selecionando todas as variáveis na tabela
    special,
    child_school,
    child_school_grade,
    age,
    gender,
    PRE_Read_Score,
    POS_Read_SCORE) %>% 
  tbl_summary(
    by = special,
    statistic = list(all_continuous() ~ "{mean} ({sd})")) %>% #o by informa a varilavel do cruzamento
  add_p() #adiciona os p-valores dos testes de qui-quadrado

Characteristic	no, N = 178¹	not-informed, N = 15¹	yes, N = 9¹	p-value²
child_school				0.8
centro-educativo-particular	37 (21%)	2 (13%)	3 (33%)
centro-educativo-publico	133 (75%)	13 (87%)	6 (67%)
otro	8 (4.5%)	0 (0%)	0 (0%)
child_school_grade				0.6
kinder	84 (47%)	5 (33%)	3 (33%)
otro	5 (2.8%)	1 (6.7%)	0 (0%)
pre-kinder	58 (33%)	7 (47%)	4 (44%)
primary-basica-general	31 (17%)	2 (13%)	2 (22%)
age				0.7
4	72 (40%)	7 (47%)	5 (56%)
5	106 (60%)	8 (53%)	4 (44%)
gender				0.11
f	78 (44%)	8 (53%)	1 (11%)
m	100 (56%)	7 (47%)	8 (89%)
PRE_Read_Score	0.94 (0.13)	0.85 (0.25)	0.94 (0.07)	0.2
POS_Read_SCORE	0.96 (0.11)	0.98 (0.03)	0.99 (0.02)	0.7
¹ n (%); Mean (SD) ² Fisher's exact test; Kruskal-Wallis rank sum test

Tipo Escola

dados %>% 
  select( #aqui ta selecionando todas as variáveis na tabela
    special,
    child_school,
    child_school_grade,
    age,
    gender,
    PRE_Read_Score,
    POS_Read_SCORE) %>% 
  tbl_summary(
    by = child_school,
    statistic = list(all_continuous() ~ "{mean} ({sd})")) %>% #o by informa a varilavel do cruzamento
  add_p() #adiciona os p-valores dos testes de qui-quadrado

Characteristic	centro-educativo-particular, N = 42¹	centro-educativo-publico, N = 152¹	otro, N = 8¹	p-value²
special				0.8
no	37 (88%)	133 (88%)	8 (100%)
not-informed	2 (4.8%)	13 (8.6%)	0 (0%)
yes	3 (7.1%)	6 (3.9%)	0 (0%)
child_school_grade				0.3
kinder	16 (38%)	72 (47%)	4 (50%)
otro	0 (0%)	5 (3.3%)	1 (12%)
pre-kinder	16 (38%)	50 (33%)	3 (38%)
primary-basica-general	10 (24%)	25 (16%)	0 (0%)
age				0.10
4	23 (55%)	57 (38%)	4 (50%)
5	19 (45%)	95 (62%)	4 (50%)
gender				0.9
f	17 (40%)	66 (43%)	4 (50%)
m	25 (60%)	86 (57%)	4 (50%)
PRE_Read_Score	0.93 (0.17)	0.93 (0.14)	0.94 (0.10)	0.8
POS_Read_SCORE	0.97 (0.05)	0.96 (0.11)	0.96 (0.05)	0.6
¹ n (%); Mean (SD) ² Fisher's exact test; Kruskal-Wallis rank sum test

Série

dados %>% 
  select( #aqui ta selecionando todas as variáveis na tabela
    special,
    child_school,
    child_school_grade,
    age,
    gender,
    PRE_Read_Score,
    POS_Read_SCORE) %>% 
  tbl_summary(
    by = child_school_grade,
    statistic = list(all_continuous() ~ "{mean} ({sd})")) %>% #o by informa a varilavel do cruzamento
  add_p() #adiciona os p-valores dos testes de qui-quadrado

Characteristic	kinder, N = 92¹	otro, N = 6¹	pre-kinder, N = 69¹	primary-basica-general, N = 35¹	p-value²
special					0.6
no	84 (91%)	5 (83%)	58 (84%)	31 (89%)
not-informed	5 (5.4%)	1 (17%)	7 (10%)	2 (5.7%)
yes	3 (3.3%)	0 (0%)	4 (5.8%)	2 (5.7%)
child_school					0.3
centro-educativo-particular	16 (17%)	0 (0%)	16 (23%)	10 (29%)
centro-educativo-publico	72 (78%)	5 (83%)	50 (72%)	25 (71%)
otro	4 (4.3%)	1 (17%)	3 (4.3%)	0 (0%)
age					<0.001
4	16 (17%)	3 (50%)	65 (94%)	0 (0%)
5	76 (83%)	3 (50%)	4 (5.8%)	35 (100%)
gender					0.5
f	44 (48%)	2 (33%)	29 (42%)	12 (34%)
m	48 (52%)	4 (67%)	40 (58%)	23 (66%)
PRE_Read_Score	0.95 (0.13)	0.93 (0.09)	0.91 (0.17)	0.94 (0.14)	0.14
POS_Read_SCORE	0.97 (0.11)	0.97 (0.05)	0.95 (0.11)	0.97 (0.05)	0.8
¹ n (%); Mean (SD) ² Fisher's exact test; Kruskal-Wallis rank sum test

Idade

dados %>% 
  select( #aqui ta selecionando todas as variáveis na tabela
    special,
    child_school,
    child_school_grade,
    age,
    gender,
    PRE_Read_Score,
    POS_Read_SCORE) %>% 
  tbl_summary(
    by = age,
    statistic = list(all_continuous() ~ "{mean} ({sd})")) %>% #o by informa a varilavel do cruzamento
  add_p() #adiciona os p-valores dos testes de qui-quadrado

Characteristic	4, N = 84¹	5, N = 118¹	p-value²
special			0.7
no	72 (86%)	106 (90%)
not-informed	7 (8.3%)	8 (6.8%)
yes	5 (6.0%)	4 (3.4%)
child_school			0.10
centro-educativo-particular	23 (27%)	19 (16%)
centro-educativo-publico	57 (68%)	95 (81%)
otro	4 (4.8%)	4 (3.4%)
child_school_grade			<0.001
kinder	16 (19%)	76 (64%)
otro	3 (3.6%)	3 (2.5%)
pre-kinder	65 (77%)	4 (3.4%)
primary-basica-general	0 (0%)	35 (30%)
gender			0.8
f	37 (44%)	50 (42%)
m	47 (56%)	68 (58%)
PRE_Read_Score	0.91 (0.17)	0.95 (0.12)	0.011
POS_Read_SCORE	0.96 (0.10)	0.96 (0.10)	0.6
¹ n (%); Mean (SD) ² Fisher's exact test; Pearson's Chi-squared test; Wilcoxon rank sum test

3.3 Análise de Densidade do Pré e Pós teste

Análise gráfica utilizando gráficos de densidade para verificar a diferença entre o pré e pós teste de acordo com as características: geral, idade, sexo, escola e série.

Geral

Plotando a densidade para os scores de pré e pós teste.

#gráfico de densidade
dados %>% 
  gather(PRE_Read_Score,
         POS_Read_SCORE,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_density(alpha=0.5) +
  xlim(0.75,1)

Idade

Plotando a densidade para os scores de pré e pós teste de acordo com a idade do estudante

dados %>% 
  gather(PRE_Read_Score,
         POS_Read_SCORE,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_density(alpha=0.5) +
  xlim(0.75,1) + 
  facet_wrap(~age,1)

Sexo

Plotando a densidade para os scores de pré e pós teste em relação ao sexo.

#gráfico de densidade em relação ao sexo
dados %>% 
  gather(PRE_Read_Score,
         POS_Read_SCORE,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_density(alpha=0.5) +
  xlim(0.75,1) + 
  facet_wrap(~gender,1)

Escola

Plotando a densidade para os scores de pré e pós teste em relação ao tipo de escola.

#gráfico de densidade em relação a escola
dados %>% 
  gather(PRE_Read_Score,
         POS_Read_SCORE,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_density(alpha=0.5) +
  xlim(0.75,1) + 
  facet_wrap(~child_school,1)

Série

Plotando a densidade para os scores de pré e pós teste em relação a série do estudante.

#gráfico de densidade em relação a escola
dados %>% 
  gather(PRE_Read_Score,
         POS_Read_SCORE,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_density(alpha=0.5) +
  xlim(0.75,1) + 
  facet_wrap(~child_school_grade,1)

3.4 Análise com BoxPlot do Pré e Pós teste

Análise gráfica utilizando gráficos de boxplot para verificar a diferença entre o pré e pós teste de acordo com as características: geral, idade, sexo, escola e série.

Geral

Plotando a densidade para os scores de pré e pós teste.

#gráfico de boxplot geral
dados %>% 
  gather(PRE_Read_Score,
         POS_Read_SCORE,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_boxplot(alpha=0.5) +
  coord_flip() +
  xlim(0.75,1)

Idade

Plotando o boxplot para os scores de pré e pós teste de acordo com a idade do estudante

#gráfico de boxplot em relação a idade
dados %>% 
  gather(PRE_Read_Score,
         POS_Read_SCORE,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_boxplot(alpha=0.5) +
  xlim(0.75,1) + 
  coord_flip() +
  facet_wrap(~age, 1)

Sexo

Plotando o boxplot para os scores de pré e pós teste em relação ao sexo.

#gráfico de boxplot em relação ao sexo
dados %>% 
  gather(PRE_Read_Score,
         POS_Read_SCORE,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_boxplot(alpha=0.5) +
  xlim(0.75,1) + 
  coord_flip() +
  facet_wrap(~gender,1)

Escola

Plotando o boxplot para os scores de pré e pós teste em relação ao tipo de escola.

#gráfico de boxplot em relação a escola
dados %>% 
  gather(PRE_Read_Score,
         POS_Read_SCORE,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_boxplot(alpha=0.5) +
  xlim(0.75,1) + 
  coord_flip() +
  facet_wrap(~child_school,1)

Série

Plotando o boxplot para os scores de pré e pós teste em relação a série do estudante.

#gráfico de boxplot em relação a escola
dados %>% 
  gather(PRE_Read_Score,
         POS_Read_SCORE,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_boxplot(alpha=0.5) +
  xlim(0.75,1) + 
  coord_flip() +
  facet_wrap(~child_school_grade,1)

4. Cluster Analysis

4.1 Assessing Clustering Tendency

Hopkins Statistic Method

V

dadosCluster <- dados %>% 
  select(ends_with("_V")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.1368253

R

dadosCluster <- dados %>% 
  select(ends_with("_R")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.1026321

W

dadosCluster <- dados %>% 
  select(ends_with("_W")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.1171089

C

dadosCluster <- dados %>% 
  select(ends_with("_C")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.08886933

V + R

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_R")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.1230036

V + W

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_W")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.1405325

V + C

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_C")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.1183648

R + W

dadosCluster <- dados %>% 
  select(ends_with("_R"), ends_with("_W")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.1176674

R + C

dadosCluster <- dados %>% 
  select(ends_with("_R"), ends_with("_C")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.09787017

W + C

dadosCluster <- dados %>% 
  select(ends_with("_W"), ends_with("_C")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.1111217

V + R + W

dadosCluster <- dados %>% 
  select(ends_with("_W"), ends_with("_R"), ends_with("_W")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.1172034

V + R + C

dadosCluster <- dados %>% 
  select(ends_with("_W"), ends_with("_R"), ends_with("_C")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.1112065

V + W + C

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_W"), ends_with("_C")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.1261339

R + W + C

dadosCluster <- dados %>% 
  select(ends_with("_R"), ends_with("_W"), ends_with("_C")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.1114106

V + R + W + C

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_R"), ends_with("_W"), ends_with("_C")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.1230248

dadosCluster <- dados %>% 
  select(ends_with("_C")) %>% 
  scale() #Escalonando os valores

4.2 K-means Clustering

4.2.1 Estimating the optimal number of clusters

WSS Method

fviz_nbclust(dadosCluster, kmeans, method = "wss") +
  geom_vline(xintercept = 4, linetype = 2) +
  labs(subtitle = "Elbow method")

Silhouette Method

fviz_nbclust(dadosCluster, kmeans, method = "silhouette") +
  labs(subtitle = "Silhouette method")

Gap Statistic Method

fviz_nbclust(dadosCluster, kmeans, method = "gap_stat", nboot = 50) +
  labs(subtitle = "Gap statistic method")

4.2.2 Computing and Visualizing k-means clustering

K = 2

set.seed(123)
km.res2 <- kmeans(dadosCluster, 2, nstart = 25)

fviz_cluster(km.res2, data = dadosCluster,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
             star.plot = TRUE, # Add segments from centroids to items
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_minimal())

K = 3

set.seed(123)
km.res3 <- kmeans(dadosCluster, 3, nstart = 25)

fviz_cluster(km.res3, data = dadosCluster,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
             star.plot = TRUE, # Add segments from centroids to items
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_minimal())

K = 4

set.seed(123)
km.res4 <- kmeans(dadosCluster, 4, nstart = 25)

fviz_cluster(km.res4, data = dadosCluster,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
             star.plot = TRUE, # Add segments from centroids to items
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_minimal())

K = 5

set.seed(123)
km.res5 <- kmeans(dadosCluster, 5, nstart = 25)

fviz_cluster(km.res5, data = dadosCluster,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07", "#0000CD"),
             star.plot = TRUE, # Add segments from centroids to items
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_minimal())

4.2.3 Teste de Kruskall Wallis

K = 2

dadosKruskal_k2 <- cbind(dados, Cluster = km.res2$cluster)

dadosKruskal_k2 <- dadosKruskal_k2 %>% 
  select(Cluster, Gain_Read)

kruskal.test(Gain_Read ~ Cluster, data = dadosKruskal_k2)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Read by Cluster
## Kruskal-Wallis chi-squared = 0.78432, df = 1, p-value = 0.3758

K = 3

dadosKruskal_k3 <- cbind(dados, Cluster = km.res3$cluster)

dadosKruskal_k3 <- dadosKruskal_k3 %>% 
  select(Cluster, Gain_Read)

kruskal.test(Gain_Read ~ Cluster, data = dadosKruskal_k3)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Read by Cluster
## Kruskal-Wallis chi-squared = 1.1736, df = 2, p-value = 0.5561

K = 4

dadosKruskal_k4 <- cbind(dados, Cluster = km.res4$cluster)

dadosKruskal_k4 <- dadosKruskal_k4 %>% 
  select(Cluster, Gain_Read)

kruskal.test(Gain_Read ~ Cluster, data = dadosKruskal_k4)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Read by Cluster
## Kruskal-Wallis chi-squared = 4.9958, df = 3, p-value = 0.1721

K = 5

dadosKruskal_k5 <- cbind(dados, Cluster = km.res5$cluster)

dadosKruskal_k5 <- dadosKruskal_k5 %>% 
  select(Cluster, Gain_Read)

kruskal.test(Gain_Read ~ Cluster, data = dadosKruskal_k5)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Read by Cluster
## Kruskal-Wallis chi-squared = 7.5359, df = 4, p-value = 0.1101

4.2.4 Teste de Dunn com ajuste do p-valor

K = 2

dunn_test(Gain_Read ~ Cluster, data = dadosKruskal_k2, p.adjust.method = "bonferroni")

## # A tibble: 1 × 9
##   .y.       group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 Gain_Read 1      2        201     1    -0.886 0.376 0.376 ns

K = 3

dunn_test(Gain_Read ~ Cluster, data = dadosKruskal_k3, p.adjust.method = "bonferroni")

## # A tibble: 3 × 9
##   .y.       group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 Gain_Read 1      2         13     1    -0.694 0.488     1 ns          
## 2 Gain_Read 1      3         13   188     0.624 0.533     1 ns          
## 3 Gain_Read 2      3          1   188     0.897 0.370     1 ns

K = 4

dunn_test(Gain_Read ~ Cluster, data = dadosKruskal_k4, p.adjust.method = "bonferroni")

## # A tibble: 6 × 9
##   .y.       group1 group2    n1    n2 statistic      p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl>  <dbl> <dbl> <chr>       
## 1 Gain_Read 1      2          5    34     1.78  0.0751 0.451 ns          
## 2 Gain_Read 1      3          5   162     1.24  0.217  1     ns          
## 3 Gain_Read 1      4          5     1    -0.266 0.790  1     ns          
## 4 Gain_Read 2      3         34   162    -1.55  0.122  0.733 ns          
## 5 Gain_Read 2      4         34     1    -1.13  0.260  1     ns          
## 6 Gain_Read 3      4        162     1    -0.850 0.395  1     ns

K = 5

dunn_test(Gain_Read ~ Cluster, data = dadosKruskal_k5, p.adjust.method = "bonferroni")

## # A tibble: 10 × 9
##    .y.       group1 group2    n1    n2 statistic      p p.adj p.adj.signif
##  * <chr>     <chr>  <chr>  <int> <int>     <dbl>  <dbl> <dbl> <chr>       
##  1 Gain_Read 1      2        148    40     2.21  0.0274 0.274 ns          
##  2 Gain_Read 1      3        148     5    -1.15  0.249  1     ns          
##  3 Gain_Read 1      4        148     1    -0.813 0.416  1     ns          
##  4 Gain_Read 1      5        148     8     0.476 0.634  1     ns          
##  5 Gain_Read 2      3         40     5    -1.93  0.0531 0.531 ns          
##  6 Gain_Read 2      4         40     1    -1.19  0.232  1     ns          
##  7 Gain_Read 2      5         40     8    -0.569 0.569  1     ns          
##  8 Gain_Read 3      4          5     1    -0.266 0.790  1     ns          
##  9 Gain_Read 3      5          5     8     1.22  0.222  1     ns          
## 10 Gain_Read 4      5          1     8     0.932 0.351  1     ns

4.3 K-Medoids

4.3.1 Estimating the optimal number of clusters

WSS Method

fviz_nbclust(dadosCluster, pam, method = "wss") +
  geom_vline(xintercept = 4, linetype = 2) +
  labs(subtitle = "Elbow method")

Silhouette Method

fviz_nbclust(dadosCluster, pam, method = "silhouette") +
  labs(subtitle = "Silhouette method")

Gap Statistic Method

fviz_nbclust(dadosCluster, pam, method = "gap_stat", nboot = 50) +
  labs(subtitle = "Gap statistic method")

4.3.2 Computing and Visualizing PAM clustering

K = 2

set.seed(123)
pam.res2 <- pam(dadosCluster, 2)

fviz_cluster(pam.res2, 
             palette = c("#00AFBB", "#FC4E07"), # color palette
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_classic())

K = 3

set.seed(123)
pam.res3 <- pam(dadosCluster, 3)

fviz_cluster(pam.res3, 
             palette = c("#2E9FDF", "#00AFBB", "#FC4E07"), # color palette
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_classic())

K = 4

set.seed(123)
pam.res4 <- pam(dadosCluster, 4)

fviz_cluster(pam.res4, 
             palette = c("#2E9FDF", "#E7B800", "#00AFBB", "#FC4E07"), # color palette
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_classic())

K = 5

set.seed(123)
pam.res5 <- pam(dadosCluster, 5)


fviz_cluster(pam.res5,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07", "#0000CD"),
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_classic())

4.3.3 Teste de Kruskall Wallis

K = 2

dadosKruskal_k2 <- cbind(dados, Cluster = pam.res2$cluster)

dadosKruskal_k2 <- dadosKruskal_k2 %>% 
  select(Cluster, Gain_Read)

kruskal.test(Gain_Read ~ Cluster, data = dadosKruskal_k2)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Read by Cluster
## Kruskal-Wallis chi-squared = 2.9914, df = 1, p-value = 0.08371

K = 3

dadosKruskal_k3 <- cbind(dados, Cluster = pam.res3$cluster)

dadosKruskal_k3 <- dadosKruskal_k3 %>% 
  select(Cluster, Gain_Read)

kruskal.test(Gain_Read ~ Cluster, data = dadosKruskal_k3)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Read by Cluster
## Kruskal-Wallis chi-squared = 4.2377, df = 2, p-value = 0.1202

K = 4

dadosKruskal_k4 <- cbind(dados, Cluster = pam.res4$cluster)

dadosKruskal_k4 <- dadosKruskal_k4 %>% 
  select(Cluster, Gain_Read)

kruskal.test(Gain_Read ~ Cluster, data = dadosKruskal_k4)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Read by Cluster
## Kruskal-Wallis chi-squared = 8.1659, df = 3, p-value = 0.0427

K = 5

dadosKruskal_k5 <- cbind(dados, Cluster = pam.res5$cluster)

dadosKruskal_k5 <- dadosKruskal_k5 %>% 
  select(Cluster, Gain_Read)

kruskal.test(Gain_Read ~ Cluster, data = dadosKruskal_k5)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Read by Cluster
## Kruskal-Wallis chi-squared = 6.0918, df = 4, p-value = 0.1924

4.3.4 Teste de Dunn com ajuste do p-valor

K = 2

dunn_test(Gain_Read ~ Cluster, data = dadosKruskal_k2, p.adjust.method = "bonferroni")

## # A tibble: 1 × 9
##   .y.       group1 group2    n1    n2 statistic      p  p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl>  <dbl>  <dbl> <chr>       
## 1 Gain_Read 1      2        155    47      1.73 0.0837 0.0837 ns

K = 3

dunn_test(Gain_Read ~ Cluster, data = dadosKruskal_k3, p.adjust.method = "bonferroni")

## # A tibble: 3 × 9
##   .y.       group1 group2    n1    n2 statistic      p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl>  <dbl> <dbl> <chr>       
## 1 Gain_Read 1      2        155    46     1.86  0.0631 0.189 ns          
## 2 Gain_Read 1      3        155     1    -0.814 0.416  1     ns          
## 3 Gain_Read 2      3         46     1    -1.12  0.264  0.793 ns

K = 4

dunn_test(Gain_Read ~ Cluster, data = dadosKruskal_k4, p.adjust.method = "bonferroni")

## # A tibble: 6 × 9
##   .y.       group1 group2    n1    n2 statistic      p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl>  <dbl> <dbl> <chr>       
## 1 Gain_Read 1      2        155    41     2.36  0.0184 0.110 ns          
## 2 Gain_Read 1      3        155     1    -0.814 0.416  1     ns          
## 3 Gain_Read 1      4        155     5    -1.16  0.248  1     ns          
## 4 Gain_Read 2      3         41     1    -1.22  0.224  1     ns          
## 5 Gain_Read 2      4         41     5    -1.98  0.0475 0.285 ns          
## 6 Gain_Read 3      4          1     5     0.266 0.790  1     ns

K = 5

dunn_test(Gain_Read ~ Cluster, data = dadosKruskal_k5, p.adjust.method = "bonferroni")

## # A tibble: 10 × 9
##    .y.       group1 group2    n1    n2 statistic      p p.adj p.adj.signif
##  * <chr>     <chr>  <chr>  <int> <int>     <dbl>  <dbl> <dbl> <chr>       
##  1 Gain_Read 1      2        102    38     1.86  0.0632 0.632 ns          
##  2 Gain_Read 1      3        102    56     0.403 0.687  1     ns          
##  3 Gain_Read 1      4        102     1    -0.811 0.417  1     ns          
##  4 Gain_Read 1      5        102     5    -1.14  0.253  1     ns          
##  5 Gain_Read 2      3         38    56    -1.36  0.174  1     ns          
##  6 Gain_Read 2      4         38     1    -1.15  0.249  1     ns          
##  7 Gain_Read 2      5         38     5    -1.84  0.0653 0.653 ns          
##  8 Gain_Read 3      4         56     1    -0.875 0.382  1     ns          
##  9 Gain_Read 3      5         56     5    -1.27  0.206  1     ns          
## 10 Gain_Read 4      5          1     5     0.266 0.790  1     ns

4.4 CLARA - Clustering Large Applications

4.4.1 Estimating the optimal number of clusters

WSS Method

fviz_nbclust(dadosCluster, clara, method = "wss") +
  geom_vline(xintercept = 4, linetype = 2) +
  labs(subtitle = "Elbow method")

Silhouette Method

fviz_nbclust(dadosCluster, clara, method = "silhouette") +
  labs(subtitle = "Silhouette method")

Gap Statistic Method

fviz_nbclust(dadosCluster, clara, method = "gap_stat", nboot = 50) +
  labs(subtitle = "Gap statistic method")

4.4.2 Computing and Visualizing CLARA clustering

K = 2

set.seed(123)
clara.res2 <- clara(dadosCluster, 2, samples = 50, pamLike = TRUE)

fviz_cluster(clara.res2,
             palette = c("#00AFBB", "#FC4E07"), # color palette
             geom = "point", pointsize = 1,
             ggtheme = theme_classic())

K = 3

set.seed(123)
clara.res3 <- clara(dadosCluster, 3, samples = 50, pamLike = TRUE)

fviz_cluster(clara.res3,
             palette = c("#2E9FDF", "#00AFBB", "#FC4E07"), # color palette
             geom = "point", pointsize = 1,
             ggtheme = theme_classic())

K = 4

set.seed(123)
clara.res4 <- clara(dadosCluster, 4, samples = 50, pamLike = TRUE)

fviz_cluster(clara.res4,
             palette = c("#2E9FDF", "#E7B800", "#00AFBB", "#FC4E07"), # color palette
             geom = "point", pointsize = 1,
             ggtheme = theme_classic())

K = 5

set.seed(123)
clara.res5 <- clara(dadosCluster, 5, samples = 50, pamLike = TRUE)


fviz_cluster(clara.res5,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07", "#0000CD"),
             geom = "point", pointsize = 1,
             ggtheme = theme_classic())

4.4.3 Teste de Kruskall Wallis

K = 2

dadosKruskal_k2 <- cbind(dados, Cluster = clara.res2$cluster)

dadosKruskal_k2 <- dadosKruskal_k2 %>% 
  select(Cluster, Gain_Read)

kruskal.test(Gain_Read ~ Cluster, data = dadosKruskal_k2)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Read by Cluster
## Kruskal-Wallis chi-squared = 0.78432, df = 1, p-value = 0.3758

K = 3

dadosKruskal_k3 <- cbind(dados, Cluster = clara.res3$cluster)

dadosKruskal_k3 <- dadosKruskal_k3 %>% 
  select(Cluster, Gain_Read)

kruskal.test(Gain_Read ~ Cluster, data = dadosKruskal_k3)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Read by Cluster
## Kruskal-Wallis chi-squared = 4.2377, df = 2, p-value = 0.1202

K = 4

dadosKruskal_k4 <- cbind(dados, Cluster = clara.res4$cluster)

dadosKruskal_k4 <- dadosKruskal_k4 %>% 
  select(Cluster, Gain_Read)

kruskal.test(Gain_Read ~ Cluster, data = dadosKruskal_k4)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Read by Cluster
## Kruskal-Wallis chi-squared = 8.1659, df = 3, p-value = 0.0427

K = 5

dadosKruskal_k5 <- cbind(dados, Cluster = clara.res5$cluster)

dadosKruskal_k5 <- dadosKruskal_k5 %>% 
  select(Cluster, Gain_Read)

kruskal.test(Gain_Read ~ Cluster, data = dadosKruskal_k5)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Read by Cluster
## Kruskal-Wallis chi-squared = 10.235, df = 4, p-value = 0.03665

4.4.4 Teste de Dunn com ajuste do p-valor

K = 2

dunn_test(Gain_Read ~ Cluster, data = dadosKruskal_k2, p.adjust.method = "bonferroni")

## # A tibble: 1 × 9
##   .y.       group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 Gain_Read 1      2        201     1    -0.886 0.376 0.376 ns

K = 3

dunn_test(Gain_Read ~ Cluster, data = dadosKruskal_k3, p.adjust.method = "bonferroni")

## # A tibble: 3 × 9
##   .y.       group1 group2    n1    n2 statistic      p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl>  <dbl> <dbl> <chr>       
## 1 Gain_Read 1      2        155    46     1.86  0.0631 0.189 ns          
## 2 Gain_Read 1      3        155     1    -0.814 0.416  1     ns          
## 3 Gain_Read 2      3         46     1    -1.12  0.264  0.793 ns

K = 4

dunn_test(Gain_Read ~ Cluster, data = dadosKruskal_k4, p.adjust.method = "bonferroni")

## # A tibble: 6 × 9
##   .y.       group1 group2    n1    n2 statistic      p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl>  <dbl> <dbl> <chr>       
## 1 Gain_Read 1      2        155    41     2.36  0.0184 0.110 ns          
## 2 Gain_Read 1      3        155     1    -0.814 0.416  1     ns          
## 3 Gain_Read 1      4        155     5    -1.16  0.248  1     ns          
## 4 Gain_Read 2      3         41     1    -1.22  0.224  1     ns          
## 5 Gain_Read 2      4         41     5    -1.98  0.0475 0.285 ns          
## 6 Gain_Read 3      4          1     5     0.266 0.790  1     ns

K = 5

dunn_test(Gain_Read ~ Cluster, data = dadosKruskal_k5, p.adjust.method = "bonferroni")

## # A tibble: 10 × 9
##    .y.       group1 group2    n1    n2 statistic      p p.adj p.adj.signif
##  * <chr>     <chr>  <chr>  <int> <int>     <dbl>  <dbl> <dbl> <chr>       
##  1 Gain_Read 1      2        155    41    2.36   0.0184 0.184 ns          
##  2 Gain_Read 1      3        155     1   -0.814  0.416  1     ns          
##  3 Gain_Read 1      4        155     4   -1.67   0.0946 0.946 ns          
##  4 Gain_Read 1      5        155     1    0.759  0.448  1     ns          
##  5 Gain_Read 2      3         41     1   -1.22   0.224  1     ns          
##  6 Gain_Read 2      4         41     4   -2.41   0.0161 0.161 ns          
##  7 Gain_Read 2      5         41     1    0.344  0.731  1     ns          
##  8 Gain_Read 3      4          1     4   -0.0269 0.979  1     ns          
##  9 Gain_Read 3      5          1     1    1.12   0.264  1     ns          
## 10 Gain_Read 4      5          4     1    1.44   0.150  1     ns

4.5 Agglomerative Clustering

4.5.1 Dendrogram

Euclidean Distance Method

res.dist <- dist(dadosCluster, method = "euclidean")

res.hc <- hclust(d = res.dist, method = "ward.D2")

fviz_dend(res.hc, k = 3, # Cut in three groups
          cex = 0.5, # label size
          k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"),
          color_labels_by_k = TRUE, # color labels by groups
          rect = TRUE, # Add rectangle around groups
          rect_border = c("#2E9FDF", "#00AFBB", "#E7B800"),
          rect_fill = TRUE)

Manhattan Distance Method

res.dist <- dist(dadosCluster, method = "manhattan")

res.hc <- hclust(d = res.dist, method = "ward.D2")

fviz_dend(res.hc, k = 3, # Cut in three groups
          cex = 0.5, # label size
          k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"),
          color_labels_by_k = TRUE, # color labels by groups
          rect = TRUE, # Add rectangle around groups
          rect_border = c("#2E9FDF", "#00AFBB", "#E7B800"),
          rect_fill = TRUE)

Maximun Distance Method

res.dist <- dist(dadosCluster, method = "maximum")

res.hc <- hclust(d = res.dist, method = "ward.D2")

fviz_dend(res.hc, k = 3, # Cut in three groups
          cex = 0.5, # label size
          k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"),
          color_labels_by_k = TRUE, # color labels by groups
          rect = TRUE, # Add rectangle around groups
          rect_border = c("#2E9FDF", "#00AFBB", "#E7B800"),
          rect_fill = TRUE)

5. Explorando os Clusters

5.1 Criando a variável Cluster na base de dados

##Criando coluna de cluster na tabela de dados do grupo experimental
dados <- cbind(dados, Cluster = pam.res4$cluster)
dados$Cluster <- 
  factor(dados$Cluster, label = c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4"), levels = c(1,2,3,4))

5.2 Tabela de variáveis em relação aos clusters

dados %>% 
  select( #aqui ta selecionando todas as variáveis na tabela
    special,
    child_school,
    child_school_grade,
    age,
    gender,
    PRE_Read_Score,
    POS_Read_SCORE,
    Cluster) %>% 
 tbl_summary(by = Cluster,
    statistic = list(all_continuous() ~ "{mean} ({sd})")) %>% #o by informa a varilavel do cruzamento
  add_p() #adiciona os p-valores dos testes de qui-quadrado

Characteristic	Cluster 1, N = 155¹	Cluster 2, N = 41¹	Cluster 3, N = 1¹	Cluster 4, N = 5¹	p-value²
special					0.3
no	140 (90%)	32 (78%)	1 (100%)	5 (100%)
not-informed	9 (5.8%)	6 (15%)	0 (0%)	0 (0%)
yes	6 (3.9%)	3 (7.3%)	0 (0%)	0 (0%)
child_school					>0.9
centro-educativo-particular	34 (22%)	7 (17%)	0 (0%)	1 (20%)
centro-educativo-publico	115 (74%)	32 (78%)	1 (100%)	4 (80%)
otro	6 (3.9%)	2 (4.9%)	0 (0%)	0 (0%)
child_school_grade					0.6
kinder	69 (45%)	20 (49%)	0 (0%)	3 (60%)
otro	6 (3.9%)	0 (0%)	0 (0%)	0 (0%)
pre-kinder	51 (33%)	16 (39%)	0 (0%)	2 (40%)
primary-basica-general	29 (19%)	5 (12%)	1 (100%)	0 (0%)
age					0.8
4	63 (41%)	18 (44%)	0 (0%)	3 (60%)
5	92 (59%)	23 (56%)	1 (100%)	2 (40%)
gender					0.4
f	70 (45%)	14 (34%)	0 (0%)	3 (60%)
m	85 (55%)	27 (66%)	1 (100%)	2 (40%)
PRE_Read_Score	0.95 (0.12)	0.87 (0.20)	1.00 (NA)	0.96 (0.04)	0.008
POS_Read_SCORE	0.96 (0.11)	0.96 (0.07)	0.99 (NA)	0.95 (0.06)	0.2
¹ n (%); Mean (SD) ² Fisher's exact test; Kruskal-Wallis rank sum test

5.3 Gráficos de Densidade em relação a formação dos clusters

Plotando a densidade para os scores de pré e pós teste de acordo com o Cluster

dados %>% 
  gather(PRE_Read_Score,
         POS_Read_SCORE,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_density(alpha=0.5) +
  xlim(0.75,1) + 
  facet_wrap(~Cluster,1)

5.4 Gráficos de Boxplot em relação a formação dos clusters

Plotando a densidade para os scores de pré e pós teste de acordo com o Cluster

dados %>% 
  gather(PRE_Read_Score,
         POS_Read_SCORE,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_boxplot(alpha=0.5) +
  coord_flip() +
  facet_wrap(~Cluster,1)