Análise - Escribo Panamá (Matemática)

1. Carregamento da base de dados

Aqui iniciamos a importação da base de dados onde foi necessário utilizar o pacote read_spss para realizar a importação no formato .sav do SPSS.

#importando a base de dados
dados <- read_spss("data/Math dados.sav")

2. Pré-processamento dos dados

Realizamos um pré processamento: limitando apenas alunos em um intervalo de idade; apenas alunos do Panamá; alunos que jogaram os games de matemática (grupo experimental); Apenas características gerais, pré-pós-teste e interação com os jogos e por fim retiramos todos os alunos que deixaram de fazer um dos testes (pré ou pós)

#Pré-processamento
dados <- dados %>%
  filter(age >= 4 & age <= 7) %>% #Apenas alunos entre 4 à 7 anos.
  filter(`@1_Games` == 1) %>% # Apenas grupo experimental.
  select(3:12, 56, 74, 159:238) %>%
  mutate(Gain_Math = POS_Math_Score - PRE_Math_Score) %>%  #Criação da variável de ganho.
  drop_na() #Apenas estudantes sem NA's.

3. Análise Descritiva

Nesta seção mostraremos as tabela de contingência para as variáveis qualitativas, bem como testes de relação entre as variáveis….

3.1 Base de dados

Aqui apresentamos a base de dados utilizada para as análises após o pré-processamento dos dados

#library(rmarkdown)
paged_table(dados)

3.2 Tabela de contingência

Nesta tabela de contingência fizemos uma dupla entrada entre as variáveis qualitativas cruzando com uma única variável escolhida. É apresentado os valores de p-valor referente ao teste de relação de qui-quadrado.

Geral

dados %>% 
  select( #aqui ta selecionando todas as variáveis na tabela
    special,
    child_school,
    child_school_grade,
    age,
    gender,
    PRE_Math_Score,
    POS_Math_Score) %>% 
  tbl_summary(statistic = list(all_continuous() ~ "{mean} ({sd})"))

Characteristic	N = 692¹
special
no	623 (90%)
not-informed	50 (7.2%)
yes	19 (2.7%)
child_school
centro-educativo-particular	135 (20%)
centro-educativo-publico	544 (79%)
otro	13 (1.9%)
child_school_grade
kinder	167 (24%)
otro	44 (6.4%)
pre-kinder	72 (10%)
primary-basica-general	210 (30%)
secondary-basica-general	199 (29%)
age
4	94 (14%)
5	188 (27%)
6	208 (30%)
7	202 (29%)
gender
f	349 (50%)
m	343 (50%)
PRE_Math_Score	0.96 (0.07)
POS_Math_Score	0.98 (0.04)
¹ n (%); Mean (SD)

Sexo

dados %>% 
  select( #aqui ta selecionando todas as variáveis na tabela
    special,
    child_school,
    child_school_grade,
    age,
    gender,
    PRE_Math_Score,
    POS_Math_Score) %>% 
  tbl_summary(
    by = gender,
    statistic = list(all_continuous() ~ "{mean} ({sd})")) %>% #o by informa a varilavel do cruzamento
  add_p() #adiciona os p-valores dos testes de qui-quadrado

Characteristic	f, N = 349¹	m, N = 343¹	p-value²
special			0.014
no	324 (93%)	299 (87%)
not-informed	21 (6.0%)	29 (8.5%)
yes	4 (1.1%)	15 (4.4%)
child_school			0.5
centro-educativo-particular	64 (18%)	71 (21%)
centro-educativo-publico	280 (80%)	264 (77%)
otro	5 (1.4%)	8 (2.3%)
child_school_grade			0.3
kinder	86 (25%)	81 (24%)
otro	23 (6.6%)	21 (6.1%)
pre-kinder	36 (10%)	36 (10%)
primary-basica-general	94 (27%)	116 (34%)
secondary-basica-general	110 (32%)	89 (26%)
age			0.077
4	49 (14%)	45 (13%)
5	90 (26%)	98 (29%)
6	94 (27%)	114 (33%)
7	116 (33%)	86 (25%)
PRE_Math_Score	0.96 (0.08)	0.96 (0.06)	0.2
POS_Math_Score	0.97 (0.05)	0.98 (0.04)	>0.9
¹ n (%); Mean (SD) ² Pearson's Chi-squared test; Wilcoxon rank sum test

Especial

dados %>% 
  select( #aqui ta selecionando todas as variáveis na tabela
    special,
    child_school,
    child_school_grade,
    age,
    gender,
    PRE_Math_Score,
    POS_Math_Score) %>% 
  tbl_summary(
    by = special,
    statistic = list(all_continuous() ~ "{mean} ({sd})")) %>% #o by informa a varilavel do cruzamento
  add_p() #adiciona os p-valores dos testes de qui-quadrado

Characteristic	no, N = 623¹	not-informed, N = 50¹	yes, N = 19¹	p-value²
child_school				0.12
centro-educativo-particular	123 (20%)	8 (16%)	4 (21%)
centro-educativo-publico	491 (79%)	39 (78%)	14 (74%)
otro	9 (1.4%)	3 (6.0%)	1 (5.3%)
child_school_grade
kinder	148 (24%)	13 (26%)	6 (32%)
otro	41 (6.6%)	1 (2.0%)	2 (11%)
pre-kinder	61 (9.8%)	9 (18%)	2 (11%)
primary-basica-general	191 (31%)	16 (32%)	3 (16%)
secondary-basica-general	182 (29%)	11 (22%)	6 (32%)
age				0.2
4	82 (13%)	9 (18%)	3 (16%)
5	163 (26%)	16 (32%)	9 (47%)
6	192 (31%)	14 (28%)	2 (11%)
7	186 (30%)	11 (22%)	5 (26%)
gender				0.014
f	324 (52%)	21 (42%)	4 (21%)
m	299 (48%)	29 (58%)	15 (79%)
PRE_Math_Score	0.96 (0.07)	0.96 (0.06)	0.95 (0.07)	0.057
POS_Math_Score	0.98 (0.05)	0.98 (0.03)	0.98 (0.05)	>0.9
¹ n (%); Mean (SD) ² Fisher's exact test; Pearson's Chi-squared test; Kruskal-Wallis rank sum test

Tipo Escola

dados %>% 
  select( #aqui ta selecionando todas as variáveis na tabela
    special,
    child_school,
    child_school_grade,
    age,
    gender,
    PRE_Math_Score,
    POS_Math_Score) %>% 
  tbl_summary(
    by = child_school,
    statistic = list(all_continuous() ~ "{mean} ({sd})")) %>% #o by informa a varilavel do cruzamento
  add_p() #adiciona os p-valores dos testes de qui-quadrado

Characteristic	centro-educativo-particular, N = 135¹	centro-educativo-publico, N = 544¹	otro, N = 13¹	p-value²
special				0.12
no	123 (91%)	491 (90%)	9 (69%)
not-informed	8 (5.9%)	39 (7.2%)	3 (23%)
yes	4 (3.0%)	14 (2.6%)	1 (7.7%)
child_school_grade
kinder	35 (26%)	129 (24%)	3 (23%)
otro	10 (7.4%)	32 (5.9%)	2 (15%)
pre-kinder	17 (13%)	51 (9.4%)	4 (31%)
primary-basica-general	39 (29%)	169 (31%)	2 (15%)
secondary-basica-general	34 (25%)	163 (30%)	2 (15%)
age
4	24 (18%)	64 (12%)	6 (46%)
5	39 (29%)	146 (27%)	3 (23%)
6	34 (25%)	172 (32%)	2 (15%)
7	38 (28%)	162 (30%)	2 (15%)
gender				0.5
f	64 (47%)	280 (51%)	5 (38%)
m	71 (53%)	264 (49%)	8 (62%)
PRE_Math_Score	0.96 (0.06)	0.96 (0.08)	0.93 (0.06)	0.017
POS_Math_Score	0.97 (0.05)	0.98 (0.04)	0.99 (0.02)	0.4
¹ n (%); Mean (SD) ² Fisher's exact test; Pearson's Chi-squared test; Kruskal-Wallis rank sum test

Série

dados %>% 
  select( #aqui ta selecionando todas as variáveis na tabela
    special,
    child_school,
    child_school_grade,
    age,
    gender,
    PRE_Math_Score,
    POS_Math_Score) %>% 
  tbl_summary(
    by = child_school_grade,
    statistic = list(all_continuous() ~ "{mean} ({sd})")) %>% #o by informa a varilavel do cruzamento
  add_p() #adiciona os p-valores dos testes de qui-quadrado

Characteristic	kinder, N = 167¹	otro, N = 44¹	pre-kinder, N = 72¹	primary-basica-general, N = 210¹	secondary-basica-general, N = 199¹	p-value²
special
no	148 (89%)	41 (93%)	61 (85%)	191 (91%)	182 (91%)
not-informed	13 (7.8%)	1 (2.3%)	9 (12%)	16 (7.6%)	11 (5.5%)
yes	6 (3.6%)	2 (4.5%)	2 (2.8%)	3 (1.4%)	6 (3.0%)
child_school
centro-educativo-particular	35 (21%)	10 (23%)	17 (24%)	39 (19%)	34 (17%)
centro-educativo-publico	129 (77%)	32 (73%)	51 (71%)	169 (80%)	163 (82%)
otro	3 (1.8%)	2 (4.5%)	4 (5.6%)	2 (1.0%)	2 (1.0%)
age						<0.001
4	27 (16%)	2 (4.5%)	65 (90%)	0 (0%)	0 (0%)
5	140 (84%)	4 (9.1%)	6 (8.3%)	38 (18%)	0 (0%)
6	0 (0%)	0 (0%)	1 (1.4%)	168 (80%)	39 (20%)
7	0 (0%)	38 (86%)	0 (0%)	4 (1.9%)	160 (80%)
gender						0.3
f	86 (51%)	23 (52%)	36 (50%)	94 (45%)	110 (55%)
m	81 (49%)	21 (48%)	36 (50%)	116 (55%)	89 (45%)
PRE_Math_Score	0.95 (0.10)	0.97 (0.04)	0.94 (0.07)	0.97 (0.06)	0.97 (0.06)	0.003
POS_Math_Score	0.97 (0.04)	0.97 (0.06)	0.98 (0.04)	0.98 (0.04)	0.98 (0.05)	0.4
¹ n (%); Mean (SD) ² Pearson's Chi-squared test; Kruskal-Wallis rank sum test

Idade

dados %>% 
  select( #aqui ta selecionando todas as variáveis na tabela
    special,
    child_school,
    child_school_grade,
    age,
    gender,
    PRE_Math_Score,
    POS_Math_Score) %>% 
  tbl_summary(
    by = age,
    statistic = list(all_continuous() ~ "{mean} ({sd})")) %>% #o by informa a varilavel do cruzamento
  add_p() #adiciona os p-valores dos testes de qui-quadrado

Characteristic	4, N = 94¹	5, N = 188¹	6, N = 208¹	7, N = 202¹	p-value²
special					0.2
no	82 (87%)	163 (87%)	192 (92%)	186 (92%)
not-informed	9 (9.6%)	16 (8.5%)	14 (6.7%)	11 (5.4%)
yes	3 (3.2%)	9 (4.8%)	2 (1.0%)	5 (2.5%)
child_school
centro-educativo-particular	24 (26%)	39 (21%)	34 (16%)	38 (19%)
centro-educativo-publico	64 (68%)	146 (78%)	172 (83%)	162 (80%)
otro	6 (6.4%)	3 (1.6%)	2 (1.0%)	2 (1.0%)
child_school_grade					<0.001
kinder	27 (29%)	140 (74%)	0 (0%)	0 (0%)
otro	2 (2.1%)	4 (2.1%)	0 (0%)	38 (19%)
pre-kinder	65 (69%)	6 (3.2%)	1 (0.5%)	0 (0%)
primary-basica-general	0 (0%)	38 (20%)	168 (81%)	4 (2.0%)
secondary-basica-general	0 (0%)	0 (0%)	39 (19%)	160 (79%)
gender					0.077
f	49 (52%)	90 (48%)	94 (45%)	116 (57%)
m	45 (48%)	98 (52%)	114 (55%)	86 (43%)
PRE_Math_Score	0.94 (0.10)	0.95 (0.08)	0.97 (0.06)	0.97 (0.06)	0.001
POS_Math_Score	0.97 (0.05)	0.98 (0.04)	0.98 (0.04)	0.97 (0.05)	0.3
¹ n (%); Mean (SD) ² Fisher's exact test; Pearson's Chi-squared test; Kruskal-Wallis rank sum test

3.3 Análise de Densidade do Pré e Pós teste

Análise gráfica utilizando gráficos de densidade para verificar a diferença entre o pré e pós teste de acordo com as características: geral, idade, sexo, escola e série.

Geral

Plotando a densidade para os scores de pré e pós teste.

#gráfico de densidade
dados %>% 
  gather(PRE_Math_Score,
         POS_Math_Score,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_density(alpha=0.5) +
  xlim(0.75,1)

Idade

Plotando a densidade para os scores de pré e pós teste de acordo com a idade do estudante

dados %>% 
  gather(PRE_Math_Score,
         POS_Math_Score,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_density(alpha=0.5) +
  xlim(0.75,1) + 
  facet_wrap(~age,1)

Sexo

Plotando a densidade para os scores de pré e pós teste em relação ao sexo.

#gráfico de densidade em relação ao sexo
dados %>% 
  gather(PRE_Math_Score,
         POS_Math_Score,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_density(alpha=0.5) +
  xlim(0.75,1) + 
  facet_wrap(~gender,1)

Escola

Plotando a densidade para os scores de pré e pós teste em relação ao tipo de escola.

#gráfico de densidade em relação a escola
dados %>% 
  gather(PRE_Math_Score,
         POS_Math_Score,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_density(alpha=0.5) +
  xlim(0.75,1) + 
  facet_wrap(~child_school,1)

Série

Plotando a densidade para os scores de pré e pós teste em relação a série do estudante.

#gráfico de densidade em relação a escola
dados %>% 
  gather(PRE_Math_Score,
         POS_Math_Score,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_density(alpha=0.5) +
  xlim(0.75,1) + 
  facet_wrap(~child_school_grade,1)

3.4 Análise com BoxPlot do Pré e Pós teste

Análise gráfica utilizando gráficos de boxplot para verificar a diferença entre o pré e pós teste de acordo com as características: geral, idade, sexo, escola e série.

Geral

Plotando a densidade para os scores de pré e pós teste.

#gráfico de boxplot geral
dados %>% 
  gather(PRE_Math_Score,
         POS_Math_Score,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_boxplot(alpha=0.5) +
  coord_flip() +
  xlim(0.75,1)

Idade

Plotando o boxplot para os scores de pré e pós teste de acordo com a idade do estudante

#gráfico de boxplot em relação a idade
dados %>% 
  gather(PRE_Math_Score,
         POS_Math_Score,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_boxplot(alpha=0.5) +
  xlim(0.75,1) + 
  coord_flip() +
  facet_wrap(~age, 1)

Sexo

Plotando o boxplot para os scores de pré e pós teste em relação ao sexo.

#gráfico de boxplot em relação ao sexo
dados %>% 
  gather(PRE_Math_Score,
         POS_Math_Score,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_boxplot(alpha=0.5) +
  xlim(0.75,1) + 
  coord_flip() +
  facet_wrap(~gender,1)

Escola

Plotando o boxplot para os scores de pré e pós teste em relação ao tipo de escola.

#gráfico de boxplot em relação a escola
dados %>% 
  gather(PRE_Math_Score,
         POS_Math_Score,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_boxplot(alpha=0.5) +
  xlim(0.75,1) + 
  coord_flip() +
  facet_wrap(~child_school,1)

Série

Plotando o boxplot para os scores de pré e pós teste em relação a série do estudante.

#gráfico de boxplot em relação a escola
dados %>% 
  gather(PRE_Math_Score,
         POS_Math_Score,
         key = test, value = scores) %>% 
  ggplot(aes(scores, fill = test)) +
  geom_boxplot(alpha=0.5) +
  xlim(0.75,1) + 
  coord_flip() +
  facet_wrap(~child_school_grade,1)

4. Cluster Analysis

4.1 Assessing Clustering Tendency

Hopkins Statistic Method

V

dadosCluster <- dados %>% 
  select(ends_with("_V")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.1132266

R

dadosCluster <- dados %>% 
  select(ends_with("_R")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.09341127

W

dadosCluster <- dados %>% 
  select(ends_with("_W")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.06698298

C

dadosCluster <- dados %>% 
  select(ends_with("_C")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.08534153

V + R

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_R")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.1082903

V + W

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_W")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.09320807

V + C

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_C")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.1071213

R + W

dadosCluster <- dados %>% 
  select(ends_with("_R"), ends_with("_W")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.08472295

R + C

dadosCluster <- dados %>% 
  select(ends_with("_R"), ends_with("_C")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.09169524

W + C

dadosCluster <- dados %>% 
  select(ends_with("_W"), ends_with("_C")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.08375064

V + R + W

dadosCluster <- dados %>% 
  select(ends_with("_W"), ends_with("_R"), ends_with("_W")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.08458943

V + R + C

dadosCluster <- dados %>% 
  select(ends_with("_W"), ends_with("_R"), ends_with("_C")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.08821345

V + W + C

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_W"), ends_with("_C")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.09671276

R + W + C

dadosCluster <- dados %>% 
  select(ends_with("_R"), ends_with("_W"), ends_with("_C")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.08819709

V + R + W + C

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_R"), ends_with("_W"), ends_with("_C")) %>% 
  scale() #Escalonando os valores
set.seed(123)
hopkins(dadosCluster, n = nrow(dadosCluster)-1)

## $H
## [1] 0.09750174

4.2 K-means Clustering

4.2.1 Estimating the optimal number of clusters

WSS Method

fviz_nbclust(dadosCluster, kmeans, method = "wss") +
  geom_vline(xintercept = 4, linetype = 2) +
  labs(subtitle = "Elbow method")

Silhouette Method

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_R"), ends_with("_W"), ends_with("_C")) %>% 
  scale()

fviz_nbclust(dadosCluster, kmeans, method = "silhouette") +
  labs(subtitle = "Silhouette method")

Gap Statistic Method

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_R"), ends_with("_W"), ends_with("_C")) %>% 
  scale()

fviz_nbclust(dadosCluster, kmeans, method = "gap_stat", nboot = 50) +
  labs(subtitle = "Gap statistic method")

4.2.2 Computing and Visualizing k-means clustering

K = 2

set.seed(123)
km.res2 <- kmeans(dadosCluster, 2, nstart = 25)

fviz_cluster(km.res2, data = dadosCluster,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
             star.plot = TRUE, # Add segments from centroids to items
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_minimal())

K = 3

set.seed(123)
km.res3 <- kmeans(dadosCluster, 3, nstart = 25)

fviz_cluster(km.res3, data = dadosCluster,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
             star.plot = TRUE, # Add segments from centroids to items
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_minimal())

K = 4

set.seed(123)
km.res4 <- kmeans(dadosCluster, 4, nstart = 25)

fviz_cluster(km.res4, data = dadosCluster,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
             star.plot = TRUE, # Add segments from centroids to items
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_minimal())

K = 5

set.seed(123)
km.res5 <- kmeans(dadosCluster, 5, nstart = 25)

fviz_cluster(km.res5, data = dadosCluster,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07", "#0000CD"),
             star.plot = TRUE, # Add segments from centroids to items
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_minimal())

4.2.3 Teste de Kruskall Wallis

K = 2

dadosKruskal_k2 <- cbind(dados, Cluster = km.res2$cluster)

dadosKruskal_k2 <- dadosKruskal_k2 %>% 
  select(Cluster, Gain_Math)

kruskal.test(Gain_Math ~ Cluster, data = dadosKruskal_k2)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Math by Cluster
## Kruskal-Wallis chi-squared = 0.16448, df = 1, p-value = 0.6851

K = 3

dadosKruskal_k3 <- cbind(dados, Cluster = km.res3$cluster)

dadosKruskal_k3 <- dadosKruskal_k3 %>% 
  select(Cluster, Gain_Math)

kruskal.test(Gain_Math ~ Cluster, data = dadosKruskal_k3)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Math by Cluster
## Kruskal-Wallis chi-squared = 4.6034, df = 2, p-value = 0.1001

K = 4

dadosKruskal_k4 <- cbind(dados, Cluster = km.res4$cluster)

dadosKruskal_k4 <- dadosKruskal_k4 %>% 
  select(Cluster, Gain_Math)

kruskal.test(Gain_Math ~ Cluster, data = dadosKruskal_k4)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Math by Cluster
## Kruskal-Wallis chi-squared = 2.2109, df = 3, p-value = 0.5298

K = 5

dadosKruskal_k5 <- cbind(dados, Cluster = km.res5$cluster)

dadosKruskal_k5 <- dadosKruskal_k5 %>% 
  select(Cluster, Gain_Math)

kruskal.test(Gain_Math ~ Cluster, data = dadosKruskal_k5)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Math by Cluster
## Kruskal-Wallis chi-squared = 1.7409, df = 4, p-value = 0.7833

4.2.4 Teste de Dunn com ajuste do p-valor

K = 2

dunn_test(Gain_Math ~ Cluster, data = dadosKruskal_k2, p.adjust.method = "bonferroni")

## # A tibble: 1 × 9
##   .y.       group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 Gain_Math 1      2         40   652    -0.406 0.685 0.685 ns

K = 3

dunn_test(Gain_Math ~ Cluster, data = dadosKruskal_k3, p.adjust.method = "bonferroni")

## # A tibble: 3 × 9
##   .y.       group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 Gain_Math 1      2          5   577     -1.57 0.116 0.348 ns          
## 2 Gain_Math 1      3          5   110     -1.20 0.231 0.693 ns          
## 3 Gain_Math 2      3        577   110      1.52 0.129 0.387 ns

K = 4

dunn_test(Gain_Math ~ Cluster, data = dadosKruskal_k4, p.adjust.method = "bonferroni")

## # A tibble: 6 × 9
##   .y.       group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 Gain_Math 1      2         36   520   -0.775  0.438     1 ns          
## 2 Gain_Math 1      3         36     2    0.242  0.808     1 ns          
## 3 Gain_Math 1      4         36   134   -0.0473 0.962     1 ns          
## 4 Gain_Math 2      3        520     2    0.437  0.662     1 ns          
## 5 Gain_Math 2      4        520   134    1.29   0.198     1 ns          
## 6 Gain_Math 3      4          2   134   -0.260  0.795     1 ns

K = 5

dunn_test(Gain_Math ~ Cluster, data = dadosKruskal_k5, p.adjust.method = "bonferroni")

## # A tibble: 10 × 9
##    .y.       group1 group2    n1    n2 statistic     p p.adj p.adj.signif
##  * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
##  1 Gain_Math 1      2          2     7    0.0218 0.983     1 ns          
##  2 Gain_Math 1      3          2    40   -0.187  0.852     1 ns          
##  3 Gain_Math 1      4          2   479   -0.418  0.676     1 ns          
##  4 Gain_Math 1      5          2   164   -0.386  0.700     1 ns          
##  5 Gain_Math 2      3          7    40   -0.373  0.709     1 ns          
##  6 Gain_Math 2      4          7   479   -0.824  0.410     1 ns          
##  7 Gain_Math 2      5          7   164   -0.756  0.449     1 ns          
##  8 Gain_Math 3      4         40   479   -0.977  0.328     1 ns          
##  9 Gain_Math 3      5         40   164   -0.788  0.431     1 ns          
## 10 Gain_Math 4      5        479   164    0.242  0.809     1 ns

4.3 K-Medoids

4.3.1 Estimating the optimal number of clusters

WSS Method

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_R"), ends_with("_W"), ends_with("_C")) %>% 
  scale()

fviz_nbclust(dadosCluster, pam, method = "wss") +
  geom_vline(xintercept = 4, linetype = 2) +
  labs(subtitle = "Elbow method")

Silhouette Method

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_R"), ends_with("_W"), ends_with("_C")) %>% 
  scale()

fviz_nbclust(dadosCluster, pam, method = "silhouette") +
  labs(subtitle = "Silhouette method")

Gap Statistic Method

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_R"), ends_with("_W"), ends_with("_C")) %>% 
  scale()

fviz_nbclust(dadosCluster, pam, method = "gap_stat", nboot = 50) +
  labs(subtitle = "Gap statistic method")

4.3.2 Computing and Visualizing PAM clustering

K = 2

set.seed(123)
pam.res2 <- pam(dadosCluster, 2)

fviz_cluster(pam.res2, 
             palette = c("#00AFBB", "#FC4E07"), # color palette
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_classic())

K = 3

set.seed(123)
pam.res3 <- pam(dadosCluster, 3)

fviz_cluster(pam.res3, 
             palette = c("#2E9FDF", "#00AFBB", "#FC4E07"), # color palette
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_classic())

K = 4

set.seed(123)
pam.res4 <- pam(dadosCluster, 4)

fviz_cluster(pam.res4, 
             palette = c("#2E9FDF", "#E7B800", "#00AFBB", "#FC4E07"), # color palette
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_classic())

K = 5

set.seed(123)
pam.res5 <- pam(dadosCluster, 5)


fviz_cluster(pam.res5,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07", "#0000CD"),
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_classic())

4.3.3 Teste de Kruskall Wallis

K = 2

dadosKruskal_k2 <- cbind(dados, Cluster = pam.res2$cluster)

dadosKruskal_k2 <- dadosKruskal_k2 %>% 
  select(Cluster, Gain_Math)

kruskal.test(Gain_Math ~ Cluster, data = dadosKruskal_k2)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Math by Cluster
## Kruskal-Wallis chi-squared = 0.078735, df = 1, p-value = 0.779

K = 3

dadosKruskal_k3 <- cbind(dados, Cluster = pam.res3$cluster)

dadosKruskal_k3 <- dadosKruskal_k3 %>% 
  select(Cluster, Gain_Math)

kruskal.test(Gain_Math ~ Cluster, data = dadosKruskal_k3)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Math by Cluster
## Kruskal-Wallis chi-squared = 0.64694, df = 2, p-value = 0.7236

K = 4

dadosKruskal_k4 <- cbind(dados, Cluster = pam.res4$cluster)

dadosKruskal_k4 <- dadosKruskal_k4 %>% 
  select(Cluster, Gain_Math)

kruskal.test(Gain_Math ~ Cluster, data = dadosKruskal_k4)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Math by Cluster
## Kruskal-Wallis chi-squared = 2.0662, df = 3, p-value = 0.5588

K = 5

dadosKruskal_k5 <- cbind(dados, Cluster = pam.res5$cluster)

dadosKruskal_k5 <- dadosKruskal_k5 %>% 
  select(Cluster, Gain_Math)

kruskal.test(Gain_Math ~ Cluster, data = dadosKruskal_k5)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Math by Cluster
## Kruskal-Wallis chi-squared = 2.948, df = 4, p-value = 0.5666

4.3.4 Teste de Dunn com ajuste do p-valor

K = 2

dunn_test(Gain_Math ~ Cluster, data = dadosKruskal_k2, p.adjust.method = "bonferroni")

## # A tibble: 1 × 9
##   .y.       group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 Gain_Math 1      2        509   183     0.281 0.779 0.779 ns

K = 3

dunn_test(Gain_Math ~ Cluster, data = dadosKruskal_k3, p.adjust.method = "bonferroni")

## # A tibble: 3 × 9
##   .y.       group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 Gain_Math 1      2        301   307     0.406 0.685     1 ns          
## 2 Gain_Math 1      3        301    84     0.790 0.430     1 ns          
## 3 Gain_Math 2      3        307    84     0.524 0.600     1 ns

K = 4

dunn_test(Gain_Math ~ Cluster, data = dadosKruskal_k4, p.adjust.method = "bonferroni")

## # A tibble: 6 × 9
##   .y.       group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 Gain_Math 1      2        258   307     0.741 0.459     1 ns          
## 2 Gain_Math 1      3        258   103     1.37  0.171     1 ns          
## 3 Gain_Math 1      4        258    24     0.619 0.536     1 ns          
## 4 Gain_Math 2      3        307   103     0.852 0.394     1 ns          
## 5 Gain_Math 2      4        307    24     0.328 0.743     1 ns          
## 6 Gain_Math 3      4        103    24    -0.121 0.904     1 ns

K = 5

dunn_test(Gain_Math ~ Cluster, data = dadosKruskal_k5, p.adjust.method = "bonferroni")

## # A tibble: 10 × 9
##    .y.       group1 group2    n1    n2 statistic     p p.adj p.adj.signif
##  * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
##  1 Gain_Math 1      2        258   307    0.741  0.459     1 ns          
##  2 Gain_Math 1      3        258   103    1.37   0.171     1 ns          
##  3 Gain_Math 1      4        258    23    0.791  0.429     1 ns          
##  4 Gain_Math 1      5        258     1   -0.786  0.432     1 ns          
##  5 Gain_Math 2      3        307   103    0.852  0.394     1 ns          
##  6 Gain_Math 2      4        307    23    0.506  0.613     1 ns          
##  7 Gain_Math 2      5        307     1   -0.848  0.396     1 ns          
##  8 Gain_Math 3      4        103    23    0.0542 0.957     1 ns          
##  9 Gain_Math 3      5        103     1   -0.942  0.346     1 ns          
## 10 Gain_Math 4      5         23     1   -0.939  0.348     1 ns

4.4 CLARA - Clustering Large Applications

4.4.1 Estimating the optimal number of clusters

WSS Method

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_R"), ends_with("_W"), ends_with("_C")) %>% 
  scale()

fviz_nbclust(dadosCluster, clara, method = "wss") +
  geom_vline(xintercept = 4, linetype = 2) +
  labs(subtitle = "Elbow method")

Silhouette Method

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_R"), ends_with("_W"), ends_with("_C")) %>% 
  scale()

fviz_nbclust(dadosCluster, clara, method = "silhouette") +
  labs(subtitle = "Silhouette method")

Gap Statistic Method

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_R"), ends_with("_W"), ends_with("_C")) %>% 
  scale()

fviz_nbclust(dadosCluster, clara, method = "gap_stat", nboot = 50) +
  labs(subtitle = "Gap statistic method")

4.4.2 Computing and Visualizing CLARA clustering

K = 2

set.seed(123)
clara.res2 <- clara(dadosCluster, 2, samples = 50, pamLike = TRUE)

fviz_cluster(clara.res2,
             palette = c("#00AFBB", "#FC4E07"), # color palette
             geom = "point", pointsize = 1,
             ggtheme = theme_classic())

K = 3

set.seed(123)
clara.res3 <- clara(dadosCluster, 3, samples = 50, pamLike = TRUE)

fviz_cluster(clara.res3,
             palette = c("#2E9FDF", "#00AFBB", "#FC4E07"), # color palette
             geom = "point", pointsize = 1,
             ggtheme = theme_classic())

K = 4

set.seed(123)
clara.res4 <- clara(dadosCluster, 4, samples = 50, pamLike = TRUE)

fviz_cluster(clara.res4,
             palette = c("#2E9FDF", "#E7B800", "#00AFBB", "#FC4E07"), # color palette
             geom = "point", pointsize = 1,
             ggtheme = theme_classic())

K = 5

set.seed(123)
clara.res5 <- clara(dadosCluster, 5, samples = 50, pamLike = TRUE)


fviz_cluster(clara.res5,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07", "#0000CD"),
             geom = "point", pointsize = 1,
             ggtheme = theme_classic())

4.4.3 Teste de Kruskall Wallis

K = 2

dadosKruskal_k2 <- cbind(dados, Cluster = clara.res2$cluster)

dadosKruskal_k2 <- dadosKruskal_k2 %>% 
  select(Cluster, Gain_Math)

kruskal.test(Gain_Math ~ Cluster, data = dadosKruskal_k2)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Math by Cluster
## Kruskal-Wallis chi-squared = 0.078735, df = 1, p-value = 0.779

K = 3

dadosKruskal_k3 <- cbind(dados, Cluster = clara.res3$cluster)

dadosKruskal_k3 <- dadosKruskal_k3 %>% 
  select(Cluster, Gain_Math)

kruskal.test(Gain_Math ~ Cluster, data = dadosKruskal_k3)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Math by Cluster
## Kruskal-Wallis chi-squared = 0.50904, df = 2, p-value = 0.7753

K = 4

dadosKruskal_k4 <- cbind(dados, Cluster = clara.res4$cluster)

dadosKruskal_k4 <- dadosKruskal_k4 %>% 
  select(Cluster, Gain_Math)

kruskal.test(Gain_Math ~ Cluster, data = dadosKruskal_k4)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Math by Cluster
## Kruskal-Wallis chi-squared = 2.7583, df = 3, p-value = 0.4304

K = 5

dadosKruskal_k5 <- cbind(dados, Cluster = clara.res5$cluster)

dadosKruskal_k5 <- dadosKruskal_k5 %>% 
  select(Cluster, Gain_Math)

kruskal.test(Gain_Math ~ Cluster, data = dadosKruskal_k5)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gain_Math by Cluster
## Kruskal-Wallis chi-squared = 3.2859, df = 4, p-value = 0.5112

4.4.4 Teste de Dunn com ajuste do p-valor

K = 2

dunn_test(Gain_Math ~ Cluster, data = dadosKruskal_k2, p.adjust.method = "bonferroni")

## # A tibble: 1 × 9
##   .y.       group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 Gain_Math 1      2        509   183     0.281 0.779 0.779 ns

K = 3

dunn_test(Gain_Math ~ Cluster, data = dadosKruskal_k3, p.adjust.method = "bonferroni")

## # A tibble: 3 × 9
##   .y.       group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 Gain_Math 1      2        277   405     0.506 0.613     1 ns          
## 2 Gain_Math 1      3        277    10     0.571 0.568     1 ns          
## 3 Gain_Math 2      3        405    10     0.450 0.652     1 ns

K = 4

dunn_test(Gain_Math ~ Cluster, data = dadosKruskal_k4, p.adjust.method = "bonferroni")

## # A tibble: 6 × 9
##   .y.       group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 Gain_Math 1      2        318   307     0.434 0.664 1     ns          
## 2 Gain_Math 1      3        318    62     0.622 0.534 1     ns          
## 3 Gain_Math 1      4        318     5     1.56  0.118 0.710 ns          
## 4 Gain_Math 2      3        307    62     0.371 0.711 1     ns          
## 5 Gain_Math 2      4        307     5     1.48  0.138 0.827 ns          
## 6 Gain_Math 3      4         62     5     1.33  0.184 1     ns

K = 5

dunn_test(Gain_Math ~ Cluster, data = dadosKruskal_k5, p.adjust.method = "bonferroni")

## # A tibble: 10 × 9
##    .y.       group1 group2    n1    n2 statistic      p p.adj p.adj.signif
##  * <chr>     <chr>  <chr>  <int> <int>     <dbl>  <dbl> <dbl> <chr>       
##  1 Gain_Math 1      2        318   307     0.434 0.664  1     ns          
##  2 Gain_Math 1      3        318    62     0.622 0.534  1     ns          
##  3 Gain_Math 1      4        318     3     1.67  0.0948 0.948 ns          
##  4 Gain_Math 1      5        318     2     0.431 0.666  1     ns          
##  5 Gain_Math 2      3        307    62     0.371 0.711  1     ns          
##  6 Gain_Math 2      4        307     3     1.61  0.107  1     ns          
##  7 Gain_Math 2      5        307     2     0.382 0.702  1     ns          
##  8 Gain_Math 3      4         62     3     1.49  0.135  1     ns          
##  9 Gain_Math 3      5         62     2     0.306 0.760  1     ns          
## 10 Gain_Math 4      5          3     2    -0.726 0.468  1     ns

4.5 Agglomerative Clustering

4.4.1 Dendrogram

Euclidean Distance Method

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_R"), ends_with("_W"), ends_with("_C")) %>% 
  scale()

res.dist <- dist(dadosCluster, method = "euclidean")

res.hc <- hclust(d = res.dist, method = "ward.D2")

fviz_dend(res.hc, k = 3, # Cut in three groups
          cex = 0.5, # label size
          k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"),
          color_labels_by_k = TRUE, # color labels by groups
          rect = TRUE, # Add rectangle around groups
          rect_border = c("#2E9FDF", "#00AFBB", "#E7B800"),
          rect_fill = TRUE)

Manhattan Distance Method

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_R"), ends_with("_W"), ends_with("_C")) %>% 
  scale()

res.dist <- dist(dadosCluster, method = "manhattan")

res.hc <- hclust(d = res.dist, method = "ward.D2")

fviz_dend(res.hc, k = 3, # Cut in three groups
          cex = 0.5, # label size
          k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"),
          color_labels_by_k = TRUE, # color labels by groups
          rect = TRUE, # Add rectangle around groups
          rect_border = c("#2E9FDF", "#00AFBB", "#E7B800"),
          rect_fill = TRUE)

Maximun Distance Method

dadosCluster <- dados %>% 
  select(ends_with("_V"), ends_with("_R"), ends_with("_W"), ends_with("_C")) %>% 
  scale()

res.dist <- dist(dadosCluster, method = "maximum")

res.hc <- hclust(d = res.dist, method = "ward.D2")

fviz_dend(res.hc, k = 3, # Cut in three groups
          cex = 0.5, # label size
          k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"),
          color_labels_by_k = TRUE, # color labels by groups
          rect = TRUE, # Add rectangle around groups
          rect_border = c("#2E9FDF", "#00AFBB", "#E7B800"),
          rect_fill = TRUE)