library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
library(readxl)
library(emmeans)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rstatix)
## 
## Attaching package: 'rstatix'
## 
## The following object is masked from 'package:stats':
## 
##     filter
library(ggpubr)

1 PERMANOVA

PERMANOVA (Permutational Multivariate Analysis of Variance) PERMANOVA is a non-parametric method used to compare groups of multivariate samples. It’s often applied in ecological studies, especially when dealing with species abundance data, but its application is not restricted to that field.

How it works: PERMANOVA tests the null hypothesis that the centroids of two or more groups are equivalent in multivariate space. It operates by partitioning distance matrices among sources of variation, relying on permutations to determine the statistical significance. Assumptions: Homogeneity of group dispersions: The variability within each group should be approximately the same. This assumption is equivalent to the homogeneity of variances (homoscedasticity) in ANOVA. The distances among the observations are directly interpretable and meaningful. MANOVA (Multivariate Analysis of Variance) MANOVA is a parametric method that extends the capabilities of analysis of variance (ANOVA) to allow for the analysis of multiple dependent variables.

Assumptions: Multivariate Normality: Each combination of levels from the independent variables results in a multivariate normal distribution of the dependent variables. Homogeneity of Covariance: The covariance matrices of the dependent variables are equal across groups (often tested using Box’s M test). Linearity: Linear relationships should exist between all pairs of dependent variables for each group of the independent variable. Homogeneity of Variances: For each dependent variable, the variances should be equal across groups (homoscedasticity). Differences in Assumptions: Parametric vs. Non-parametric: MANOVA is parametric and assumes multivariate normality, linearity, and homogeneity of variances and covariances. PERMANOVA is non-parametric and does not make these assumptions about the distribution of the data. Data Type and Distance: MANOVA works directly on raw data, whereas PERMANOVA works on a distance or dissimilarity matrix representing pairwise distances between samples. Dispersion: Both methods assume homogeneity of dispersion, but in different contexts. MANOVA assumes homogeneity of variances and covariances, whereas PERMANOVA assumes homogeneity of group dispersions in multivariate space. In essence, PERMANOVA offers a more flexible approach when the assumptions of MANOVA are not met, especially in cases where data are not multivariate normally distributed or when working with complex data types like species abundance data.

2 PCA

次元削減の結果、

流暢性ではspeech rate, mid clause duration、

複雑性では(内容語/総語数), contents asunit、

正確性ではe_asunit

が選出された。

library(readxl)
data <- read_xlsx("/Users/riku/Library/CloudStorage/Dropbox/zemizemi/paper/caf/all_230717_noname.xlsx")
data$jyoshi_as <- (data$kakari + data$fukujyoshi + data$setsuzokujyoshi + data$kakujyoshi)/ data$asunit
data <- data %>% filter(!(trueid == "17" | trueid == "24"))

2.1 Fluency

speech_rate for PC1, Mid_clause_pause_duration for PC2

data_standardized <- scale(data[, c("speech_rate", "articulation_rate", "mean_run_length", "effective_speech_rate", "effective_articulation_rate", "Mid_clause_pause_duration", "Mid_clause_pause_ratio", "DYSF_ratio")])

pca_result <- prcomp(data_standardized, center = TRUE, scale. = TRUE)

summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6    PC7
## Standard deviation     2.1974 1.0752 1.0108 0.69057 0.54547 0.39570 0.2482
## Proportion of Variance 0.6036 0.1445 0.1277 0.05961 0.03719 0.01957 0.0077
## Cumulative Proportion  0.6036 0.7481 0.8758 0.93537 0.97257 0.99214 0.9998
##                            PC8
## Standard deviation     0.03604
## Proportion of Variance 0.00016
## Cumulative Proportion  1.00000
plot(pca_result, type="lines")

pca_result$rotation
##                                     PC1         PC2         PC3         PC4
## speech_rate                 -0.44167096  0.02209629 -0.05326305 -0.15611635
## articulation_rate           -0.39997066  0.28765769 -0.13422130  0.36895027
## mean_run_length             -0.37546513  0.21289090  0.07797681 -0.66668635
## effective_speech_rate       -0.43716000  0.08009078  0.11877818 -0.13213372
## effective_articulation_rate -0.37999623  0.21052445 -0.19797287  0.54675497
## Mid_clause_pause_duration    0.21986399  0.73499991 -0.22630681 -0.19686007
## Mid_clause_pause_ratio       0.34497002  0.42815175 -0.15169498  0.04361295
## DYSF_ratio                  -0.03355109 -0.31171847 -0.91953230 -0.19487534
##                                     PC5        PC6         PC7          PC8
## speech_rate                  0.28979027 -0.2720744 -0.30567383  0.725113699
## articulation_rate            0.07875903 -0.3257988  0.69931909 -0.041735558
## mean_run_length             -0.08731988  0.5088821  0.31032179 -0.016339515
## effective_speech_rate        0.28823577 -0.2796275 -0.39344999 -0.674414371
## effective_articulation_rate -0.21248046  0.5540951 -0.34762723  0.011580191
## Mid_clause_pause_duration   -0.42344449 -0.3110876 -0.21247806  0.015448660
## Mid_clause_pause_ratio       0.77075786  0.2802407  0.01268478 -0.002210384
## DYSF_ratio                   0.02435432 -0.0186156  0.01616474 -0.130341458
pca_scores <- pca_result$x

2.2 Complexity

contents_asunit for PC1, ld(内容語/総語数) for PC2.

data_standardized <- scale(data[, c("clauseperasunit", "ld" , "kotonarucontents_asunit" , "contents_asunit", "jyoshi_as"  )])

pca_result <- prcomp(data_standardized, center = TRUE, scale. = TRUE)

summary(pca_result)
## Importance of components:
##                          PC1    PC2     PC3     PC4     PC5
## Standard deviation     1.883 1.0094 0.51239 0.35307 0.22143
## Proportion of Variance 0.709 0.2038 0.05251 0.02493 0.00981
## Cumulative Proportion  0.709 0.9127 0.96526 0.99019 1.00000
plot(pca_result, type="lines")

pca_result$rotation
##                                PC1        PC2        PC3         PC4        PC5
## clauseperasunit         -0.4693786  0.1552543 -0.8527489  0.16786750 -0.0148206
## ld                       0.1478575  0.9453361  0.1372005  0.22531446 -0.1220130
## kotonarucontents_asunit -0.5034067  0.1540269  0.1791096 -0.68210990 -0.4748718
## contents_asunit         -0.5100956  0.1572307  0.2824399 -0.06682073  0.7942567
## jyoshi_as               -0.4941721 -0.1838203  0.3770183  0.67179980 -0.3585335
pca_scores <- pca_result$x

2.3 Accuracy

e_asunit

data_standardized <- scale(data[, c("e_asunit" , "e_nashi_as_count_wariai" )])

pca_result <- prcomp(data_standardized, center = TRUE, scale. = TRUE)

summary(pca_result)
## Importance of components:
##                           PC1     PC2
## Standard deviation     1.3536 0.40955
## Proportion of Variance 0.9161 0.08386
## Cumulative Proportion  0.9161 1.00000
plot(pca_result, type="lines")

pca_result$rotation
##                                PC1       PC2
## e_asunit                 0.7071068 0.7071068
## e_nashi_as_count_wariai -0.7071068 0.7071068
pca_scores <- pca_result$x

3 LM

3.1 PERMANOVA

交互作用は有意ではなかった。 認知負荷は有意ではなかった。 習熟度は有意であった。

3.1.1 DATA IMPORT

data <- read_xlsx("/Users/riku/Library/CloudStorage/Dropbox/zemizemi/paper/caf/all_230717_noname_LM.xlsx")
data$load <- factor(data$load, levels = c("L", "M"))
data$load <- as.factor(data$load)
dep_vars <- data[, c('speech_rate', 'Mid_clause_pause_duration', 'contents_asunit', 'e_asunit', 'ld')] 
dist_matrix <- vegdist(dep_vars, method = "euclidean")

3.1.2 OUTLIERS

id 1 を除外した。

# Convert the "dist" object into a full matrix
full_dist_matrix <- as.matrix(dist_matrix)

# Calculate the average distance of each sample to all other samples
avg_distances <- rowMeans(full_dist_matrix)

# Plot to visualize potential outliers
boxplot(avg_distances, main="Average Distances of Samples", ylab="Distance")

# Identify potential outliers
threshold <- quantile(avg_distances, 0.95)  # for example, the 95th percentile
potential_outliers <- which(avg_distances > threshold)
# Removc id 1
data <- data %>% filter(!(trueid == "1" ))
dep_vars <- data[, c('speech_rate', 'Mid_clause_pause_duration', 'contents_asunit', 'e_asunit', 'ld')] 
dist_matrix <- vegdist(dep_vars, method = "euclidean")
full_dist_matrix <- as.matrix(dist_matrix)
avg_distances <- rowMeans(full_dist_matrix)
boxplot(avg_distances, main="Average Distances of Samples", ylab="Distance")

# Subset the dependent variables
dep_vars <- data[, c('speech_rate', 'Mid_clause_pause_duration', 'contents_asunit', 'e_asunit', 'ld')] 

# 2. Check the assumption of homogeneity of group dispersions
# Create a distance matrix
dist_matrix <- vegdist(dep_vars, method = "euclidean")

# Test for homogeneity of dispersions
grouping <- interaction(data$load, data$proficiency)
dispersion_test <- betadisper(dist_matrix, grouping)
anova(dispersion_test)
## Analysis of Variance Table
## 
## Response: Distances
##           Df Sum Sq Mean Sq F value Pr(>F)
## Groups     5   3337  667.33  0.9462 0.4577
## Residuals 62  43726  705.26
# 3. Run the PERMANOVA
set.seed(123)  # Set a random seed for reproducibility
permanova_result <- adonis2(dist_matrix ~ data$load * data$proficiency, permutations = 999)
print(permanova_result)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dist_matrix ~ data$load * data$proficiency, permutations = 999)
##                            Df SumOfSqs      R2       F Pr(>F)    
## data$load                   1     2312 0.01560  1.5355  0.204    
## data$proficiency            2    52266 0.35252 17.3535  0.001 ***
## data$load:data$proficiency  2      319 0.00215  0.1058  0.921    
## Residual                   62    93367 0.62974                   
## Total                      67   148263 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.2 Correlation Matrix

data %>% cor_mat('speech_rate', 'Mid_clause_pause_duration', 'contents_asunit', 'e_asunit', 'ld')
## # A tibble: 5 × 6
##   rowname     speech_rate Mid_clause_pause_dur…¹ contents_asunit e_asunit     ld
## * <chr>             <dbl>                  <dbl>           <dbl>    <dbl>  <dbl>
## 1 speech_rate        1                    -0.52            0.3      0.18  -0.35 
## 2 Mid_clause…       -0.52                  1               0.037   -0.089  0.064
## 3 contents_a…        0.3                   0.037           1        0.57  -0.12 
## 4 e_asunit           0.18                 -0.089           0.57     1     -0.19 
## 5 ld                -0.35                  0.064          -0.12    -0.19   1    
## # ℹ abbreviated name: ¹​Mid_clause_pause_duration

3.3 Visualization

library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
# 1. Scale the values of each dependent variable
data_scaled <- as.data.frame(lapply(data[c('speech_rate', 'Mid_clause_pause_duration', 'contents_asunit', 'e_asunit', 'ld')], scale))
data_scaled$load <- data$load

# 2. Create an empty matrix and populate with average scaled values
num_vars <- length(c('speech_rate', 'Mid_clause_pause_duration', 'contents_asunit', 'e_asunit', 'ld'))
num_load_levels <- length(unique(data_scaled$load))

heatmap_matrix <- matrix(0, nrow=num_vars, ncol=num_load_levels)
rownames(heatmap_matrix) <- c('speech_rate', 'Mid_clause_pause_duration', 'contents_asunit', 'e_asunit', 'ld')
colnames(heatmap_matrix) <- unique(data_scaled$load)

for (var in rownames(heatmap_matrix)) {
  heatmap_matrix[var, ] <- with(data_scaled, tapply(get(var), load, mean, na.rm=TRUE))
}

# 3. Generate the heatmap
heatmap.2(heatmap_matrix, main="Heatmap for Scaled Dependent Variables", 
          xlab="Load", ylab="Variables", 
          col=colorRampPalette(c("blue", "white", "red"))(256), # Colors adjusted for scaled values
          trace="none", density.info="none", 
          margins=c(8,8), srtRow=0, srtCol=45, key.title="Mean Value", 
          key.xlab="Average scaled values of variables", dendrogram="row")

in separate ANOVA,

L > M 減少: speech rate, ld, contents asunit

L < M 増加: mid clause pause duration

ns: e asunit

library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
# Create individual plots
plot1 <- ggboxplot(
  data, x = "load", y = "speech_rate", add = "jitter", palette = "lancet"
)
plot2 <- ggboxplot(
  data, x = "load", y = "Mid_clause_pause_duration", add = "jitter", palette = "lancet"
)
plot3 <- ggboxplot(
  data, x = "load", y = "contents_asunit", add = "jitter", palette = "lancet"
)
plot4 <- ggboxplot(
  data, x = "load", y = "ld", add = "jitter", palette = "lancet"
)
plot5 <- ggboxplot(
  data, x = "load", y = "e_asunit", add = "jitter", palette = "lancet"
)

# Combine the plots into a single plot
grid.arrange(plot1, plot2, plot3, plot4, plot5, ncol = 2, nrow = 3)

4 MH

交互作用は有意ではなかった。 認知負荷は有意であった。 習熟度は有意であった。

4.1 PERMANOVA

4.1.1 DATA IMPORT

data <- read_xlsx("/Users/riku/Library/CloudStorage/Dropbox/zemizemi/paper/caf/all_230717_noname_MH.xlsx")
data$load <- factor(data$load, levels = c("M", "H"))
data$load <- as.factor(data$load)
dep_vars <- data[, c('speech_rate', 'Mid_clause_pause_duration', 'contents_asunit', 'e_asunit', 'ld')] 
dist_matrix <- vegdist(dep_vars, method = "euclidean")

4.1.2 OUTLIERS

id 1 を除外した。

# Convert the "dist" object into a full matrix
full_dist_matrix <- as.matrix(dist_matrix)

# Calculate the average distance of each sample to all other samples
avg_distances <- rowMeans(full_dist_matrix)

# Plot to visualize potential outliers
boxplot(avg_distances, main="Average Distances of Samples", ylab="Distance")

# Identify potential outliers
threshold <- quantile(avg_distances, 0.95)  # for example, the 95th percentile
potential_outliers <- which(avg_distances > threshold)
# Removc id 1
data <- data %>% filter(!(trueid == "1" ))
dep_vars <- data[, c('speech_rate', 'Mid_clause_pause_duration', 'contents_asunit', 'e_asunit', 'ld')] 
dist_matrix <- vegdist(dep_vars, method = "euclidean")
full_dist_matrix <- as.matrix(dist_matrix)
avg_distances <- rowMeans(full_dist_matrix)
boxplot(avg_distances, main="Average Distances of Samples", ylab="Distance")

4.1.3 Result

# Subset the dependent variables
dep_vars <- data[, c('speech_rate', 'Mid_clause_pause_duration', 'contents_asunit', 'e_asunit', 'ld')] 

# 2. Check the assumption of homogeneity of group dispersions
# Create a distance matrix
dist_matrix <- vegdist(dep_vars, method = "euclidean")

# Test for homogeneity of dispersions
grouping <- interaction(data$load, data$proficiency)
dispersion_test <- betadisper(dist_matrix, grouping)
anova(dispersion_test)
## Analysis of Variance Table
## 
## Response: Distances
##           Df Sum Sq Mean Sq F value Pr(>F)
## Groups     5   2429  485.89  0.8704 0.5062
## Residuals 62  34612  558.26
# 3. Run the PERMANOVA
set.seed(123)  # Set a random seed for reproducibility
permanova_result <- adonis2(dist_matrix ~ data$load * data$proficiency, permutations = 999)
print(permanova_result)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dist_matrix ~ data$load * data$proficiency, permutations = 999)
##                            Df SumOfSqs      R2       F Pr(>F)    
## data$load                   1     6443 0.04946  4.8098  0.029 *  
## data$proficiency            2    40190 0.30857 15.0022  0.001 ***
## data$load:data$proficiency  2      567 0.00435  0.2116  0.802    
## Residual                   62    83047 0.63761                   
## Total                      67   130247 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.2 Post-hoc

data %>%
  pairwise_t_test(
    speech_rate ~ load, paired = TRUE, 
    p.adjust.method = "bonferroni"
  )
## # A tibble: 1 × 10
##   .y.     group1 group2    n1    n2 statistic    df       p   p.adj p.adj.signif
## * <chr>   <chr>  <chr>  <int> <int>     <dbl> <dbl>   <dbl>   <dbl> <chr>       
## 1 speech… M      H         34    34      4.42    33 1.01e-4 1.01e-4 ***
data %>%
  pairwise_t_test(
    Mid_clause_pause_duration ~ load, paired = TRUE, 
    p.adjust.method = "bonferroni"
  )
## # A tibble: 1 × 10
##   .y.         group1 group2    n1    n2 statistic    df     p p.adj p.adj.signif
## * <chr>       <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <dbl> <chr>       
## 1 Mid_clause… M      H         34    34    -0.721    33 0.476 0.476 ns
data %>%
  pairwise_t_test(
    contents_asunit ~ load, paired = TRUE, 
    p.adjust.method = "bonferroni"
  )
## # A tibble: 1 × 10
##   .y.         group1 group2    n1    n2 statistic    df     p p.adj p.adj.signif
## * <chr>       <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <dbl> <chr>       
## 1 contents_a… M      H         34    34      2.44    33  0.02  0.02 *
data %>%
  pairwise_t_test(
    e_asunit ~ load, paired = TRUE, 
    p.adjust.method = "bonferroni"
  )
## # A tibble: 1 × 10
##   .y.      group1 group2    n1    n2 statistic    df     p p.adj p.adj.signif
## * <chr>    <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <dbl> <chr>       
## 1 e_asunit M      H         34    34      2.25    33 0.031 0.031 *
data %>%
  pairwise_t_test(
    ld ~ load, paired = TRUE, 
    p.adjust.method = "bonferroni"
  )
## # A tibble: 1 × 10
##   .y.   group1 group2    n1    n2 statistic    df     p p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <dbl> <chr>       
## 1 ld    M      H         34    34     -2.88    33 0.007 0.007 **

4.3 Correlation Matrix

data %>% cor_mat('speech_rate', 'Mid_clause_pause_duration', 'contents_asunit', 'e_asunit', 'ld')
## # A tibble: 5 × 6
##   rowname      speech_rate Mid_clause_pause_dur…¹ contents_asunit e_asunit    ld
## * <chr>              <dbl>                  <dbl>           <dbl>    <dbl> <dbl>
## 1 speech_rate         1                    -0.48            0.37     0.2   -0.45
## 2 Mid_clause_…       -0.48                  1               0.051   -0.026  0.29
## 3 contents_as…        0.37                  0.051           1        0.45  -0.13
## 4 e_asunit            0.2                  -0.026           0.45     1     -0.23
## 5 ld                 -0.45                  0.29           -0.13    -0.23   1   
## # ℹ abbreviated name: ¹​Mid_clause_pause_duration

4.4 Visualization

library(gplots)
library(scales)

# 1. Scale the values of each dependent variable
data_scaled <- as.data.frame(lapply(data[c('speech_rate', 'Mid_clause_pause_duration', 'contents_asunit', 'e_asunit', 'ld')], scale))
data_scaled$load <- data$load

# 2. Create an empty matrix and populate with average scaled values
num_vars <- length(c('speech_rate', 'Mid_clause_pause_duration', 'contents_asunit', 'e_asunit', 'ld'))
num_load_levels <- length(unique(data_scaled$load))

heatmap_matrix <- matrix(0, nrow=num_vars, ncol=num_load_levels)
rownames(heatmap_matrix) <- c('speech_rate', 'Mid_clause_pause_duration', 'contents_asunit', 'e_asunit', 'ld')
colnames(heatmap_matrix) <- unique(data_scaled$load)

for (var in rownames(heatmap_matrix)) {
  heatmap_matrix[var, ] <- with(data_scaled, tapply(get(var), load, mean, na.rm=TRUE))
}

# 3. Generate the heatmap
heatmap.2(heatmap_matrix, main="Heatmap for Scaled Dependent Variables", 
          xlab="Load", ylab="Variables", 
          col=colorRampPalette(c("blue", "white", "red"))(256), # Colors adjusted for scaled values
          trace="none", density.info="none", 
          margins=c(8,8), srtRow=0, srtCol=45, key.title="Mean Value", 
          key.xlab="Average scaled values of variables", dendrogram="row")

in separate ANOVA,

M > H減少: speech rate, contents asunit, e asunit M < H増加: ld ns: mid clause pause duration

library(ggplot2)
library(gridExtra)

# Create individual plots
plot1 <- ggboxplot(
  data, x = "load", y = "speech_rate", add = "jitter", palette = "lancet"
)
plot2 <- ggboxplot(
  data, x = "load", y = "Mid_clause_pause_duration", add = "jitter", palette = "lancet"
)
plot3 <- ggboxplot(
  data, x = "load", y = "contents_asunit", add = "jitter", palette = "lancet"
)
plot4 <- ggboxplot(
  data, x = "load", y = "ld", add = "jitter", palette = "lancet"
)
plot5 <- ggboxplot(
  data, x = "load", y = "e_asunit", add = "jitter", palette = "lancet"
)

# Combine the plots into a single plot
grid.arrange(plot1, plot2, plot3, plot4, plot5, ncol = 2, nrow = 3)