embeddings

self_responses <- read.csv("~/Documents/projects/self-disclosure/data/all_responses_clean.csv", 
                           stringsAsFactors = FALSE)

anonymized_children <- read.csv("~/Documents/projects/self-disclosure/data/anonymized_children.csv", 
                           stringsAsFactors = FALSE)

children_ages <- anonymized_children %>%
  select(anon_id, age_at_test) %>% 
  rename(subject_id = anon_id)

Questions to ask Claire + Robert - should i get rid of all punctuation in the adult responses?

all_embeddings_raw <- textEmbed(
  texts = self_responses$response_clean,
  model = "sentence-transformers/all-distilroberta-v1",
  batch_size = 10,
  keep_token_embeddings = FALSE
) # Minutes from start:  6.989

# check if we lost any rows, if TRUE, we're good to go
nrow(self_responses) == nrow(all_embeddings_raw$texts[[1]])

# pull the vectors
self_vectors_all <- all_embeddings_raw$texts[[1]]

# write.csv(x = self_vectors_all, file = "self_vectors_all.csv", row.names = FALSE)

self_vectors_all <- read.csv("self_vectors_all.csv")

# compute similarity matrix
all_sim_matrix <- textSimilarityMatrix(self_vectors_all)
# compute typicality calculation, then add to main df
self_responses$typicality <- (rowSums(all_sim_matrix) - 1) / (ncol(all_sim_matrix) - 1)

# 2D coordinates for adult cloud
# NOTE: UMAP calculated using raw vectors (embeddings), not typicality scores
all_umap <- umap(self_vectors_all)
self_responses$umap_1 <- all_umap$layout[,1]
self_responses$umap_2 <- all_umap$layout[,2]

# add skip count to main df
self_responses <- self_responses %>% 
  mutate(is_skip = if_else(response_clean == "none", TRUE, FALSE)) %>% 
  rename(subject_id = anon_id)

# add vectors to main df
self_responses <- cbind(self_responses, self_vectors_all)

# make child response df with ages
child_responses_age <- left_join(self_responses %>% filter(age_group == "child"), 
                                 children_ages, by = join_by(subject_id)) %>% 
  select(subject_id, age_at_test, age_group, -id_texts, everything())

child_diversity_metrics <- child_responses_age %>%
  group_by(subject_id) %>%
  group_split() %>%
  map_dfr(function(df_child) {
    
    # count skips
    total_skips <- sum(df_child$is_skip)
    
    # get only real responses
    real_responses <- df_child %>% filter(!is_skip)
    
    diversity <- NA
    
    if(nrow(real_responses) >= 2) {
      
      # extract the embedding columns (ignoring our metadata columns)
      vectors <- as.matrix(real_responses[, 11:778])
      
      # compute manual cosine similarity matrix
      # normalize rows
      norm_vecs <- vectors / sqrt(rowSums(vectors^2))
      # dot product for the similarity matrix
      sim_matrix <- norm_vecs %*% t(norm_vecs)
      
      # remove self-similarity (the diagonal of 1s)
      diag(sim_matrix) <- NA
      
      # diversity = 1 - average similarity
      diversity <- 1 - mean(sim_matrix, na.rm = TRUE)
    }
    
    data.frame(
      subject_id = df_child$subject_id[1],
      age = first(df_child$age_at_test),
      skip_count = total_skips,
      semantic_diversity = diversity
    )
  })

print(child_diversity_metrics)

##    subject_id  age skip_count semantic_diversity
## 1    024a0cf3 4.07          7         0.09089179
## 2    04769bc3 4.93          3         0.10542622
## 3    25d82e80 3.93          1         0.07530919
## 4    26c6fd45 3.40          0         0.03026165
## 5    328a5988 4.84          0         0.08085186
## 6    615ba795 3.48          0         0.06215766
## 7    703b7489 4.55          0         0.07066781
## 8    73f24da8 4.02          0         0.08605593
## 9    794d7026 4.74          0         0.07088528
## 10   88ca5370 3.12          4         0.06151624
## 11   9209bd73 3.35          1         0.08018356
## 12   aa35da33 3.22         12                 NA
## 13   ad0d1d9e 4.28          0         0.06944330
## 14   b59594b6 3.21          0         0.08807305
## 15   bc136ecb 4.00          7         0.08609081
## 16   cb867663 4.88          0         0.07977969
## 17   dfdfe900 3.44          0         0.05973152
## 18   f1f018e8 3.22          1         0.08748632
## 19   f29a55ac 3.19          2         0.09651427

age_model <- lm(semantic_diversity ~ age, data = child_diversity_metrics)

summary(age_model)

## 
## Call:
## lm(formula = semantic_diversity ~ age, data = child_diversity_metrics)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.043045 -0.010110 -0.001669  0.012117  0.024581 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 0.051068   0.025053   2.038   0.0584 .
## age         0.006541   0.006302   1.038   0.3147  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01692 on 16 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.06308,    Adjusted R-squared:  0.004526 
## F-statistic: 1.077 on 1 and 16 DF,  p-value: 0.3147

ggplot(child_diversity_metrics, aes(x = age, y = semantic_diversity)) +
  geom_point(size = 3, alpha = 0.7) +
  geom_smooth(method = "lm", color = "blue") +
  theme_minimal() +
  labs(title = "Effect of Age on Semantic Diversity",
       x = "Age (Years)",
       y = "Semantic Diversity (Breadth)")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

interaction_model_children <- lm(semantic_diversity ~ age + skip_count, data = child_diversity_metrics)

summary(interaction_model_children)

## 
## Call:
## lm(formula = semantic_diversity ~ age + skip_count, data = child_diversity_metrics)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.038828 -0.007263 -0.001135  0.011616  0.023459 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 0.045040   0.024151   1.865   0.0819 .
## age         0.007074   0.006011   1.177   0.2577  
## skip_count  0.002725   0.001679   1.623   0.1254  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01612 on 15 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2031, Adjusted R-squared:  0.09681 
## F-statistic: 1.911 on 2 and 15 DF,  p-value: 0.1822

# pearson correlation (age vs diversity)
cor_test_children <- cor.test(child_diversity_metrics$age, child_diversity_metrics$semantic_diversity)

# does the number of skips predict how diverse their remaining thoughts are?
skip_cor_children <- cor.test(child_diversity_metrics$skip_count, child_diversity_metrics$semantic_diversity)

print(paste("Correlation (Age & Diversity):", round(cor_test_children$estimate, 3), 
            "p =", round(cor_test_children$p.value, 3))) # 0.251 p = 0.315

## [1] "Correlation (Age & Diversity): 0.251 p = 0.315"

print(paste("Correlation (Skips & Diversity):", round(skip_cor_children$estimate, 3), 
            "p =", round(skip_cor_children$p.value, 3))) # 0.36 p = 0.142

## [1] "Correlation (Skips & Diversity): 0.36 p = 0.142"

adult_responses <- self_responses %>% 
  filter(age_group == "adult") %>% 
  select(-id_texts)

adult_diversity_metrics <- adult_responses %>%
  group_by(subject_id) %>%
  group_split() %>%
  map_dfr(function(df_adult) {
    
    # count skips
    total_skips <- sum(df_adult$is_skip)
    
    # get only real responses
    real_responses <- df_adult %>% filter(!is_skip)
    
    diversity <- NA
    
    if(nrow(real_responses) >= 2) {
      
      # extract the embedding columns 
      vectors <- as.matrix(real_responses[, 9:776])
      
      # compute manual cosine similarity matrix
      # normalize rows
      norm_vecs <- vectors / sqrt(rowSums(vectors^2))
      # dot product gives the similarity matrix
      sim_matrix <- norm_vecs %*% t(norm_vecs)
      
      # remove self-similarity (the diagonal of 1s)
      diag(sim_matrix) <- NA
      
      # diversity = 1 - average similarity
      diversity <- 1 - mean(sim_matrix, na.rm = TRUE)
    }
    
    data.frame(
      subject_id = df_adult$subject_id[1],
      skip_count = total_skips,
      semantic_diversity = diversity
    )
  })

head(adult_diversity_metrics, 10L)

##    subject_id skip_count semantic_diversity
## 1    001fafba          0         0.11221275
## 2    002e49b2          0         0.11330701
## 3    00982e47          0         0.10657843
## 4    00a14d68          0         0.11744416
## 5    01b0b64a          0         0.09673105
## 6    02a8f8f6          0         0.10596563
## 7    039b6c84          0         0.11777708
## 8    03f84c9b          0         0.12554274
## 9    043f55bb          0         0.10360089
## 10   04827cc2          0         0.11642650

Combining child and adult diversity scores

diversity_comparison <- rbind(
  child_diversity_metrics %>% select(subject_id, semantic_diversity) %>% mutate(group = "child"),
  adult_diversity_metrics %>% select(subject_id, semantic_diversity) %>% mutate(group = "adult")
)

ggplot(diversity_comparison, aes(x = semantic_diversity, fill = group)) +
  geom_density(alpha = 0.5) + 
  geom_boxplot(width = 0.1, position = position_nudge(y = -0.05), alpha = 0.3) + 
  scale_fill_manual(values = c("child" = "#FF9999", "adult" = "#66B2FF")) +
  theme_minimal() +
  labs(title = "The Child's Self-Concept is Less Diverse than the Adult's",
       subtitle = "Density Plot of Within-Subject Semantic Diversity",
       x = "Semantic Diversity (1 - Mean Similarity)",
       y = "Density")

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

t_result <- t.test(semantic_diversity ~ group, data = diversity_comparison)

print(t_result)

## 
##  Welch Two Sample t-test
## 
## data:  semantic_diversity by group
## t = 6.6366, df = 19.253, p-value = 2.23e-06
## alternative hypothesis: true difference in means between group adult and group child is not equal to 0
## 95 percent confidence interval:
##  0.01874779 0.03599776
## sample estimates:
## mean in group adult mean in group child 
##          0.10411312          0.07674034

# cohen's d = 1.703564
cohensD(semantic_diversity ~ group, data = diversity_comparison)

## [1] 1.703564

# calculate word counts
self_responses$word_count <- str_count(self_responses$response_clean, "\\w+")
self_responses <- self_responses %>% 
  select(subject_id, utterance, response_clean, word_count, everything())

# aggregate to the subject level
word_stats <- self_responses %>%
  filter(!is_skip) %>%
  group_by(subject_id) %>%
  summarize(mean_word_count = mean(word_count, na.rm = TRUE))

# join with diversity metrics
diversity_comparison <- diversity_comparison %>%
  left_join(word_stats, by = "subject_id")

# verbosity correlation?
confound_test <- cor.test(diversity_comparison$mean_word_count, 
                          diversity_comparison$semantic_diversity)

print(confound_test)

## 
##  Pearson's product-moment correlation
## 
## data:  diversity_comparison$mean_word_count and diversity_comparison$semantic_diversity
## t = 4.5588, df = 265, p-value = 7.866e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1546277 0.3774931
## sample estimates:
##       cor 
## 0.2696678

# TODO---run this separately for adults and then children

# ANCOVA model
# looking for the effect of 'group' (child vs adult) 
# while 'holding constant' the mean_word_count

ancova_model <- lm(semantic_diversity ~ mean_word_count + group, data = diversity_comparison)

summary(ancova_model)

## 
## Call:
## lm(formula = semantic_diversity ~ mean_word_count + group, data = diversity_comparison)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.051385 -0.009194  0.000419  0.010121  0.045187 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.0955794  0.0020042  47.690  < 2e-16 ***
## mean_word_count  0.0010234  0.0002099   4.877 1.86e-06 ***
## groupchild      -0.0271176  0.0037637  -7.205 6.08e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01542 on 264 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.2251, Adjusted R-squared:  0.2192 
## F-statistic: 38.34 on 2 and 264 DF,  p-value: 2.402e-15

# adjusted means (diversity scores AFTER controlling for word count)
adj_means <- emmeans(ancova_model, "group")
print(adj_means)

##  group emmean       SE  df lower.CL upper.CL
##  adult  0.104 0.000977 264   0.1022   0.1060
##  child  0.077 0.003630 264   0.0698   0.0841
## 
## Confidence level used: 0.95

# calculate effect size 
eta_squared(ancova_model, partial = TRUE)

## # Effect Size for ANOVA (Type I)
## 
## Parameter       | Eta2 (partial) |       95% CI
## -----------------------------------------------
## mean_word_count |           0.09 | [0.04, 1.00]
## group           |           0.16 | [0.10, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

embeddings_clean