self_responses <- read.csv("~/Documents/projects/self-disclosure/data/all_responses_clean.csv",
stringsAsFactors = FALSE)
anonymized_children <- read.csv("~/Documents/projects/self-disclosure/data/anonymized_children.csv",
stringsAsFactors = FALSE)
children_ages <- anonymized_children %>%
select(anon_id, age_at_test) %>%
rename(subject_id = anon_id)
Questions to ask Claire + Robert - should i get rid of all punctuation in the adult responses?
all_embeddings_raw <- textEmbed(
texts = self_responses$response_clean,
model = "sentence-transformers/all-distilroberta-v1",
batch_size = 10,
keep_token_embeddings = FALSE
) # Minutes from start: 6.989
# check if we lost any rows, if TRUE, we're good to go
nrow(self_responses) == nrow(all_embeddings_raw$texts[[1]])
# pull the vectors
self_vectors_all <- all_embeddings_raw$texts[[1]]
# write.csv(x = self_vectors_all, file = "self_vectors_all.csv", row.names = FALSE)
self_vectors_all <- read.csv("self_vectors_all.csv")
# compute similarity matrix
all_sim_matrix <- textSimilarityMatrix(self_vectors_all)
# compute typicality calculation, then add to main df
self_responses$typicality <- (rowSums(all_sim_matrix) - 1) / (ncol(all_sim_matrix) - 1)
# 2D coordinates for adult cloud
# NOTE: UMAP calculated using raw vectors (embeddings), not typicality scores
all_umap <- umap(self_vectors_all)
self_responses$umap_1 <- all_umap$layout[,1]
self_responses$umap_2 <- all_umap$layout[,2]
# add skip count to main df
self_responses <- self_responses %>%
mutate(is_skip = if_else(response_clean == "none", TRUE, FALSE)) %>%
rename(subject_id = anon_id)
# add vectors to main df
self_responses <- cbind(self_responses, self_vectors_all)
# make child response df with ages
child_responses_age <- left_join(self_responses %>% filter(age_group == "child"),
children_ages, by = join_by(subject_id)) %>%
select(subject_id, age_at_test, age_group, -id_texts, everything())
child_diversity_metrics <- child_responses_age %>%
group_by(subject_id) %>%
group_split() %>%
map_dfr(function(df_child) {
# count skips
total_skips <- sum(df_child$is_skip)
# get only real responses
real_responses <- df_child %>% filter(!is_skip)
diversity <- NA
if(nrow(real_responses) >= 2) {
# extract the embedding columns (ignoring our metadata columns)
vectors <- as.matrix(real_responses[, 11:778])
# compute manual cosine similarity matrix
# normalize rows
norm_vecs <- vectors / sqrt(rowSums(vectors^2))
# dot product for the similarity matrix
sim_matrix <- norm_vecs %*% t(norm_vecs)
# remove self-similarity (the diagonal of 1s)
diag(sim_matrix) <- NA
# diversity = 1 - average similarity
diversity <- 1 - mean(sim_matrix, na.rm = TRUE)
}
data.frame(
subject_id = df_child$subject_id[1],
age = first(df_child$age_at_test),
skip_count = total_skips,
semantic_diversity = diversity
)
})
print(child_diversity_metrics)
## subject_id age skip_count semantic_diversity
## 1 024a0cf3 4.07 7 0.09089179
## 2 04769bc3 4.93 3 0.10542622
## 3 25d82e80 3.93 1 0.07530919
## 4 26c6fd45 3.40 0 0.03026165
## 5 328a5988 4.84 0 0.08085186
## 6 615ba795 3.48 0 0.06215766
## 7 703b7489 4.55 0 0.07066781
## 8 73f24da8 4.02 0 0.08605593
## 9 794d7026 4.74 0 0.07088528
## 10 88ca5370 3.12 4 0.06151624
## 11 9209bd73 3.35 1 0.08018356
## 12 aa35da33 3.22 12 NA
## 13 ad0d1d9e 4.28 0 0.06944330
## 14 b59594b6 3.21 0 0.08807305
## 15 bc136ecb 4.00 7 0.08609081
## 16 cb867663 4.88 0 0.07977969
## 17 dfdfe900 3.44 0 0.05973152
## 18 f1f018e8 3.22 1 0.08748632
## 19 f29a55ac 3.19 2 0.09651427
age_model <- lm(semantic_diversity ~ age, data = child_diversity_metrics)
summary(age_model)
##
## Call:
## lm(formula = semantic_diversity ~ age, data = child_diversity_metrics)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.043045 -0.010110 -0.001669 0.012117 0.024581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.051068 0.025053 2.038 0.0584 .
## age 0.006541 0.006302 1.038 0.3147
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01692 on 16 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.06308, Adjusted R-squared: 0.004526
## F-statistic: 1.077 on 1 and 16 DF, p-value: 0.3147
ggplot(child_diversity_metrics, aes(x = age, y = semantic_diversity)) +
geom_point(size = 3, alpha = 0.7) +
geom_smooth(method = "lm", color = "blue") +
theme_minimal() +
labs(title = "Effect of Age on Semantic Diversity",
x = "Age (Years)",
y = "Semantic Diversity (Breadth)")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
interaction_model_children <- lm(semantic_diversity ~ age + skip_count, data = child_diversity_metrics)
summary(interaction_model_children)
##
## Call:
## lm(formula = semantic_diversity ~ age + skip_count, data = child_diversity_metrics)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.038828 -0.007263 -0.001135 0.011616 0.023459
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.045040 0.024151 1.865 0.0819 .
## age 0.007074 0.006011 1.177 0.2577
## skip_count 0.002725 0.001679 1.623 0.1254
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01612 on 15 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2031, Adjusted R-squared: 0.09681
## F-statistic: 1.911 on 2 and 15 DF, p-value: 0.1822
# pearson correlation (age vs diversity)
cor_test_children <- cor.test(child_diversity_metrics$age, child_diversity_metrics$semantic_diversity)
# does the number of skips predict how diverse their remaining thoughts are?
skip_cor_children <- cor.test(child_diversity_metrics$skip_count, child_diversity_metrics$semantic_diversity)
print(paste("Correlation (Age & Diversity):", round(cor_test_children$estimate, 3),
"p =", round(cor_test_children$p.value, 3))) # 0.251 p = 0.315
## [1] "Correlation (Age & Diversity): 0.251 p = 0.315"
print(paste("Correlation (Skips & Diversity):", round(skip_cor_children$estimate, 3),
"p =", round(skip_cor_children$p.value, 3))) # 0.36 p = 0.142
## [1] "Correlation (Skips & Diversity): 0.36 p = 0.142"
adult_responses <- self_responses %>%
filter(age_group == "adult") %>%
select(-id_texts)
adult_diversity_metrics <- adult_responses %>%
group_by(subject_id) %>%
group_split() %>%
map_dfr(function(df_adult) {
# count skips
total_skips <- sum(df_adult$is_skip)
# get only real responses
real_responses <- df_adult %>% filter(!is_skip)
diversity <- NA
if(nrow(real_responses) >= 2) {
# extract the embedding columns
vectors <- as.matrix(real_responses[, 9:776])
# compute manual cosine similarity matrix
# normalize rows
norm_vecs <- vectors / sqrt(rowSums(vectors^2))
# dot product gives the similarity matrix
sim_matrix <- norm_vecs %*% t(norm_vecs)
# remove self-similarity (the diagonal of 1s)
diag(sim_matrix) <- NA
# diversity = 1 - average similarity
diversity <- 1 - mean(sim_matrix, na.rm = TRUE)
}
data.frame(
subject_id = df_adult$subject_id[1],
skip_count = total_skips,
semantic_diversity = diversity
)
})
head(adult_diversity_metrics, 10L)
## subject_id skip_count semantic_diversity
## 1 001fafba 0 0.11221275
## 2 002e49b2 0 0.11330701
## 3 00982e47 0 0.10657843
## 4 00a14d68 0 0.11744416
## 5 01b0b64a 0 0.09673105
## 6 02a8f8f6 0 0.10596563
## 7 039b6c84 0 0.11777708
## 8 03f84c9b 0 0.12554274
## 9 043f55bb 0 0.10360089
## 10 04827cc2 0 0.11642650
Combining child and adult diversity scores
diversity_comparison <- rbind(
child_diversity_metrics %>% select(subject_id, semantic_diversity) %>% mutate(group = "child"),
adult_diversity_metrics %>% select(subject_id, semantic_diversity) %>% mutate(group = "adult")
)
ggplot(diversity_comparison, aes(x = semantic_diversity, fill = group)) +
geom_density(alpha = 0.5) +
geom_boxplot(width = 0.1, position = position_nudge(y = -0.05), alpha = 0.3) +
scale_fill_manual(values = c("child" = "#FF9999", "adult" = "#66B2FF")) +
theme_minimal() +
labs(title = "The Child's Self-Concept is Less Diverse than the Adult's",
subtitle = "Density Plot of Within-Subject Semantic Diversity",
x = "Semantic Diversity (1 - Mean Similarity)",
y = "Density")
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
t_result <- t.test(semantic_diversity ~ group, data = diversity_comparison)
print(t_result)
##
## Welch Two Sample t-test
##
## data: semantic_diversity by group
## t = 6.6366, df = 19.253, p-value = 2.23e-06
## alternative hypothesis: true difference in means between group adult and group child is not equal to 0
## 95 percent confidence interval:
## 0.01874779 0.03599776
## sample estimates:
## mean in group adult mean in group child
## 0.10411312 0.07674034
# cohen's d = 1.703564
cohensD(semantic_diversity ~ group, data = diversity_comparison)
## [1] 1.703564
# calculate word counts
self_responses$word_count <- str_count(self_responses$response_clean, "\\w+")
self_responses <- self_responses %>%
select(subject_id, utterance, response_clean, word_count, everything())
# aggregate to the subject level
word_stats <- self_responses %>%
filter(!is_skip) %>%
group_by(subject_id) %>%
summarize(mean_word_count = mean(word_count, na.rm = TRUE))
# join with diversity metrics
diversity_comparison <- diversity_comparison %>%
left_join(word_stats, by = "subject_id")
# verbosity correlation?
confound_test <- cor.test(diversity_comparison$mean_word_count,
diversity_comparison$semantic_diversity)
print(confound_test)
##
## Pearson's product-moment correlation
##
## data: diversity_comparison$mean_word_count and diversity_comparison$semantic_diversity
## t = 4.5588, df = 265, p-value = 7.866e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1546277 0.3774931
## sample estimates:
## cor
## 0.2696678
# TODO---run this separately for adults and then children
# ANCOVA model
# looking for the effect of 'group' (child vs adult)
# while 'holding constant' the mean_word_count
ancova_model <- lm(semantic_diversity ~ mean_word_count + group, data = diversity_comparison)
summary(ancova_model)
##
## Call:
## lm(formula = semantic_diversity ~ mean_word_count + group, data = diversity_comparison)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.051385 -0.009194 0.000419 0.010121 0.045187
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0955794 0.0020042 47.690 < 2e-16 ***
## mean_word_count 0.0010234 0.0002099 4.877 1.86e-06 ***
## groupchild -0.0271176 0.0037637 -7.205 6.08e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01542 on 264 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.2251, Adjusted R-squared: 0.2192
## F-statistic: 38.34 on 2 and 264 DF, p-value: 2.402e-15
# adjusted means (diversity scores AFTER controlling for word count)
adj_means <- emmeans(ancova_model, "group")
print(adj_means)
## group emmean SE df lower.CL upper.CL
## adult 0.104 0.000977 264 0.1022 0.1060
## child 0.077 0.003630 264 0.0698 0.0841
##
## Confidence level used: 0.95
# calculate effect size
eta_squared(ancova_model, partial = TRUE)
## # Effect Size for ANOVA (Type I)
##
## Parameter | Eta2 (partial) | 95% CI
## -----------------------------------------------
## mean_word_count | 0.09 | [0.04, 1.00]
## group | 0.16 | [0.10, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].