We’ll be using the quanteda package for text processing
and quanteda.textmodels for embeddings.
library(quanteda)
library(quanteda.textmodels)
library(tidyverse)
library(ggwordcloud)
library(jsonlite)
library(httr)
library(jsonlite)
To utilize text embeddings using quanteda, our raw data
should meet the following structure:
Subject.ID) and another for text
(Words.Spoken).data <- read.csv("full_text.csv")
head(data, n = 2)
Convert data into corpus format to apply NLP transformations in
quanteda by defining ID and text columns names.
# Define id and text column to conduct a text corpus
data_corp <- corpus(data,
docid_field = "Subject.ID",
text_field = "Words.Spoken")
head(data_corp)
Corpus consisting of 6 documents.
117 :
"& – טוב, מוכן, אודי? – יאללה, שלוש, ארבע... – אוקי. דוג..."
118 :
" – אני מוכן. – ארבע, ו... – אה, לי יש "מה אתה לומד או עו..."
123 :
" & – רגע, אני לא יודעת איפה זה. אוקיי. רגע... סבבה. עכשיו ..."
124 :
" – שלוש, ארבע...? – יאללה. – אה, יש עוד דף כזה? לא, עברת..."
129 :
"129_130 – קיצור מי ש… אני מתחילה אני שואלת את השאלה ואז אני..."
130 :
" – בדיוק, יהיה בסדר יאללה. אנחנו מעבירות? -יאללה שלוש ארבע ..."
Text tokenization involves splitting each document into smaller
units, called tokens, which are typically individual words. By default,
the tokens() function splits the text at spaces, treating
each word as a separate token. In this case, we also remove punctuation
to further clean the data, ensuring only meaningful text is
analyzed.
# Text Tokenization: Split text into single words, remove punctuation
data_token <- tokens(data_corp,
remove_punct = TRUE)
head(data_token)
Tokens consisting of 6 documents.
117 :
[1] "טוב" "מוכן" "אודי" "יאללה" "שלוש" "ארבע" "אוקי" "דוגמה" "למסך" "יש" "לי" "נתקדם"
[ ... and 2,473 more ]
118 :
[1] "אני" "מוכן" "ארבע" "ו" "אה" "לי" "יש" "מה" "אתה" "לומד" "או" "עושה"
[ ... and 2,110 more ]
123 :
[1] "רגע" "אני" "לא" "יודעת" "איפה" "זה" "אוקיי" "רגע" "סבבה" "עכשיו" "עוברים" "להבא"
[ ... and 1,837 more ]
124 :
[1] "שלוש" "ארבע" "יאללה" "אה" "יש" "עוד" "דף" "כזה" "לא" "עברתי" "את" "העמוד"
[ ... and 1,741 more ]
129 :
[1] "129_130" "קיצור" "מי" "ש" "אני" "מתחילה" "אני" "שואלת" "את" "השאלה" "ואז" "אני"
[ ... and 1,671 more ]
130 :
[1] "בדיוק" "יהיה" "בסדר" "יאללה" "אנחנו" "מעבירות" "יאללה" "שלוש" "ארבע" "ו" "אוקי" "אה"
[ ... and 2,865 more ]
Lemmatization is the process of reducing words to their base or
dictionary form, known as lemmas. we utilize the
nakdan API, which provides a comprehensive Hebrew lemma
dictionary. This API allows us to process a list of Hebrew words and
return their corresponding base forms.
BaseWords <- function(words) {
headers <- c('Content-Type' = 'application/json;charset=utf-8')
# API Configurations
params <- list(
task = "nakdan",
genre = "modern",
data = paste(words, collapse = " "),
addmorph = TRUE,
matchpartial = TRUE,
apiKey = "4b65be84-35f2-443b-ab3d-b18f3b82b27d" # API key
)
response <- POST(
url = "https://nakdan-3-2.loadbalancer.dicta.org.il/addnikud",
add_headers(.headers = headers),
body = params,
encode = "json"
)
if (response$status_code != 200) {
stop("API request failed with status code: ", response$status_code)
}
response_data <- fromJSON(content(response, as = "text", encoding = "UTF-8"), simplifyVector = FALSE)
sapply(words, function(word) {
idx <- which(sapply(response_data, function(x) x$word) == word)
if (length(idx) == 0) return(word) # Use original word if not found
base_word <- response_data[[idx[1]]]
if (!is.null(base_word$options) && length(base_word$options) > 0) {
return(base_word$options[[1]]$lex)
}
return(base_word$word)
})
}
hello_wolrd <- c("שלומי", "עולמות")
BaseWords(hello_wolrd)
שלומי עולמות
"שָׁלוֹם" "עוֹלָם"
# Step 1: Extract unique tokens
unique_tokens <- types(data_token)
# Step 2: Split tokens into batches to speed up the process
batch_size <- 250 # Adjust the batch size if necessary
token_batches <- split(unique_tokens, ceiling(seq_along(unique_tokens) / batch_size))
# Step 3: Apply BaseWords to each batch
lemmas_list <- lapply(token_batches, BaseWords)
# Combine lemmas into a single vector
lemmas <- unlist(lemmas_list, use.names = FALSE)
# Step 4: Create a token-to-lemma mapping
token_to_lemma <- setNames(lemmas, unique_tokens)
# Step 5: Replace tokens with lemmas
data_lemma <- tokens_replace(
data_token,
pattern = unique_tokens,
replacement = lemmas,
valuetype = "fixed"
)
head(data_lemma)
Tokens consisting of 6 documents.
117 :
[1] "טוֹב" "מוּכָן" "אוּדִי" "יַאלְלָה" "שְׁלוֹשָׁה" "אַרְבָּעָה" "אוֹקֵי" "דֻּגְמָה" "מָסָךְ" "יֵשׁ" "לְ" "קדם"
[ ... and 2,473 more ]
118 :
[1] "אֲנִי" "מוּכָן" "אַרְבָּעָה" "ו" "אָה" "לְ" "יֵשׁ" "מָה" "אֲנִי" "למד" "אוֹ" "עשׂי"
[ ... and 2,110 more ]
123 :
[1] "רֶגַע" "אֲנִי" "לֹא" "ידע" "אֵיפֹה" "זֶה" "אוֹקֵי" "רֶגַע" "סַבַּבָּה" "עַכְשָׁו" "עֻבָּר" "לְהַבָּא"
[ ... and 1,837 more ]
124 :
[1] "שְׁלוֹשָׁה" "אַרְבָּעָה" "יַאלְלָה" "אָה" "יֵשׁ" "עוֹד" "דַּף" "זֶה" "לֹא" "עבר" "אֶת" "עַמּוּד"
[ ... and 1,741 more ]
129 :
[1] "129_130" "קִצּוּר" "מִי" "ש" "אֲנִי" "תחל" "אֲנִי" "שׁאל" "אֶת" "שְׁאֵלָה" "אָז" "אֲנִי"
[ ... and 1,671 more ]
130 :
[1] "בְּדִיּוּק" "היי" "בְּסֵדֶר" "יַאלְלָה" "אֲנִי" "עֲבִירָה" "יַאלְלָה" "שְׁלוֹשָׁה" "אַרְבָּעָה" "ו" "אוֹקֵי" "אָה"
[ ... and 2,865 more ]
Token Frequencies Table
We create a document-feature matrix (DFM) to examine word frequency distributions across participants. Each token acts as a feature in the frequency table, with rows representing subjects.
data_dfm <- dfm(data_lemma)
head(data_dfm)
Document-feature matrix of: 6 documents, 5,247 features (91.09% sparse) and 0 docvars.
features
docs טוֹב מוּכָן אוּדִי יַאלְלָה שְׁלוֹשָׁה אַרְבָּעָה אוֹקֵי דֻּגְמָה מָסָךְ
117 39 23 2 2 8 13 16 19 3 2
118 26 14 2 1 14 8 13 12 0 2
123 14 12 0 0 0 4 5 10 0 0
124 11 5 0 0 4 10 5 5 1 0
129 6 27 2 0 0 18 18 8 0 1
130 12 15 1 0 5 13 15 8 0 1
[ reached max_nfeat ... 5,237 more features ]
The DFM includes many rare tokens, which can introduce noise into the analysis. To address this, we filter out tokens that occur in less than 1% of the documents, a common threshold for eliminating irrelevant words.
# Define Token Frequency Threshold within documents
data_trim <- data_dfm |> # Omit tokens the appear in less then 1% of documents
dfm_trim(min_docfreq = 0.01, docfreq_type = "prop")
head(data_trim)
Document-feature matrix of: 6 documents, 2,899 features (84.71% sparse) and 0 docvars.
features
docs טוֹב מוּכָן אוּדִי יַאלְלָה שְׁלוֹשָׁה אַרְבָּעָה אוֹקֵי דֻּגְמָה מָסָךְ
117 39 23 2 2 8 13 16 19 3 2
118 26 14 2 1 14 8 13 12 0 2
123 14 12 0 0 0 4 5 10 0 0
124 11 5 0 0 4 10 5 5 1 0
129 6 27 2 0 0 18 18 8 0 1
130 12 15 1 0 5 13 15 8 0 1
[ reached max_nfeat ... 2,889 more features ]
We significantly decreased the number of By trimming the DFM, we significantly reduce the number of features and focus on the most informative tokens.
Term Frequency-Inverse Document Frequency (TF-IDF) is a method to down-weight common, less informative words while emphasizing unique and meaningful tokens. This reduces the bias toward frequently occurring words and highlights important words in each document.
# TF-IDF scaled dfm
data_tfidf <- data_trim |>
dfm_tfidf(scheme_tf = "prop")
head(data_tfidf)
Document-feature matrix of: 6 documents, 2,899 features (84.71% sparse) and 0 docvars.
features
docs טוֹב מוּכָן אוּדִי יַאלְלָה שְׁלוֹשָׁה אַרְבָּעָה אוֹקֵי דֻּגְמָה מָסָךְ
117 3.553290e-04 0 2.233139e-04 0.0013104785 0.0003622030 1.387879e-04 0.0004898806 2.631113e-04 0.0003105601 0.0005572525
118 2.764612e-04 0 2.606217e-04 0.0007647064 0.0007397500 9.967657e-05 0.0004645243 1.939376e-04 0 0.0006503496
123 1.703274e-04 0 0 0 0 5.702415e-05 0.0002044235 1.849168e-04 0 0
124 1.431310e-04 0 0 0 0.0002586409 1.524696e-04 0.0002186328 9.888513e-05 0.0001478425 0
129 7.993922e-05 0 3.265565e-04 0 0 2.810110e-04 0.0008059077 1.620013e-04 0 0.0004074409
130 9.376245e-05 0 9.575632e-05 0 0.0001941394 1.190237e-04 0.0003938611 9.500741e-05 0 0.0002389482
[ reached max_nfeat ... 2,889 more features ]
Latent Semantic Analysis (LSA) is a dimensionality reduction technique that identifies patterns in the relationships between terms (words) and documents (subjects). By applying Singular Value Decomposition (SVD) to a term-document matrix, LSA captures the underlying semantic structure of the text data. This method focuses on the most relevant topics or concepts, effectively grouping words into semantic fields and revealing trends where words commonly occur together within the same contexts—in this case, the subjects’ transcripts.
By proceeding with 10 dimensions (the function’s default), we maintain a model that is both robust and interpretable. To visualize the contribution of additional dimensions, we can plot the singular values, which reveal an “elbow” point around the 10-dimensional mark, suggesting 10 dimensions are sufficient to capture the essential semantic structures in our data without introducing unnecessary complexity.
data_lsa_max <- textmodel_lsa(data_tfidf, nd = 119)
singular_values <- data_lsa_max$sk
plot(singular_values, type = "b", xlab = "Dimensions", ylab = "Singular Values", main = "Singular Value Decay")
# Apply LSA with 10
data_lsa <- textmodel_lsa(data_tfidf)
After applying LSA, the text data is embedded into a reduced-dimensional space. Each document (row) represents a specific context or subject, with the 10 dimensions (columns) capturing distinct semantic relationships. Tokens are considered similar if they frequently appear among same subjects.
GOI <- read.csv("Data.csv")
GOI <- GOI |>
arrange(iSubject) |>
select(iSubject, iPartner, iDyad, GOI)
head(GOI)
NA
sub_doc_df <- as.data.frame(data_lsa$docs)
head(sub_doc_df)
NA
We will insert the interaction quality score (GOI) to our data
GOI <- sub_doc_df |>
rownames_to_column(var = "iSubject") |> # Make ID column
arrange(iSubject) |> # Arrange to fit GOI structure
mutate(iPartner = GOI$iPartner,# Add partner column
iDyad = GOI$iDyad,
GOI = GOI$GOI) |> # Add GOI column
select(iSubject, iPartner, iDyad, GOI, everything())
head(GOI)
NA
We can now explore similarities between subjects using common distance metrics such as cosine similarity, allowing for deeper insights into shared linguistic patterns. to do so we will first need to add the partner variables to our data.
GOI_merged <- GOI %>%
mutate(iSubject = as.numeric(iSubject),
iPartner = as.numeric(iPartner)) |>
left_join(GOI, by = c("iSubject" = "iPartner"), suffix = c("", "_partner")) |>
select(-iSubject_partner)
head(GOI_merged)
We will now calculate the cosine similarities between subjects semantic vectors.
library(proxy)
Attaching package: ‘proxy’
The following object is masked from ‘package:Matrix’:
as.matrix
The following objects are masked from ‘package:stats’:
as.dist, dist
The following object is masked from ‘package:base’:
as.matrix
cosine_similarities <- numeric(nrow(GOI_merged))
for (i in 1:nrow(GOI_merged)) {
# Extract subject and partner vector:
subject_vector <- as.numeric(GOI_merged[i, 5:14])
partner_vector <- as.numeric(GOI_merged[i, 17:26])
# Bind vectors into single matrix
vectors <- rbind(subject_vector, partner_vector)
# Calculate cosine similarity
cosine_similarity <- 1 - proxy::dist(vectors, method = "cosine")
# Store result in the vector
cosine_similarities[i] <- cosine_similarity
}
GOI_merged$cosine_similarity <- cosine_similarities
GOI <- GOI_merged |>
select(iSubject, iPartner, iDyad, cosine_similarity, GOI, everything(), -ends_with("_partner"))
head(GOI)
NA
NA
NA
library(caret)
library(recipes)
library(rsample)
library(yardstick)
Predict GOI with d1 + d2 + …d10 + cosine similarity
# Predict GOI with all variables
rec <- recipe(GOI ~ ., data = GOI) |>
step_rm(iSubject, iPartner, iDyad)
rec
── Recipe ──────────────────────────────────────────────────────────────────────────────────────
── Inputs
Number of variables by role
outcome: 1
predictor: 14
── Operations
• Variables removed: iSubject, iPartner, iDyad
We will conduct leave one dyad out cross validation, testing our tuning on each dyad separtly.
# Extract iDyad for indexing
GOI_index <- GOI %>%
select(iDyad)
# Create a vector of unique dyad IDs and make folds for LODOCV
dyads <- unique(GOI_index$iDyad)
n_dyads <- length(dyads) # Number of dyads (should be 60)
folds <- lapply(dyads, function(d) which(GOI_index$iDyad != d))
names(folds) <- paste0("Fold", dyads)
# 4. Set up trainControl for Leave-One-Dyad-Out Cross-Validation
tc <- trainControl(
method = "cv", # Cross-validation
index = folds, # Custom indices for LODOCV
number = n_dyads
)
We can observe that V4 is included in all prediction combinations. Additionally, cosine similarity appears in every model except the one with a single predictor, underscoring the importance of semantic similarity for interaction quality assessment (GOI).
bestsub.GOI <- leaps::regsubsets(GOI ~ . - iSubject - iPartner -iDyad, data = GOI)
summary(bestsub.GOI)
Subset selection object
Call: regsubsets.formula(GOI ~ . - iSubject - iPartner - iDyad, data = GOI)
11 Variables (and intercept)
Forced in Forced out
cosine_similarity FALSE FALSE
V1 FALSE FALSE
V2 FALSE FALSE
V3 FALSE FALSE
V4 FALSE FALSE
V5 FALSE FALSE
V6 FALSE FALSE
V7 FALSE FALSE
V8 FALSE FALSE
V9 FALSE FALSE
V10 FALSE FALSE
1 subsets of each size up to 8
Selection Algorithm: exhaustive
cosine_similarity V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
1 ( 1 ) " " " " " " " " "*" " " " " " " " " " " " "
2 ( 1 ) "*" " " " " " " "*" " " " " " " " " " " " "
3 ( 1 ) "*" " " " " " " "*" " " " " " " " " "*" " "
4 ( 1 ) "*" " " " " " " "*" " " "*" " " " " "*" " "
5 ( 1 ) "*" " " " " "*" "*" " " "*" " " " " "*" " "
6 ( 1 ) "*" " " "*" "*" "*" " " "*" " " " " "*" " "
7 ( 1 ) "*" "*" "*" "*" "*" " " "*" " " " " "*" " "
8 ( 1 ) "*" "*" "*" "*" "*" " " "*" " " "*" "*" " "
Tune lambda over alpha = 0 vs alpha = 1
set.seed(1)
tg <- expand.grid(
alpha = c(0, 1),
lambda = 10^seq(3, -3, by = -0.1) # Range of lambda
)
l1_l2.GOI <- train(
rec,
data = GOI,
method = "glmnet",
tuneGrid = tg,
trControl = tc
)
Loading required package: Matrix
Attaching package: ‘Matrix’
The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
Loaded glmnet 4.1-8
Warning: There were missing values in resampled performance measures.
plot(l1_l2.GOI, xlim = c(0, 400))
l1_l2.GOI$bestTune
min(l1_l2.GOI$results$RMSE)
[1] 17.20466
Tune alpha skewed towards Ridge.
set.seed(1)
tg <- expand.grid(
alpha = seq(0, 0.5, by = 0.1),
lambda = 10^seq(3, -3, by = -0.5) # Range of lambda
)
elastic.GOI <- train(
rec,
data = GOI,
method = "glmnet",
tuneGrid = tg,
trControl = tc
)
Warning: There were missing values in resampled performance measures.
plot(elastic.GOI, xlim = c(0, 400))
elastic.GOI$bestTune
min(elastic.GOI$results$RMSE)
[1] 17.20466
#GOI.null <- lm(GOI ~ 1, data = GOI)
sst <- sum((GOI$GOI - mean(GOI$GOI))^2)
sse <- min(l1_l2.GOI$results$RMSE)^2 * 120
R2 <- 1 - sse / sst
sst
[1] 44130.99
sse
[1] 35520.04
R2
[1] 0.1951223
print(l1_l2.GOI$resample)
NA
Use same rec and tc from last models
rec
── Recipe ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
── Inputs
Number of variables by role
outcome: 1
predictor: 14
── Operations
• Variables removed: iSubject, iPartner, iDyad
head(tc)
$method
[1] "cv"
$number
[1] 60
$repeats
[1] NA
$search
[1] "grid"
$p
[1] 0.75
$initialWindow
NULL
tg <- expand.grid(
mtry = c(1:11) # Evaluate 1 to 11(p) predictors
)
set.seed(1234)
rf.GOI <- train(
rec,
data = GOI,
method = "rf",
tuneGrid = tg,
trControl = tc
)
Loading required namespace: randomForest
randomForest 4.7-1.1
Type rfNews() to see new features/changes/bug fixes.
Attaching package: ‘randomForest’
The following object is masked from ‘package:dplyr’:
combine
The following object is masked from ‘package:ggplot2’:
margin
plot(rf.GOI)
rf.GOI$bestTune
min(rf.GOI$results$RMSE)
[1] 16.30243
sst <- sum((GOI$GOI - mean(GOI$GOI))^2)
cat("SST:", sst)
SST: 44130.99
# Min RMSE for L1 L2
min_rmse_l1l2 <- min(l1_l2.GOI$results$RMSE)
# L1 L2 Explained Variance
ssm_l1l2 <- min_rmse_l1l2^2 * nrow(GOI)
# Elastic Net Explained Variance
ssm_elastic <- min_rmse_elastic^2 * nrow(GOI)
# Min RMSE for RF
min_rmse_rf <- min(rf.GOI$results$RMSE)
# RF Explained Variance
ssm_rf <- min_rmse_rf^2 * nrow(GOI)
R2_l1l2 <- 1 - (ssm_l1l2 / sst)
cat("L1 L2 Perfomance:", "\nMin RMSE:", min_rmse_l1l2, "\nExplained Variance:", ssm_l1l2, "\nRsqrd:", R2_l1l2)
L1 L2 Perfomance:
Min RMSE: 17.20559
Explained Variance: 35523.88
Rsqrd: 0.1950354
R2_elastic <- 1 - (ssm_elastic / sst)
cat("Elastic Net Perfomance:", "\nMin RMSE:", min_rmse_elastic, "\nExplained Variance:", ssm_elastic, "\nRsqrd:", R2_elastic)
Elastic Net Perfomance:
Min RMSE: 17.20559
Explained Variance: 35523.88
Rsqrd: 0.1950354
R2_rf <- 1 - (ssm_rf / sst)
cat("Random Forest Perfomance:", "\nMin RMSE:", min_rmse_rf, "\nExplained Variance:", ssm_rf, "\nRsqrd:", R2_rf)
Random Forest Perfomance:
Min RMSE: 16.30243
Explained Variance: 31892.32
Rsqrd: 0.2773259
varImp(rf.GOI) |>
plot()
varImp(l1_l2.GOI) |>
plot()
NA
NA
We will Plot the 10 must representative words for the most informative dimensions (V9, V4)
V_matrix <- data_lsa$features # Get the word loadings
# Vector of the terms (words) in corpus
terms <- colnames(data_tfidf)
# For each dimension, extract the top contributing words
get_top_words_per_dimension <- function(V_matrix, terms, top_n = 10) {
top_words <- list()
for (dim in 1:ncol(V_matrix)) {
# Get the loadings for this dimension
loadings <- V_matrix[, dim]
# Rank the words by the absolute value of their loadings
ranked_indices <- order(abs(loadings), decreasing = TRUE)[1:top_n]
# Get the corresponding words
top_words[[paste("Dimension", dim)]] <- terms[ranked_indices]
}
return(top_words)
}
# Get the top 10 words for each dimension
top_words_per_dimension <- get_top_words_per_dimension(V_matrix, terms, top_n = 10)
# Display Best Dimensions to Predcit GOI
top_words_per_dimension$`Dimension 4`
[1] "נשׁר" "צֶמַח" "עלם" "אִצְטַדְיוֹן" "רִצְפָּה" "אוֹפַנַּיִם" "פּוּאֶנְטָה" "מִתְעַנְיֵן" "רֶכֶב"
[10] "שַׁיִט"
top_words_per_dimension$`Dimension 9`
[1] "חָחָחָ" "כְּלוֹמַר" "תַּכְשִׁיט" "עַרְבִית" "כֶּלֶב" "גּוּגְל" "מִשְׂגָּב" "3-4" "עתק" "כַּדּוּרֶגֶל"