Factor analysis / PCA
We’ll try to do PCA or FA by dummy-coding all values per participant,
weighted by the importance indicated by the participant, across
perspectives.
I’ll first try a binary and just for one perspective.
paper_bin <- df_amd %>%
filter(type == "uspaper") %>%
select(PID,value) %>%
mutate(var = 1) %>%
pivot_wider(names_from = value,
values_from = var) %>%
mutate_at(vars(achievement:kindness),function(x){replace_na(x,0)}) %>%
select(-PID)
paraellel <- fa.parallel(paper_bin,fm = 'minres', fa = 'fa')
factors <- fa(paper_bin, nfactors = 12, rotate = "oblimin", fm = "minres")
as.data.frame(factors$weights) %>%
arrange(desc(MR8))
Now i’ll try a weighted score and just for one perspective.
paper_weighted <- df_amd %>%
filter(type == "uspaper") %>%
select(PID,value,weight) %>%
pivot_wider(names_from = value,
values_from = weight) %>%
mutate_at(vars(achievement:kindness),function(x){replace_na(x,0)}) %>%
select(-PID)
paraellel <- fa.parallel(paper_weighted,fm = 'minres', fa = 'fa')
factors <- fa(paper_weighted, nfactors = 10, rotate = "oblimin", fm = "minres")
as.data.frame(factors$weights) %>%
arrange(desc(MR7))
I’ll do this for all three persepctives
paper_weighted <- df_amd %>%
filter(type == "uspaper") %>%
select(PID,value,weight) %>%
pivot_wider(names_from = value,
values_from = weight) %>%
select(-PID) %>%
mutate_all(function(x){replace_na(x,0)})
practice_weighted <- df_amd %>%
filter(type == "uspractice") %>%
select(PID,value,weight) %>%
pivot_wider(names_from = value,
values_from = weight) %>%
select(-PID) %>%
mutate_all(function(x){replace_na(x,0)})
ideal_weighted <- df_amd %>%
filter(type == "ideal") %>%
select(PID,value,weight) %>%
pivot_wider(names_from = value,
values_from = weight) %>%
select(-PID) %>%
mutate_all(function(x){replace_na(x,0)})
paraellel <- fa.parallel(paper_weighted,fm = 'minres', fa = 'fa')
factors_paper <- fa(paper_weighted, nfactors = 8, rotate = "oblimin", fm = "minres")
factors_practice <- fa(practice_weighted, nfactors = 8, rotate = "oblimin", fm = "minres")
factors_ideal <- fa(ideal_weighted, nfactors = 8, rotate = "oblimin", fm = "minres")
as.data.frame(factors_ideal$weights) %>%
arrange(desc(MR8)) %>%
select(MR8)
ok. it’s not perfect, but it’s pretty cool. I think it’ll get better
if I’m able to combine all three perspectives together. The problem with
that is that some people selected the same value for different
perspectives. So, when that happens, I’ll put all of them in.
firsts <- df_amd %>%
select(PID,value,weight) %>%
group_by(PID,value) %>%
slice(1) %>%
ungroup() %>%
pivot_wider(names_from = value,
values_from = weight) %>%
select(-PID) %>%
mutate_all(function(x){replace_na(x,0)})
seconds <- df_amd %>%
select(PID,value,weight) %>%
group_by(PID,value) %>%
slice(2) %>%
ungroup() %>%
pivot_wider(names_from = value,
values_from = weight) %>%
select(-PID) %>%
mutate_all(function(x){replace_na(x,0)})
thirds <- df_amd %>%
select(PID,value,weight) %>%
group_by(PID,value) %>%
slice(3) %>%
ungroup() %>%
pivot_wider(names_from = value,
values_from = weight) %>%
select(-PID) %>%
mutate_all(function(x){replace_na(x,0)})
items4FA <- firsts %>%
bind_rows(seconds) %>%
bind_rows(thirds) %>%
mutate_all(function(x){replace_na(x,0)})
paraellel <- fa.parallel(items4FA,fm = 'minres', fa = 'fa')
factors <- fa(paper_weighted, nfactors = 14, rotate = "oblimin", fm = "minres")
as.data.frame(factors$weights) %>%
arrange(desc(MR14)) %>%
select(MR14)
factors <- fa(paper_weighted, nfactors = 8, rotate = "oblimin", fm = "minres")
as.data.frame(factors$weights) %>%
arrange(desc(MR4)) %>%
select(MR4)
hmm, idk about this. lets try PCA.
PCA(items4FA,axes = c(4,4))
sol1 <- PCA(items4FA,ncp = 10)
as.data.frame(sol1$var$contrib) %>%
arrange(desc(Dim.10)) %>%
select(Dim.10)
not great either.
What I’ll do here is actually get the cosine similarities of all
words to each other. Then, I’ll be able to group them based on
similarity. Let’s see how this goes
vec_1 <- df_amd %>%
select(value) %>%
distinct() %>%
arrange(value) %>%
unlist() %>%
unname()
vec_2 <- df_amd %>%
select(value) %>%
distinct() %>%
arrange(value) %>%
unlist() %>%
unname()
all_df <- tibble(vec_1 = "temp",vec_2 = "temp")
count = 0
for(i in vec_1){
count = count + 1
current_df <- expand_grid(vec_1,vec_2) %>%
filter(vec_1 == i) %>%
slice(-c(1:count))
all_df <- all_df %>%
bind_rows(current_df)
}
value_combinations <- all_df %>%
filter(vec_1 != "temp") %>%
mutate(row_num = row_number())
row_num <- value_combinations %>%
select(row_num) %>%
unlist() %>%
unname()
df_valuesims = tibble(val1 = "temp",val2 = "temp",cosine = 0)
for(i in row_num){
v1 <- value_combinations %>%
filter(row_num == i) %>%
select(vec_1) %>%
unlist() %>%
unname()
v2 <- value_combinations %>%
filter(row_num == i) %>%
select(vec_2) %>%
unlist() %>%
unname()
mt_cosine <- df_amd %>%
select(value,X1:X100) %>%
filter(value == v1 | value == v2) %>%
distinct() %>%
pivot_longer(X1:X100,
names_to = "names",
values_to = "values") %>%
pivot_wider(names_from = "value",
values_from = "values") %>%
select(-names) %>%
as.matrix() %>%
cosine()
print(i)
current_cosine = mt_cosine[v1,v2]
current_scores = tibble(val1 = v1,
val2 = v2,
cosine = current_cosine)
df_valuesims <- df_valuesims %>%
bind_rows(current_scores)
}
ok, cool. for this next part, I’ll try to group them together. What
I’ll do is take the most similar words to each word and see if those
words are similar to each other as well. If yes, they should cluster
together.
top_10 <- df_valuesims %>%
filter(val1 != "temp") %>%
bind_rows(df_valuesims %>%
filter(val1 != "temp") %>%
rename(val1 = val2,
val2 = val1)) %>%
arrange(val1,desc(cosine)) %>%
group_by(val1) %>%
slice(1:10) %>%
select(-cosine)
top_5 <- df_valuesims %>%
filter(val1 != "temp") %>%
bind_rows(df_valuesims %>%
filter(val1 != "temp") %>%
rename(val1 = val2,
val2 = val1)) %>%
arrange(val1,desc(cosine)) %>%
group_by(val1) %>%
slice(1:5)
ig <- graph_from_data_frame(top_5, directed = FALSE, vertices = NULL)
plot(ig)
clus <- cluster_edge_betweenness(
ig,
weights = top_5$cosine,
directed = FALSE,
edge.betweenness = TRUE,
merges = TRUE,
bridges = TRUE,
modularity = TRUE,
membership = TRUE
)
top_5 %>%
select(-cosine) %>%
left_join(top_5 %>%
select(-cosine) %>%
rename(val2 = val1,
val_rec = val2)) %>%
group_by(val1,val2) %>%
mutate(is_recip = ifelse(val1 %in% val_rec,1,0)) %>%
filter(is_recip == 1) %>%
select(val1,val2) %>%
distinct()
fornet <- df_valuesims %>%
filter(val1 != "temp") %>%
bind_rows(df_valuesims %>%
filter(val1 != "temp") %>%
rename(val1 = val2,
val2 = val1)) %>%
arrange(val1,val2)
cor_matrix <- fornet %>%
pivot_wider(names_from = val2,
values_from = cosine) %>%
mutate_at(vars("capitalism":"achievement"),function(x){replace_na(x,1)}) %>%
select(-val1) %>%
select(achievement,everything()) %>%
as.matrix()
row.names(cor_matrix) = fornet %>%
pivot_wider(names_from = val2,
values_from = cosine) %>%
mutate_at(vars("self government":"achievement"),function(x){replace_na(x,1)}) %>%
select(val1) %>%
unname() %>%
unlist()
mat <- df_amd %>%
select(value,X1:X100) %>%
distinct() %>%
select(-value) %>%
as.matrix()
rownames(mat) = df_amd %>%
select(value) %>%
distinct() %>%
unlist() %>%
unname()
d = dist(mat)
#kmeans
plot(stats::hclust(d,method = "ward.D2"))
from chat gpt
# Load required packages
library(text2vec)
# Define a function to calculate cosine similarity between two vectors
cosine_similarity <- function(x, y) {
dot_product <- sum(x * y)
norm_x <- sqrt(sum(x^2))
norm_y <- sqrt(sum(y^2))
similarity <- dot_product / (norm_x * norm_y)
return(similarity)
}
# Define a function to calculate cosine similarity matrix between words
calculate_similarity_matrix <- function(words) {
n <- length(words)
similarity_matrix <- matrix(0, nrow = n, ncol = n)
for (i in 1:n) {
for (j in 1:n) {
similarity_matrix[i, j] <- cosine_similarity(words[i, ], words[j, ])
}
}
return(similarity_matrix)
}
# Define a function to perform PCA on the similarity matrix
perform_pca <- function(similarity_matrix) {
pca <- PCA(similarity_matrix, graph = FALSE)
return(pca$eig)
}
# Sample word vectors
word_vectors <- matrix(runif(100), nrow = 10)
# Calculate cosine similarity matrix
similarity_matrix <- calculate_similarity_matrix(word_vectors)
# Perform PCA on similarity matrix
pca_result <- perform_pca(cor_matrix)
# Print the eigenvalues
print(pca_result$eigenvalues)
pca_result
pca_solution <- PCA(cor_matrix)
pca_solution$var
pca_solution_1 <- PCA(mat,ncp = 7,axes = 2)
pca_solution_1 <- PCA(mat,ncp = 10)
#write.csv(pca_solution_1$ind$contrib,"ind_contrib_10.csv")
df_pcasol <- as.data.frame(pca_solution_1$ind$contrib)
let’s try kmeans clustering
df_valuevectors <- df_amd %>%
select(value,X1:X100) %>%
distinct()
my_rownames = df_valuevectors %>%
select(value) %>%
unlist() %>%
unname()
mt_valuevectors <- data.matrix(df_valuevectors)
rownames(mt_valuevectors) <- my_rownames
mt_valuevectors <- mt_valuevectors[,2:101]
distance <- get_dist(mt_valuevectors)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
Ten clusters
hmm, this isn’t really giving me anything meaningful. Let’s try to do
this by intuition. Broad components, the way I see them, are:
1. Freedom: freedom, liberty, freedom of speech, rights, freedom of
religion, competition, right to bear arms, self determination, religion,
democracy, individualism, self government, privacy.
2. Compassion: respect, integrity, empathy, fairness, kindness,
tolerance, compassion.
3. Capitalism: money, opportunity, capitalism, greed, success,
achievement.
4. Security: life, peace.
5. Equality: diversity, equality, equal opportunity.
6. National Pride: power, independence, strong, unity, nationalism,
strength, patriotism.
7. Progress: progress, change, hard work, education.
8. Happiness: pursuit of happiness, happiness, prosperity Uncategorized:
justice, informality, honesty.
I’ll try to verify this. Basically, I’ll get the cosine similarity of
each of our 48 values with each of these 8 overarching themes. That’ll
help me get a sense of which one is most similar to which. And which
don’t really map onto to any of these themes. Let’s see.
library(text2vec)
text8_file = "~/text8"
if (!file.exists(text8_file)) {
download.file("http://mattmahoney.net/dc/text8.zip", "~/text8.zip")
unzip ("~/text8.zip", files = "text8", exdir = "~/")
}
wiki = readLines(text8_file, n = 1, warn = FALSE)
# Create iterator over tokens
tokens <- space_tokenizer(wiki)
# Create vocabulary. Terms will be ngrams (1 to 4 tokens).
it = itoken(tokens, progressbar = FALSE)
vocab <- create_vocabulary(it,ngram = c(ngram_min = 1,ngram_max = 4))
vocab <- prune_vocabulary(vocab, term_count_min = 3L)
# Use our filtered vocabularyzoo
vectorizer <- vocab_vectorizer(vocab)
# use window of 5 for context words
tcm <- create_tcm(it, vectorizer, skip_grams_window = 5L)
glove = GlobalVectors$new(rank = 100, x_max = 10)
wv_main = glove$fit_transform(tcm, n_iter = 10, convergence_tol = 0.01, n_threads = 8)
wv_context = glove$components
word_vectors = wv_main + t(wv_context)
themes_vectors <- data.frame(word_vectors) %>%
mutate(word = rownames(word_vectors)) %>%
filter(word == "freedom" |
word == "compassion" |
word == "capitalism" |
word == "security" |
word == "equality" |
word == "national_pride" |
word == "progress" |
word == "happiness") %>%
select(word,everything())
similarity scores
row.names(themes_vectors) = NULL
themes_vectors <- themes_vectors %>%
mutate(value = paste0(word,"_theme")) %>%
select(-word)
df_themesimis = tibble(value = "random",
simi_national_pride = 0,
simi_compassion = 0,
simi_happiness = 0,
simi_equality = 0,
simi_capitalism = 0,
simi_progress = 0,
simi_freedom = 0,
simi_security = 0)
value_vectors <- df_amd %>%
select(value,X1:X100) %>%
distinct()
values = unique(df_amd$value)
for(i in values){
mt_cosine <- value_vectors %>%
filter(value == i) %>%
bind_rows(themes_vectors) %>%
pivot_longer(X1:X100,
names_to = "names",
values_to = "values") %>%
pivot_wider(names_from = "value",
values_from = "values") %>%
select(-names) %>%
as.matrix() %>%
cosine()
cosine_national_pride = mt_cosine[2,1]
cosine_compassion = mt_cosine[3,1]
cosine_happiness = mt_cosine[4,1]
cosine_equality = mt_cosine[5,1]
cosine_capitalism = mt_cosine[6,1]
cosine_progress = mt_cosine[7,1]
cosine_freedom = mt_cosine[8,1]
cosine_security = mt_cosine[9,1]
current_scores = tibble(value = i,
simi_national_pride = cosine_national_pride,
simi_compassion = cosine_compassion,
simi_happiness = cosine_happiness,
simi_equality = cosine_equality,
simi_capitalism = cosine_capitalism,
simi_progress = cosine_progress,
simi_freedom = cosine_freedom,
simi_security = cosine_security)
df_themesimis <- df_themesimis %>%
bind_rows(current_scores)
}
df_themesimis %>%
arrange(desc(simi_capitalism))