#MATH 308 Applied Project
#March 25, 2026
library(RSpectra)
library(tidyverse)
## Warning: package 'readr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#################################################################################
#Question 1) SVD for learning word embeddings
#Part 1: Start by importing the given datasets
dictionary <- readLines("dictionary.txt")
## Warning in readLines("dictionary.txt"): incomplete final line found on
## 'dictionary.txt'
head(dictionary)
## [1] "the" "of" "in" "and" "to" "was"
#We see that the 6 most common words are "the", "of", in", "and", "to", and "was"
M <- read.csv("co_occur.csv", header = FALSE)
M <- as.matrix(M)
M[1,2]
## V2
## 24637000
M[2,1]
## V1
## 24637000
#There are 24,637,000 co-occurrences of the words "the" and "of". Also, the
#matrix is clearly symmetric, as the co-occurrences of "the" and "of" are the same
#as the co-occurrences of "of" and "the"
M[1, 3]
## V3
## 13531000
#There are 13,531, 000 co-occurrences of the words "the" and "in". This makes sense
#that it is less than the above example as "in" is less common, in general, than
#the word "of"
M[3, 5]
## V5
## 1095500
#There are 1,095,500 co-occurrences of the words "in" and "to". This makes logical sense
#that it is less than both above example, as the chosen words are less frequent
#Part 2: Compute the rank-100 approximation of M_norm using the top 100 singular values
M_norm = log(1+M)
SVD_M = svds(M_norm, k = 100)
U <- SVD_M$u
D <- SVD_M$d
Sigma <- diag(D)
V <- SVD_M$v
M_100 = U %*% Sigma %*% t(V)
plot(D, type = "b", xlab = "Index", ylab = "Singular value", main = "Top 100 singular values of M_tilde")

D_data <- data.frame(Index = 1:length(D), SingularValue = D)
ggplot(D_data) + aes(x = Index, y = SingularValue) + geom_line(alpha = 0.5) + geom_point(shape = 1, size = 2, alpha = 0.8) +
labs(title = "Top 100 singular values of M_norm") + theme_minimal()

#M_norm seems to be close to a low rank matrix based off of this graph. A low rank
#matrix
#part 3
#~M = U SIGMA V^T
V <- SVD_M$v
vec1 <- V[,1]
vec2 <- V[,2]
#top words for each
getter_largest_word_values <- function (vec, dictionary, n=10){
positivei <- order(vec, decreasing = TRUE)[1:n]
negativei = order(vec, decreasing = FALSE)[1:n]
list(
positive = dictionary[positivei],
negative = dictionary[negativei]
)
}
vec1 <- getter_largest_word_values(V[,1], dictionary)
vec2 <- getter_largest_word_values(V[,2], dictionary)
vec3 <- getter_largest_word_values(V[,3], dictionary)
vec4 <- getter_largest_word_values(V[,4], dictionary)
vec5 <- getter_largest_word_values(V[,5], dictionary)
vec1
## $positive
## [1] "the" "and" "of" "in" "to" "for" "as" "is" "with" "was"
##
## $negative
## [1] "gmina" "insee" "householder" "increment" "peakposition"
## [6] "midst" "outskirts" "iucn" "voivodeship" "subtropical"
vec2
## $positive
## [1] "innings" "tournaments" "leagues" "constituency" "medals"
## [6] "locomotives" "households" "manga" "billboard" "theaters"
##
## $negative
## [1] "the" "and" "of" "in" "to" "for" "with" "as" "by" "is"
vec3
## $positive
## [1] "born" "john" "james" "david" "robert" "william" "jr"
## [8] "george" "thomas" "michael"
##
## $negative
## [1] "specific" "any" "data" "provide" "these" "certain"
## [7] "different" "systems" "its" "use"
vec4
## $positive
## [1] "district" "county" "council" "university" "national"
## [6] "regional" "government" "department" "northern" "municipal"
##
## $negative
## [1] "you" "album" "love" "me" "my" "song" "episode"
## [8] "your" "vocals" "baby"
vec5
## $positive
## [1] "jpg" "km" "located" "near" "road" "river"
## [7] "lake" "route" "station" "mountain"
##
## $negative
## [1] "political" "social" "religious" "law" "legal"
## [6] "policy" "rights" "government" "minister" "jewish"
#Part 4: Explore how certain directions in the embedded space correspond to
#specific syntactic or semantic concepts.
row_norms <- sqrt(apply(V^2, 1, sum))
V_embed <- V / matrix(row_norms, nrow = nrow(V), ncol = ncol(V))
V1 <- V_embed[which(dictionary == "woman"), ]
V2 <- V_embed[which(dictionary == "man"), ]
V <- V1 - V2
#Part(a) Present a plot of projections of the embeddings of these words marked on a line.
words <- c("boy","girl","brother","sister","king","queen","he","she","john","mary","wall","tree")
embeddings <- V_embed[match(words, dictionary), ]
proj<- as.vector(embeddings %*% V)
ggplot() + aes(x = proj, y = rep(0,length(proj)) ) + geom_point() +
geom_text(aes(label = words), hjust = -1, size = 1.9, angle = 90) +
geom_hline(yintercept = 0) + theme_minimal()

#We see that the words more associated with men are further to the left, and the
#words associated with women are further to the right. The words in the middle,
#like tree, are gender neutral words. Notably, this isn't completely accurate because
#words like he and she are still quite close to the middle.
#Part(b) Present a similar plot of the projections of the embeddings with new words.
words <- c("math", "matrix", "history", "nurse", "doctor", "pilot", "teacher", "engineer", "science",
"arts", "literature", "bob", "alice")
embeddings <- V_embed[match(words, dictionary), ]
proj<- as.vector(embeddings %*% V)
ggplot() + aes(x = proj, y = rep(0,length(proj)) ) + geom_point() +
geom_text(aes(label = words), hjust = -1, size = 1.9, angle = 90) +
geom_hline(yintercept = 0) + theme_minimal()

#We can see that some jobs are far associated more with men and some with women.
#For example, engineer, math, and science are very skewed towards men,
#and nurse is very skewed towards women. I also see that male names like Bob are
#identified to the male side, and female names like Alice are identitified to the
#female side.
#There is a clear problem with this, as it unfairly associates certain jobs to certain
#genders. In the case of Linkedin, it could assume a profile that is more closely aligned
#with a job description is more qualified, however, having Bob is more closely asssociated
#engineer than Alice, just based off of the gender association of their name, and not
#their skills. This could heavily impact the acceptance rate for jobs and schools just
#based off of gender.
#Part(c) Propose one method of mitigating the problem discussed in the previous
#part.
#Since word embedding is a useful tool, we want to be able to adjust our algorithms
#to "debias" them. One method could be to go into the gender component of the embedding
#and specifically neutralize certain words that shouldn't have a bias, like doctor,
#engineer, math, nurse etc. We can do this for different words along different directions too,
#like race or age.
#Part 5: In this question we will explore in more depth the property that directions in
#the embedded space correspond to semantic or syntactic concepts.
#Part(a)
V_montreal <- V_embed[which(dictionary == "montreal"), ]
similarity_list <- V_embed %*% V_montreal
ordered_list <- order(similarity_list, decreasing = TRUE)
dictionary[ordered_list[1:10]]
## [1] "montreal" "vancouver" "toronto" "ottawa" "winnipeg" "calgary"
## [7] "edmonton" "detroit" "quebec" "atlanta"
#The most similar words to montreal are "vancouver", "toronto", "ottawa", "winnipeg", "calgary",
#"edmonton" "detroit", "quebec", "atlanta"
#Part (b)
analogy <- read.table("analogy_task.txt", header = FALSE)
colnames(analogy) <- c("a", "b", "c", "d")
num_correct <- 0
printed_a <- c()
wrong_count <- 0
for (i in 1:nrow(analogy)) {
a <- analogy$a[i]
b <- analogy$b[i]
c <- analogy$c[i]
V_query <- V_embed[which(dictionary == b), ] - V_embed[which(dictionary == a), ] + V_embed[which(dictionary == c), ]
similarity_list <- V_embed %*% V_query
similarity_list[match(c(a, b, c), dictionary)] <- -Inf
ordered_list <- order(similarity_list, decreasing = TRUE)
top5 <- dictionary[ordered_list[1:5]]
if (dictionary[ordered_list[1]] == analogy$d[i]) {
num_correct <- num_correct + 1
} else {
wrong_count <- wrong_count + 1
if (!(analogy$d[i] %in% top5) && wrong_count %% 50 == 0 && !(a %in% printed_a)) {
print(paste(a, "is to", b, "as", c, "is to", analogy$d[i], "| Top 5:", paste(top5, collapse = ", ")))
printed_a <- c(printed_a, a)
}
}
}
## [1] "usa is to dollar as korea is to won | Top 5: dollars, billion, currency, taiwan, singapore"
## [1] "houston is to texas as minneapolis is to minnesota | Top 5: missouri, connecticut, illinois, kentucky, wisconsin"
## [1] "phoenix is to arizona as pittsburgh is to pennsylvania | Top 5: kansas, michigan, oklahoma, nebraska, alabama"
## [1] "indianapolis is to indiana as austin is to texas | Top 5: ohio, virginia, massachusetts, pennsylvania, illinois"
## [1] "detroit is to michigan as denver is to colorado | Top 5: iowa, nebraska, illinois, alabama, oregon"
## [1] "boston is to massachusetts as sacramento is to california | Top 5: vermont, wyoming, nevada, idaho, delaware"
## [1] "denver is to colorado as arlington is to texas | Top 5: wyoming, delaware, nevada, vermont, idaho"
## [1] "nashville is to tennessee as irving is to texas | Top 5: franklin, herbert, jefferson, theodore, sidney"
## [1] "portland is to oregon as henderson is to nevada | Top 5: carroll, greene, wheeler, dixon, baker"
## [1] "miami is to florida as minneapolis is to minnesota | Top 5: connecticut, massachusetts, wisconsin, illinois, oregon"
## [1] "arlington is to texas as madison is to wisconsin | Top 5: virginia, ohio, carolina, illinois, missouri"
## [1] "orlando is to florida as indianapolis is to indiana | Top 5: michigan, illinois, wisconsin, ohio, iowa"
## [1] "complete is to completely as immediate is to immediately | Top 5: totally, seemingly, entirely, largely, partly"
## [1] "most is to mostly as rare is to rarely | Top 5: exclusively, predominantly, mainly, primarily, chiefly"
## [1] "possible is to possibly as sudden is to suddenly | Top 5: probably, presumably, apparently, experiencing, supposedly"
## [1] "quick is to quickly as precise is to precisely | Top 5: soon, gradually, exact, eventually, finally"
## [1] "serious is to seriously as complete is to completely | Top 5: abandon, rebuild, incomplete, rid, succeed"
## [1] "typical is to typically as quick is to quickly | Top 5: normally, usually, generally, rarely, readily"
## [1] "big is to biggest as good is to best | Top 5: excellent, perfect, vital, essential, important"
## [1] "dance is to dancing as listen is to listening | Top 5: talking, thank, laugh, watching, imagine"
## [1] "increase is to increasing as read is to reading | Top 5: addresses, addressed, written, reads, translated"
## [1] "austria is to austrian as greece is to greek | Top 5: bulgarian, turkish, romanian, russian, hungarian"
## [1] "egypt is to egyptian as greece is to greek | Top 5: romanian, bulgarian, hungarian, croatian, serbian"
## [1] "moving is to moved as jumping is to jumped | Top 5: jump, freestyle, swim, skating, skiing"
## [1] "running is to ran as implementing is to implemented | Top 5: implement, adopting, implementation, guidelines, outlined"
## [1] "singing is to sang as thinking is to thought | Top 5: perspective, feels, ya, thinks, realized"
## [1] "dog is to dogs as eagle is to eagles | Top 5: hawk, deer, falcon, cats, lion"
## [1] "rat is to rats as lion is to lions | Top 5: cats, dogs, horses, wolves, deer"
## [1] "play is to plays as think is to thinks | Top 5: says, know, knows, say, maybe"
accuracy <- num_correct / nrow(analogy)
accuracy
## [1] 0.5498657
#Hard Examples:
#(1) "phoenix is to arizona as pittsburgh is to pennsylvania | Top 5: kansas, michigan, oklahoma, nebraska, alabama"
#City to state examples are hard because the embedding doesn't understand the context that
#the first words are cities and the seconda are states, it just spits out similar words, which is often
#just other states
#(2) "quick is to quickly as precise is to precisely | Top 5: soon, gradually, exact, eventually, finally"
#This example is hard because, again, the embedding doesn't get that these are the adverbs
#of the adjectives. Soon is a synoymn to quick, so it makes a lot of sense to be similar,
#but the context of the analogy is lost.
#(3) "singing is to sang as thinking is to thought | Top 5: perspective, feels, ya, thinks, realized"
#These words are hard because sang and thought don't follow a specific pattern
#of endings. The embedding seems to find this extremely hard as it spews very random words
#Part(c)Your goal is to now improve the score of the word embeddings on the analogy task.
k_values <- c(50, 100, 150, 200)
results <- data.frame(k = integer(), accuracy = numeric())
analogy <- read.table("analogy_task.txt", header = FALSE)
colnames(analogy) <- c("a", "b", "c", "d")
for (k in k_values) {
SVD_M <- svds(M_norm, k = k)
V <- SVD_M$v
row_norms <- sqrt(apply(V^2, 1, sum))
V_embed <- V / matrix(row_norms, nrow = nrow(V), ncol = ncol(V))
num_correct <- 0
for (i in 1:nrow(analogy)) {
a <- analogy$a[i]
b <- analogy$b[i]
c <- analogy$c[i]
V_query <- V_embed[which(dictionary == b), ] -
V_embed[which(dictionary == a), ] +
V_embed[which(dictionary == c), ]
similarity_list <- V_embed %*% V_query
similarity_list[match(c(a, b, c), dictionary)] <- -Inf
ordered_list <- order(similarity_list, decreasing = TRUE)
if (dictionary[ordered_list[1]] == analogy$d[i]) {
num_correct <- num_correct + 1
}
}
accuracy <- num_correct / nrow(analogy)
results <- rbind(results, data.frame(k = k, accuracy = accuracy))
print(paste("k =", k, "| accuracy =", accuracy))
}
## [1] "k = 50 | accuracy = 0.447985675917637"
## [1] "k = 100 | accuracy = 0.549865711727842"
## [1] "k = 150 | accuracy = 0.578155774395703"
## [1] "k = 200 | accuracy = 0.601253357206804"
results
## k accuracy
## 1 50 0.4479857
## 2 100 0.5498657
## 3 150 0.5781558
## 4 200 0.6012534
#We tried improving the accuracy of our embedding by varying the dimension of the SVD
#to test different amounts of singular directions (k = 50, 100, 150, and 200). Our accuracy increased
#as we increased k, however, so did the time to compute the analogies. Using 200 as our
#dimension for the SVD got us a higher accuracy at 0.601, but it took a considerable amount
#more time to process. Thus, there is a tradeoff in the case to improve our accuracy.