#MATH 308 Applied Project
#March 25, 2026

library(RSpectra)
library(tidyverse)
## Warning: package 'readr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#################################################################################
#Question 1) SVD for learning word embeddings

#Part 1: Start by importing the given datasets
dictionary <- readLines("dictionary.txt")
## Warning in readLines("dictionary.txt"): incomplete final line found on
## 'dictionary.txt'
head(dictionary)
## [1] "the" "of"  "in"  "and" "to"  "was"
#We see that the 6 most common words are "the", "of", in", "and", "to", and "was"

M <- read.csv("co_occur.csv", header = FALSE)
M <- as.matrix(M)
M[1,2]
##       V2 
## 24637000
M[2,1]
##       V1 
## 24637000
#There are 24,637,000 co-occurrences of the words "the" and "of". Also, the
#matrix is clearly symmetric, as the co-occurrences of "the" and "of" are the same
#as the co-occurrences of "of" and "the"

M[1, 3]
##       V3 
## 13531000
#There are 13,531, 000 co-occurrences of the words "the" and "in". This makes sense
#that it is less than the above example as "in" is less common, in general, than
#the word "of"

M[3, 5]
##      V5 
## 1095500
#There are 1,095,500 co-occurrences of the words "in" and "to". This makes logical sense
#that it is less than both above example, as the chosen words are less frequent


#Part 2: Compute the rank-100 approximation of M_norm using the top 100 singular values
M_norm = log(1+M)

SVD_M = svds(M_norm, k = 100)
U <- SVD_M$u
D <- SVD_M$d
Sigma <- diag(D)
V <- SVD_M$v

M_100 = U %*% Sigma %*% t(V)

plot(D, type = "b", xlab = "Index", ylab = "Singular value", main = "Top 100 singular values of M_tilde")

D_data <- data.frame(Index = 1:length(D), SingularValue = D)
ggplot(D_data) + aes(x = Index, y = SingularValue) + geom_line(alpha = 0.5) + geom_point(shape = 1, size = 2, alpha = 0.8) + 
  labs(title = "Top 100 singular values of M_norm") + theme_minimal()

#M_norm seems to be close to a low rank matrix based off of this graph. A low rank
#matrix 

#part 3
#~M = U SIGMA V^T
V <- SVD_M$v

vec1 <- V[,1]
vec2 <- V[,2]

#top words for each
getter_largest_word_values <- function (vec, dictionary, n=10){
  positivei <- order(vec, decreasing = TRUE)[1:n]
  negativei = order(vec, decreasing = FALSE)[1:n]
  
  list(
    positive = dictionary[positivei],
    negative = dictionary[negativei]
    
    
  )
}

vec1 <- getter_largest_word_values(V[,1], dictionary)
vec2 <- getter_largest_word_values(V[,2], dictionary)
vec3 <- getter_largest_word_values(V[,3], dictionary)
vec4 <- getter_largest_word_values(V[,4], dictionary)
vec5 <- getter_largest_word_values(V[,5], dictionary)

vec1
## $positive
##  [1] "the"  "and"  "of"   "in"   "to"   "for"  "as"   "is"   "with" "was" 
## 
## $negative
##  [1] "gmina"        "insee"        "householder"  "increment"    "peakposition"
##  [6] "midst"        "outskirts"    "iucn"         "voivodeship"  "subtropical"
vec2
## $positive
##  [1] "innings"      "tournaments"  "leagues"      "constituency" "medals"      
##  [6] "locomotives"  "households"   "manga"        "billboard"    "theaters"    
## 
## $negative
##  [1] "the"  "and"  "of"   "in"   "to"   "for"  "with" "as"   "by"   "is"
vec3
## $positive
##  [1] "born"    "john"    "james"   "david"   "robert"  "william" "jr"     
##  [8] "george"  "thomas"  "michael"
## 
## $negative
##  [1] "specific"  "any"       "data"      "provide"   "these"     "certain"  
##  [7] "different" "systems"   "its"       "use"
vec4
## $positive
##  [1] "district"   "county"     "council"    "university" "national"  
##  [6] "regional"   "government" "department" "northern"   "municipal" 
## 
## $negative
##  [1] "you"     "album"   "love"    "me"      "my"      "song"    "episode"
##  [8] "your"    "vocals"  "baby"
vec5
## $positive
##  [1] "jpg"      "km"       "located"  "near"     "road"     "river"   
##  [7] "lake"     "route"    "station"  "mountain"
## 
## $negative
##  [1] "political"  "social"     "religious"  "law"        "legal"     
##  [6] "policy"     "rights"     "government" "minister"   "jewish"
#Part 4: Explore how certain directions in the embedded space correspond to
#specific syntactic or semantic concepts.

row_norms <- sqrt(apply(V^2, 1, sum))
V_embed <- V / matrix(row_norms, nrow = nrow(V), ncol = ncol(V))

V1 <- V_embed[which(dictionary == "woman"), ]
V2 <- V_embed[which(dictionary == "man"), ]
V <- V1 - V2

#Part(a) Present a plot of projections of the embeddings of these words marked on a line.  
words <- c("boy","girl","brother","sister","king","queen","he","she","john","mary","wall","tree")
embeddings <- V_embed[match(words, dictionary), ]
proj<- as.vector(embeddings %*% V)

ggplot() + aes(x = proj, y = rep(0,length(proj)) ) + geom_point() +
  geom_text(aes(label = words), hjust = -1, size = 1.9, angle = 90) +
  geom_hline(yintercept = 0) + theme_minimal()

#We see that the words more associated with men are further to the left, and the
#words associated with women are further to the right. The words in the middle,
#like tree, are gender neutral words. Notably, this isn't completely accurate because
#words like he and she are still quite close to the middle.

#Part(b) Present a similar plot of the projections of the embeddings with new words.
words <- c("math", "matrix", "history", "nurse", "doctor", "pilot", "teacher", "engineer", "science",
           "arts", "literature", "bob", "alice")
embeddings <- V_embed[match(words, dictionary), ]
proj<- as.vector(embeddings %*% V)

ggplot() + aes(x = proj, y = rep(0,length(proj)) ) + geom_point() +
  geom_text(aes(label = words), hjust = -1, size = 1.9, angle = 90) +
  geom_hline(yintercept = 0) + theme_minimal()

#We can see that some jobs are far associated more with men and some with women.
#For example, engineer, math, and science are very skewed towards men, 
#and nurse is very skewed towards women. I also see that male names like Bob are 
#identified to the male side, and female names like Alice are identitified to the 
#female side.

#There is a clear problem with this, as it unfairly associates certain jobs to certain
#genders. In the case of Linkedin, it could assume a profile that is more closely aligned 
#with a job description is more qualified, however, having Bob is more closely asssociated
#engineer than Alice, just based off of the gender association of their name, and not
#their skills. This could heavily impact the acceptance rate for jobs and schools just
#based off of gender.

#Part(c) Propose one method of mitigating the problem discussed in the previous
#part. 

#Since word embedding is a useful tool, we want to be able to adjust our algorithms
#to "debias" them. One method could be to go into the gender component of the embedding
#and specifically neutralize certain words that shouldn't have a bias, like doctor,
#engineer, math, nurse etc. We can do this for different words along different directions too,
#like race or age.

#Part 5: In this question we will explore in more depth the property that directions in
#the embedded space correspond to semantic or syntactic concepts.

#Part(a)
V_montreal <- V_embed[which(dictionary == "montreal"), ]
similarity_list <- V_embed %*% V_montreal
ordered_list <- order(similarity_list, decreasing = TRUE)
dictionary[ordered_list[1:10]]
##  [1] "montreal"  "vancouver" "toronto"   "ottawa"    "winnipeg"  "calgary"  
##  [7] "edmonton"  "detroit"   "quebec"    "atlanta"
#The most similar words to montreal are "vancouver", "toronto", "ottawa", "winnipeg", "calgary",
#"edmonton"  "detroit", "quebec", "atlanta"

#Part (b) 

analogy <- read.table("analogy_task.txt", header = FALSE)
colnames(analogy) <- c("a", "b", "c", "d")
num_correct <- 0
printed_a <- c()
wrong_count <- 0

for (i in 1:nrow(analogy)) {
  a <- analogy$a[i]
  b <- analogy$b[i]
  c <- analogy$c[i]
  
  V_query <- V_embed[which(dictionary == b), ] - V_embed[which(dictionary == a), ] + V_embed[which(dictionary == c), ]
  similarity_list <- V_embed %*% V_query
  similarity_list[match(c(a, b, c), dictionary)] <- -Inf
  ordered_list <- order(similarity_list, decreasing = TRUE)
  top5 <- dictionary[ordered_list[1:5]]
  
  if (dictionary[ordered_list[1]] == analogy$d[i]) {
    num_correct <- num_correct + 1
  } else {
    wrong_count <- wrong_count + 1
    if (!(analogy$d[i] %in% top5) && wrong_count %% 50 == 0 && !(a %in% printed_a)) {
    print(paste(a, "is to", b, "as", c, "is to", analogy$d[i], "| Top 5:", paste(top5, collapse = ", ")))
    printed_a <- c(printed_a, a)
    }
  }
}
## [1] "usa is to dollar as korea is to won | Top 5: dollars, billion, currency, taiwan, singapore"
## [1] "houston is to texas as minneapolis is to minnesota | Top 5: missouri, connecticut, illinois, kentucky, wisconsin"
## [1] "phoenix is to arizona as pittsburgh is to pennsylvania | Top 5: kansas, michigan, oklahoma, nebraska, alabama"
## [1] "indianapolis is to indiana as austin is to texas | Top 5: ohio, virginia, massachusetts, pennsylvania, illinois"
## [1] "detroit is to michigan as denver is to colorado | Top 5: iowa, nebraska, illinois, alabama, oregon"
## [1] "boston is to massachusetts as sacramento is to california | Top 5: vermont, wyoming, nevada, idaho, delaware"
## [1] "denver is to colorado as arlington is to texas | Top 5: wyoming, delaware, nevada, vermont, idaho"
## [1] "nashville is to tennessee as irving is to texas | Top 5: franklin, herbert, jefferson, theodore, sidney"
## [1] "portland is to oregon as henderson is to nevada | Top 5: carroll, greene, wheeler, dixon, baker"
## [1] "miami is to florida as minneapolis is to minnesota | Top 5: connecticut, massachusetts, wisconsin, illinois, oregon"
## [1] "arlington is to texas as madison is to wisconsin | Top 5: virginia, ohio, carolina, illinois, missouri"
## [1] "orlando is to florida as indianapolis is to indiana | Top 5: michigan, illinois, wisconsin, ohio, iowa"
## [1] "complete is to completely as immediate is to immediately | Top 5: totally, seemingly, entirely, largely, partly"
## [1] "most is to mostly as rare is to rarely | Top 5: exclusively, predominantly, mainly, primarily, chiefly"
## [1] "possible is to possibly as sudden is to suddenly | Top 5: probably, presumably, apparently, experiencing, supposedly"
## [1] "quick is to quickly as precise is to precisely | Top 5: soon, gradually, exact, eventually, finally"
## [1] "serious is to seriously as complete is to completely | Top 5: abandon, rebuild, incomplete, rid, succeed"
## [1] "typical is to typically as quick is to quickly | Top 5: normally, usually, generally, rarely, readily"
## [1] "big is to biggest as good is to best | Top 5: excellent, perfect, vital, essential, important"
## [1] "dance is to dancing as listen is to listening | Top 5: talking, thank, laugh, watching, imagine"
## [1] "increase is to increasing as read is to reading | Top 5: addresses, addressed, written, reads, translated"
## [1] "austria is to austrian as greece is to greek | Top 5: bulgarian, turkish, romanian, russian, hungarian"
## [1] "egypt is to egyptian as greece is to greek | Top 5: romanian, bulgarian, hungarian, croatian, serbian"
## [1] "moving is to moved as jumping is to jumped | Top 5: jump, freestyle, swim, skating, skiing"
## [1] "running is to ran as implementing is to implemented | Top 5: implement, adopting, implementation, guidelines, outlined"
## [1] "singing is to sang as thinking is to thought | Top 5: perspective, feels, ya, thinks, realized"
## [1] "dog is to dogs as eagle is to eagles | Top 5: hawk, deer, falcon, cats, lion"
## [1] "rat is to rats as lion is to lions | Top 5: cats, dogs, horses, wolves, deer"
## [1] "play is to plays as think is to thinks | Top 5: says, know, knows, say, maybe"
accuracy <- num_correct / nrow(analogy)
accuracy
## [1] 0.5498657
#Hard Examples: 

#(1)  "phoenix is to arizona as pittsburgh is to pennsylvania | Top 5: kansas, michigan, oklahoma, nebraska, alabama"
#City to state examples are hard because the embedding doesn't understand the context that
#the first words are cities and the seconda are states, it just spits out similar words, which is often
#just other states

#(2) "quick is to quickly as precise is to precisely | Top 5: soon, gradually, exact, eventually, finally"
#This example is hard because, again, the embedding doesn't get that these are the adverbs
#of the adjectives. Soon is a synoymn to quick, so it makes a lot of sense to be similar,
#but the context of the analogy is lost.

#(3) "singing is to sang as thinking is to thought | Top 5: perspective, feels, ya, thinks, realized"
#These words are hard because sang and thought don't follow a specific pattern 
#of endings. The embedding seems to find this extremely hard as it spews very random words

#Part(c)Your goal is to now improve the score of the word embeddings on the analogy task.
k_values <- c(50, 100, 150, 200)
results <- data.frame(k = integer(), accuracy = numeric())

analogy <- read.table("analogy_task.txt", header = FALSE)
colnames(analogy) <- c("a", "b", "c", "d")

for (k in k_values) {
  SVD_M <- svds(M_norm, k = k)
  V <- SVD_M$v
  
  row_norms <- sqrt(apply(V^2, 1, sum))
  V_embed <- V / matrix(row_norms, nrow = nrow(V), ncol = ncol(V))
  
  num_correct <- 0
  
  for (i in 1:nrow(analogy)) {
    a <- analogy$a[i]
    b <- analogy$b[i]
    c <- analogy$c[i]
    
    V_query <- V_embed[which(dictionary == b), ] - 
      V_embed[which(dictionary == a), ] + 
      V_embed[which(dictionary == c), ]
    
    similarity_list <- V_embed %*% V_query
    similarity_list[match(c(a, b, c), dictionary)] <- -Inf
    ordered_list <- order(similarity_list, decreasing = TRUE)
    
    if (dictionary[ordered_list[1]] == analogy$d[i]) {
      num_correct <- num_correct + 1
    }
  }
  
  accuracy <- num_correct / nrow(analogy)
  results <- rbind(results, data.frame(k = k, accuracy = accuracy))
  
  print(paste("k =", k, "| accuracy =", accuracy))
}
## [1] "k = 50 | accuracy = 0.447985675917637"
## [1] "k = 100 | accuracy = 0.549865711727842"
## [1] "k = 150 | accuracy = 0.578155774395703"
## [1] "k = 200 | accuracy = 0.601253357206804"
results
##     k  accuracy
## 1  50 0.4479857
## 2 100 0.5498657
## 3 150 0.5781558
## 4 200 0.6012534
#We tried improving the accuracy of our embedding by varying the dimension of the SVD
#to test different amounts of singular directions (k = 50, 100, 150, and 200). Our accuracy increased
#as we increased k, however, so did the time to compute the analogies. Using 200 as our 
#dimension for the SVD got us a higher accuracy at 0.601, but it took a considerable amount
#more time to process. Thus, there is a tradeoff in the case to improve our accuracy.