url_book = "https://www.gutenberg.org/files/2701/2701-h/2701-h.htm#link2HCH0004"
html <- read_html(url_book, encoding = "UTF-8")
text <-html_text2(html)
print(str_extract(text, "(\\w[^\\n]*)")) #printing the first sentence
## [1] "The Project Gutenberg eBook of Moby-Dick; or The Whale, by Herman Melville"
We used a the functions strsplitwith a regular
expression. Afterwards we cleaned it a little better using
sapply (removing empty strings). We present the number of
words that are in the texts.
book_split_pattern <- "(\\s)|(\\r)|(\\n)|(\\,)|(\\.){1,}"
words_vec <- strsplit(text ,split = book_split_pattern ) #splitting the text
words_vec <- sapply(words_vec, function(z){ z[!is.na(z) & z != ""]}) #cleaning result
word.count <- length(words_vec) #Counting words
There are 217099 non-unique words in the text.
We now plot a histogram of the distribution of words length. We used
the str_length function to get the vector of lengths and
calculated key descriptive statistics. Then plotted the
distribution.
lengths_vec <- str_length(words_vec) #lengths of all the words
#Plotting distribution
lengths.df <- data.frame(lengths_vec)
lengths.df %>%
ggplot(aes(lengths_vec) )+
geom_bar(aes(y=..prop.., group = 1), color = "grey",show.legend = TRUE) +
labs(x="Word Length",
y="Probability") +
ggtitle("Distribution of Length of Words")
#key descriptive statistics:
mean_lengths <- mean(lengths_vec)
max_lengths <- max(lengths_vec)
median_lengths <- median(lengths_vec)
mode <- function(x) { #Calculating mode with a little helper function
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]}
mode_lengths <- mode(lengths_vec) #applying Mode function
We see that the distribution has a right tail and it’s center is around 3-4 letters per word. The median is 4, the mean is 4.5597723, the longest length is 32 and most common word lengths 3.
we calculated the words frequencies in the text using the
table function.
freq_tb = table(words_vec)
freq_df = as.data.frame(freq_tb)
freq_df %>% arrange(desc(Freq)) %>% head(10)
The top 10 words are not surprising since they are all part of most common words in words in the English language.
We spited the book to chapters with strsplit function
and a regular expression. we then decided to exclude everything that is
not relative to the literature piece so we used another regular
expression to include only the Etymology chapters 1 to 135, and
Epilogue.
pattern.cahpters = "(\\r\\n\\n\\r\\n\\n\\r\\s\\r\\s\\r\\n\\n\\r\\n\\r\\n\\n\\n\\n\\n\\r\\n\\r\\s){1,}"
chapters <- strsplit(text ,split = pattern.cahpters)
chapters <- unlist(chapters)
pattern.etymology = "(\\r\\n\\n\\r\\n\\n\\r\\n\\r\\r\\sH2 anchor\\r\\n\\r\\n\\n\\n\\n\\n\\r\\n\\r\\s)"
chapters[1] = unlist(strsplit(chapters[1] ,split = pattern.etymology))[2]
pattern.epilogue = "(\\r\\n\\n\\r\\s\\r\\n)"
chapters[137] = unlist(strsplit(chapters[137] ,split = pattern.epilogue))[1]
We calculated the lengths of each chapter in two main steps. We first separated each chapter into words using the Wishing 1 methods, then we calculated the length of the vector we were given. We then plotted the distribution of word lengths as follows. Note that the etymology and epilogue in the graph are presented as values 0 and 136 respectively).
pattern.words = "(\\s)|(\\r)|(\\n)|(\\,)|(\\.)" #for splitting the words
chaps_count <- sapply(chapters,function(c){ #splitting each chapter to words
length( sapply(strsplit(c ,split = pattern.words),function(w){ w[!is.na(w) & w != ""]}))})
names(chaps_count) = c(0:136)
names(chaps_count)[1] = "etymology" # including the etymology and extracts
names(chaps_count)[137] = "epilouge"
chaps_data = as.data.frame(chaps_count)
chaps_data$names_chaps = rownames(chaps_data)
rownames(chaps_data) = c(1:137)
chaps_data %>% ggplot(aes(x=names_chaps, y=chaps_count)) + geom_point() +
geom_segment( aes(x=names_chaps, xend = names_chaps, y=0, yend=chaps_count))+
labs(x="Chapter Number",y= "Words Count Per Chapter",title = "Words Count Per Chapter") + scale_x_discrete(limits = c("etymology", seq(1,135), "epilouge"), breaks = c("etymology", seq(10,135,10), "epilouge")) + theme(axis.text.x = element_text(angle = 90))
It can be said that there is no uniformity between the lengths of the chapters and there is no clear trend to the eye.
We wrote a function that receives as input a word, and an array of
strings representing the chapters. We used str_count for
each chapter to count the frequency of the word and then calculated the
relative frequencies.
names(chapters) = c(0:136)
names(chapters) = c("etymology", 1:135, "epilouge")
relative_freq <- function (word, strings_arr){ #Defining the function
sapply(chapters, function(c){ #calling sapply in the function
word_freq <- str_count(c ,pattern = as.character(word)) #counting frequency
tmp_words_ls <- strsplit(c ,split = pattern.words) # splitting chapter to words vec
chapter_length <- length(sapply(tmp_words_ls, function(w){w[!is.na(w) & w != ""]})) #calculating chapter length
return(word_freq /chapter_length) #returning relative frequency
})}
We decided to display the three graphs in grid view using ggplot as follows:
ahab = as.data.frame(relative_freq("Ahab", chapters))
names(ahab)[1] = "freq_Ahab"
p1 = ahab %>% ggplot(aes(x=row.names(ahab), y=freq_Ahab)) +
geom_point() +
geom_segment( aes(x=row.names(ahab),xend=row.names(ahab), y=0, yend=freq_Ahab))+
labs(x="Chapter Number",y= "Relative Frequency",title = "Ahab") + scale_x_discrete(limits = c("etymology", seq(1,135), "epilouge"), breaks = c("etymology", seq(10,135,10), "epilouge")) + theme(axis.text.x = element_text(angle = 45))
moby = as.data.frame(relative_freq("Moby", chapters))
names(moby)[1] = "freq_Moby"
p2 = moby %>% ggplot(aes(x=row.names(moby), y=freq_Moby)) +
geom_point() +
geom_segment( aes(x=row.names(moby),xend=row.names(moby), y=0, yend=freq_Moby))+
labs(x="Chapter Number",y= "Relative Frequency",title = "Moby") + scale_x_discrete(limits = c("etymology", seq(1,135), "epilouge"), breaks = c("etymology", seq(10,135,10), "epilouge")) + theme(axis.text.x = element_text(angle = 45))
sea = as.data.frame(relative_freq("sea", chapters))
names(sea)[1] = "freq_sea"
p3 = sea %>% ggplot(aes(x=row.names(sea), y=freq_sea)) +
geom_point() +
geom_segment( aes(x=row.names(sea),xend=row.names(sea), y=0, yend=freq_sea))+
labs(x="Chapter Number",y= "Relative Frequency",title = "sea") + scale_x_discrete(limits = c("etymology", seq(1,135), "epilouge"), breaks = c("etymology", seq(10,135,10), "epilouge")) + theme(axis.text.x = element_text(angle = 45))
grid.arrange(p1, p2, p3, heights = c(7,7,7),
top = textGrob("Relative Frequencies of Specific Words", gp=gpar(fontface = "bold", fontsize=20)))
Certainly different behavior can be seen for the different words. The word “Ahab” is especially common in chapters 25-50 and in chapters 100 and on wards.The word “Moby” is quite rare and hardly appears in few chapters. The word “sea” on the other hand quite frequent through the whole book.
Suppose that Alice and Bob each choose independently and uniformly at random a single word from the book, What is the probability that they will pick the same word? We will demonstrate the answer in various ways.
Suppose that exist \(\{\omega_1, ... , \omega_n\}\) that represent all the words in the book. We want to calculate the probability that Alice and Bob will choose the same word \(\omega_i\), \(i \in \{1,2...,n\}\). We denote Alice’s choose as \(X_A\) and respectively Bob’s choose as \(X_B\), that is:
\[P(X_A=\omega_i, X_B=\omega_i)= P(X_A=\omega_1, X_B=\omega_1)\cup P(X_A=\omega_2, X_B=\omega_2)...\cup P(X_A=\omega_n, X_B=\omega_n)\] The events Alice and Bob will both choose the word \(i\) or they will both choose the word \(j\), are foreign events. Note that the union formula is \(P(A\cup B)=P(A)+P(B)-P(A\cap B)\). The intersection of the events will be 0 since they are independent, hence we can say that their union is equal to their sum. We can rewrite it as: \(P(A\cup B)=P(A)+P(B)\). Hence: \[P(X_A=\omega_i, X_B=\omega_i)= P(X_A=\omega_1, X_B=\omega_1)+P(X_A=\omega_2, X_B=\omega_2)...+P(X_A=\omega_n, X_B=\omega_n)\] \[\Longrightarrow P(X_A=\omega_i, X_B=\omega_i)=\sum_{i=1}^nP(X_A=\omega_i, X_B=\omega_i)\]
\[\Longrightarrow P(X_A=\omega_i, X_B=\omega_i)=\sum_{i=1}^nP(X_A=\omega_i, X_B=\omega_i) = \sum_{i=1}^nP(X_A=\omega_i)P(X_B=\omega_i)\] Let us simulate this formula by an example:
Assume the vector
('To', 'be', 'or', 'not', 'to', 'be', 'this', 'is', 'the', 'question').
The frequency of all the words apart of be is 0.1, and the
frequency of be is 0.2. The probability that Alice and Bob
will both choose the word To is \(0.1\cdot0.1=0.01\), and for the word
be is \(0.2\cdot0.2=0.04\), and so on. If we want
to know what will be the probability that Alice and Bob will choose the
same word, we will use the mathematical equation above have: \[P(X_A=\omega_i, X_B=\omega_i) =
\frac{1}{100}+\frac{4}{100}+\frac{1}{100}+\frac{1}{100}+\frac{1}{100}+\frac{1}{100}+\frac{1}{100}+\frac{1}{100}+\frac{1}{100}=\frac{12}{100}\]
Now we calculate the probability that both Alice and Bob will choose the
same word in the book. We used the frequencies of words
in the book as probabilities and applied the formula as follows:
probs_to_choose_a_word = table(words_vec) / length(words_vec) #creating
expected_prob_for_both = sum(probs_to_choose_a_word*probs_to_choose_a_word)
The probability for Alice and Bob to choose the same world is 0.0085396.
We simulated the formula using sample function and the
frequencies of words in the book as probabilities. We sampled \(B=100,000\) times their choices and
compared them to see how many of the tries resulted in the same choice
with mean function.
probs = table(words_vec) / length(words_vec) #creating probabilities vector
B <- 100000 #number of simulations
alice = sample(names(probs), B, prob = probs, replace = TRUE)
bob = sample(names(probs), B, prob = probs, replace = TRUE)
same_choices_probabilty <- mean(alice == bob) #Showing the frequency of same results
The relative frequency of Alice and Bob choosing the same word is 0.00828. Which is very close to the probability we calculated: 0.0085396. Not surprisingly, the chances are small for a same choice.
We will explain why in case of each Alice and Bob choose a single word from the list of unique words, the probability they the same word will definitely be lower (or equal in case that the original list of words is unique).
In the case of \(n\) unique words, each word has a probability of \(\frac{1}{n}\) to be chosen uniformly. When we choose a word from a non-unique list, some words that occur multiple times, say \(b > 0\) times, have a bigger chance to be chosen: \(\frac{1}{n}\cdot b\). Hence, the probability for choosing it twice in a random choice is \((\frac{1}{n}\cdot b)^2 \geq (\frac{1}{n})^2\). Thus we expect a smaller (or equal in case of original uniqe list of words) to the probability of the same event with non-unique words.
We will repeat the example from Q a, but this time the vector will
be: (To, be, or, not, to, be, this, is, the, question).
The frequency of all the words is equal to 0.1, and the probability that Alice and Bob will choose the same word is \(0.1\cdot0.1=0.01\), therefor the calculation will be:
\[P_r(X_A=\omega_i, X_B=\omega_i) = \frac{1}{100}+\frac{1}{100}+\frac{1}{100}++\frac{1}{100}+\frac{1}{100}+\frac{1}{100}+\]
\[+\frac{1}{100}+\frac{1}{100}+\frac{1}{100}=10\cdot \frac{1}{100}=\frac{9}{100}<\frac{12}{100}\] We can notice that we got a smaller probability than the result fro Question 3 a.
The same result can be computed with a similar formula.
un_words = unique(words_vec) # This time only unique
B <- 100000
alice = sample(un_words, B, replace = TRUE)
bob = sample(un_words, B, replace = TRUE)
uniqe_mean = mean(alice == bob)
We get the empirical probability of 0.00004 which is lower then the previous, 0.00828.
We filtered out the words that have only English letters and only 5
characters. First we replaced in the text string all
non-characters with a space ” “. Then we splitted the resulted string to
a vector via strsplit by spaces and the filtered out the
5-letter words and converted them to lower-case.
replaced = str_replace_all(text, "[^a-zA-Z]", " ") #replaced all non-characters with space
words_chars_only <- strsplit(replaced ,split = "(\\s)")
words_chars_only <- sapply(words_chars_only, function(z){ z[!is.na(z) & z != ""]})
fivers <- str_to_lower(words_chars_only[str_length(words_chars_only) == 5])
book.fivers.count <- length(fivers)
book.fivers.count.uniqe <- length(unique(fivers))
After cleaning and filtering the text, we are left with 26964 words and 2013 unique words.
We simply used table and arrange(desc())
function to present the top 10.
freq_fivers = table(fivers)
as.data.frame(freq_fivers) %>% arrange(desc(Freq)) %>% head(10)
The results make sense, the book is acout whales indeed.
We created a helper function called get_char_locs_freqs
returns a matrix of the letters frequencies for with rows for each
letter from a to z and columns for each location from 1 to 5.
#Helper function: receives a matrix of 5 letter words and returns a matrix of the letters frequencies for each of the five location
get_char_locs_freqs <- function(letter_frequencies_mat){
freqs_mat = matrix(nrow = 26, ncol = 5) # create a matrix 26-by-5
freqs_mat[is.na(freqs_mat)] <- 0 # define NA = 0
colnames(freqs_mat) <- c(1:5)
rownames(freqs_mat) <- letters[seq(from = 1, to = 26 )] # define row names as a-z
for (i in 1:5){
for (j in rownames(freqs_mat)){
freqs_mat[j,i] = sum(str_count(letter_frequencies_mat[,i], j)) # count the number of observation of specific letter
}
}
return(freqs_mat)
}
We used str_split_fixed to retrieve a matrix of
characters by location i and word i. Then we applied our helper function
get_char_locs_freqs. The results are shown in a heat-map as
follows:
book.fivers.matrix <- str_split_fixed(fivers, "", n=5)
book.fivers.chars.freqs = get_char_locs_freqs(book.fivers.matrix)
heatmap(book.fivers.chars.freqs, Colv = NA, Rowv = NA, scale = "column", xlab = "column", ylab = "latter", main = "Latters Heatmap")
common_letters <- c()
for (i in 1:5) {
max_ind = which.max(book.fivers.chars.freqs[,i])
common_letters[i] = names(max_ind)
} #aviv
The most common letter for each location respectfully: w, h, a, e, e. We see that each location has a different distribution of letter frequencies in it.
The function get.words.likelihoods creates a vector of
likelihoods out of character probabilities and an array of words.with a
space for each word, then for each location from 1 to 5 draws for each
word the probability for of the character in i location to occur.
Afterwards it products the probabilities with the Like_vec
in the same way we calculate the Likelihood for each word. The output is
the Like_vecthat has all the likelihoods for each word
respectfully.
get.words.likelihoods <- function(char_probs, words_arr){
Like_vec = rep(1, length(words_arr)) #result vector, all values are 1
names(Like_vec) = words_arr
for (i in 1:5){
char_vec <- str_sub(words_arr, i,i) #extracts all chars in i index
probs_col = sapply(char_vec, FUN = function(c){char_probs[c,i]}) #drawing probability of character from every word in location i
Like_vec = Like_vec * probs_col #calculates product
}
return(Like_vec)
}
Now we run the function to compute the likelihood of all unique five-letter words from the book. The top-10 words with the highest likelihood is presented.
Note that we decided to use the above function inside a helper function for the next questions.
get.words.likelihoods.df.sorted <- function(chars_locations.freqs.mat, words_arr){
probs_mat <- chars_locations.freqs.mat/sum(chars_locations.freqs.mat[,1])
likelihoods <- get.words.likelihoods(probs_mat, unique(words_arr))
likelihoods <- as.data.frame(likelihoods)
likelihoods <- cbind(words = rownames(likelihoods), likelihoods)
rownames(likelihoods) <- 1:nrow(likelihoods)
return(likelihoods %>% arrange(desc(likelihoods)))
}
book.likelihoods = get.words.likelihoods.df.sorted(book.fivers.chars.freqs, fivers) %>% head(10)
book.likelihoods %>% head(10)
wordle_match
# Helper function:
wordle_match <- function(guess, word) # 1: correct location, -1: wrong location, 0: missing
{ L <- nchar(guess)
match <- rep(0, L)
for(i in 1:L)
{
if(grepl(substr(guess, i, i), word, fixed=TRUE))
{match[i] = -1}
if(substr(guess, i, i) == substr(word, i, i))
{ match[i] = 1}
}
return(match)
}
book.fivers.chars.freqs - The frequencies of letters
from the book
chars_locations.freqs.mat - The frequencies of letters
from the new list of words
We loaded a new list of words from the github reposetory and not from the original link.
url_words_2 = "https://raw.githubusercontent.com/charlesreid1/five-letter-words/master/sgb-words.txt"
html_words <- read_html(url_words_2, encoding = "UTF-8")
words <- html_text2(html_words)
We then assigned a few useful objects: a
helpersplit_txt_to_words function creates a vector of the
new words which we call dict, a matrix of the words by
characters in the locations called ‘wordle.words.chars_mat’. In
addition, we calculated a matrix of the character frequencies called
chars_locations.freqs.mat.
# Helper function that receives a "clean" text of only words and outputs a vector of the words
split_txt_to_words <- function(text){
text_split <- strsplit(text ,split = "(\\s)")
text_split <- sapply(text_split, function(z){ z[!is.na(z) & z != ""]})
return(text_split)
}
#Assigning useful variables
wordle.words = split_txt_to_words(words) #list of new words
wordle.words.chars_mat = str_split_fixed(wordle.words, "", n=5) # matrix of letters from the list
chars_locations.freqs.mat = get_char_locs_freqs(wordle.words.chars_mat) #matrix of frequncies of the letters
Afterwards we show the distribution of letters in locations with a heat map, just like we did in Question 4 b.
len_new_words = length(wordle.words) #number of words in the Stanford list
heatmap(chars_locations.freqs.mat, Colv = NA, Rowv = NA, scale = "column", xlab = "column", ylab = "latter", main = "Latters Heatmap of new dictionary")
We learn from the heat map that some letters are frequent in the 2-4th locations and some are frequent in the locations 1,5. In compare to the previous heat map We see more visual variance in the new heat map and it might suggest some dependency in the words at the book.Some of the differences can be explained by the fact that the book, which has a story that takes place in a certain environment like the sea, contains a certin bias within the letter-locations. For example the letter w appears more frequently in a book in position 1 because there is much mention of the word whale.
The function follows receives: 1. An array of guess words 2. An array of their corresponding matches to the unknown words. 3. A dictionary, i.e the list of legal English words The function outputs all the words from the dictionary that are consistent with the results of the previous guesses.
We used a for loop of the functions for the guess, which nests within a for loop of the characters and their corresponding result (0, 1 or -1). The function filters the dictionary accordingly.
filter_words_by_match <- function(guesses, matches, dictionary){
for (w in 1:length(guesses)){ #iterates the guess's
for (c in 1:str_length(guesses[w])){ #iterates the characters of the word
if (matches[w,c] == -1){ #if the -1 filter the words with the letter in different locations than c
dictionary = dictionary[substr(dictionary, start = c, stop = c) != substr(guesses[w], start = c, stop = c)]
dictionary = dictionary[(dictionary %in% str_subset(dictionary, substr(guesses[w], start = c, stop = c)))]
}
if (matches[w,c] == 0){ # if 0 filter out all words with the letter
dictionary = dictionary[!(dictionary %in% str_subset(dictionary, substr(guesses[w], start = c, stop = c)))]
}
if (matches[w,c] == 1){ # if 1 filter all words with letter in loc c
dictionary = dictionary[substr(dictionary, start = c, stop = c) == substr(guesses[w], start = c, stop = c)]
}
}
}
return(dictionary)
}
Now we run the function with the guesses c(“south”, “north”) and their corresponding matches.
guesses = c("south", "north")
matches = rbind(c(-1, 1, 1, 0, 0),c(0, 1, 0, 0, 0))
filter_words_by_match(guesses, matches, split_txt_to_words(words))
## [1] "mouse" "lousy" "louse" "fouls" "douse" "louis" "coups" "mousy" "poufs"
## [10] "youse"
We manged to filter 10 words for the next guess.
First we created wordle.words.likelihoods, a new
dictionary (vector) of words and their likelihoods for the use of
Strategy 2:
wordle.words.likelihoods = get.words.likelihoods.df.sorted(chars_locations.freqs.mat, wordle.words) # creating a data frame with the function from Q4c arranged from highest to lowest likelihood
Afterwards we created a function that receives as input an unknown
word, a dictionary of words and likelihoods, and a Boolean expression,
print_guesses defaulted as FALSE for whether
we want to print the results. The function will return the number of
guesses with strategy 2 and if requested will also return the words
themselves as well.
In the function we used the likelihoods dictionary in choose in every
iteration the most likely word, then we used wordle_match
to retrieve the array of match-results. We filtered the dictionary of
likelihoods at the according to the match-result, recorded it and
repeated the process in a loop until the right word was found.
strategy2 <- function(unknown,dictionary ,print_guesses = FALSE){
dict = dictionary
guess_word = ""
counter = 0 # this will count the tries
while(guess_word != unknown){
counter = counter + 1 # count grow by 1 each iteration
guess_word = dict$words[1]
result = rbind(wordle_match(guess_word, unknown)) #matching array
if(print_guesses == TRUE) { # optional printing
cat("Guess number: ",counter,"\n")
print(rbind(str_split_fixed(guess_word, "", n=5), result))
cat("-------------------------------","\n")
}
d = as.data.frame(filter_words_by_match(guess_word, result, dict$words[-1])) # apply filter_words_by_match function from Q5 on the new dict without the first guess and create new data frame
colnames(d) = "words"
dict = inner_join(x = d ,y = dict, by = "words") # inner_join the 2 dictionaries by the new d
dict = dict %>% arrange(desc(dict$likelihoods))
}#end of While loop
return(counter)
}
Preparing the wordle.words.df which we want to use for
Strategy 3:
wordle.words.df = as.data.frame(wordle.words)
colnames(wordle.words.df) = "words"
We created a strategy3 function that receives as input
an unknown word, a list of words and their likelihoods we call
dictionary, and an expression containing TRUE or FALSE if
we want to print the results.
We ran a while loop like in Q 6 but this time sampled a word instead of characters.
The function will return the number of guesses and if requested will also return the guesses themselves as well.
strategy3 <- function(unknown, dictionary, print_guesses = FALSE){
dict = dictionary #words and their likelihoods
guess_word = ""
counter = 0 # this will count the tries
vec_of_guess = c() # creating new empty vector that contains the guess words
while (guess_word != unknown){ # until guess == unknown
counter = counter + 1
guess_word = dictionary[sample(nrow(dictionary), 1, replace = TRUE), ] # sample a word from the given dictionary
vec_of_guess = append(vec_of_guess, guess_word)
result = rbind(wordle_match(guess_word, unknown)) # apply given helper function
if(print_guesses == TRUE) { # optional printing
cat("Guess number: ",counter,"\n")
print(rbind(str_split_fixed(guess_word, "", n=5), result))
cat("-------------------------------","\n")
}
d = as.data.frame(filter_words_by_match(guess_word, result, dictionary$words[!(dictionary$words %in% vec_of_guess)])) # apply function from Q5 on the guess, vector matches and dictionary without the word and create a data frame
colnames(d) = "words"
dictionary = inner_join(x = d ,y = dictionary, by = "words") # inner_join the two data frame
}
return(counter) # return the number of guesses
}
We performed a simulation similar to section 6.c, this time with B = 100. We presented the average of all attempts for each strategy as follows:
paste("Total tries count in Stategy 2: ",strategy2("mouse", wordle.words.likelihoods, print_guesses = TRUE))
## Guess number: 1
## [,1] [,2] [,3] [,4] [,5]
## [1,] "s" "o" "r" "e" "s"
## [2,] "-1" "1" "0" "-1" "-1"
## -------------------------------
## Guess number: 2
## [,1] [,2] [,3] [,4] [,5]
## [1,] "p" "o" "i" "s" "e"
## [2,] "0" "1" "0" "1" "1"
## -------------------------------
## Guess number: 3
## [,1] [,2] [,3] [,4] [,5]
## [1,] "m" "o" "o" "s" "e"
## [2,] "1" "1" "-1" "1" "1"
## -------------------------------
## Guess number: 4
## [,1] [,2] [,3] [,4] [,5]
## [1,] "m" "o" "u" "s" "e"
## [2,] "1" "1" "1" "1" "1"
## -------------------------------
## [1] "Total tries count in Stategy 2: 4"
paste("Total tries count in Stategy 3: ",strategy3("mouse", wordle.words.df, print_guesses = TRUE))
## Guess number: 1
## [,1] [,2] [,3] [,4] [,5]
## [1,] "s" "h" "i" "r" "e"
## [2,] "-1" "0" "0" "0" "1"
## -------------------------------
## Guess number: 2
## [,1] [,2] [,3] [,4] [,5]
## [1,] "l" "o" "u" "s" "e"
## [2,] "0" "1" "1" "1" "1"
## -------------------------------
## Guess number: 3
## [,1] [,2] [,3] [,4] [,5]
## [1,] "d" "o" "u" "s" "e"
## [2,] "0" "1" "1" "1" "1"
## -------------------------------
## Guess number: 4
## [,1] [,2] [,3] [,4] [,5]
## [1,] "y" "o" "u" "s" "e"
## [2,] "0" "1" "1" "1" "1"
## -------------------------------
## Guess number: 5
## [,1] [,2] [,3] [,4] [,5]
## [1,] "m" "o" "u" "s" "e"
## [2,] "1" "1" "1" "1" "1"
## -------------------------------
## [1] "Total tries count in Stategy 3: 5"
As we can see, there were 4 guesses for strategy 2, but
for strategy 3 it’s not stable because it’s random.
B = 100 # number of simulations
unknown_vec = sample(wordle.words, B ,replace = FALSE)
res_strategy3 = sapply(unknown_vec, function(w){strategy3(w,wordle.words.df)})
res_strategy2 = sapply(unknown_vec, function(w){strategy2(w,wordle.words.likelihoods)})
df_strategies = as.data.frame(cbind(index = c(1:length(res_strategy3)),res_strategy2, res_strategy3))
We plotted the ecdf for each strategy as follows:
df_reshaped <- data.frame(x = df_strategies$index ,strategy = c(df_strategies$res_strategy2, df_strategies$res_strategy3),group = c(rep("Strategy2", nrow(df_strategies)),
rep("Strategy3", nrow(df_strategies))))
ggplot(df_reshaped, aes(strategy,x, col = group)) + stat_ecdf() + ggtitle("Guesses of Strategies") + xlab("Number of Guesses") + ylab("Relative Frequancy of Numer of Guesses")
The curves of the two strategies are to some extent similar but also clearly different. The CDF of Strategy 2 increases sharply up to 5 attempts and then moderately increases up to 10 attempts. The CDF of Strategy 3 increases less sharply than 2 until the 5th attempt and then rises more sharply than in 2 which implies a greater variance. The ECDF of 2 and 3 are significantly different from the ECDF of 1 by the x-axis ranges which is larger at strategy 1. That is, the variance of 1 is significantly higher than that of the two and therefore these are different cumulative probability functions.
For last, we printed the empirical means:
print(colMeans(df_strategies[,c(2,3)]))
## res_strategy2 res_strategy3
## 4.64 4.63
We see that the means are close, they differ by variance.
We wrote our own strategy and managed to lower the mean. We use a
seed word stone as a first guess for all cases, we found
that running guesses with highest likelihood after have the match array
from that words gives as better results.
strategy_extra <- function(unknown, cha_matrix, words_arr, print_guesses = FALSE){
word_arr = words_arr #this list will change in the function
letter_matrix = cha_matrix #this matrix wiil change in the function
guess_word = "stone"
# guess_word = "salet"
counter = 1
result = rbind(wordle_match(guess_word, unknown))
if(print_guesses == TRUE) { # print option
cat("Guess number: ",counter,"\n")
print(rbind(str_split_fixed(guess_word, "", n=5), result))
cat("-------------------------------","\n")
}
dict = get.words.likelihoods.df.sorted(letter_matrix, word_arr)
d = as.data.frame(filter_words_by_match(guess_word, result, dict$words[dict$words!=guess_word]))
colnames(d) = "words"
dict = inner_join(x = d ,y = dict, by = "words")
word_arr = array(dict$words)
word_mat = str_split_fixed(word_arr, "", n=5)
letter_matrix = get_char_locs_freqs(word_mat)
while(guess_word != unknown){
counter = counter + 1
dict = get.words.likelihoods.df.sorted(letter_matrix, word_arr)
if (sum(result)<4){
guess_word = dict$words[1] # choose the most likely
}
else{ # if sum = 4 we need to guess one last word
place = which(result==0) # find the index where we have 0 in wordle
ind = (which(as.matrix(letter_matrix[,place]>0))) # find the indexes that the value in the matrix is more than zero
last_letters = rownames(letter_matrix)[ind] # names of the letters
mati = as.data.frame(cha_matrix)
mati = mati %>% filter(rownames(mati) %in% c(last_letters)) %>% select(all_of(place))
max_ind = which.max(mati[,1]) # find the letter with the most value
guess_letter = rownames(mati)[max_ind]
guess_split = str_split_fixed(guess_word, "", n=5)
guess_split[place] = guess_letter # change the letter in the 0 place
guess_word = str_c(guess_split, collapse = "")
}
result = rbind(wordle_match(guess_word, unknown))
if(print_guesses == TRUE) { # print option
cat("Guess number: ",counter,"\n")
print(rbind(str_split_fixed(guess_word, "", n=5), result))
cat("-------------------------------","\n")
}
d = as.data.frame(filter_words_by_match(guess_word, result, dict$words[dict$words!=guess_word]))
colnames(d) = "words"
dict = inner_join(x = d ,y = dict, by = "words")
word_arr = as.matrix(dict$words)
word_mat = str_split_fixed(word_arr, "", n=5)
letter_matrix = get_char_locs_freqs(word_mat)
} #end of while loop
return(counter)
}
Running it once:
paste("Total tries count in Stategy Extra: ",strategy_extra("mouse",chars_locations.freqs.mat, wordle.words, print_guesses = TRUE))
## Guess number: 1
## [,1] [,2] [,3] [,4] [,5]
## [1,] "s" "t" "o" "n" "e"
## [2,] "-1" "0" "-1" "0" "1"
## -------------------------------
## Guess number: 2
## [,1] [,2] [,3] [,4] [,5]
## [1,] "d" "o" "u" "s" "e"
## [2,] "0" "1" "1" "1" "1"
## -------------------------------
## Guess number: 3
## [,1] [,2] [,3] [,4] [,5]
## [1,] "m" "o" "u" "s" "e"
## [2,] "1" "1" "1" "1" "1"
## -------------------------------
## [1] "Total tries count in Stategy Extra: 3"
Running it 100 times and showing the mean:
strateyg_extra.results = sapply(unknown_vec,function(w){
strategy_extra(w, chars_locations.freqs.mat, wordle.words)})
mean(strateyg_extra.results)
## [1] 4.29
df_strategies$strategy_extra = strateyg_extra.results
df_reshaped <- data.frame(x = df_strategies$index ,strategy = c(df_strategies$res_strategy2, df_strategies$res_strategy3, df_strategies$strategy_extra),group = c(rep("Strategy2", nrow(df_strategies)),
rep("Strategy3", nrow(df_strategies)),
rep("Strategy Extra", nrow(df_strategies))))
ggplot(df_reshaped, aes(strategy,x, col = group)) + stat_ecdf() + ggtitle("Guesses of Strategies") + xlab("Number of Guesses") + ylab("Relative Frequancy of Numer of Guesses")
The strategy is significant since much more tries succeed till the
third, the fourth or the fifth try.
THE END