Data Sets

uniqueInputs <- as_tibble(read.csv('uniqueAnswersDF.csv'))
uniqueInputs
## # A tibble: 8,322 × 2
##    Input1 Classif1
##    <chr>  <chr>   
##  1 abers  Guess   
##  2 abets  Guess   
##  3 abies  Guess   
##  4 abler  Guess   
##  5 ables  Guess   
##  6 ablet  Guess   
##  7 ablow  Guess   
##  8 abmho  Guess   
##  9 abohm  Guess   
## 10 aboil  Guess   
## # … with 8,312 more rows
solutions <- as_tibble(read.csv('valid_solutions.csv')) %>%
  mutate(Result = '-----')
solutions
## # A tibble: 2,315 × 2
##    word  Result
##    <chr> <chr> 
##  1 aback ----- 
##  2 abase ----- 
##  3 abate ----- 
##  4 abbey ----- 
##  5 abbot ----- 
##  6 abhor ----- 
##  7 abide ----- 
##  8 abled ----- 
##  9 abode ----- 
## 10 abort ----- 
## # … with 2,305 more rows

Demo

Using the test word faith, we’ll match it to all the possible wordle solutions. This will give us all the possible outcomes for the word faith. We’ll then tabulate the frequencies of each outcome. The frequency tells us how many words are left in the resulting set of wordle solutions for each outcome. With these frequencies, we can find the best starting word based on expected reduction in solution set size.

testingInput = 'faith'

for (input in solutions$word) {
  for (pos in 1:5) {
    # Green
    if (substr(testingInput, pos, pos) == substr(input, pos, pos)) {
      substr(solutions$Result[solutions$word == input], pos, pos) <- 'G'
    }
    # Yellow
    if (substr(testingInput, pos, pos) != substr(input, pos, pos) & grepl(substr(testingInput, pos, pos), input)) {
      substr(solutions$Result[solutions$word == input], pos, pos) <- 'Y'
    }
  }
}

testDf <- as.data.frame(table(solutions$Result)) %>%
  mutate(Prob = Freq/2315,
         Removed = -(2315 - Freq),
         Percent_removed = Removed/2315)
testDf

# confidence interval
possiblepercentremoved <- c()
for (row in 1:nrow(testDf)) {
  possiblepercentremoved <- c(possiblepercentremoved,
                              rep(testDf[row, ]$Percent_removed, 
                                  times = testDf[row, ]$Freq))
}

hist(possiblepercentremoved)

2315 + mean(possiblepercentremoved * 2315)
2315 + (as.vector(quantile(possiblepercentremoved, c(0.025, 0.975))) * 2315)

If we entered audio as the first word, we would expect to reduce the number of possible wordle solutions to 183.7 words. 95% of the time, audio will reduce the number of possible wordle solutions to anywhere from 4 to 435 words, inclusively.

If we entered faith as the first word, we would expect to reduce the number of possible wordle solutions to 180.6 words. 95% of the time, faith will reduce the number of possible wordle solutions to anywhere from 3 to 472 words, inclusively.


Best Word Reduction Analysis

means <- c()
lowerBounds <- c()
upperBounds <- c()
medians <- c()

for (inputWord in uniqueInputs$Input1) {
  solutionsCopy <- solutions
  for (solutionWord in solutionsCopy$word) {
    for (pos in 1:5) {
      # Green
      if (substr(inputWord, pos, pos) == substr(solutionWord, pos, pos)) {
        substr(solutionsCopy$Result[solutionsCopy$word == solutionWord], pos, pos) <- 'G'
      }
      # Yellow
      if (substr(inputWord, pos, pos) != substr(solutionWord, pos, pos) & grepl(substr(inputWord, pos, pos), solutionWord)) {
        substr(solutionsCopy$Result[solutionsCopy$word == solutionWord], pos, pos) <- 'Y'
      }
    }
  }
  testDF <- as.data.frame(table(solutionsCopy$Result)) %>%
    mutate(percentRemoved = -(2315 - Freq)/2315)
  # Mean & 95% CI 
  possiblePercentRemoved <- c()
  for (row in 1:nrow(testDF)) {
    possiblePercentRemoved <- c(possiblePercentRemoved,
                                rep(testDF[row, ]$percentRemoved,
                                    times = testDF[row, ]$Freq))
  }
  means <- c(means, 2315 + mean(possiblePercentRemoved * 2315))
  medians <- c(medians, 2315 + median(possiblePercentRemoved * 2315))
  lowerBounds <- c(lowerBounds, 2315 + (as.vector(quantile(possiblePercentRemoved, c(0.025, 0.975))) * 2315)[1])
  upperBounds <- c(upperBounds, 2315 + (as.vector(quantile(possiblePercentRemoved, c(0.025, 0.975))) * 2315)[2])
}

uniqueInputs <- uniqueInputs %>%
  mutate(LowerBound = lowerBounds,
         UpperBound = upperBounds,
         BoundRange = UpperBound - LowerBound,
         Mean = means,
         Median = medians,
         SumScore = Median + BoundRange) %>%
  arrange(SumScore)
uniqueInputs
## # A tibble: 8,322 × 8
##    Input1 Classif1 LowerBound UpperBound BoundRange  Mean Median SumScore
##    <chr>  <chr>         <dbl>      <dbl>      <dbl> <dbl>  <dbl>    <dbl>
##  1 raise  Solution          3        168        165  61.0     43      208
##  2 arise  Solution          4        168        164  63.7     51      215
##  3 soare  Guess             3        183        180  62.3     42      222
##  4 raile  Guess             3        173        170  61.3     54      224
##  5 reais  Guess             3        168        165  71.6     60      225
##  6 ariel  Guess             3        173        170  65.3     56      226
##  7 serai  Guess             3        168        165  72.9     62      227
##  8 aesir  Guess             4        168        164  69.9     64      228
##  9 aloes  Guess             3        174        171  77.4     58      229
## 10 arose  Solution          3        183        180  66.0     49      229
## # … with 8,312 more rows