if (!require(librarian)) {
install.packages("librarian")
library(librarian)
}::shelf(tidyverse,
librarian
rtweet,
wordcloud,
RColorBrewer,
here, scales)
Wordle data visualization
1 Objective
Visualize Wordle data in fun ways! If you don’t know, Wordle is a web-based word game created and developed by Welsh software engineer Josh Wardle, and owned and published by The New York Times Company since 2022.
2 Steps
Web scrape past Wordle words and word rankings (i.e., word commonness) using
rvest
.Use Twitter’s API and
rtweet
to read in tweets from a bot (@WordleStats) that summarizes all Wordle score distributions posted on Twitter.Visualize different aspects of the data using a word cloud, stacked bar plot, scatterplot, and lollipop plot.
3 About the data
Data used in this script are from a list of previous Wordle words from FresherLive, a measure of word commonness from datayze, and Wordle score distributions from the Twitter bot @WordleStats.
4 Scraping and formatting 🧹
4.1 Load libraries and read in data
4.2 Scrape and format previous Wordle words
This website includes a table with the previous Wordle words and associated date.
Scrape data from website using rvest
.
<- rvest::read_html("https://latestnews.fresherslive.com/articles/past-wordle-words-list-of-all-past-wordle-answers-can-i-play-past-wordles-354332")
scrape_words <- rvest::html_table(scrape_words) %>%
words_table ::flatten_df() %>%
purrr::slice(-1) %>%
dplyr::row_to_names(1) %>%
janitoras.data.frame()
#view data
head(words_table)
Date Answers
1 July 30, 2022 BLUFF
2 July 29, 2022 UPSET
3 July 28, 2022 STOMP
4 July 27, 2022 MOTTO
5 July 26, 2022 CINCH
6 July 25, 2022 ELOPE
This website includes the date of each Wordle, but it presents dates in multiple formats. From January 1 through March 1, dates are reported as day-month-year. From March 2 - present, dates are reported as month-day-year. We need to convert the date column into the correct date format.
#find row number for the last row where date is in mdy format (corresponds to word = NASTY)
<- row.names(words_table[which(words_table$Answers=="NASTY"),]) %>%
row as.numeric()
#split into two data frames, convert to date format based on day-month-year or month-day-year, rowbind the two dataframes back together
<- words_table %>%
df1 filter(row.names(words_table) %in% (row + 1):nrow(words_table)) %>%
mutate(Date = lubridate::dmy(Date))
<- words_table %>%
words_table_2 filter(row.names(words_table) %in% 1:row) %>%
mutate(Date = lubridate::mdy(Date)) %>%
rbind(df1) %>%
rename(word = Answers, date = Date)
Convert words from uppercase to lowercase using tolower
.
$word <- tolower(words_table_2$word)
words_table_2
#view data
head(words_table_2)
date word
1 2022-07-30 bluff
2 2022-07-29 upset
3 2022-07-28 stomp
4 2022-07-27 motto
5 2022-07-26 cinch
6 2022-07-25 elope
4.3 Scrape and format word rank (i.e., commonness)
This website includes a table with various metrics associated with words, including word rank.
What is word rank?
From the website: “Determining Word Rank: The word rank metric is a measure of word frequency, with frequent words corresponding to higher ranks. In order to get an accurate frequency count of each word, we utilize a stemmer to identify the morphological root form of a word. This allows us to group slight variations of the same word. For example, ‘cats’ and ‘cat’ both have the same stem, as do ‘readability’ and ‘readable.’ For most words, familiarity with said word is independent of count (e.g. the singular form vs the plural form) or part of speech (e.g. adjective form vs the noun form). In some rare cases, however, a common word may have multiple meanings including a meaning so infrequent it is not well known. We then calculate word frequency using the data from Project Gutenberg which is a large collection of freely available English documents and summing the counts for all variations of the word corresponding to the same stem.”
Higher ranks means closer to 1 (first rank), so not actually larger numbers (e.g., 1 is higher ranked than 100, and therefore associated with a more common word).
Create vector of previous Wordle words to run through the word rank website.
<- words_table_2$word word_list
Run through a for loop that takes each word, looks up that word on the datayze website, reads into R the table with word rank, formats the data, and binds all words/ranks into one df.
#create empty dataframe to store output
<- data.frame()
rank_table
#for loop
for(word in word_list){
<- sprintf('https://datayze.com/word-analyzer?word=%s', word)
url <- rvest::read_html(url)
url_scrape <- rvest::html_table(url_scrape) %>%
new_rank_table ::flatten_df() %>%
purrr::filter(X1 == "Word Rank:") %>%
dplyrmutate(word = word) %>%
rename(rank = X2) %>%
select(-1) %>%
as.data.frame()
$rank <- stringi::stri_replace_all_regex(new_rank_table$rank, pattern = c("st", "rd", "th", "nd"),
new_rank_tablereplacement = c("", "", "", ""),
vectorize = FALSE)
<- rbind(new_rank_table, rank_table)
rank_table
}
#view data
head(rank_table)
rank word
1 12266 rebus
2 15248 boost
3 12756 truss
4 3139 siege
5 4740 tiger
6 13777 banal
Remove NAs, which are input as “-”. These are because not all words have word ranks on the website.
<- rank_table %>%
rank_table_no_NA filter(!(rank == "-"))
Left join the word ranks dataframe that we just created with the dataframe of past Wordle words and dates we previously created. A left join will only keep the rows for which we have word rank.
<- left_join(rank_table_no_NA, words_table_2, by = "word")
words_dates_ranks
#view data
head(words_dates_ranks)
rank word date
1 12266 rebus 2022-01-01
2 15248 boost 2022-01-02
3 12756 truss 2022-01-03
4 3139 siege 2022-01-04
5 4740 tiger 2022-01-05
6 13777 banal 2022-01-06
4.4 Use Twitter 🐦 API to read in Wordle scores
There is a bot on Twitter that posts daily Wordle score distributions @WordleStats. We can interact with the Twitter API to download the timeline (all tweets) from that bot.
Getting authorized to use the Twitter API is typically fairly simple. Here is a great tutorial about setting up a Twitter development account and connecting to the Twitter API in RStudio.
I had a difficult time getting R to interact with the Twitter API and ended up having to apply for elevated access in my Twitter development account before it worked.
Set up authorization to work with the Twitter API and save credentials so that they can be read in each time you re-open the script. This is all outlined in this (tutorial).
#enter in your bearer token from your Twitter app on your development page when prompted
# auth <- rtweet_app() #only do this once
# auth_save(auth, "wordle-auth") #only do this once
#then read in auth using this line of code each time
auth_as("wordle-auth")
Read in all tweets from @Wordlestats.
<- get_timeline("WordleStats", n = Inf)
tweets
#view data
head(tweets$text)
[1] "#Wordle 409 2022-08-02\n34,909 results found on Twitter.\n3,380 hard mode players.\n\n1: 0%\n2: 0%\n3: 🟩 4%\n4: 🟩🟩🟩🟩 17%\n5: 🟩🟩🟩🟩🟩🟩🟩 28%\n6: 🟩🟩🟩🟩🟩🟩🟩🟩 35%\nX: 🟩🟩🟩🟩 15%\n\n#Wordle409"
[2] "#Wordle 408 2022-08-01\n36,662 results found on Twitter.\n3,303 hard mode players.\n\n1: 0%\n2: 🟩 5%\n3: 🟩🟩🟩🟩🟩 20%\n4: 🟩🟩🟩🟩🟩🟩🟩🟩 33%\n5: 🟩🟩🟩🟩🟩🟩 27%\n6: 🟩🟩🟩 13%\nX: 🟩 2%\n\n#Wordle408"
[3] "#Wordle 407 2022-07-31\n39,250 results found on Twitter.\n3,369 hard mode players.\n\n1: 1%\n2: 🟩🟩 8%\n3: 🟩🟩🟩🟩🟩🟩 26%\n4: 🟩🟩🟩🟩🟩🟩🟩🟩 33%\n5: 🟩🟩🟩🟩 19%\n6: 🟩🟩 10%\nX: 🟩 2%\n\n#Wordle407"
[4] "#Wordle 406 2022-07-30\n37,353 results found on Twitter.\n3,171 hard mode players.\n\n1: 0%\n2: 2%\n3: 🟩🟩🟩 14%\n4: 🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩 42%\n5: 🟩🟩🟩🟩🟩🟩🟩 31%\n6: 🟩🟩 10%\nX: 1%\n\n#Wordle406"
[5] "#Wordle 405 2022-07-29\n37,791 results found on Twitter.\n3,213 hard mode players.\n\n1: 0%\n2: 🟩 5%\n3: 🟩🟩🟩🟩🟩🟩🟩 30%\n4: 🟩🟩🟩🟩🟩🟩🟩🟩🟩 38%\n5: 🟩🟩🟩🟩🟩 20%\n6: 🟩 6%\nX: 1%\n\n#Wordle405"
[6] "#Wordle 404 2022-07-28\n40,650 results found on Twitter.\n3,490 hard mode players.\n\n1: 0%\n2: 🟩 7%\n3: 🟩🟩🟩🟩🟩🟩 26%\n4: 🟩🟩🟩🟩🟩🟩🟩🟩 32%\n5: 🟩🟩🟩🟩🟩 21%\n6: 🟩🟩 11%\nX: 🟩 2%\n\n#Wordle404"
Format the data to separate the components of interest (date, number of people posting results on twitter, percent people that guessed correctly after each number of guesses).
#separate tweets into date, number of results found on twitter, number of people playing in hard mode, and the % of people for each number of guesses including those that never got the answer right
<- tweets %>%
tweets_separated select(text) %>%
separate(col = text, sep='\n', into=c('date','number_results','hard_mode','empty','one_guess', 'two_guess','three_guess','four_guess','five_guess','six_guess', 'failed'), remove=TRUE)
#view data
head(tweets_separated)
# A tibble: 6 × 11
date number_results hard_mode empty one_guess two_guess three_guess
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 #Wordle 409 20… 34,909 result… 3,380 ha… "" 1: 0% 2: 0% 3: 🟩 4%
2 #Wordle 408 20… 36,662 result… 3,303 ha… "" 1: 0% 2: 🟩 5% 3: 🟩🟩🟩…
3 #Wordle 407 20… 39,250 result… 3,369 ha… "" 1: 1% 2: 🟩🟩 … 3: 🟩🟩🟩…
4 #Wordle 406 20… 37,353 result… 3,171 ha… "" 1: 0% 2: 2% 3: 🟩🟩🟩 …
5 #Wordle 405 20… 37,791 result… 3,213 ha… "" 1: 0% 2: 🟩 5% 3: 🟩🟩🟩…
6 #Wordle 404 20… 40,650 result… 3,490 ha… "" 1: 0% 2: 🟩 7% 3: 🟩🟩🟩…
# … with 4 more variables: four_guess <chr>, five_guess <chr>, six_guess <chr>,
# failed <chr>
#use string extract to extract only the percentage of respondents for each number of guesses
#(\\d+) is the sign for any number of digits
c(5:11)] <-lapply(tweets_separated[,c(5:11)], str_extract, pattern = '(\\d+)(%)')
tweets_separated[,
#remove the percentage signs
c(5:11)] <-lapply(tweets_separated[,c(5:11)], gsub, pattern = '%', replacement = "")
tweets_separated[,
#remove additional characters that we don't want and convert the date to date format
<- tweets_separated %>%
tweets_final select(-empty) %>%
mutate(date = gsub('#Wordle (\\d+) ', "", date)) %>%
mutate(number_results = gsub(' results found on Twitter.', "", number_results)) %>%
mutate(number_results = gsub(',', "", number_results)) %>%
mutate(hard_mode = gsub(' hard mode players.', "", hard_mode)) %>%
mutate(hard_mode = gsub(',', "", hard_mode)) %>%
mutate(date = lubridate::ymd(date))
Warning: 8 failed to parse.
#you get a warning that a few rows failed to parse. These are rows where the tweet did not follow the typical format.
#view data
head(tweets_final)
# A tibble: 6 × 10
date number_results hard_mode one_guess two_guess three_guess four_guess
<date> <chr> <chr> <chr> <chr> <chr> <chr>
1 2022-08-02 34909 3380 0 0 4 17
2 2022-08-01 36662 3303 0 5 20 33
3 2022-07-31 39250 3369 1 8 26 33
4 2022-07-30 37353 3171 0 2 14 42
5 2022-07-29 37791 3213 0 5 30 38
6 2022-07-28 40650 3490 0 7 26 32
# … with 3 more variables: five_guess <chr>, six_guess <chr>, failed <chr>
#remove rows with NA for any column (these are the ones that failed to parse)
<- na.omit(tweets_final) tweets_final
Merge dataframe with word, rank, and date with the newly created score distribution dataframe by date. First visualize the two dataframes that we’re combining.
head(words_dates_ranks)
rank word date
1 12266 rebus 2022-01-01
2 15248 boost 2022-01-02
3 12756 truss 2022-01-03
4 3139 siege 2022-01-04
5 4740 tiger 2022-01-05
6 13777 banal 2022-01-06
head(tweets_final)
# A tibble: 6 × 10
date number_results hard_mode one_guess two_guess three_guess four_guess
<date> <chr> <chr> <chr> <chr> <chr> <chr>
1 2022-08-02 34909 3380 0 0 4 17
2 2022-08-01 36662 3303 0 5 20 33
3 2022-07-31 39250 3369 1 8 26 33
4 2022-07-30 37353 3171 0 2 14 42
5 2022-07-29 37791 3213 0 5 30 38
6 2022-07-28 40650 3490 0 7 26 32
# … with 3 more variables: five_guess <chr>, six_guess <chr>, failed <chr>
<- left_join(words_dates_ranks, tweets_final, by = "date") %>%
wordle_data na.omit()
#view data
head(wordle_data)
rank word date number_results hard_mode one_guess two_guess
8 7758 crank 2022-01-08 101503 1763 1 5
9 5764 gorge 2022-01-09 91477 1913 1 3
10 8221 query 2022-01-10 107134 2242 1 4
11 853 drink 2022-01-11 153880 3017 1 9
12 1338 favor 2022-01-12 137586 3073 1 4
13 3647 abbey 2022-01-13 132726 3345 1 2
three_guess four_guess five_guess six_guess failed
8 23 31 24 14 2
9 13 27 30 22 4
10 16 30 30 17 2
11 35 34 16 5 1
12 15 26 29 21 4
13 13 29 31 20 3
If one of the previous websites we’ve used to scrape data stops working, and you still want to proceed with the visualizations, you can read in the csv saved in the github repo.
wordle_data1 <- read_csv(here("wordle_data_August_2_2022"))
This dataframe has data last downloaded on August 2, 2022.
5 Data visualization 📊
5.1 “Letter” cloud
For our first data visualization, let’s make a word cloud. Instead of words though, let’s show letters in our cloud to see which letters are most commonly used.
Separate each word into its five letters.
<- wordle_data %>%
letters select(word) %>%
extract(word, into = c('one','two','three','four','five'), regex = "([a-z])([a-z])([a-z])([a-z])([a-z])")
#view data
head(letters)
one two three four five
8 c r a n k
9 g o r g e
10 q u e r y
11 d r i n k
12 f a v o r
13 a b b e y
Change the format from wide to long using pivot_longer
.
<- letters %>%
letters_col pivot_longer(c(1:5)) %>%
select(-name) %>%
rename(letter = value)
#view data
head(letters_col)
# A tibble: 6 × 1
letter
<chr>
1 c
2 r
3 a
4 n
5 k
6 g
Make a frequency table for how many times each letter appears in a Wordle word.
<- plyr::count(letters_col, 'letter')
letters_freq
#view data
letters_freq
letter freq
1 a 78
2 b 16
3 c 35
4 d 25
5 e 86
6 f 21
7 g 24
8 h 37
9 i 49
10 k 17
11 l 59
12 m 30
13 n 43
14 o 78
15 p 28
16 q 1
17 r 74
18 s 47
19 t 68
20 u 28
21 v 12
22 w 19
23 x 1
24 y 24
Create the word (letter) cloud.
wordcloud(words = letters_freq$letter, freq = letters_freq$freq, min.freq = 0, random.order=FALSE, rot.per=0, colors=brewer.pal(8, "Dark2"))
5.2 Stacked barplot
Now let’s find out if less common words are harder to guess? We’ll do this by plotting the average number of guesses by word rank (i.e., word commonness).
Convert variables from character to numeric.
c(1, 4, 6:12)] <- lapply(wordle_data[,c(1, 4, 6:12)], as.numeric) wordle_data[,
Create three groups for word rank from most to least common. We do this by grouping rows into the bottom, middle, and top 1/3 of all data by word rank. We also use case_when
here, which is very similar to ifelse
.
<- wordle_data %>%
wordle_data_grp_rank mutate(rank_group = case_when(rank < quantile(rank, prob = .333) ~ "more common",
> quantile(rank, prob = .666) ~ "less common",
rank TRUE ~ "medium"))
Relevel the word rank groups from more to less common. This is important for plotting in the correct order.
$rank_group <- as.factor(wordle_data_grp_rank$rank_group)
wordle_data_grp_rank$rank_group <- forcats::fct_relevel(wordle_data_grp_rank$rank_group, "more common", "medium", "less common") wordle_data_grp_rank
Change the format from wide to long for percent of people in each number of guesses category.
<- wordle_data_grp_rank %>%
wordle_data_long_guess pivot_longer(cols = c(6:12), names_to = "guess_number", values_to = "percent")
Relevel the guess number categories (e.g., 1 guess, 2 guesses, etc.) from more to less guesses. This is important for plotting in the correct order.
$guess_number <- as.factor(wordle_data_long_guess$guess_number)
wordle_data_long_guess$guess_number <- forcats::fct_relevel(wordle_data_long_guess$guess_number, "failed", "six_guess", "five_guess", "four_guess", "three_guess", "two_guess", "one_guess") wordle_data_long_guess
To calculate the average number of guesses for each word rank group, we have to sum the percent of respondents for each number of guesses, and divide by the sum of word guess percentages for all words.
# ((length(wordle_data_long_guess$rank_group)/3)/7) calculates number of words per work rank group
# times that value by 100 to get the sum of word guess percentages for all words
<- ((length(wordle_data_long_guess$rank_group)/3)/7)*100
x
# calculate the average percentage of people for each number of guesses for each word rank group
<- wordle_data_long_guess %>%
wordle_data_avg_percent group_by(rank_group, guess_number) %>%
mutate(avg_percent = 100*(sum(percent))/x)
Subset the dataframe to unique combinations of rank_group and guess_number, this makes it easier to add labels to the stacked bar plot later. Create labely
, which is the y-axis values for where the labels should be added to the stacked barplot.
<- wordle_data_avg_percent %>%
wordle_data_avg_percent_unique distinct(rank_group, guess_number, .keep_all=TRUE) %>%
group_by(rank_group) %>%
mutate(labely = cumsum(avg_percent))
Plot the data in a stacked barplot.
ggplot(wordle_data_avg_percent_unique, aes(x = rank_group, y = avg_percent, fill = guess_number)) +
geom_col() +
geom_text(aes(y = labely, label = paste(format(round(avg_percent,1), nsmall = 1), "%")), vjust = 1.0, colour = "black", size = 2.5) +
scale_fill_discrete(labels = c("failed", "6 guesses", "5 guesses", "4 guesses", "3 guesses", "2 guesses", "1 guess")) +
labs(x = "Word commonness", y = "Percent of players", title = "Less common words require more guesses in Wordle", fill = "Number of guesses") +
theme_classic() +
theme(legend.title = element_text(face = "italic", family = "Times", size = 10),
title = element_text(face = "italic", family = "Times", size = 12)
)
5.3 Scatterplot
Another way to look at these data is to see if the number of people posting results on twitter has changed throughout the year. We can do this with a scatterplot and fitted line.
ggplot(data = wordle_data, aes(x = date, y = number_results)) +
geom_point(shape=18) +
theme(axis.text.y = element_text(angle = 45)) +
labs(x = "Date", y = "Number of people", title = "How many people share their Wordle scores on Twitter?", caption = "Data from Twitter bot @WordleStats") +
geom_smooth(span = 0.3, lwd = 0.8) +
scale_x_date(breaks = scales::breaks_pretty(10)) +
scale_y_continuous(labels = comma) +
theme_classic() +
theme(axis.text.y = element_text(angle = 45))
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
5.4 Lollipop chart
For our last visualization, let’s see if people score better on certain days of the week.
Calculate the average number of guesses it takes to guess each Wordle (exclude those who failed).
<- wordle_data %>%
wordle_data_avg mutate(average = (((1*one_guess)+(2*two_guess)+(3*three_guess)+(4*four_guess)+(5*five_guess)+(6*six_guess))/(100-failed)))
#overall average and round to 2 decimals - this will be used in plot
<- mean(wordle_data_avg$average)
mean <- round(mean, 2) mean
Add weekday as a variable using wday
and calculate the average number of guesses it takes to guess the Wordle on each weekday.
<- wordle_data_avg %>%
wordle_data_day mutate(day = lubridate::wday(date, label=TRUE)) %>%
group_by(day) %>%
mutate(avg_per_day = mean(average)) %>%
ungroup()
Plot the data in a lollipop plot with the average number of guesses as a middle horizontal line.
ggplot(wordle_data_day, aes(x=day, y=avg_per_day)) +
geom_segment(aes(x=day, xend=day, y=mean(average), yend=avg_per_day), color="skyblue") +
geom_point( color="blue", size=4, alpha=0.6) +
scale_x_discrete(limits = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")) +
geom_hline(yintercept=mean(wordle_data_day$average), linetype="dashed",
color = "black", size=1) +
labs(x = "", y = "Average number of guesses", title = "Do people play Wordle better on certain days of the week?",
subtitle = "A look at how many guesses it takes to solve the Wordle, on average, each day of the week", caption = "Data from Twitter bot @WordleStats") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank()
+
) annotate(geom = "text",
x = 3.5,
y = 4.1,
size = 3,
color = "black",
lineheight = 0.9,
label = paste0("Overall average = ", mean))