tidyTuesday: Passwords

This week’s tidyTuesday is a database of passwords. How do letter frequencies from passwords match up with letter frequencies in the English Language?

Letter.Freq <- data.frame(stringsAsFactors=FALSE,
                     Letter = c("E", "T", "A", "O", "I", "N", "S", "R", "H", "D", "L", "U",
                                "C", "M", "F", "Y", "W", "G", "P", "B", "V",
                                "K", "X", "Q", "J", "Z"),
                  Frequency = c(12.02, 9.1, 8.12, 7.68, 7.31, 6.95, 6.28, 6.02, 5.92, 4.32,
                                3.98, 2.88, 2.71, 2.61, 2.3, 2.11, 2.09, 2.03,
                                1.82, 1.49, 1.11, 0.69, 0.17, 0.11, 0.1, 0.07)
               )
Letter.Freq <- Letter.Freq %>% mutate(Frequency = Frequency / 100, Letter = tolower(Letter))
passwords <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-14/passwords.csv')
## Parsed with column specification:
## cols(
##   rank = col_double(),
##   password = col_character(),
##   category = col_character(),
##   value = col_double(),
##   time_unit = col_character(),
##   offline_crack_sec = col_double(),
##   rank_alt = col_double(),
##   strength = col_double(),
##   font_size = col_double()
## )
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
passwords <- passwords %>% mutate(PasswordNN = removeNumbers(password))
freq <- function(x,y) {
     word <- tolower(unlist(strsplit(x,"")))
     word_table <- table(word)
     ans <- word_table[names(word_table)==y]
     return(ans)
}
Letter.FreqPW <- sapply(letters,function(x) { freq(passwords$PasswordNN, x) } )
names(Letter.FreqPW) <- letters
LFPW <- data.frame(Letter.FreqPW)
LFPW$Letter <- row.names(LFPW)
LFPW <- LFPW %>% mutate(Freq = Letter.FreqPW)
Res <- left_join(Letter.Freq, LFPW)
## Joining, by = "Letter"
Res <- Res %>% mutate(LF = Freq / sum(Freq))
library(ggrepel)
gg1 <- ggplot(Res) + aes(x=Frequency, y=LF, label=Letter) + geom_label_repel(fill="white") + labs(x="Language Frequency", y="Frequency in Password", title="Password Letters vs. English Letter Frequency") + coord_equal() + geom_abline(slope=1, intercept=0) + ggthemes::theme_economist() 
gg1