This week’s tidyTuesday is a database of passwords. How do letter frequencies from passwords match up with letter frequencies in the English Language?
Letter.Freq <- data.frame(stringsAsFactors=FALSE,
Letter = c("E", "T", "A", "O", "I", "N", "S", "R", "H", "D", "L", "U",
"C", "M", "F", "Y", "W", "G", "P", "B", "V",
"K", "X", "Q", "J", "Z"),
Frequency = c(12.02, 9.1, 8.12, 7.68, 7.31, 6.95, 6.28, 6.02, 5.92, 4.32,
3.98, 2.88, 2.71, 2.61, 2.3, 2.11, 2.09, 2.03,
1.82, 1.49, 1.11, 0.69, 0.17, 0.11, 0.1, 0.07)
)
Letter.Freq <- Letter.Freq %>% mutate(Frequency = Frequency / 100, Letter = tolower(Letter))
passwords <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-14/passwords.csv')
## Parsed with column specification:
## cols(
## rank = col_double(),
## password = col_character(),
## category = col_character(),
## value = col_double(),
## time_unit = col_character(),
## offline_crack_sec = col_double(),
## rank_alt = col_double(),
## strength = col_double(),
## font_size = col_double()
## )
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
passwords <- passwords %>% mutate(PasswordNN = removeNumbers(password))
freq <- function(x,y) {
word <- tolower(unlist(strsplit(x,"")))
word_table <- table(word)
ans <- word_table[names(word_table)==y]
return(ans)
}
Letter.FreqPW <- sapply(letters,function(x) { freq(passwords$PasswordNN, x) } )
names(Letter.FreqPW) <- letters
LFPW <- data.frame(Letter.FreqPW)
LFPW$Letter <- row.names(LFPW)
LFPW <- LFPW %>% mutate(Freq = Letter.FreqPW)
Res <- left_join(Letter.Freq, LFPW)
## Joining, by = "Letter"
Res <- Res %>% mutate(LF = Freq / sum(Freq))
library(ggrepel)
gg1 <- ggplot(Res) + aes(x=Frequency, y=LF, label=Letter) + geom_label_repel(fill="white") + labs(x="Language Frequency", y="Frequency in Password", title="Password Letters vs. English Letter Frequency") + coord_equal() + geom_abline(slope=1, intercept=0) + ggthemes::theme_economist()
gg1