Load the dataset
file <- "https://people.bu.edu/kalathur/datasets/mlk.txt"
words <- scan(file, what = character())
a) Words with punctuation
punct_words <- words[str_detect(words, "[[:punct:]]")]
punct_words
## [1] "today," "friends," "moment," "dream."
## [5] "dream." "creed:" "self-evident:" "equal."
## [9] "slave-owners" "brotherhood." "Mississippi," "state,"
## [13] "oppression," "justice." "character." "today."
## [17] "Alabama," "governor’s" "nullification," "brothers."
## [21] "today." "exalted," "low," "plain,"
## [25] "straight," "revealed," "together."
b) Cleaned & lowercase words
new_words <- str_replace_all(words, "[[:punct:]]", "")
new_words <- str_to_lower(new_words)
c) Top 5 frequent words
top_words <- sort(table(new_words), decreasing = TRUE)
head(top_words, 5)
## new_words
## the of a and be
## 17 15 14 14 11
d) Word length frequency and plot
word_lengths <- str_length(new_words)
length_freq <- table(word_lengths)
length_freq
## word_lengths
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 24 46 58 57 49 16 11 10 5 2 6 2 2
ggplot(data.frame(length = as.numeric(names(length_freq)),
freq = as.vector(length_freq)),
aes(x = length, y = freq)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Distribution of Word Lengths", x = "Word Length", y = "Frequency")

e) Longest words
max_length <- max(word_lengths)
longest_words <- unique(new_words[word_lengths == max_length])
longest_words
## [1] "interposition" "nullification"
f) Words starting with ‘c’
starts_with_c <- unique(new_words[str_starts(new_words, "c")])
starts_with_c
## [1] "creed" "created" "children" "color" "content" "character"
## [7] "crooked"
g) Words ending with ‘r’
ends_with_r <- unique(new_words[str_ends(new_words, "r")])
ends_with_r
## [1] "former" "together" "four" "color" "their" "character"
h) Words starting with ‘c’ and ending with ‘r’
c_and_r <- unique(new_words[str_starts(new_words, "c") & str_ends(new_words, "r")])
c_and_r
## [1] "color" "character"