cs544_quizzes

Load the dataset

file <- "https://people.bu.edu/kalathur/datasets/mlk.txt"
words <- scan(file, what = character())

a) Words with punctuation

punct_words <- words[str_detect(words, "[[:punct:]]")]
punct_words

##  [1] "today,"         "friends,"       "moment,"        "dream."        
##  [5] "dream."         "creed:"         "self-evident:"  "equal."        
##  [9] "slave-owners"   "brotherhood."   "Mississippi,"   "state,"        
## [13] "oppression,"    "justice."       "character."     "today."        
## [17] "Alabama,"       "governor’s"     "nullification," "brothers."     
## [21] "today."         "exalted,"       "low,"           "plain,"        
## [25] "straight,"      "revealed,"      "together."

b) Cleaned & lowercase words

new_words <- str_replace_all(words, "[[:punct:]]", "")
new_words <- str_to_lower(new_words)

c) Top 5 frequent words

top_words <- sort(table(new_words), decreasing = TRUE)
head(top_words, 5)

## new_words
## the  of   a and  be 
##  17  15  14  14  11

d) Word length frequency and plot

word_lengths <- str_length(new_words)
length_freq <- table(word_lengths)
length_freq

## word_lengths
##  1  2  3  4  5  6  7  8  9 10 11 12 13 
## 24 46 58 57 49 16 11 10  5  2  6  2  2

ggplot(data.frame(length = as.numeric(names(length_freq)),
                  freq = as.vector(length_freq)),
       aes(x = length, y = freq)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Distribution of Word Lengths", x = "Word Length", y = "Frequency")

e) Longest words

max_length <- max(word_lengths)
longest_words <- unique(new_words[word_lengths == max_length])
longest_words

## [1] "interposition" "nullification"

f) Words starting with ‘c’

starts_with_c <- unique(new_words[str_starts(new_words, "c")])
starts_with_c

## [1] "creed"     "created"   "children"  "color"     "content"   "character"
## [7] "crooked"

g) Words ending with ‘r’

ends_with_r <- unique(new_words[str_ends(new_words, "r")])
ends_with_r

## [1] "former"    "together"  "four"      "color"     "their"     "character"

h) Words starting with ‘c’ and ending with ‘r’

c_and_r <- unique(new_words[str_starts(new_words, "c") & str_ends(new_words, "r")])
c_and_r

## [1] "color"     "character"

cs544_quizzes

Zulal Akarsu

Load the dataset

a) Words with punctuation

b) Cleaned & lowercase words

c) Top 5 frequent words

d) Word length frequency and plot

e) Longest words

f) Words starting with ‘c’

g) Words ending with ‘r’

h) Words starting with ‘c’ and ending with ‘r’