Task 0: Understanding the Problem

For Task 0, I read the three text corpora (blogs, news, Twitter). Each line in the raw files represents a sentence or paragraph.

set.seed(123)
blogs <- sample(readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8"), 50000)
news <- sample(readLines("final/en_US/en_US.news.txt", encoding = "UTF-8"), 50000)
twitter <- sample(readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8"), 50000)

Task 1: Getting and Cleaning Data

In Task 1, I prepared the text data for analysis. I tokenized the data since we will be dealing with words rather than lines as a whole. I then created a summary data frame with basic information like word and line counts.

library(tokenizers)
library(lexicon)
data("profanity_alvarez")

clean_tokens <- function(text) {
  tokens <- unlist(tokenize_words(text))
  tokens[!(tokens %in% profanity_alvarez)]
}

blogs_tokens <- clean_tokens(blogs)
news_tokens <- clean_tokens(news)
twitter_tokens <- clean_tokens(twitter)

summary_df <- data.frame(
  Corpus = c("Blogs", "News", "Twitter"),
  Lines = c(length(blogs), length(news), length(twitter)),
  Words = c(length(blogs_tokens), length(news_tokens), length(twitter_tokens))
)
summary_df
##    Corpus Lines   Words
## 1   Blogs 50000 2079665
## 2    News 50000 1714237
## 3 Twitter 50000  633072
saveRDS(blogs_tokens, "blogs_tokens.rds")
saveRDS(news_tokens, "news_tokens.rds")
saveRDS(twitter_tokens, "twitter_tokens.rds")

Task 2: Exploratory Data Analysis

In Task 2, I performed EDA to understand word distributions and common word pairs. I plotted some of my findings in a histogram.

library(tidytext)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)


unigram_freq <- function(tokens) sort(table(tokens), decreasing = TRUE)

blogs_unigram <- unigram_freq(blogs_tokens)
news_unigram <- unigram_freq(news_tokens)
twitter_unigram <- unigram_freq(twitter_tokens)

plot_top_unigrams <- function(freq_table, title) {
  df <- head(freq_table, 20)
  df <- data.frame(Word = names(df), Frequency = as.numeric(df))
  ggplot(df, aes(x = reorder(Word, Frequency), y = Frequency)) +
    geom_bar(stat="identity", fill="steelblue") +
    coord_flip() +
    labs(title=title, x="Word", y="Frequency")
}

plot_top_unigrams(blogs_unigram, "Top 20 Words in Blogs")

plot_top_unigrams(news_unigram, "Top 20 Words in News")

plot_top_unigrams(twitter_unigram, "Top 20 Words in Twitter")

bigram_freq <- function(tokens) {
  df <- tibble(text = paste(tokens, collapse = " "))
  df %>%
    unnest_tokens(bigram, text, token="ngrams", n=2) %>%
    count(bigram, sort=TRUE)
}

blogs_bigram_freq <- bigram_freq(blogs_tokens)
news_bigram_freq <- bigram_freq(news_tokens)
twitter_bigram_freq <- bigram_freq(twitter_tokens)

Task 3: Modeling

I created a simple bigram model with backoff to most common unigram.

prepare_lookup <- function(bigram_df) {
  bigram_df %>%
    separate(bigram, into=c("word1","word2"), sep=" ") %>%
    group_by(word1) %>%
    slice_max(n, n=1) %>%
    ungroup()
}

blogs_lookup <- prepare_lookup(blogs_bigram_freq)
news_lookup <- prepare_lookup(news_bigram_freq)
twitter_lookup <- prepare_lookup(twitter_bigram_freq)

predict_unigram <- function(freq_table) names(freq_table)[1]

predict_bigram <- function(prev_word, lookup, unigram_freq) {
  pred <- lookup %>% filter(word1 == prev_word)
  if(nrow(pred) == 0) predict_unigram(unigram_freq) else pred$word2
}
predict_bigram("I", blogs_lookup, blogs_unigram)
## [1] "the"
predict_bigram("the", news_lookup, news_unigram)
## [1] "first"
predict_bigram("thank", twitter_lookup, twitter_unigram)
## [1] "you"