Introduction

This project leverages a diverse text dataset (blogs, news, and Twitter) to build a predictive text application. The project has the following steps:

Step 1. First it explores and summarizes text data. (Milestone Report focus)
Step 2. Build n-gram models.
Step 3. Create a prediction algorithm.
Step 4. Deploy it as a Shiny app.

STEP 1: Exploratory Analysis and Cleaning of Corpus Dataset

1. Loading Data

file_blogs <- 'en_US.blogs.txt'
file_news <-  'en_US.news.txt' 
file_twitter <- 'en_US.twitter.txt'

blogs <- readLines(file_blogs, warn = FALSE)
news <- readLines(file_news, warn = FALSE)
twitter <- readLines(file_twitter, warn = FALSE)

2. Basic Summary Statistics

word_count <- function(lines) {
  splits <- strsplit(lines, "\\\\s+")
  sum(sapply(splits, length))
}

summary_state <- data.frame(
  File = c('blogs', 'news', 'twitter'),
  Lines = c(length(blogs), length(news), length(twitter)),
  Words = c(word_count(blogs), word_count(news), word_count(twitter)),
  Characters = c(sum(nchar(blogs)), sum(nchar(news)), sum(nchar(twitter)))
)

print(summary_state)
##      File   Lines   Words Characters
## 1   blogs  899288  899288  206824509
## 2    news 1010206 1010206  203214543
## 3 twitter 2360148 2360151  162122651

3. Word Frequency Analysis

3.1 Unigrams

options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages(c("tidytext", "dplyr", "ggplot2", "stringr", "tm"))
## package 'tidytext' successfully unpacked and MD5 sums checked
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\R_Folder\R_Libraries\00LOCK\dplyr\libs\x64\dplyr.dll to
## D:\R_Folder\R_Libraries\dplyr\libs\x64\dplyr.dll: Permission denied
## Warning: restored 'dplyr'
## package 'ggplot2' successfully unpacked and MD5 sums checked
## package 'stringr' successfully unpacked and MD5 sums checked
## package 'tm' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Drsal\AppData\Local\Temp\RtmpYHs40g\downloaded_packages
library(tidytext)
library(dplyr)
library(ggplot2)
library(stringr)
library(tm)

text_df <- data.frame(
  text = c(blogs, news, twitter), 
  source = c(
    rep("blogs", length(blogs)),
    rep("news", length(news)),
    rep("twitter", length(twitter))
  ), 
  stringsAsFactors = FALSE
)

word_counts <- text_df %>%
  unnest_tokens(word, text) %>%
  count(source, word, sort = TRUE)

word_counts %>%
  group_by(source) %>%
  top_n(10, n) %>%
  ungroup() %>%
  ggplot(aes(reorder(word, n), n, fill = source)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~source, scales = "free") +
  coord_flip() +
  labs(title = "Top 10 Most Frequent Words", x = "Words", y = "Frequency")

3.2 Bigrams

bigrams <- text_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  count(source, bigram, sort = TRUE)

bigrams %>%
  group_by(source) %>%
  top_n(10, n) %>%
  ungroup() %>%
  ggplot(aes(reorder(bigram, n), n, fill = source)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~source, scales = "free") +
  coord_flip() +
  labs(title = "Top 10 Most Frequent Bigrams", x = "Bigrams", y = "Frequency")

4. Cleaning the Text

clean_text <- text_df %>%
  mutate(text = tolower(text)) %>%
  mutate(text = str_replace_all(text, "http\\S+", "")) %>%
  mutate(text = str_replace_all(text, "@\\w+", "")) %>%
  mutate(text = str_replace_all(text, "#\\w+", "")) %>%
  mutate(text = str_replace_all(text, "[^a-z\\s]", "")) %>%
  mutate(text = str_squish(text))

5. Removing Profane Words (Luis von Ahn’s List)

url <- "https://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
profanity_file <- "profanity_list.txt"
download.file(url, destfile = profanity_file, method = "auto")
profanity <- readLines(profanity_file, warn = FALSE)

data("stop_words")

clean_words <- clean_text %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word) %>%
  filter(!word %in% profanity)

clean_words %>%
  count(word, sort = TRUE) %>%
  head(20)
##      word      n
## 1      im 246449
## 2    time 213953
## 3    dont 176062
## 4     day 169245
## 5    love 159576
## 6  people 158260
## 7      rt  88749
## 8    life  87816
## 9    home  80563
## 10   week  76100
## 11  night  75370
## 12 school  72223
## 13   game  70733
## 14    lol  69335
## 15  didnt  68481
## 16  happy  66468
## 17    ive  66156
## 18  youre  64513
## 19  world  64304
## 20   feel  58132