Introduction

This project leverages a diverse text dataset (blogs, news, and Twitter) to build a predictive text application. The project has following steps

Step 1. First it explore and summarize text data.(Milestone Report focus) Step 2. Build n-gram models. Step 3. Create a prediction algorithm. Step 4. Deploy it as a Shiny app.

STEP 1 Exploratory analysis and cleaning of Corpos data set

1. Loading data

# File path 
file_blogs <- 'en_US.blogs.txt'
file_news <-  'en_US.news.txt' 
file_twitter <- 'en_US.twitter.txt'

# Read the files
blogs <- readLines(file_blogs, warn = FALSE)
news <- readLines(file_news, warn = FALSE)
twitter <- readLines(file_blogs, warn = FALSE)

2. Basic Summary Statistics

# Number of Lines 
num_lines <- c( blogs = length(blogs), 
                news = length(news), 
                twitter = length(twitter)
                )
# Numnber of words 
word_count <- function(lines) 
  sum(sapply(strsplit(lines, "\\s+"), length))
num_words <- c( blogs = word_count(blogs), 
                news = word_count(news), 
                twitter = word_count(twitter)
                )
# Number of characters 
num_char <- c( blogs = sum(nchar(blogs)), 
                news = sum(nchar(news)), 
                twitter = sum(nchar(twitter))
)
# Combine all to make summary 
summary_state <- data.frame( File = c ('blogs', 'news', 'twitter'),
                             Lines = num_lines, 
                             Words = num_words,
                             Characters = num_char)
print(summary_state)
##            File   Lines    Words Characters
## blogs     blogs  899288 37334131  206824509
## news       news 1010206 34371031  203214543
## twitter twitter  899288 37334131  206824509

3. Word Frequency Analysis

3.1 Unigrams

# Set CRAN mirror explicitly (e.g., cloud mirror)
options(repos = c(CRAN = "https://cloud.r-project.org"))
# install/load required packages
install.packages(c("tidytext", "dplyr", "ggplot2", "stringr", "tm"))
## package 'tidytext' successfully unpacked and MD5 sums checked
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\R_Folder\R_Libraries\00LOCK\dplyr\libs\x64\dplyr.dll to
## D:\R_Folder\R_Libraries\dplyr\libs\x64\dplyr.dll: Permission denied
## Warning: restored 'dplyr'
## package 'ggplot2' successfully unpacked and MD5 sums checked
## package 'stringr' successfully unpacked and MD5 sums checked
## package 'tm' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Drsal\AppData\Local\Temp\RtmpGitCtd\downloaded_packages
library(tidytext)
library(dplyr)
library(ggplot2)
library(stringr)
library(tm)
# Combine all file into a data frame 
text_df <-  data.frame (text = c(blogs, news, twitter), 
                        source = c ( rep("blogs", length(blogs)),
                                     rep("news", length(news)),
                                     rep("twitter", length(twitter))
                                    
                                    ), stringsAsFactors = FALSE
                        )
# Tokanize into words (Unigrams)
word_counts <- text_df %>%
  unnest_tokens(word, text) %>%
  count(source, word, sort = TRUE)
# Visualize Top Words By Source
word_counts %>%
  group_by(source) %>%
  top_n(10, n) %>%
  ungroup() %>%
  ggplot(aes(reorder(word, n), n, fill = source)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~source, scales = "free") +
  coord_flip() +
  labs(title = "Top 10 Most Frequent Words", x = "Words", y = "Frequency")

## 3.2 Word Frequency Analysis (bigrams)

# Tokenize into word pairs (bigrams)
bigrams <- text_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  count(source, bigram, sort = TRUE)
# Plot top 10 bigrams per source
bigrams %>%
  group_by(source) %>%
  top_n(10, n) %>%
  ungroup() %>%
  ggplot(aes(reorder(bigram, n), n, fill = source)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~source, scales = "free") +
  coord_flip() +
  labs(title = "Top 10 Most Frequent Bigrams", x = "Bigrams", y = "Frequency")

## 4. Cleaning the text

clean_text <- text_df %>%
  mutate(text = tolower (text)) %>% # Lowercases 
  mutate(text = str_replace_all(text, "http\\S+", "")) %>% # Remove URLs
  mutate(text = str_replace_all(text, "@\\w+", "")) %>% # Remove mentions 
  mutate(text = str_replace_all(text, "#\\w+", "")) %>% # Remove hashtag
  mutate(text = str_replace_all(text, "[^a-z\\s]", "")) %>% # Remove punctuation/numbers
  mutate(text = str_squish(text))                           # Remove extra spaces

5. Removing profane words using Luis von Ahn’s Profanity List

# Define the URL for the profanity list
url <- "https://www.cs.cmu.edu/~biglou/resources/bad-words.txt"

# Download the list
profanity_file <- "profanity_list.txt"
download.file(url, destfile = profanity_file, method = "auto")

# Load the words into R
profanity <- readLines(profanity_file, warn = FALSE)
# Tokenize and filter both stopwords and profanity
library(tidytext)
library(dplyr)

data("stop_words")  # from tidytext
clean_words <- clean_text %>%
  unnest_tokens(word, text) %>%       # This creates a 'word' column
  filter(!word %in% stop_words$word) %>%
  filter(!word %in% profanity)        # Remove Profane words

  #view top 20 clean word
  clean_words %>%
  count(word, sort = TRUE) %>%
  head(20)
##      word      n
## 1    time 227418
## 2  people 165996
## 3      im 154846
## 4    dont 142223
## 5     day 129712
## 6    love  99049
## 7    life  94574
## 8    home  83708
## 9     ive  75985
## 10  didnt  75299
## 11   week  73401
## 12  world  72310
## 13 school  69087
## 14    lot  61458
## 15   book  61182
## 16   city  58797
## 17   feel  58027
## 18 family  57997
## 19  house  56072
## 20   days  55613