Predictive Text Application: Milestone Report

STEP 1: Exploratory Analysis and Cleaning of Corpus Dataset

1. Loading Data

file_blogs <- 'en_US.blogs.txt'
file_news <-  'en_US.news.txt' 
file_twitter <- 'en_US.twitter.txt'

blogs <- readLines(file_blogs, warn = FALSE)
news <- readLines(file_news, warn = FALSE)
twitter <- readLines(file_twitter, warn = FALSE)

2. Basic Summary Statistics

word_count <- function(lines) {
  splits <- strsplit(lines, "\\\\s+")
  sum(sapply(splits, length))
}

summary_state <- data.frame(
  File = c('blogs', 'news', 'twitter'),
  Lines = c(length(blogs), length(news), length(twitter)),
  Words = c(word_count(blogs), word_count(news), word_count(twitter)),
  Characters = c(sum(nchar(blogs)), sum(nchar(news)), sum(nchar(twitter)))
)

print(summary_state)

##      File   Lines   Words Characters
## 1   blogs  899288  899288  206824509
## 2    news 1010206 1010206  203214543
## 3 twitter 2360148 2360151  162122651

3. Word Frequency Analysis

3.1 Unigrams

options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages(c("tidytext", "dplyr", "ggplot2", "stringr", "tm"))

## package 'tidytext' successfully unpacked and MD5 sums checked
## package 'dplyr' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'dplyr'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\R_Folder\R_Libraries\00LOCK\dplyr\libs\x64\dplyr.dll to
## D:\R_Folder\R_Libraries\dplyr\libs\x64\dplyr.dll: Permission denied

## Warning: restored 'dplyr'

## package 'ggplot2' successfully unpacked and MD5 sums checked
## package 'stringr' successfully unpacked and MD5 sums checked
## package 'tm' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Drsal\AppData\Local\Temp\RtmpYHs40g\downloaded_packages

library(tidytext)
library(dplyr)
library(ggplot2)
library(stringr)
library(tm)

text_df <- data.frame(
  text = c(blogs, news, twitter), 
  source = c(
    rep("blogs", length(blogs)),
    rep("news", length(news)),
    rep("twitter", length(twitter))
  ), 
  stringsAsFactors = FALSE
)

word_counts <- text_df %>%
  unnest_tokens(word, text) %>%
  count(source, word, sort = TRUE)

word_counts %>%
  group_by(source) %>%
  top_n(10, n) %>%
  ungroup() %>%
  ggplot(aes(reorder(word, n), n, fill = source)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~source, scales = "free") +
  coord_flip() +
  labs(title = "Top 10 Most Frequent Words", x = "Words", y = "Frequency")

3.2 Bigrams

bigrams <- text_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  count(source, bigram, sort = TRUE)

bigrams %>%
  group_by(source) %>%
  top_n(10, n) %>%
  ungroup() %>%
  ggplot(aes(reorder(bigram, n), n, fill = source)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~source, scales = "free") +
  coord_flip() +
  labs(title = "Top 10 Most Frequent Bigrams", x = "Bigrams", y = "Frequency")

4. Cleaning the Text

clean_text <- text_df %>%
  mutate(text = tolower(text)) %>%
  mutate(text = str_replace_all(text, "http\\S+", "")) %>%
  mutate(text = str_replace_all(text, "@\\w+", "")) %>%
  mutate(text = str_replace_all(text, "#\\w+", "")) %>%
  mutate(text = str_replace_all(text, "[^a-z\\s]", "")) %>%
  mutate(text = str_squish(text))

5. Removing Profane Words (Luis von Ahn’s List)

url <- "https://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
profanity_file <- "profanity_list.txt"
download.file(url, destfile = profanity_file, method = "auto")
profanity <- readLines(profanity_file, warn = FALSE)

data("stop_words")

clean_words <- clean_text %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word) %>%
  filter(!word %in% profanity)

clean_words %>%
  count(word, sort = TRUE) %>%
  head(20)

##      word      n
## 1      im 246449
## 2    time 213953
## 3    dont 176062
## 4     day 169245
## 5    love 159576
## 6  people 158260
## 7      rt  88749
## 8    life  87816
## 9    home  80563
## 10   week  76100
## 11  night  75370
## 12 school  72223
## 13   game  70733
## 14    lol  69335
## 15  didnt  68481
## 16  happy  66468
## 17    ive  66156
## 18  youre  64513
## 19  world  64304
## 20   feel  58132

Predictive Text Application: Milestone Report

Salman Saleem

2025-06-19

Introduction

STEP 1: Exploratory Analysis and Cleaning of Corpus Dataset

1. Loading Data

2. Basic Summary Statistics

3. Word Frequency Analysis

3.1 Unigrams

3.2 Bigrams

4. Cleaning the Text

5. Removing Profane Words (Luis von Ahn’s List)