This project leverages a diverse text dataset (blogs, news, and Twitter) to build a predictive text application. The project has following steps
Step 1. First it explore and summarize text data.(Milestone Report focus) Step 2. Build n-gram models. Step 3. Create a prediction algorithm. Step 4. Deploy it as a Shiny app.
# File path
file_blogs <- 'en_US.blogs.txt'
file_news <- 'en_US.news.txt'
file_twitter <- 'en_US.twitter.txt'
# Read the files
blogs <- readLines(file_blogs, warn = FALSE)
news <- readLines(file_news, warn = FALSE)
twitter <- readLines(file_blogs, warn = FALSE)
# Number of Lines
num_lines <- c( blogs = length(blogs),
news = length(news),
twitter = length(twitter)
)
# Numnber of words
word_count <- function(lines)
sum(sapply(strsplit(lines, "\\s+"), length))
num_words <- c( blogs = word_count(blogs),
news = word_count(news),
twitter = word_count(twitter)
)
# Number of characters
num_char <- c( blogs = sum(nchar(blogs)),
news = sum(nchar(news)),
twitter = sum(nchar(twitter))
)
# Combine all to make summary
summary_state <- data.frame( File = c ('blogs', 'news', 'twitter'),
Lines = num_lines,
Words = num_words,
Characters = num_char)
print(summary_state)
## File Lines Words Characters
## blogs blogs 899288 37334131 206824509
## news news 1010206 34371031 203214543
## twitter twitter 899288 37334131 206824509
# Set CRAN mirror explicitly (e.g., cloud mirror)
options(repos = c(CRAN = "https://cloud.r-project.org"))
# install/load required packages
install.packages(c("tidytext", "dplyr", "ggplot2", "stringr", "tm"))
## package 'tidytext' successfully unpacked and MD5 sums checked
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\R_Folder\R_Libraries\00LOCK\dplyr\libs\x64\dplyr.dll to
## D:\R_Folder\R_Libraries\dplyr\libs\x64\dplyr.dll: Permission denied
## Warning: restored 'dplyr'
## package 'ggplot2' successfully unpacked and MD5 sums checked
## package 'stringr' successfully unpacked and MD5 sums checked
## package 'tm' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Drsal\AppData\Local\Temp\RtmpGitCtd\downloaded_packages
library(tidytext)
library(dplyr)
library(ggplot2)
library(stringr)
library(tm)
# Combine all file into a data frame
text_df <- data.frame (text = c(blogs, news, twitter),
source = c ( rep("blogs", length(blogs)),
rep("news", length(news)),
rep("twitter", length(twitter))
), stringsAsFactors = FALSE
)
# Tokanize into words (Unigrams)
word_counts <- text_df %>%
unnest_tokens(word, text) %>%
count(source, word, sort = TRUE)
# Visualize Top Words By Source
word_counts %>%
group_by(source) %>%
top_n(10, n) %>%
ungroup() %>%
ggplot(aes(reorder(word, n), n, fill = source)) +
geom_col(show.legend = FALSE) +
facet_wrap(~source, scales = "free") +
coord_flip() +
labs(title = "Top 10 Most Frequent Words", x = "Words", y = "Frequency")
## 3.2 Word Frequency Analysis (bigrams)
# Tokenize into word pairs (bigrams)
bigrams <- text_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
count(source, bigram, sort = TRUE)
# Plot top 10 bigrams per source
bigrams %>%
group_by(source) %>%
top_n(10, n) %>%
ungroup() %>%
ggplot(aes(reorder(bigram, n), n, fill = source)) +
geom_col(show.legend = FALSE) +
facet_wrap(~source, scales = "free") +
coord_flip() +
labs(title = "Top 10 Most Frequent Bigrams", x = "Bigrams", y = "Frequency")
## 4. Cleaning the text
clean_text <- text_df %>%
mutate(text = tolower (text)) %>% # Lowercases
mutate(text = str_replace_all(text, "http\\S+", "")) %>% # Remove URLs
mutate(text = str_replace_all(text, "@\\w+", "")) %>% # Remove mentions
mutate(text = str_replace_all(text, "#\\w+", "")) %>% # Remove hashtag
mutate(text = str_replace_all(text, "[^a-z\\s]", "")) %>% # Remove punctuation/numbers
mutate(text = str_squish(text)) # Remove extra spaces
# Define the URL for the profanity list
url <- "https://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
# Download the list
profanity_file <- "profanity_list.txt"
download.file(url, destfile = profanity_file, method = "auto")
# Load the words into R
profanity <- readLines(profanity_file, warn = FALSE)
# Tokenize and filter both stopwords and profanity
library(tidytext)
library(dplyr)
data("stop_words") # from tidytext
clean_words <- clean_text %>%
unnest_tokens(word, text) %>% # This creates a 'word' column
filter(!word %in% stop_words$word) %>%
filter(!word %in% profanity) # Remove Profane words
#view top 20 clean word
clean_words %>%
count(word, sort = TRUE) %>%
head(20)
## word n
## 1 time 227418
## 2 people 165996
## 3 im 154846
## 4 dont 142223
## 5 day 129712
## 6 love 99049
## 7 life 94574
## 8 home 83708
## 9 ive 75985
## 10 didnt 75299
## 11 week 73401
## 12 world 72310
## 13 school 69087
## 14 lot 61458
## 15 book 61182
## 16 city 58797
## 17 feel 58027
## 18 family 57997
## 19 house 56072
## 20 days 55613