This project leverages a diverse text dataset (blogs, news, and Twitter) to build a predictive text application. The project has the following steps:
Step 1. First it explores and summarizes text data. (Milestone Report
focus)
Step 2. Build n-gram models.
Step 3. Create a prediction algorithm.
Step 4. Deploy it as a Shiny app.
file_blogs <- 'en_US.blogs.txt'
file_news <- 'en_US.news.txt'
file_twitter <- 'en_US.twitter.txt'
blogs <- readLines(file_blogs, warn = FALSE)
news <- readLines(file_news, warn = FALSE)
twitter <- readLines(file_twitter, warn = FALSE)
word_count <- function(lines) {
splits <- strsplit(lines, "\\\\s+")
sum(sapply(splits, length))
}
summary_state <- data.frame(
File = c('blogs', 'news', 'twitter'),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(word_count(blogs), word_count(news), word_count(twitter)),
Characters = c(sum(nchar(blogs)), sum(nchar(news)), sum(nchar(twitter)))
)
print(summary_state)
## File Lines Words Characters
## 1 blogs 899288 899288 206824509
## 2 news 1010206 1010206 203214543
## 3 twitter 2360148 2360151 162122651
options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages(c("tidytext", "dplyr", "ggplot2", "stringr", "tm"))
## package 'tidytext' successfully unpacked and MD5 sums checked
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\R_Folder\R_Libraries\00LOCK\dplyr\libs\x64\dplyr.dll to
## D:\R_Folder\R_Libraries\dplyr\libs\x64\dplyr.dll: Permission denied
## Warning: restored 'dplyr'
## package 'ggplot2' successfully unpacked and MD5 sums checked
## package 'stringr' successfully unpacked and MD5 sums checked
## package 'tm' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Drsal\AppData\Local\Temp\RtmpYHs40g\downloaded_packages
library(tidytext)
library(dplyr)
library(ggplot2)
library(stringr)
library(tm)
text_df <- data.frame(
text = c(blogs, news, twitter),
source = c(
rep("blogs", length(blogs)),
rep("news", length(news)),
rep("twitter", length(twitter))
),
stringsAsFactors = FALSE
)
word_counts <- text_df %>%
unnest_tokens(word, text) %>%
count(source, word, sort = TRUE)
word_counts %>%
group_by(source) %>%
top_n(10, n) %>%
ungroup() %>%
ggplot(aes(reorder(word, n), n, fill = source)) +
geom_col(show.legend = FALSE) +
facet_wrap(~source, scales = "free") +
coord_flip() +
labs(title = "Top 10 Most Frequent Words", x = "Words", y = "Frequency")
bigrams <- text_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
count(source, bigram, sort = TRUE)
bigrams %>%
group_by(source) %>%
top_n(10, n) %>%
ungroup() %>%
ggplot(aes(reorder(bigram, n), n, fill = source)) +
geom_col(show.legend = FALSE) +
facet_wrap(~source, scales = "free") +
coord_flip() +
labs(title = "Top 10 Most Frequent Bigrams", x = "Bigrams", y = "Frequency")
clean_text <- text_df %>%
mutate(text = tolower(text)) %>%
mutate(text = str_replace_all(text, "http\\S+", "")) %>%
mutate(text = str_replace_all(text, "@\\w+", "")) %>%
mutate(text = str_replace_all(text, "#\\w+", "")) %>%
mutate(text = str_replace_all(text, "[^a-z\\s]", "")) %>%
mutate(text = str_squish(text))
url <- "https://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
profanity_file <- "profanity_list.txt"
download.file(url, destfile = profanity_file, method = "auto")
profanity <- readLines(profanity_file, warn = FALSE)
data("stop_words")
clean_words <- clean_text %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) %>%
filter(!word %in% profanity)
clean_words %>%
count(word, sort = TRUE) %>%
head(20)
## word n
## 1 im 246449
## 2 time 213953
## 3 dont 176062
## 4 day 169245
## 5 love 159576
## 6 people 158260
## 7 rt 88749
## 8 life 87816
## 9 home 80563
## 10 week 76100
## 11 night 75370
## 12 school 72223
## 13 game 70733
## 14 lol 69335
## 15 didnt 68481
## 16 happy 66468
## 17 ive 66156
## 18 youre 64513
## 19 world 64304
## 20 feel 58132