The goal of the capstone project is to create a predictive text model using a large text corpus of documents as training data. Natural language processing techniques will be used to perform the analysis.
The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set.
The motivation for this project is to:
Review criteria:
Loading The Required Libraries and creating support functions.
## Loading the package 'BBmisc'
if(suppressMessages(!require('BBmisc'))) install.packages('BBmisc')
suppressMessages(library('BBmisc'))
pkgs <- c('tufte', 'knitr', 'rmarkdown', 'lubridate', 'plyr', 'dplyr', 'magrittr', 'purrr', 'stringr', 'stringi', 'wordcloud', 'slam', 'tm', 'igraph', 'NLP', 'xtable', 'SnowballC', 'rpart', 'RWeka', 'RColorBrewer', 'rvest', 'parallel', 'doParallel', 'ggplot2', 'googleVis', 'htmltools', 'rCharts', 'janeaustenr', 'syuzhet', 'viridis')
# Set CRAN mirror to avoid 'trying to use CRAN without setting a mirror' error
options(repos = c(CRAN = "[https://cran.rstudio.com/](https://cran.rstudio.com/)"))
suppressAll(lib(pkgs))
## load in case of BBmisc::lib() doesn't work
suppressAll(plyr::l_ply(pkgs, require, quietly = TRUE))
rm(pkgs)
The dataset is downloadable in zipped file.
# Download the file and put the file in the "data" folder
if (!file.exists("./data")){
dir.create("./data")
}
URL <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(URL,destfile="./data/Coursera-SwiftKey.zip",method="auto")
# Unzip DataSet to /data directory
unzip(zipfile="./data/Coursera-SwiftKey.zip", exdir="./data")
From above information, we can know the information of the zipped files, and now we try to list out the documents for this mile-stone report as well as the summary of files.
## Load plyr and dplyr packages again
suppressAll(library('plyr'))
suppressAll(library('dplyr'))
## files for this mile-stone report
lsfiles <- list.files('data/final/en_US')
lsfiles
## [1] "en_US.blogs.txt" "en_US.corpus.rds" "en_US.corpus.txt"
## [4] "en_US.news.txt" "en_US.twitter.txt"
## summary of files
datafiles <- paste0('data/final/en_US/', lsfiles)
rm(lsfiles)
rbind_all <- llply(as.list(datafiles), file.info)
rbind_all
## [[1]]
## size isdir mode mtime
## data/final/en_US/en_US.blogs.txt 210160014 FALSE 666 2025-11-16 15:52:46
## ctime atime exe
## data/final/en_US/en_US.blogs.txt 2025-11-16 15:20:34 2025-11-16 15:52:46 no
## uname udomain
## data/final/en_US/en_US.blogs.txt Admin DESKTOP-C97DV80
##
## [[2]]
## size isdir mode mtime
## data/final/en_US/en_US.corpus.rds 2814359 FALSE 666 2025-11-16 15:44:30
## ctime atime exe
## data/final/en_US/en_US.corpus.rds 2025-11-16 15:34:39 2025-11-16 15:44:30 no
## uname udomain
## data/final/en_US/en_US.corpus.rds Admin DESKTOP-C97DV80
##
## [[3]]
## size isdir mode mtime
## data/final/en_US/en_US.corpus.txt 4322111 FALSE 666 2025-11-16 15:44:31
## ctime atime exe
## data/final/en_US/en_US.corpus.txt 2025-11-16 15:34:40 2025-11-16 15:44:31 no
## uname udomain
## data/final/en_US/en_US.corpus.txt Admin DESKTOP-C97DV80
##
## [[4]]
## size isdir mode mtime
## data/final/en_US/en_US.news.txt 205811889 FALSE 666 2025-11-16 15:52:45
## ctime atime exe
## data/final/en_US/en_US.news.txt 2025-11-16 15:20:32 2025-11-16 15:52:45 no
## uname udomain
## data/final/en_US/en_US.news.txt Admin DESKTOP-C97DV80
##
## [[5]]
## size isdir mode mtime
## data/final/en_US/en_US.twitter.txt 167105338 FALSE 666 2025-11-16 15:52:43
## ctime atime exe
## data/final/en_US/en_US.twitter.txt 2025-11-16 15:20:31 2025-11-16 15:52:43 no
## uname udomain
## data/final/en_US/en_US.twitter.txt Admin DESKTOP-C97DV80
# Set working directory to where the text files are
# Note: setwd() in Rmd is complex. It's often better to use relative paths.
# We will assume the data is in 'data/final/en_US/' relative to the .Rmd file
blogs <- readLines("data/final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("data/final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("data/final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
library(stringi) # stats files
# Size of Files file.size
size_blogs <- file.info("data/final/en_US/en_US.blogs.txt")$size / 1024^2 # Megabytes
size_news <- file.info("data/final/en_US/en_US.news.txt")$size / 1024^2 # Megabytes
size_twitter <- file.info("data/final/en_US/en_US.twitter.txt")$size / 1024^2 # Megabytes
# Number of Lines num.lines
len_blogs <- length(blogs)
len_news <- length(news)
len_twitter <- length(twitter)
# Number of characters
nchar_blogs <- sum(nchar(blogs))
nchar_news <- sum(nchar(news))
nchar_twitter <- sum(nchar(twitter))
# Counting the Words (num.words)
nword_blogs <- sum(stri_count_words(blogs))
nword_news <- sum(stri_count_words(news))
nword_twitter <-sum(stri_count_words(twitter))
# create table
data.frame(file.name = c("blogs", "news", "twitter"),
files.size.MB = c(size_blogs,size_news,size_twitter),
num.lines = c(len_blogs,len_news,len_twitter),
num.character = c(nchar_blogs,nchar_news,nchar_twitter),
num.words = c(nword_blogs,nword_news,nword_twitter))
## file.name files.size.MB num.lines num.character num.words
## 1 blogs 200.4242 899288 206824505 37546806
## 2 news 196.2775 1010206 203214543 34761151
## 3 twitter 159.3641 2360148 162096031 30096649
Remove all non-English characters and then compile a sample dataset that is composed of 1% of each of the 3 original datasets.
# assign sample size
sampleSize = 0.01
set.seed(12345)
blogs1 <-iconv(blogs,"latin1","ASCII",sub="")
news1 <-iconv(news,"latin1","ASCII",sub="")
twitter1 <-iconv(twitter,"latin1","ASCII",sub="")
# sample data set only 1% of each file
sample_data <-c(sample(blogs1,length(blogs1)*sampleSize),
sample(news1,length(news1)*sampleSize),
sample(twitter1,length(twitter1)*sampleSize))
Since Data sets is too big for processing, so using sample() function, I sample 1% of each file.
The next step is to create a corpus from the sampled data set to perform the following transformation steps for each document:
library(stringi)
library(kableExtra)
library(tm) # Text mining
library(NLP)
# download bad words file
badWordsURL <- "https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en"
badWordsFile <- "data/badwords_en.txt"
if (!file.exists('data')) {
dir.create('data')
}
if (!file.exists(badWordsFile)) {
download.file(badWordsURL, destfile = badWordsFile, method = "auto")
}
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- VCorpus(VectorSource(sample_data))
corpus1 <- tm_map(corpus,removePunctuation)
corpus2 <- tm_map(corpus1,stripWhitespace)
corpus3 <- tm_map(corpus2,tolower) # Convert to lowercase
corpus4 <- tm_map(corpus3,removeNumbers)
corpus5 <- tm_map(corpus4,PlainTextDocument)
#removing stop words in English (a, as, at, so, etc.)
corpus6 <- tm_map(corpus5,removeWords,stopwords("english"))
# remove profane words from the sample data set , skipNul = TRUE
con <- file(badWordsFile, open = "r")
profanity <- readLines(con, encoding = "UTF-8")
close(con)
profanity <- iconv(profanity, "latin1", "ASCII", sub = "")
corpus7 <- tm_map(corpus6, removeWords, profanity)
# remove URL, Twitter handles and email patterns
corpus8 <- tm_map(corpus7, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus9 <- tm_map(corpus8, toSpace, "@[^\\s]+")
corpus10 <- tm_map(corpus9, toSpace, "\\b[A-Z a-z 0-9._ - ]*[@](.*?)[.]{1,3} \\b")
# build the corpus and write to disk (RDS)
saveRDS(corpus10, file = "data/final/en_US/en_US.corpus.rds")
# convert corpus to a dataframe and write lines/words to disk (text)
corpusText <- data.frame(text = unlist(sapply(corpus10, '[', "content")), stringsAsFactors = FALSE)
con <- file("data/final/en_US/en_US.corpus.txt", open = "w")
writeLines(corpusText$text, con)
close(con)
kable(head(corpusText$text, 10),
row.names = FALSE,
col.names = NULL,
align = c("l"),
caption = "First 10 Documents") %>% kable_styling(position = "left")
| put another way spirit sites mission |
| regrets youth |
| tom see |
| see fault evolution |
| seriously wells youngs bull crap selling uk next year get sorted want drinking christmas |
| answer pretty straightforward need muscle biopsy painful muscles yes know something looking forward however place can find virus will happy help find virus causing severe disabling disease mitochondria |
| found ad lucky one left |
| teaspoon grated lemon zest okay didnt lemon used clementine zest instead |
| occasionally wonderful talking heads lifetime momentyou know beautiful house beautiful wife get moment |
| curious started looking information surrounding brownsville revival like toronto blessing brownsville revival similar manifestations holy spirit following lakeland revival |
# remove variables no longer needed to free up memory
rm(corpus)
rm(corpus1)
rm(corpus2)
rm(corpus3)
rm(corpus4)
rm(corpus5)
rm(corpus6)
rm(corpus7)
rm(corpus8)
rm(corpus9)
rm(blogs, blogs1, news, news1, twitter, twitter1, sample_data)
In Natural Language Processing (NLP), n-gram is a contiguous sequence of n items from a given sequence of text or speech. Unigrams are single words. Bigrams are two words combinations. Trigrams are three-word combinations.
The following function is used to extract 1-grams, 2-grams, 3-grams from the text Corpus using RWeka.
library(RWeka)
# tokenizer - create unigrams, bigrams, trigrams
# I use the RWeka package to construct functions that tokenize the sample
# and construct matrices of uniqrams, bigrams, and trigrams.
one<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
two<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
thr<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
one_table<-TermDocumentMatrix(corpus10,control=list(tokenize=one))
two_table<-TermDocumentMatrix(corpus10,control=list(tokenize=two))
thr_table<-TermDocumentMatrix(corpus10,control=list(tokenize=thr))
# Then I find the frequency of terms in each of these 3 matrices
# and construct dataframes of these frequencies.
one_corpus<-findFreqTerms(one_table,lowfreq=1000)
two_corpus<-findFreqTerms(two_table,lowfreq=80)
thr_corpus<-findFreqTerms(thr_table,lowfreq=10)
one_corpus_num<-rowSums(as.matrix(one_table[one_corpus,]))
one_corpus_table<-data.frame(Word=names(one_corpus_num),frequency=one_corpus_num)
one_corpus_sort<-one_corpus_table[order(-one_corpus_table$frequency),]
head(one_corpus_sort)
## Word frequency
## will will 3183
## just just 3017
## said said 2970
## one one 2799
## like like 2698
## can can 2426
two_corpus_num<-rowSums(as.matrix(two_table[two_corpus,]))
two_corpus_table<-data.frame(Word=names(two_corpus_num),frequency=two_corpus_num)
two_corpus_sort<-two_corpus_table[order(-two_corpus_table$frequency),]
head(two_corpus_sort)
## Word frequency
## right now right now 245
## cant wait cant wait 213
## dont know dont know 210
## new york new york 180
## last year last year 164
## im going im going 147
thr_corpus_num<-rowSums(as.matrix(thr_table[thr_corpus,]))
thr_corpus_table<-data.frame(Word=names(thr_corpus_num),frequency=thr_corpus_num)
thr_corpus_sort<-thr_corpus_table[order(-thr_corpus_table$frequency),]
head(thr_corpus_sort)
## Word frequency
## cant wait see cant wait see 51
## happy mothers day happy mothers day 33
## call call call call call call 23
## happy new year happy new year 21
## let us know let us know 19
## italy lakes holidays italy lakes holidays 18
Wordcloud for most frequent words.
The frequency distribution of each n-grams category were visualized into 3 different bar plots.
library(ggplot2) #visualization
one_g<-ggplot(one_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
one_g<-one_g+geom_bar(stat="identity") + scale_fill_gradient(low = "yellow", high = "red", na.value = NA)
one_g<-one_g+labs(title="Unigrams",x="Words",y="Frequency")
one_g<-one_g+theme(axis.text.x=element_text(angle=90))
one_g
two_g<-ggplot(two_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
two_g<-two_g+geom_bar(stat="identity") + scale_fill_gradient(low = "yellow", high = "red", na.value = NA)
two_g<-two_g+labs(title="Bigrams",x="Words",y="Frequency")
two_g<-two_g+theme(axis.text.x=element_text(angle=90))
two_g
thr_g<-ggplot(thr_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
thr_g<-thr_g+geom_bar(stat="identity") + scale_fill_gradient(low = "yellow", high = "red", na.value = NA)
thr_g<-thr_g+labs(title="Trigrams",x="Words",y="Frequency")
thr_g<-thr_g+theme(axis.text.x=element_text(angle=90))
thr_g
Ideas: Use a text input box as the user interface of the Shiny app.
Next Steps:
```