Introduction
The goal of the capstone project is to create a predictive text model using a large text corpus of documents as training data. Natural language processing techniques will be used to perform the analysis.
Install packages and libraries
install.packages("ggplot2", repos = "https://cloud.r-project.org")
##
## There is a binary version available but the source version is later:
## binary source needs_compilation
## ggplot2 3.5.1 4.0.3 FALSE
install.packages("leaflet", repos = "https://cloud.r-project.org")
##
## There is a binary version available but the source version is later:
## binary source needs_compilation
## leaflet 2.2.2 2.2.3 FALSE
install.packages("sf", repos = "https://cloud.r-project.org")
##
## There is a binary version available but the source version is later:
## binary source needs_compilation
## sf 1.0-20 1.1-1 TRUE
install.packages("stringi", repos = "https://cloud.r-project.org")
## package 'stringi' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\rmendozam\AppData\Local\Temp\Rtmpu41DjP\downloaded_packages
install.packages("tm", repos = "https://cloud.r-project.org")
##
## There is a binary version available but the source version is later:
## binary source needs_compilation
## tm 0.7-16 0.7-18 TRUE
install.packages("NLP", repos = "https://cloud.r-project.org")
## package 'NLP' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\rmendozam\AppData\Local\Temp\Rtmpu41DjP\downloaded_packages
install.packages("RWeka", repos = "https://cloud.r-project.org")
##
## There is a binary version available but the source version is later:
## binary source needs_compilation
## RWeka 0.4-46 0.4-48 FALSE
# Cargar las librerías para empezar a trabajar
library(ggplot2)
library(leaflet)
library(sf)
library(stringi)
library(tm)
library(NLP)
library(RWeka)
Loading Blogs, news, twitter
setwd("~/Coursera/2025/Modulo 10 DS Capstone/Milestone-Report/Data/en_US")
blogs <- readLines("en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
Sampling
#Size of Files file.size
size_blogs <- file.info("en_US.blogs.txt")$size / 1024^2 # Megabytes
size_news <- file.info("en_US.news.txt")$size / 1024^2 # Megabytes
size_twitter <- file.info("en_US.twitter.txt")$size / 1024^2 # Megabytes
# Number of Lines num.lines
len_blogs <- length(blogs) # 899,288 lines
len_news <- length(news) # 1,010,242 lines
len_twitter <- length(twitter) # 2,360,148
# Number of characters
nchar_blogs <- sum(nchar(blogs))
nchar_news <- sum(nchar(news))
nchar_twitter <- sum(nchar(twitter))
# Counting the Words (num.words)
nword_blogs <- sum(stri_count_words(blogs)) # words at blogs = 37,546,246
nword_news <- sum(stri_count_words(news)) # words at news = 34,762,395
nword_twitter <-sum(stri_count_words(twitter)) # words at twitter = 30,093,410
# create table
data.frame(file.name = c("blogs", "news", "twitter"),
files.size.MB = c(size_blogs,size_news,size_twitter),
num.lines = c(len_blogs,len_news,len_twitter),
num.character = c(nchar_blogs,nchar_news,nchar_twitter),
num.words = c(nword_blogs,nword_news,nword_twitter))
## file.name files.size.MB num.lines num.character num.words
## 1 blogs NA 899288 206824505 37546806
## 2 news NA 77259 15639408 2674561
## 3 twitter NA 2360148 162096031 30096649
Sampling
Remove all non-English characters and then compile a sample dataset that is composed of 1% of each of the 3 original datasets.
set.seed(12345)
blogs1 <-iconv(blogs,"latin1","ASCII",sub="")
news1 <-iconv(news,"latin1","ASCII",sub="")
twitter1 <-iconv(twitter,"latin1","ASCII",sub="")
# sample data set only 1% of each file
sample_data <-c(sample(blogs1,length(blogs1)*0.01),
sample(news1,length(news1)*0.01),
sample(twitter1,length(twitter1)*0.01))
Since Data sets is too big for processing, so using sample() function, I sample 1% of each file.
Clean and build corpus
corpus <- VCorpus(VectorSource(sample_data))
corpus1 <- tm_map(corpus,removePunctuation)
corpus2 <- tm_map(corpus1,stripWhitespace)
corpus3 <- tm_map(corpus2,tolower) # Convert to lowercase
corpus4 <- tm_map(corpus3,removeNumbers)
corpus5 <- tm_map(corpus4,PlainTextDocument)
#removing stop words in English (a, as, at, so, etc.)
corpus6 <- tm_map(corpus5,removeWords,stopwords("english"))
Build N-Grams
In Natural Language Processing (NLP), n-gram is a contiguous sequence of n items from a given sequence of text or speech. Unigrams are single words. Bigrams are two words combinations. Trigrams are three-word combinations.
The following function is used to extract 1-grams, 2-grams, 3-grams from the text Corpus using RWeka.
#Using RWeka package to construct functions that tokenize the sample and construct matrices of uniqrams, bigrams, and trigrams.
one<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
two<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
thr<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
one_table<-TermDocumentMatrix(corpus6,control=list(tokenize=one))
two_table<-TermDocumentMatrix(corpus6,control=list(tokenize=two))
thr_table<-TermDocumentMatrix(corpus6,control=list(tokenize=thr))
#Then the frequency of terms in each of these 3 matrices and construct dataframes of these frequencies.
one_corpus<-findFreqTerms(one_table,lowfreq=1000)
two_corpus<-findFreqTerms(two_table,lowfreq=80)
thr_corpus<-findFreqTerms(thr_table,lowfreq=10)
one_corpus_num<-rowSums(as.matrix(one_table[one_corpus,]))
one_corpus_table<-data.frame(Word=names(one_corpus_num),frequency=one_corpus_num)
one_corpus_sort<-one_corpus_table[order(-one_corpus_table$frequency),]
head(one_corpus_sort)
## Word frequency
## just just 2576
## like like 2218
## will will 2211
## one one 2049
## get get 1869
## can can 1866
two_corpus_num<-rowSums(as.matrix(two_table[two_corpus,]))
two_corpus_table<-data.frame(Word=names(two_corpus_num),frequency=two_corpus_num)
two_corpus_sort<-two_corpus_table[order(-two_corpus_table$frequency),]
head(two_corpus_sort)
## Word frequency
## cant wait cant wait 208
## right now right now 206
## dont know dont know 164
## last night last night 148
## im going im going 130
## feel like feel like 125
thr_corpus_num<-rowSums(as.matrix(thr_table[thr_corpus,]))
thr_corpus_table<-data.frame(Word=names(thr_corpus_num),frequency=thr_corpus_num)
thr_corpus_sort<-thr_corpus_table[order(-thr_corpus_table$frequency),]
head(thr_corpus_sort)
## Word frequency
## cant wait see cant wait see 45
## happy mothers day happy mothers day 36
## happy new year happy new year 24
## im pretty sure im pretty sure 18
## italy lakes holidays italy lakes holidays 18
## little italy boston little italy boston 17
Exploratory analysis: Graphs and visualizations
The frequency distribution of each n-grams category were visualized into 3 different bar plots.
#Unigram
one_g<-ggplot(one_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
one_g<-one_g+geom_bar(stat="identity")
one_g<-one_g+labs(title="Unigrams",x="Words",y="Frequency")
one_g<-one_g+theme(axis.text.x=element_text(angle=90, color = "red"))
one_g
#Biogram
two_g<-ggplot(two_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
two_g<-two_g+geom_bar(stat="identity")
two_g<-two_g+labs(title="Bigrams",x="Words",y="Frequency")
two_g<-two_g+theme(axis.text.x=element_text(angle=90, color ="orange"))
two_g
#Trigrams
thr_g<-ggplot(thr_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
thr_g<-thr_g+geom_bar(stat="identity")
thr_g<-thr_g+labs(title="Trigrams",x="Words",y="Frequency")
thr_g<-thr_g+theme(axis.text.x=element_text(angle=90, color ="green"))
thr_g
Conclusion & Next Steps
Ideas: Use a text input box as the user interface of the Shiny app.
Next Steps:
1.Build a predictive algorithm
2.Build a a Shiny app, that suggest the most likely next word after a phrase is typed
3.Prepare a pitch about the app and publish it at “shinyapps.io” server.