library(knitr)
library(stringi) # stats files
library(tm) # Text mining
library(NLP)
library(rJava) #tokenizer prerequisite
library(RWeka)# tokenizer - create unigrams, bigrams, trigrams
# You will need to have Java installed for this package to run. check your r version with R. version and install the appropriate version of java. No other work necessary.
library(ggplot2) #visualization
The ultimate goal of the capstone project is to demonstrate the skills that have been developed during the Data Science specialization. to do this we will create an interface for a text prediction algorithm. This will take the form of a shiny app that can be access by others which takes an input (word of phrase) and create a prediction for the following word.
The goal of this assignment is to not only demonstrate progress toward the development of the final assignment but also to provide a brief summary of the data being used for the predictive text algorithm.
setwd("C:/Users/sgras/OneDrive/Documents/School/WNTR 2024/Capstone Project/Coursera-SwiftKey/final/en_US")
blogs <- readLines("en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
Now that we have the data in hand, we can take a step back and look at some of the characteristics of our dataset. The following code will help us take a look at file size, the number of lines, the number of characters total and the number of words in our data.
# Size of Files file.size
size_blogs <- file.info("en_US.blogs.txt")$size / 1024^2 # Megabytes
size_news <- file.info("en_US.news.txt")$size / 1024^2 # Megabytes
size_twitter <- file.info("en_US.twitter.txt")$size / 1024^2 # Megabytes
# Number of Lines num.lines
len_blogs <- length(blogs)
# 899,288 lines
len_news <- length(news)
# 1,010,242 lines
len_twitter <- length(twitter)
# 2,360,148
# Number of characters
nchar_blogs <- sum(nchar(blogs))
#206824505
nchar_news <- sum(nchar(news))
#15639408
nchar_twitter <- sum(nchar(twitter))
#162096031
# Counting the Words (num.words)
nword_blogs <- sum(stri_count_words(blogs))
# words at blogs = 37,546,246
nword_news <- sum(stri_count_words(news))
# words at news = 34,762,395
nword_twitter <-sum(stri_count_words(twitter))
# words at twitter = 30,093,410
# create table
data.frame(file.name = c("blogs", "news", "twitter"),
files.size.MB = c(size_blogs,size_news,size_twitter),
num.lines = c(len_blogs,len_news,len_twitter),
num.character = c(nchar_blogs,nchar_news,nchar_twitter),
num.words = c(nword_blogs,nword_news,nword_twitter))
## file.name files.size.MB num.lines num.character num.words
## 1 blogs NA 899288 206824505 37546806
## 2 news NA 77259 15639408 2674561
## 3 twitter NA 2360148 162096031 30096649
# Here were simply creating a table for a summary overview of the data involved (see above for specifics)
Remove all non-English characters and then compile a sample dataset that is composed of 5% of each of the 3 original datasets.
set.seed(12345)
blogs1 <-iconv(blogs,"latin1","ASCII",sub="")
news1 <-iconv(news,"latin1","ASCII",sub="")
twitter1 <-iconv(twitter,"latin1","ASCII",sub="")
# sample data set only 1% of each file
sample_data <-c(sample(blogs1,length(blogs1)*0.05),
sample(news1,length(news1)*0.05),
sample(twitter1,length(twitter1)*0.05))
Since the raw data sets are simply too big for processing, I will be sampling 5% of the data from each file which is still more that sufficient for our needs.
corpus <- VCorpus(VectorSource(sample_data))
#Remove punctuations
corpus1 <- tm_map(corpus,removePunctuation)
#Remove white spaces
corpus2 <- tm_map(corpus1,stripWhitespace)
# Convert to lowercase
corpus3 <- tm_map(corpus2,tolower)
#Remove numbers
corpus4 <- tm_map(corpus3,removeNumbers)
#remodelling to a plain .txt
corpus5 <- tm_map(corpus4,PlainTextDocument)
#removing stop words in English (a, as, at, so, etc.)
corpus6 <- tm_map(corpus5,removeWords,stopwords("english"))
In Natural Language Processing (NLP), an n-gram is a contiguous sequence of n items from a given sequence of text or speech. Unigrams are effectivly single words. Bigrams are two-word combinations. And Trigrams are three-word combinations.
The following function is used to extract 1-grams, 2-grams, 3-grams from the finalized text Corpus (corpus6) using RWeka.
#The RWeka package will help to construct functions that tokenize the sample and construct matrices of uniqrams, bigrams, and trigrams.
one<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
two<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
thr<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
one_table<-TermDocumentMatrix(corpus6,control=list(tokenize=one))
two_table<-TermDocumentMatrix(corpus6,control=list(tokenize=two))
thr_table<-TermDocumentMatrix(corpus6,control=list(tokenize=thr))
#Then I find the frequency of terms in each of these 3 matrices and construct dataframes of these frequencies.
one_corpus<-findFreqTerms(one_table,lowfreq=1000)
two_corpus<-findFreqTerms(two_table,lowfreq=80)
thr_corpus<-findFreqTerms(thr_table,lowfreq=10)
one_corpus_num<-rowSums(as.matrix(one_table[one_corpus,]))
one_corpus_table<-data.frame(Word=names(one_corpus_num),frequency=one_corpus_num)
one_corpus_sort<-one_corpus_table[order(-one_corpus_table$frequency),]
head(one_corpus_sort)
## Word frequency
## just just 12839
## like like 11203
## will will 10928
## one one 10447
## can can 9586
## get get 9326
two_corpus_num<-rowSums(as.matrix(two_table[two_corpus,]))
two_corpus_table<-data.frame(Word=names(two_corpus_num),frequency=two_corpus_num)
two_corpus_sort<-two_corpus_table[order(-two_corpus_table$frequency),]
head(two_corpus_sort)
## Word frequency
## right now right now 1115
## cant wait cant wait 958
## dont know dont know 821
## last night last night 741
## im going im going 615
## feel like feel like 577
thr_corpus_num<-rowSums(as.matrix(thr_table[thr_corpus,]))
thr_corpus_table<-data.frame(Word=names(thr_corpus_num),frequency=thr_corpus_num)
thr_corpus_sort<-thr_corpus_table[order(-thr_corpus_table$frequency),]
head(thr_corpus_sort)
## Word frequency
## cant wait see cant wait see 170
## happy mothers day happy mothers day 152
## let us know let us know 117
## happy new year happy new year 96
## im pretty sure im pretty sure 74
## feel like im feel like im 63
The frequency distribution of each n-grams category were visualized into 3 different bar plots with color to supplement the frequency.
one_g<-ggplot(one_corpus_sort[1:30,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
one_g<-one_g+geom_bar(stat="identity")
one_g<-one_g+labs(title="Top 30 Unigrams",x="Words",y="Frequency")
one_g<-one_g+coord_flip()
one_g<-one_g+ scale_fill_gradient(low = "blue", high = "red")
one_g
two_g<-ggplot(two_corpus_sort[1:20,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
two_g<-two_g+geom_bar(stat="identity")
two_g<-two_g+labs(title="Top 20 Bigrams",x="Words",y="Frequency")
two_g<-two_g+coord_flip()
two_g<-two_g+ scale_fill_gradient(low = "blue", high = "red")
two_g
three_g<-ggplot(thr_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
three_g<-three_g+geom_bar(stat="identity")
three_g<-three_g+labs(title="Top 10 Trigrams",x="Words",y="Frequency")
three_g<-three_g+coord_flip()
three_g<-three_g+ scale_fill_gradient(low = "blue", high = "red")
three_g
Ideas: Use a text input box as the user interface of the Shiny app.
Next Steps: 1.Build a predictive algorithm 2.Build a a Shiny app, that suggest the most likely next word after a phrase is typed 3.Prepare a pitch about the app and publish it at “shinyapps.io” server.