Capstone Project - Milestone Report

library(knitr)
library(stringi) # stats files
library(tm) # Text mining
library(NLP)
library(rJava) #tokenizer prerequisite
library(RWeka)# tokenizer - create unigrams, bigrams, trigrams
# You will need to have Java installed for this package to run. check your r version with R. version and install the appropriate version of java. No other work necessary.
library(ggplot2) #visualization

Introduction

The ultimate goal of the capstone project is to demonstrate the skills that have been developed during the Data Science specialization. to do this we will create an interface for a text prediction algorithm. This will take the form of a shiny app that can be access by others which takes an input (word of phrase) and create a prediction for the following word.

The goal of this assignment is to not only demonstrate progress toward the development of the final assignment but also to provide a brief summary of the data being used for the predictive text algorithm.

Loading (Blogs, News, Twitter)

setwd("C:/Users/sgras/OneDrive/Documents/School/WNTR 2024/Capstone Project/Coursera-SwiftKey/final/en_US")

blogs <- readLines("en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")

Surface Level Exploration

Now that we have the data in hand, we can take a step back and look at some of the characteristics of our dataset. The following code will help us take a look at file size, the number of lines, the number of characters total and the number of words in our data.

# Size of Files file.size
size_blogs <- file.info("en_US.blogs.txt")$size / 1024^2 # Megabytes
size_news <- file.info("en_US.news.txt")$size  / 1024^2 # Megabytes
size_twitter <- file.info("en_US.twitter.txt")$size / 1024^2 # Megabytes

# Number of Lines num.lines
len_blogs <- length(blogs) 
# 899,288 lines
len_news <- length(news) 
# 1,010,242 lines
len_twitter <- length(twitter)
# 2,360,148

# Number of characters
nchar_blogs <- sum(nchar(blogs))
#206824505
nchar_news <- sum(nchar(news))
#15639408   
nchar_twitter <- sum(nchar(twitter))
#162096031

# Counting the Words (num.words)
nword_blogs <- sum(stri_count_words(blogs)) 
# words at blogs = 37,546,246
nword_news <- sum(stri_count_words(news))  
# words at news =  34,762,395
nword_twitter <-sum(stri_count_words(twitter)) 
# words at twitter = 30,093,410

# create table 
data.frame(file.name = c("blogs", "news", "twitter"),
files.size.MB = c(size_blogs,size_news,size_twitter),
num.lines = c(len_blogs,len_news,len_twitter),
num.character = c(nchar_blogs,nchar_news,nchar_twitter),
num.words = c(nword_blogs,nword_news,nword_twitter))

##   file.name files.size.MB num.lines num.character num.words
## 1     blogs            NA    899288     206824505  37546806
## 2      news            NA     77259      15639408   2674561
## 3   twitter            NA   2360148     162096031  30096649

# Here were simply creating a table for a summary overview of the data involved (see above for specifics)

Sampling

Remove all non-English characters and then compile a sample dataset that is composed of 5% of each of the 3 original datasets.

set.seed(12345)
blogs1 <-iconv(blogs,"latin1","ASCII",sub="")
news1 <-iconv(news,"latin1","ASCII",sub="")
twitter1 <-iconv(twitter,"latin1","ASCII",sub="")

# sample data set only 1% of each file
sample_data <-c(sample(blogs1,length(blogs1)*0.05),
               sample(news1,length(news1)*0.05),
               sample(twitter1,length(twitter1)*0.05))

Since the raw data sets are simply too big for processing, I will be sampling 5% of the data from each file which is still more that sufficient for our needs.

Clean and Build Corpus

corpus <- VCorpus(VectorSource(sample_data))
#Remove punctuations
corpus1 <- tm_map(corpus,removePunctuation)
#Remove white spaces
corpus2 <- tm_map(corpus1,stripWhitespace)
# Convert to lowercase
corpus3 <- tm_map(corpus2,tolower) 
#Remove numbers
corpus4 <- tm_map(corpus3,removeNumbers)
#remodelling to a plain .txt
corpus5 <- tm_map(corpus4,PlainTextDocument)
#removing stop words in English (a, as, at, so, etc.)
corpus6 <- tm_map(corpus5,removeWords,stopwords("english"))

Building N-Grams

In Natural Language Processing (NLP), an n-gram is a contiguous sequence of n items from a given sequence of text or speech. Unigrams are effectivly single words. Bigrams are two-word combinations. And Trigrams are three-word combinations.

The following function is used to extract 1-grams, 2-grams, 3-grams from the finalized text Corpus (corpus6) using RWeka.

#The RWeka package will help to construct functions that tokenize the sample and construct matrices of uniqrams, bigrams, and trigrams.

one<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
two<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
thr<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))

one_table<-TermDocumentMatrix(corpus6,control=list(tokenize=one))
two_table<-TermDocumentMatrix(corpus6,control=list(tokenize=two))
thr_table<-TermDocumentMatrix(corpus6,control=list(tokenize=thr))

#Then I find the frequency of terms in each of these 3 matrices and construct dataframes of these frequencies.

one_corpus<-findFreqTerms(one_table,lowfreq=1000)
two_corpus<-findFreqTerms(two_table,lowfreq=80)
thr_corpus<-findFreqTerms(thr_table,lowfreq=10)

one_corpus_num<-rowSums(as.matrix(one_table[one_corpus,]))
one_corpus_table<-data.frame(Word=names(one_corpus_num),frequency=one_corpus_num)
one_corpus_sort<-one_corpus_table[order(-one_corpus_table$frequency),]
head(one_corpus_sort)

##      Word frequency
## just just     12839
## like like     11203
## will will     10928
## one   one     10447
## can   can      9586
## get   get      9326

two_corpus_num<-rowSums(as.matrix(two_table[two_corpus,]))
two_corpus_table<-data.frame(Word=names(two_corpus_num),frequency=two_corpus_num)
two_corpus_sort<-two_corpus_table[order(-two_corpus_table$frequency),]
head(two_corpus_sort)

##                  Word frequency
## right now   right now      1115
## cant wait   cant wait       958
## dont know   dont know       821
## last night last night       741
## im going     im going       615
## feel like   feel like       577

thr_corpus_num<-rowSums(as.matrix(thr_table[thr_corpus,]))
thr_corpus_table<-data.frame(Word=names(thr_corpus_num),frequency=thr_corpus_num)
thr_corpus_sort<-thr_corpus_table[order(-thr_corpus_table$frequency),]
head(thr_corpus_sort)

##                                Word frequency
## cant wait see         cant wait see       170
## happy mothers day happy mothers day       152
## let us know             let us know       117
## happy new year       happy new year        96
## im pretty sure       im pretty sure        74
## feel like im           feel like im        63

Exploratory Analysis (Graphs & Visualizations)

The frequency distribution of each n-grams category were visualized into 3 different bar plots with color to supplement the frequency.

one_g<-ggplot(one_corpus_sort[1:30,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
one_g<-one_g+geom_bar(stat="identity")
one_g<-one_g+labs(title="Top 30 Unigrams",x="Words",y="Frequency")
one_g<-one_g+coord_flip()
one_g<-one_g+ scale_fill_gradient(low = "blue", high = "red")
one_g

two_g<-ggplot(two_corpus_sort[1:20,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
two_g<-two_g+geom_bar(stat="identity")
two_g<-two_g+labs(title="Top 20 Bigrams",x="Words",y="Frequency")
two_g<-two_g+coord_flip()
two_g<-two_g+ scale_fill_gradient(low = "blue", high = "red")
two_g

three_g<-ggplot(thr_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
three_g<-three_g+geom_bar(stat="identity")
three_g<-three_g+labs(title="Top 10 Trigrams",x="Words",y="Frequency")
three_g<-three_g+coord_flip()
three_g<-three_g+ scale_fill_gradient(low = "blue", high = "red")
three_g

Conclusion & Next Steps

Ideas: Use a text input box as the user interface of the Shiny app.

Next Steps: 1.Build a predictive algorithm 2.Build a a Shiny app, that suggest the most likely next word after a phrase is typed 3.Prepare a pitch about the app and publish it at “shinyapps.io” server.