Capstone project: Milestone report

Introduction

The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in. 2. Create a basic report of summary statistics about the data sets. 3. Report any interesting findings that you amassed so far. 4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

Loading (Blogs, News, Twitter)

setwd("/Users/cuongpham/library/CloudStorage/Dropbox/Data work/Data Science/Course 10-Capstone/Capstone project/data/en_US")

blogs <- readLines("en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")

Sampling

library(stringi) 
# Get the size of Files file.size
size_blogs <- file.info("en_US.blogs.txt")$size / 1024^2 
size_news <- file.info("en_US.news.txt")$size  / 1024^2 
size_twitter <- file.info("en_US.twitter.txt")$size / 1024^2 

# Get the number of Lines num.lines
len_blogs <- length(blogs) 
len_news <- length(news)  
len_twitter <- length(twitter) 

# Get the number of characters
nchar_blogs <- sum(nchar(blogs))
nchar_news <- sum(nchar(news))
nchar_twitter <- sum(nchar(twitter))

# Counting the Words (num.words)
nword_blogs <- sum(stri_count_words(blogs)) 
nword_news <- sum(stri_count_words(news))  
nword_twitter <-sum(stri_count_words(twitter)) 

# create table 
data.frame(file.name = c("blogs", "news", "twitter"),
files.size.MB = c(size_blogs,size_news,size_twitter),
num.lines = c(len_blogs,len_news,len_twitter),
num.character = c(nchar_blogs,nchar_news,nchar_twitter),
num.words = c(nword_blogs,nword_news,nword_twitter))

##   file.name files.size.MB num.lines num.character num.words
## 1     blogs            NA    899288     206824505  37546250
## 2      news            NA   1010242     203223159  34762395
## 3   twitter            NA   2360148     162096031  30093372

Sampling

Remove all non-English characters and then compile a sample dataset that is composed of 5% of each of the 3 original datasets.

set.seed(12345)
blogs1 <-iconv(blogs,"latin1","ASCII",sub="")
news1 <-iconv(news,"latin1","ASCII",sub="")
twitter1 <-iconv(twitter,"latin1","ASCII",sub="")

# sample data set only 1% of each file
sample_data <-c(sample(blogs1,length(blogs1)*0.05),
               sample(news1,length(news1)*0.05),
               sample(twitter1,length(twitter1)*0.05))

A sample of 5% of each file will be used for analysis.

Clean and Build Corpus

library(tm) # Text mining

## Loading required package: NLP

library(NLP)

corpus <- VCorpus(VectorSource(sample_data))
corpus1 <- tm_map(corpus,removePunctuation)
corpus2 <- tm_map(corpus1,stripWhitespace)
corpus3 <- tm_map(corpus2,tolower) 
corpus4 <- tm_map(corpus3,removeNumbers)
corpus5 <- tm_map(corpus4,PlainTextDocument)
#removing stop words in English (a, as, at, so, etc.)
corpus6 <- tm_map(corpus5,removeWords,stopwords("english"))

Build N-Grams

In Natural Language Processing (NLP), n-gram is a contiguous sequence of n items from a given sequence of text or speech. Unigrams are single words. Bigrams are two words combinations. Trigrams are three-word combinations.

The following function is used to extract 1-grams, 2-grams, 3-grams from the text Corpus using RWeka.

library(RWeka)

#Construct functions that tokenize the sample and construct matrices of uniqrams, bigrams, and trigrams.

one<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
two<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
thr<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))

one_table<-TermDocumentMatrix(corpus6,control=list(tokenize=one))
two_table<-TermDocumentMatrix(corpus6,control=list(tokenize=two))
thr_table<-TermDocumentMatrix(corpus6,control=list(tokenize=thr))

#Get the frequency of terms in each of these 3 matrices and construct dataframes of these frequencies.

one_corpus<-findFreqTerms(one_table,lowfreq=1000)
two_corpus<-findFreqTerms(two_table,lowfreq=80)
thr_corpus<-findFreqTerms(thr_table,lowfreq=10)

one_corpus_num<-rowSums(as.matrix(one_table[one_corpus,]))
one_corpus_table<-data.frame(Word=names(one_corpus_num),frequency=one_corpus_num)
one_corpus_sort<-one_corpus_table[order(-one_corpus_table$frequency),]
head(one_corpus_sort)

##      Word frequency
## will will     15816
## said said     15369
## just just     15118
## one   one     14265
## like like     13556
## can   can     12163

two_corpus_num<-rowSums(as.matrix(two_table[two_corpus,]))
two_corpus_table<-data.frame(Word=names(two_corpus_num),frequency=two_corpus_num)
two_corpus_sort<-two_corpus_table[order(-two_corpus_table$frequency),]
head(two_corpus_sort)

##                  Word frequency
## right now   right now      1278
## cant wait   cant wait       969
## dont know   dont know       969
## last year   last year       967
## new york     new york       947
## last night last night       769

thr_corpus_num<-rowSums(as.matrix(thr_table[thr_corpus,]))
thr_corpus_table<-data.frame(Word=names(thr_corpus_num),frequency=thr_corpus_num)
thr_corpus_sort<-thr_corpus_table[order(-thr_corpus_table$frequency),]
head(thr_corpus_sort)

##                                            Word frequency
## cant wait see                     cant wait see       175
## happy mothers day             happy mothers day       151
## let us know                         let us know       127
## township board district township board district       126
## new york city                     new york city       107
## happy new year                   happy new year       100

Exploratory Analysis (Graphs & Visualizations)

The frequency distribution of each n-grams category were visualized into 3 different bar plots.

library(ggplot2) #visualization

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

one_g<-ggplot(one_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
one_g<-one_g+geom_bar(stat="identity")
one_g<-one_g+labs(title="Unigrams",x="Words",y="Frequency")
one_g<-one_g+theme(axis.text.x=element_text(angle=90))
one_g

two_g<-ggplot(two_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
two_g<-two_g+geom_bar(stat="identity")
two_g<-two_g+labs(title="Bigrams",x="Words",y="Frequency")
two_g<-two_g+theme(axis.text.x=element_text(angle=90))
two_g

thr_g<-ggplot(thr_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
thr_g<-thr_g+geom_bar(stat="identity")
thr_g<-thr_g+labs(title="Trigrams",x="Words",y="Frequency")
thr_g<-thr_g+theme(axis.text.x=element_text(angle=90))
thr_g

Next Steps

My next steps involve developing a predictive algorithm capable of suggesting the most likely next word after a given phrase is typed. Once our algorithm is in place, I’ll proceed to build the Shiny app , integrating the predictive functionality seamlessly and publish the app on the “shinyapps.io” server, making it readily accessible to the public.

Capstone project: Milestone report

Cuong Pham V

2024-02-23

Introduction

Loading (Blogs, News, Twitter)

Sampling

Sampling

Clean and Build Corpus

Build N-Grams

Exploratory Analysis (Graphs & Visualizations)

Next Steps