This is part of the Data Science Capstone project, which includes the exploratory data analysis for the three documents provided (blogs, news, twitter). In this report, a corpus will be created and pre-processed. Next, the word/phrases frequency will be determined and observed before building a model.
#load library
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringi)
library(corpus)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.1.3
## Loading required package: RColorBrewer
library(RWeka)
## Warning: package 'RWeka' was built under R version 4.1.3
## java.home option:
## JAVA_HOME environment variable: .;C:\Program Files\Java\jre1.8.0_311\bin;
## Warning in fun(libname, pkgname): Java home setting is INVALID, it will be ignored.
## Please do NOT set it unless you want to override system settings.
suppressWarnings(expr)
## function (expr)
## {
## enexpr(expr)
## }
## <bytecode: 0x0000000025616278>
## <environment: namespace:rlang>
The data files are downloaded and unzipped at the working directory. In this project, we will be working on three English text files, namely the ‘blogs’, ‘news’, ‘twitter’ files.
folder<- "C:\\Users\\think\\Documents\\R\\coursera-SwiftKey\\final\\en_US"
filelist<- list.files(path=folder, pattern="*.txt")
paste(folder, "\\", filelist, sep= "")
## [1] "C:\\Users\\think\\Documents\\R\\coursera-SwiftKey\\final\\en_US\\en_US.blogs.txt"
## [2] "C:\\Users\\think\\Documents\\R\\coursera-SwiftKey\\final\\en_US\\en_US.news.txt"
## [3] "C:\\Users\\think\\Documents\\R\\coursera-SwiftKey\\final\\en_US\\en_US.twitter.txt"
files<- lapply(filelist, FUN=readLines)
## Warning in FUN(X[[i]], ...): incomplete final line found on 'en_US.news.txt'
## Warning in FUN(X[[i]], ...): line 167155 appears to contain an embedded nul
## Warning in FUN(X[[i]], ...): line 268547 appears to contain an embedded nul
## Warning in FUN(X[[i]], ...): line 1274086 appears to contain an embedded nul
## Warning in FUN(X[[i]], ...): line 1759032 appears to contain an embedded nul
A basic summary will be generated by checking on the total line count, total word count, average, maximum and minimum word count in one entry of all three files.The summary is as shown below the code:
summary<- data.frame()
for(file in files){
linecount<- length(file)
wordcount<- stri_count_words(file)
tot_wordcount<- sum(wordcount)
avg_wordcount<- mean(wordcount)
max_wordcount<- max(wordcount)
output<- c(linecount, tot_wordcount, avg_wordcount, max_wordcount)
summary<- rbind(summary, output)
}
colnames(summary)<- c('linecount', 'total wordcount', 'average wordcount', 'max wordcount')
rownames(summary)<- c('Blog', 'News', 'Twitter')
summary
## linecount total wordcount average wordcount max wordcount
## Blog 899288 38154238 42.42716 6726
## News 77259 2693898 34.86840 1123
## Twitter 2360148 30218125 12.80349 60
From the summary above, the twitter file has the highest number of line count followed by the blog file. However, the blog file has the highest total, average and maximum word count per entry; while the twitter file has the lowest average and maximum word count per entry despite having highest line count.
As each file is quite large in size, we will proceed to further steps with only 3% of the data from each file, then append all terms into a corpus for further data processing and analysis.
library(tm)
## Loading required package: NLP
set.seed(1)
corpus_samples<- list()
for (file in files){
sample<- sample(file, size= length(file)*0.03)
sample<- paste(sample, collapse= " ")
corpus_samples<- append(corpus_samples, sample)
}
#create corpus
corpus_text<- VCorpus(VectorSource(corpus_samples))
The files will be cleaned by removing punctuations, symbols, numbers, stopwords and whitespace. Lastly, all letters will be transformed to lower case.
library(gsubfn)
## Loading required package: proto
toSpace <- content_transformer(function(x, pattern){
return (gsub(pattern, " ", x))})
corpus_text<- tm_map(corpus_text, toSpace, "-")
corpus_text<- tm_map(corpus_text, toSpace, "'")
corpus_text<- tm_map(corpus_text, toSpace, "â€")
corpus_text<- tm_map(corpus_text, toSpace, "Å“")
corpus_text<- tm_map(corpus_text, toSpace, "â„¢")
corpus_text<- tm_map(corpus_text, toSpace, "¦ ¦ ¦")
#remove punctuation
corpus_text<- tm_map(corpus_text, removePunctuation)
#remove digits
corpus_text<- tm_map(corpus_text, removeNumbers)
#remove stopwords
corpus_text<- tm_map(corpus_text, removeWords, stopwords("english"))
#remove single character
corpus_text<- tm_map(corpus_text, toSpace, "\\b[A-z]\\b{1}")
#change to lower case
corpus_text<- tm_map(corpus_text, content_transformer(tolower))
#remove white space
corpus_text<- tm_map(corpus_text, stripWhitespace)
Based on Wikipedia, ‘N-gram is a contiguous sequence of n items from a speech.’. In the following, the frequency of uni-gram (single term), 2-gram (2 term phrase) and 3-gram (3 term phrase) will be analyzed.
We will first generate the term document matrix by using the relevant tokenizer and specific sparsity. The frequency of each term will then be generated in a dataframe and relevant bar graphs of the top 15 single term, 2 term phrase and 3 term phrase will be plotted for better visualization.
#set function for 2-gram and 3-gram tokenizer
UniGramTokenizer<- function(x)NGramTokenizer(x, control= Weka_control(min=1, max=1))
TwoGramTokenizer<- function(x) NGramTokenizer(x, control= Weka_control(min=2, max=2))
ThreeGramTokenizer<- function(x) NGramTokenizer(x, control= Weka_control(min=3, max=3))
#set function to create the output as dataframe and sort in descending order
Ngram_freq<- function(tdm, sparsity){
m<- as.matrix(removeSparseTerms(tdm, sparsity))
freq<- sort(rowSums(m), decreasing=TRUE)
return(data.frame(word= names(freq), freq=freq))
}
#1-gram
OneGram_tdm<- TermDocumentMatrix(corpus_text,
control=list(tokenize=UniGramTokenizer))
OneGram_freq<- Ngram_freq(OneGram_tdm, 0.99)
#2-gram
TwoGram_tdm<- TermDocumentMatrix(corpus_text,
control= list(tokenize= TwoGramTokenizer))
TwoGram_freq<- Ngram_freq(TwoGram_tdm, 0.99)
#3-gram
ThreeGram_tdm<- TermDocumentMatrix(corpus_text,
control= list(tokenize= ThreeGramTokenizer))
ThreeGram_freq<- Ngram_freq(ThreeGram_tdm, 0.99)
The top 15 single word, 2 term phrase and 3 term phrase are as follows:
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(viridis)
## Loading required package: viridisLite
#bar plot to visualize the top 15 unigram, 2-gram and 3-gram phrases
ngram_barplot<- function(data, title){
ggplot(head(data,15), aes(x=reorder(word, -freq), y=freq))+
geom_bar(stat='identity', aes(fill= freq))+
scale_fill_viridis(option = "D")+
labs(x=' ', y='frequency', title=title)+
theme(axis.text.x= element_text(angle=45, hjust=1))
}
OneGram_plot<- ngram_barplot(OneGram_freq, 'Top 15 Single Words')
TwoGram_plot<- ngram_barplot(TwoGram_freq, 'Top 15 2-gram Phrases')
ThreeGram_plot<- ngram_barplot(ThreeGram_freq, 'Top 15 3-gram Phrases')
OneGram_plot
TwoGram_plot
ThreeGram_plot
Next step would be to develop a prediction model.