This report is a exploratory analysis on a set of train data. It is for buidling a text predication Shiny app.
The use of the Shiny app is to be able to predict the next word fast and with minimum memory use.
Link of the Data https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
library(NLP)
library(tm)
library(ngram)
rm(list=ls(all=TRUE))
filepath <- "C:\\Users\\seein\\Documents\\R\\Course10\\Coursera-SwiftKey\\final\\en_US"
filelist <- list.files(path=filepath, pattern="\\.txt$", full.names = TRUE)
filelist
## [1] "C:\\Users\\seein\\Documents\\R\\Course10\\Coursera-SwiftKey\\final\\en_US/en_US.blogs.txt"
## [2] "C:\\Users\\seein\\Documents\\R\\Course10\\Coursera-SwiftKey\\final\\en_US/en_US.news.txt"
## [3] "C:\\Users\\seein\\Documents\\R\\Course10\\Coursera-SwiftKey\\final\\en_US/en_US.twitter.txt"
# user defined function sample_text to get random sample of text
sample_text <- function(filepath, percent) {
text <- as.character()
file.lines <- readLines(filepath, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
num.lines <- length(file.lines)
set.seed(1234)
samplelines <- rbinom(num.lines,2,percent/100)
for(i in 1:num.lines) if (samplelines[i]==1){
text <- c(text, file.lines[i])}
return(text)}
The sample_text is to make the corpus smaller to handel
FileStatistics <-character()
samplefilepath <- "C:\\Users\\seein\\Documents\\R\\Course10\\SampleData"
for( i in 1:length(filelist)){
scan.file<- scan(filelist[i],what="char", sep="\n", quiet = TRUE)
NumLines <- length(scan.file)
NumLines <- format(NumLines, big.mark = ",")
NumWords <- wordcount(scan.file, sep = " ", count.function = sum)
NumWords <- format(NumWords, big.mark = ",")
FileSize <- format(object.size(scan.file), units = "Mb", standard = "legacy")
if (FileSize=="0 Mb"){next}
#prepare the sample for exploration
samplefile <- sample_text(filelist[i],1)
samplefile.constring <- concatenate(samplefile)
samplefile.corpus <- Corpus(VectorSource(samplefile.constring))
writeCorpus(samplefile.corpus, samplefilepath, filenames = paste0(basename(filelist[i]), ".txt"))
#Output the information of the documents
info <- c(basename(filelist[i]), NumLines, NumWords, FileSize)
FileStatistics <- c(FileStatistics, info)
FileStatistics.Matrix <- matrix(FileStatistics, ncol = 4, byrow = TRUE)
colnames(FileStatistics.Matrix) <- c("FileName", "Num of Lines", "Num of Words", "FileSize")
FileStatistics.Table <- as.table(FileStatistics.Matrix)
}
## Warning in scan(filelist[i], what = "char", sep = "\n", quiet = TRUE): embedded
## nul(s) found in input
FileStatistics.Table
## FileName Num of Lines Num of Words FileSize
## A en_US.blogs.txt 899,288 37,334,131 255.4 Mb
## B en_US.news.txt 77,259 2,643,969 19.8 Mb
## C en_US.twitter.txt 2,360,148 30,373,543 319 Mb
As the final goal is to make a app to predict text, I combine the 3 documents to perform data exploration.
sample.corpus <-Corpus(DirSource(samplefilepath, encoding = "UTF-8"))
#clean up the file
sample.corpus.clean <- tm_map(sample.corpus, removeNumbers)
sample.corpus.clean <- tm_map(sample.corpus.clean, removePunctuation)
sample.corpus.clean <- tm_map(sample.corpus.clean, stripWhitespace)
sample.dtm <- DocumentTermMatrix(sample.corpus.clean)
sample.dtm <- removeSparseTerms(sample.dtm, 0.999)
sample.dtm.matrix <- as.matrix(sample.dtm)
FreqList.Freq<- sort(colSums(sample.dtm.matrix), decreasing = TRUE)
FreqList.Terms <- names(FreqList.Freq)
FreqList.DF <- data.frame(word=FreqList.Terms, Freq=FreqList.Freq,row.names = NULL)
head(FreqList.DF)
## word Freq
## 1 the 59294
## 2 and 32151
## 3 you 16774
## 4 for 15663
## 5 that 14520
## 6 with 9668
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
# WordCloud
wordcloud(words = FreqList.DF$word, freq = FreqList.DF$Freq, scale=c(5, 0.5), min.freq = 1000, max.words = 250, random.order = FALSE, rot.per = 0.35, use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))
# Bar plot
barplot(FreqList.DF$Freq[1:25], names.arg = FreqList.DF$word[1:25], las = 2, col = "lightgreen")
# Histogram
hist(FreqList.DF$Freq[1:1000], xlab = "FrequencyBin", las = 2, main = "Top 1000 Words Use Distribution", col = "lightblue")
# Plot with Top 10 Most Frequent Word
plot(FreqList.DF$Freq, type = "l", col = "red")
text(FreqList.DF$Freq, labels = FreqList.DF$word[1:8], cex = 0.7, col = "blue")
#n-gram Models
#ngram() works with a single string
rm(list = (ls(all=TRUE)))
samplefilepath <- "C:\\Users\\seein\\Documents\\R\\Course10\\SampleData"
samplefiles.corpus <- Corpus(DirSource(samplefilepath))
sample.text <- as.character()
for (i in 1 : length(samplefiles.corpus)){
text <- as.character(samplefiles.corpus[[i]])[1]
sample.text <- concatenate(sample.text,text)
}
sample.text.clean <- preprocess(sample.text, case = "lower", remove.punct = TRUE, remove.numbers = TRUE, fix.spacing = TRUE)
for (i in 2:4){
text.ngram <- ngram(sample.text.clean, n=i, sep = " ")
text.ngram.freqlist <- get.phrasetable(text.ngram)
names(text.ngram.freqlist)[1] <- paste0(i,"gram")
print(head(text.ngram.freqlist))
}
## 2gram freq prop
## 1 of the 5258 0.003781001
## 2 in the 5018 0.003608418
## 3 for the 2754 0.001980387
## 4 to the 2716 0.001953062
## 5 on the 2576 0.001852389
## 6 to be 2377 0.001709289
## 3gram freq prop
## 1 thanks for the 481 0.0003458849
## 2 one of the 400 0.0002876382
## 3 a lot of 378 0.0002718181
## 4 to be a 263 0.0001891221
## 5 i want to 247 0.0001776166
## 6 going to be 240 0.0001725829
## 4gram freq prop
## 1 thanks for the follow 132 9.492067e-05
## 2 at the end of 101 7.262869e-05
## 3 the end of the 101 7.262869e-05
## 4 the rest of the 89 6.399954e-05
## 5 at the same time 81 5.824677e-05
## 6 cant wait to see 70 5.033672e-05
Most words have a low frequency. The highest frequency terms are required in English for the good grammar, which are the stopwords if to study the context of the documents.
These words are usually in the list of the stopwords. They shouldn’t be removed as it is a exercise for text prediction.
To reduce the size, we can - remove currency symbols, numbers, punctuation, whitespace. - remove the words with high sparcity
Top frequent terms are in the top phrases in the ngram models. They are not necessary the leading word of the phrase, such as “the”.
In the ngram model, the higher the n is, the lower the probability is for the phrase.
The Prediction will be effective if to filter phrases by the input word; then the following word would be the most frequent word both in the word frequent list and the phrase frequent list and so on.