This report is a exploratory analysis on a set of train data. It is for buidling a text predication Shiny app.

The use of the Shiny app is to be able to predict the next word fast and with minimum memory use.

Link of the Data https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

Features of the Data

Data Download and Import

library(NLP)
library(tm)
library(ngram)
rm(list=ls(all=TRUE))

filepath <- "C:\\Users\\seein\\Documents\\R\\Course10\\Coursera-SwiftKey\\final\\en_US"
filelist <- list.files(path=filepath, pattern="\\.txt$", full.names = TRUE)
filelist
## [1] "C:\\Users\\seein\\Documents\\R\\Course10\\Coursera-SwiftKey\\final\\en_US/en_US.blogs.txt"  
## [2] "C:\\Users\\seein\\Documents\\R\\Course10\\Coursera-SwiftKey\\final\\en_US/en_US.news.txt"   
## [3] "C:\\Users\\seein\\Documents\\R\\Course10\\Coursera-SwiftKey\\final\\en_US/en_US.twitter.txt"
# user defined function sample_text to get random sample of text
sample_text <- function(filepath, percent) { 
        text <- as.character()
        file.lines <- readLines(filepath, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
        num.lines <- length(file.lines)
        set.seed(1234)
        samplelines <- rbinom(num.lines,2,percent/100)
        for(i in 1:num.lines) if (samplelines[i]==1){
                text <- c(text, file.lines[i])}
        return(text)}

The sample_text is to make the corpus smaller to handel

Statiscis on the Data sets

Table of the summary of the data sets:

FileStatistics <-character()
samplefilepath <- "C:\\Users\\seein\\Documents\\R\\Course10\\SampleData"
for( i in 1:length(filelist)){
scan.file<- scan(filelist[i],what="char", sep="\n", quiet = TRUE)
NumLines <- length(scan.file)
NumLines <- format(NumLines, big.mark = ",")
NumWords <- wordcount(scan.file, sep = " ", count.function = sum)
NumWords <- format(NumWords, big.mark = ",")
FileSize <- format(object.size(scan.file), units = "Mb", standard = "legacy")
        if (FileSize=="0 Mb"){next}
#prepare the sample for exploration
samplefile <- sample_text(filelist[i],1)
samplefile.constring <- concatenate(samplefile)
samplefile.corpus <- Corpus(VectorSource(samplefile.constring))
writeCorpus(samplefile.corpus, samplefilepath, filenames = paste0(basename(filelist[i]), ".txt"))

#Output the information of the documents
        info <- c(basename(filelist[i]), NumLines, NumWords, FileSize)
        FileStatistics <- c(FileStatistics, info)
        FileStatistics.Matrix <- matrix(FileStatistics, ncol = 4, byrow = TRUE) 
colnames(FileStatistics.Matrix) <- c("FileName", "Num of Lines", "Num of Words", "FileSize")
FileStatistics.Table <- as.table(FileStatistics.Matrix)
}
## Warning in scan(filelist[i], what = "char", sep = "\n", quiet = TRUE): embedded
## nul(s) found in input
FileStatistics.Table
##   FileName          Num of Lines Num of Words FileSize
## A en_US.blogs.txt   899,288      37,334,131   255.4 Mb
## B en_US.news.txt    77,259       2,643,969    19.8 Mb 
## C en_US.twitter.txt 2,360,148    30,373,543   319 Mb

Make the Term Frequence List

As the final goal is to make a app to predict text, I combine the 3 documents to perform data exploration.

sample.corpus <-Corpus(DirSource(samplefilepath, encoding = "UTF-8"))
#clean up the file
sample.corpus.clean <- tm_map(sample.corpus, removeNumbers)
sample.corpus.clean <- tm_map(sample.corpus.clean, removePunctuation)
sample.corpus.clean <- tm_map(sample.corpus.clean, stripWhitespace)

sample.dtm <- DocumentTermMatrix(sample.corpus.clean)
sample.dtm <- removeSparseTerms(sample.dtm, 0.999)
sample.dtm.matrix <- as.matrix(sample.dtm)
FreqList.Freq<- sort(colSums(sample.dtm.matrix), decreasing = TRUE)
FreqList.Terms <- names(FreqList.Freq)
FreqList.DF <- data.frame(word=FreqList.Terms, Freq=FreqList.Freq,row.names = NULL)
head(FreqList.DF)
##   word  Freq
## 1  the 59294
## 2  and 32151
## 3  you 16774
## 4  for 15663
## 5 that 14520
## 6 with  9668

Plots of the summary of the datassets

library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)

# WordCloud
wordcloud(words = FreqList.DF$word, freq = FreqList.DF$Freq, scale=c(5, 0.5), min.freq = 1000, max.words = 250, random.order = FALSE, rot.per = 0.35, use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))

# Bar plot
barplot(FreqList.DF$Freq[1:25], names.arg = FreqList.DF$word[1:25], las = 2, col = "lightgreen")

# Histogram
hist(FreqList.DF$Freq[1:1000], xlab = "FrequencyBin", las = 2, main = "Top 1000 Words Use Distribution", col = "lightblue")

# Plot with Top 10 Most Frequent Word
plot(FreqList.DF$Freq, type = "l", col = "red")
text(FreqList.DF$Freq, labels = FreqList.DF$word[1:8], cex = 0.7, col = "blue")

#n-gram Models

#ngram() works with a single string

rm(list = (ls(all=TRUE)))
samplefilepath <- "C:\\Users\\seein\\Documents\\R\\Course10\\SampleData"
samplefiles.corpus <- Corpus(DirSource(samplefilepath))
sample.text <- as.character()
        
for (i in 1 : length(samplefiles.corpus)){
        text <- as.character(samplefiles.corpus[[i]])[1]
        sample.text <- concatenate(sample.text,text)
}
        sample.text.clean <- preprocess(sample.text, case = "lower", remove.punct = TRUE, remove.numbers = TRUE, fix.spacing = TRUE)

for (i in 2:4){
        
        text.ngram <- ngram(sample.text.clean, n=i, sep = " ")
        text.ngram.freqlist <- get.phrasetable(text.ngram)
        names(text.ngram.freqlist)[1] <- paste0(i,"gram")
        print(head(text.ngram.freqlist))
        
}
##      2gram freq        prop
## 1  of the  5258 0.003781001
## 2  in the  5018 0.003608418
## 3 for the  2754 0.001980387
## 4  to the  2716 0.001953062
## 5  on the  2576 0.001852389
## 6   to be  2377 0.001709289
##             3gram freq         prop
## 1 thanks for the   481 0.0003458849
## 2     one of the   400 0.0002876382
## 3       a lot of   378 0.0002718181
## 4        to be a   263 0.0001891221
## 5      i want to   247 0.0001776166
## 6    going to be   240 0.0001725829
##                    4gram freq         prop
## 1 thanks for the follow   132 9.492067e-05
## 2         at the end of   101 7.262869e-05
## 3        the end of the   101 7.262869e-05
## 4       the rest of the    89 6.399954e-05
## 5      at the same time    81 5.824677e-05
## 6      cant wait to see    70 5.033672e-05

Findings of the Data sets

  1. Most words have a low frequency. The highest frequency terms are required in English for the good grammar, which are the stopwords if to study the context of the documents.

  2. These words are usually in the list of the stopwords. They shouldn’t be removed as it is a exercise for text prediction.

  3. To reduce the size, we can - remove currency symbols, numbers, punctuation, whitespace. - remove the words with high sparcity

  4. Top frequent terms are in the top phrases in the ngram models. They are not necessary the leading word of the phrase, such as “the”.

  5. In the ngram model, the higher the n is, the lower the probability is for the phrase.

  6. The Prediction will be effective if to filter phrases by the input word; then the following word would be the most frequent word both in the word frequent list and the phrase frequent list and so on.

Plans for creating the prediction algorithm and Shiny App

  1. Find out how to efficiently store the ngram models and how to take advantage of the frequent list.
  2. Study the models that can make text prediction and apply the most efficient one.
  3. Build the prediction model.
  4. Create the Shiny App to allow text prediction according to the input text.
  5. Create a slide presentation as a picth to the Shiny App.