This report is a exploratory analysis on a set of train data. It is for buidling a text predication Shiny app.

The use of the Shiny app is to be able to predict the next word fast and with minimum memory use.

Features of the Data

Data Download and Import

library(NLP)
library(tm)
library(ngram)
rm(list=ls(all=TRUE))

filepath <- "C:\\Users\\seein\\Documents\\R\\Course10\\Coursera-SwiftKey\\final\\en_US"
filelist <- list.files(path=filepath, pattern="\\.txt$", full.names = TRUE)
filelist

## [1] "C:\\Users\\seein\\Documents\\R\\Course10\\Coursera-SwiftKey\\final\\en_US/en_US.blogs.txt"  
## [2] "C:\\Users\\seein\\Documents\\R\\Course10\\Coursera-SwiftKey\\final\\en_US/en_US.news.txt"   
## [3] "C:\\Users\\seein\\Documents\\R\\Course10\\Coursera-SwiftKey\\final\\en_US/en_US.twitter.txt"

# user defined function sample_text to get random sample of text
sample_text <- function(filepath, percent) { 
        text <- as.character()
        file.lines <- readLines(filepath, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
        num.lines <- length(file.lines)
        set.seed(1234)
        samplelines <- rbinom(num.lines,2,percent/100)
        for(i in 1:num.lines) if (samplelines[i]==1){
                text <- c(text, file.lines[i])}
        return(text)}

The sample_text is to make the corpus smaller to handel

Statiscis on the Data sets

Table of the summary of the data sets:

FileStatistics <-character()
samplefilepath <- "C:\\Users\\seein\\Documents\\R\\Course10\\SampleData"
for( i in 1:length(filelist)){
scan.file<- scan(filelist[i],what="char", sep="\n", quiet = TRUE)
NumLines <- length(scan.file)
NumLines <- format(NumLines, big.mark = ",")
NumWords <- wordcount(scan.file, sep = " ", count.function = sum)
NumWords <- format(NumWords, big.mark = ",")
FileSize <- format(object.size(scan.file), units = "Mb", standard = "legacy")
        if (FileSize=="0 Mb"){next}
#prepare the sample for exploration
samplefile <- sample_text(filelist[i],1)
samplefile.constring <- concatenate(samplefile)
samplefile.corpus <- Corpus(VectorSource(samplefile.constring))
writeCorpus(samplefile.corpus, samplefilepath, filenames = paste0(basename(filelist[i]), ".txt"))

#Output the information of the documents
        info <- c(basename(filelist[i]), NumLines, NumWords, FileSize)
        FileStatistics <- c(FileStatistics, info)
        FileStatistics.Matrix <- matrix(FileStatistics, ncol = 4, byrow = TRUE) 
colnames(FileStatistics.Matrix) <- c("FileName", "Num of Lines", "Num of Words", "FileSize")
FileStatistics.Table <- as.table(FileStatistics.Matrix)
}

## Warning in scan(filelist[i], what = "char", sep = "\n", quiet = TRUE): embedded
## nul(s) found in input

FileStatistics.Table

##   FileName          Num of Lines Num of Words FileSize
## A en_US.blogs.txt   899,288      37,334,131   255.4 Mb
## B en_US.news.txt    77,259       2,643,969    19.8 Mb 
## C en_US.twitter.txt 2,360,148    30,373,543   319 Mb

Make the Term Frequence List

As the final goal is to make a app to predict text, I combine the 3 documents to perform data exploration.

sample.corpus <-Corpus(DirSource(samplefilepath, encoding = "UTF-8"))
#clean up the file
sample.corpus.clean <- tm_map(sample.corpus, removeNumbers)
sample.corpus.clean <- tm_map(sample.corpus.clean, removePunctuation)
sample.corpus.clean <- tm_map(sample.corpus.clean, stripWhitespace)

sample.dtm <- DocumentTermMatrix(sample.corpus.clean)
sample.dtm <- removeSparseTerms(sample.dtm, 0.999)
sample.dtm.matrix <- as.matrix(sample.dtm)
FreqList.Freq<- sort(colSums(sample.dtm.matrix), decreasing = TRUE)
FreqList.Terms <- names(FreqList.Freq)
FreqList.DF <- data.frame(word=FreqList.Terms, Freq=FreqList.Freq,row.names = NULL)
head(FreqList.DF)

##   word  Freq
## 1  the 59294
## 2  and 32151
## 3  you 16774
## 4  for 15663
## 5 that 14520
## 6 with  9668

Plots of the summary of the datassets

library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)

# WordCloud
wordcloud(words = FreqList.DF$word, freq = FreqList.DF$Freq, scale=c(5, 0.5), min.freq = 1000, max.words = 250, random.order = FALSE, rot.per = 0.35, use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))

# Bar plot
barplot(FreqList.DF$Freq[1:25], names.arg = FreqList.DF$word[1:25], las = 2, col = "lightgreen")

# Histogram
hist(FreqList.DF$Freq[1:1000], xlab = "FrequencyBin", las = 2, main = "Top 1000 Words Use Distribution", col = "lightblue")

# Plot with Top 10 Most Frequent Word
plot(FreqList.DF$Freq, type = "l", col = "red")
text(FreqList.DF$Freq, labels = FreqList.DF$word[1:8], cex = 0.7, col = "blue")

#n-gram Models

#ngram() works with a single string

rm(list = (ls(all=TRUE)))
samplefilepath <- "C:\\Users\\seein\\Documents\\R\\Course10\\SampleData"
samplefiles.corpus <- Corpus(DirSource(samplefilepath))
sample.text <- as.character()
        
for (i in 1 : length(samplefiles.corpus)){
        text <- as.character(samplefiles.corpus[[i]])[1]
        sample.text <- concatenate(sample.text,text)
}
        sample.text.clean <- preprocess(sample.text, case = "lower", remove.punct = TRUE, remove.numbers = TRUE, fix.spacing = TRUE)

for (i in 2:4){
        
        text.ngram <- ngram(sample.text.clean, n=i, sep = " ")
        text.ngram.freqlist <- get.phrasetable(text.ngram)
        names(text.ngram.freqlist)[1] <- paste0(i,"gram")
        print(head(text.ngram.freqlist))
        
}

##      2gram freq        prop
## 1  of the  5258 0.003781001
## 2  in the  5018 0.003608418
## 3 for the  2754 0.001980387
## 4  to the  2716 0.001953062
## 5  on the  2576 0.001852389
## 6   to be  2377 0.001709289
##             3gram freq         prop
## 1 thanks for the   481 0.0003458849
## 2     one of the   400 0.0002876382
## 3       a lot of   378 0.0002718181
## 4        to be a   263 0.0001891221
## 5      i want to   247 0.0001776166
## 6    going to be   240 0.0001725829
##                    4gram freq         prop
## 1 thanks for the follow   132 9.492067e-05
## 2         at the end of   101 7.262869e-05
## 3        the end of the   101 7.262869e-05
## 4       the rest of the    89 6.399954e-05
## 5      at the same time    81 5.824677e-05
## 6      cant wait to see    70 5.033672e-05

Data Science Capstone 2 Project Proposal

Siying R

8/30/2020