Data Import

blogsData <- readLines("D:/Cousera R HW/Cousera Big_Project_SwiftKey/final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
newsData <- readLines("D:/Cousera R HW/Cousera Big_Project_SwiftKey/final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("D:/Cousera R HW/Cousera Big_Project_SwiftKey/final/
## en_US/en_US.news.txt", : ©ó 'D:/Cousera R HW/Cousera Big_Project_SwiftKey/
## final/en_US/en_US.news.txt' §ä¨ì¤£§¹¾ãªº³Ì«á¤@¦C
twitterData <- readLines("D:/Cousera R HW/Cousera Big_Project_SwiftKey/final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

Text Statistics and Summary

WPL <- sapply(list(blogsData,newsData,twitterData),function(x) summary(stri_count_words(x))[c('Min.','Mean','Max.')])
rownames(WPL) <- c('WPL_Min','WPL_Mean','WPL_Max')
stats <- data.frame(
  FileName=c("en_US.blogs","en_US.news","en_US.twitter"),      
  t(rbind(
    sapply(list(blogsData,newsData,twitterData),stri_stats_general)[c('Lines','Chars'),],
    Words=sapply(list(blogsData,newsData,twitterData),stri_stats_latex)['Words',],
    WPL)
  ))
head(stats)
##        FileName   Lines     Chars    Words WPL_Min WPL_Mean WPL_Max
## 1   en_US.blogs  899288 206824382 37570839       0 41.75107    6726
## 2    en_US.news   77259  15639408  2651432       1 34.61779    1123
## 3 en_US.twitter 2360148 162096241 30451170       1 12.75065      47

Overview the Data and Sample Data

blogsSample <- sample(blogsData, length(blogsData) * 0.002)
newsSample <- sample(newsData, length(newsData) * 0.002)
twitterSample<- sample(twitterData, length(twitterData) * 0.002)


allDataSample<-c(sample(blogsData, length(blogsData) * 0.002),
                 sample(newsData, length(newsData) * 0.002),
                 sample(twitterData, length(twitterData) * 0.002))

Overview.after.subset <- data.frame('File' = c("blogsData","newsData","twitterData","blogsSample","newsSample","twitterSample","allDataSample"),
                                    'FileSize' = sapply(list(blogsData,newsData,twitterData,blogsSample,newsSample,twitterSample,allDataSample), function(x){format(object.size(x),"MB")}),
                                    'nEntries' = sapply(list(blogsData,newsData,twitterData,blogsSample,newsSample,twitterSample,allDataSample), function(x){length(x)}),
                                    'TotalCharacters' = sapply(list(blogsData,newsData,twitterData,blogsSample,newsSample,twitterSample,allDataSample), function(x){sum(nchar(x))}),
                                    'MaxCharacters' = sapply(list(blogsData,newsData,twitterData,blogsSample,newsSample,twitterSample,allDataSample), function(x){max(unlist(lapply(x, function(y) nchar(y))))})
)
kable(Overview.after.subset,caption = "7 datasets")
7 datasets
File FileSize nEntries TotalCharacters MaxCharacters
blogsData 255.4 Mb 899288 206824505 40833
newsData 19.8 Mb 77259 15639408 5760
twitterData 319 Mb 2360148 162096241 140
blogsSample 0.5 Mb 1798 446275 19795
newsSample 0 Mb 154 30556 560
twitterSample 0.6 Mb 4720 322166 140
allDataSample 1.2 Mb 6672 753709 3201

Build Corpus

blogsSample <- iconv(blogsSample, "UTF-8", "ASCII", sub="") ## Clean up Emojis
newsSample <- iconv(newsSample, "UTF-8", "ASCII", sub="") ## Clean up Emojis
twitterSample <- iconv(twitterSample, "UTF-8", "ASCII", sub="") ## Clean up Emojis
allDataSample <- iconv(allDataSample, "UTF-8", "ASCII", sub="") ## Clean up Emojis

build_corpus <- function (x =allDataSample) {
  samp_corp <- VCorpus(VectorSource(x)) # Create corpus dataset
  #samp_corp <- tm_map(samp_corp, Textprocessing)
  samp_corp <- tm_map(samp_corp, tolower) # all lowercase
  samp_corp <- tm_map(samp_corp, PlainTextDocument)  
  samp_corp <- tm_map(samp_corp, removePunctuation) # Eliminate punctuation
  samp_corp <- tm_map(samp_corp, removeNumbers) # Eliminate numbers
  #samp_corp <- tm_map(samp_corp,removeWords,stopwords("english")) #remove common english words
  samp_corp <- tm_map(samp_corp, stripWhitespace) # Strip Whitespace
}
corpusData <- build_corpus(allDataSample)

Tokenize and Calculate Frequencies of N-Grams

getTermTable <- function(corpusData, ngrams = 1, lowfreq = 50) {
  #create term-document matrix tokenized on n-grams
  tokenizer <- function(x){NGramTokenizer(x, Weka_control(min = ngrams, max = ngrams)) }
  tdm <- TermDocumentMatrix(corpusData, control = list(tokenize = tokenizer))
  #find the top term grams with a minimum of occurrence in the corpus
  top_terms <- findFreqTerms(tdm,lowfreq)
  top_terms_freq <- rowSums(as.matrix(tdm[top_terms,]))
  top_terms_freq <- data.frame(word = names(top_terms_freq), frequency = top_terms_freq)
  top_terms_freq <- arrange(top_terms_freq, desc(frequency))
}

tt.Data <- list(3)
for (i in 1:3) {
  tt.Data[[i]] <- getTermTable(corpusData, ngrams = i, lowfreq = 10)
}

Wordcloud Graph

# Set random seed
set.seed(1001)
# Set Plotting in 1 row 3 columns

par(mfrow=c(1, 3))
for (i in 1:3) {
  wordcloud(tt.Data[[i]]$word, tt.Data[[i]]$frequency, scale = c(2,.5), max.words=30, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(8, "Dark2"))
}

Graph Frequencies of N-Grams

Next steps

This concludes the initial exploratory analysis. The next steps include:

Build a predictive algorithm that uses an n-gram model with a frequency lookup. Deploy a Shiny app using the algorithm above and that will also suggest the most likely next word after a phrase is typed