Data Import
blogsData <- readLines("D:/Cousera R HW/Cousera Big_Project_SwiftKey/final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
newsData <- readLines("D:/Cousera R HW/Cousera Big_Project_SwiftKey/final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("D:/Cousera R HW/Cousera Big_Project_SwiftKey/final/
## en_US/en_US.news.txt", : ©ó 'D:/Cousera R HW/Cousera Big_Project_SwiftKey/
## final/en_US/en_US.news.txt' §ä¨ì¤£§¹¾ãªº³Ì«á¤@¦C
twitterData <- readLines("D:/Cousera R HW/Cousera Big_Project_SwiftKey/final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
Text Statistics and Summary
WPL <- sapply(list(blogsData,newsData,twitterData),function(x) summary(stri_count_words(x))[c('Min.','Mean','Max.')])
rownames(WPL) <- c('WPL_Min','WPL_Mean','WPL_Max')
stats <- data.frame(
FileName=c("en_US.blogs","en_US.news","en_US.twitter"),
t(rbind(
sapply(list(blogsData,newsData,twitterData),stri_stats_general)[c('Lines','Chars'),],
Words=sapply(list(blogsData,newsData,twitterData),stri_stats_latex)['Words',],
WPL)
))
head(stats)
## FileName Lines Chars Words WPL_Min WPL_Mean WPL_Max
## 1 en_US.blogs 899288 206824382 37570839 0 41.75107 6726
## 2 en_US.news 77259 15639408 2651432 1 34.61779 1123
## 3 en_US.twitter 2360148 162096241 30451170 1 12.75065 47
Overview the Data and Sample Data
blogsSample <- sample(blogsData, length(blogsData) * 0.002)
newsSample <- sample(newsData, length(newsData) * 0.002)
twitterSample<- sample(twitterData, length(twitterData) * 0.002)
allDataSample<-c(sample(blogsData, length(blogsData) * 0.002),
sample(newsData, length(newsData) * 0.002),
sample(twitterData, length(twitterData) * 0.002))
Overview.after.subset <- data.frame('File' = c("blogsData","newsData","twitterData","blogsSample","newsSample","twitterSample","allDataSample"),
'FileSize' = sapply(list(blogsData,newsData,twitterData,blogsSample,newsSample,twitterSample,allDataSample), function(x){format(object.size(x),"MB")}),
'nEntries' = sapply(list(blogsData,newsData,twitterData,blogsSample,newsSample,twitterSample,allDataSample), function(x){length(x)}),
'TotalCharacters' = sapply(list(blogsData,newsData,twitterData,blogsSample,newsSample,twitterSample,allDataSample), function(x){sum(nchar(x))}),
'MaxCharacters' = sapply(list(blogsData,newsData,twitterData,blogsSample,newsSample,twitterSample,allDataSample), function(x){max(unlist(lapply(x, function(y) nchar(y))))})
)
kable(Overview.after.subset,caption = "7 datasets")
7 datasets
| blogsData |
255.4 Mb |
899288 |
206824505 |
40833 |
| newsData |
19.8 Mb |
77259 |
15639408 |
5760 |
| twitterData |
319 Mb |
2360148 |
162096241 |
140 |
| blogsSample |
0.5 Mb |
1798 |
446275 |
19795 |
| newsSample |
0 Mb |
154 |
30556 |
560 |
| twitterSample |
0.6 Mb |
4720 |
322166 |
140 |
| allDataSample |
1.2 Mb |
6672 |
753709 |
3201 |
Build Corpus
blogsSample <- iconv(blogsSample, "UTF-8", "ASCII", sub="") ## Clean up Emojis
newsSample <- iconv(newsSample, "UTF-8", "ASCII", sub="") ## Clean up Emojis
twitterSample <- iconv(twitterSample, "UTF-8", "ASCII", sub="") ## Clean up Emojis
allDataSample <- iconv(allDataSample, "UTF-8", "ASCII", sub="") ## Clean up Emojis
build_corpus <- function (x =allDataSample) {
samp_corp <- VCorpus(VectorSource(x)) # Create corpus dataset
#samp_corp <- tm_map(samp_corp, Textprocessing)
samp_corp <- tm_map(samp_corp, tolower) # all lowercase
samp_corp <- tm_map(samp_corp, PlainTextDocument)
samp_corp <- tm_map(samp_corp, removePunctuation) # Eliminate punctuation
samp_corp <- tm_map(samp_corp, removeNumbers) # Eliminate numbers
#samp_corp <- tm_map(samp_corp,removeWords,stopwords("english")) #remove common english words
samp_corp <- tm_map(samp_corp, stripWhitespace) # Strip Whitespace
}
corpusData <- build_corpus(allDataSample)
Tokenize and Calculate Frequencies of N-Grams
getTermTable <- function(corpusData, ngrams = 1, lowfreq = 50) {
#create term-document matrix tokenized on n-grams
tokenizer <- function(x){NGramTokenizer(x, Weka_control(min = ngrams, max = ngrams)) }
tdm <- TermDocumentMatrix(corpusData, control = list(tokenize = tokenizer))
#find the top term grams with a minimum of occurrence in the corpus
top_terms <- findFreqTerms(tdm,lowfreq)
top_terms_freq <- rowSums(as.matrix(tdm[top_terms,]))
top_terms_freq <- data.frame(word = names(top_terms_freq), frequency = top_terms_freq)
top_terms_freq <- arrange(top_terms_freq, desc(frequency))
}
tt.Data <- list(3)
for (i in 1:3) {
tt.Data[[i]] <- getTermTable(corpusData, ngrams = i, lowfreq = 10)
}
Wordcloud Graph
# Set random seed
set.seed(1001)
# Set Plotting in 1 row 3 columns
par(mfrow=c(1, 3))
for (i in 1:3) {
wordcloud(tt.Data[[i]]$word, tt.Data[[i]]$frequency, scale = c(2,.5), max.words=30, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(8, "Dark2"))
}

Graph Frequencies of N-Grams

Next steps
This concludes the initial exploratory analysis. The next steps include:
Build a predictive algorithm that uses an n-gram model with a frequency lookup. Deploy a Shiny app using the algorithm above and that will also suggest the most likely next word after a phrase is typed