Data Import

blogsData <- readLines("D:/Cousera R HW/Cousera Big_Project_SwiftKey/final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
newsData <- readLines("D:/Cousera R HW/Cousera Big_Project_SwiftKey/final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)

## Warning in readLines("D:/Cousera R HW/Cousera Big_Project_SwiftKey/final/
## en_US/en_US.news.txt", : ©ó 'D:/Cousera R HW/Cousera Big_Project_SwiftKey/
## final/en_US/en_US.news.txt' §ä¨ì¤£§¹¾ãªº³Ì«á¤@¦C

twitterData <- readLines("D:/Cousera R HW/Cousera Big_Project_SwiftKey/final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

Text Statistics and Summary

WPL <- sapply(list(blogsData,newsData,twitterData),function(x) summary(stri_count_words(x))[c('Min.','Mean','Max.')])
rownames(WPL) <- c('WPL_Min','WPL_Mean','WPL_Max')
stats <- data.frame(
  FileName=c("en_US.blogs","en_US.news","en_US.twitter"),      
  t(rbind(
    sapply(list(blogsData,newsData,twitterData),stri_stats_general)[c('Lines','Chars'),],
    Words=sapply(list(blogsData,newsData,twitterData),stri_stats_latex)['Words',],
    WPL)
  ))
head(stats)

##        FileName   Lines     Chars    Words WPL_Min WPL_Mean WPL_Max
## 1   en_US.blogs  899288 206824382 37570839       0 41.75107    6726
## 2    en_US.news   77259  15639408  2651432       1 34.61779    1123
## 3 en_US.twitter 2360148 162096241 30451170       1 12.75065      47

Overview the Data and Sample Data

blogsSample <- sample(blogsData, length(blogsData) * 0.002)
newsSample <- sample(newsData, length(newsData) * 0.002)
twitterSample<- sample(twitterData, length(twitterData) * 0.002)


allDataSample<-c(sample(blogsData, length(blogsData) * 0.002),
                 sample(newsData, length(newsData) * 0.002),
                 sample(twitterData, length(twitterData) * 0.002))

Overview.after.subset <- data.frame('File' = c("blogsData","newsData","twitterData","blogsSample","newsSample","twitterSample","allDataSample"),
                                    'FileSize' = sapply(list(blogsData,newsData,twitterData,blogsSample,newsSample,twitterSample,allDataSample), function(x){format(object.size(x),"MB")}),
                                    'nEntries' = sapply(list(blogsData,newsData,twitterData,blogsSample,newsSample,twitterSample,allDataSample), function(x){length(x)}),
                                    'TotalCharacters' = sapply(list(blogsData,newsData,twitterData,blogsSample,newsSample,twitterSample,allDataSample), function(x){sum(nchar(x))}),
                                    'MaxCharacters' = sapply(list(blogsData,newsData,twitterData,blogsSample,newsSample,twitterSample,allDataSample), function(x){max(unlist(lapply(x, function(y) nchar(y))))})
)
kable(Overview.after.subset,caption = "7 datasets")

7 datasets
File	FileSize	nEntries	TotalCharacters	MaxCharacters
blogsData	255.4 Mb	899288	206824505	40833
newsData	19.8 Mb	77259	15639408	5760
twitterData	319 Mb	2360148	162096241	140
blogsSample	0.5 Mb	1798	446275	19795
newsSample	0 Mb	154	30556	560
twitterSample	0.6 Mb	4720	322166	140
allDataSample	1.2 Mb	6672	753709	3201

Build Corpus

blogsSample <- iconv(blogsSample, "UTF-8", "ASCII", sub="") ## Clean up Emojis
newsSample <- iconv(newsSample, "UTF-8", "ASCII", sub="") ## Clean up Emojis
twitterSample <- iconv(twitterSample, "UTF-8", "ASCII", sub="") ## Clean up Emojis
allDataSample <- iconv(allDataSample, "UTF-8", "ASCII", sub="") ## Clean up Emojis

build_corpus <- function (x =allDataSample) {
  samp_corp <- VCorpus(VectorSource(x)) # Create corpus dataset
  #samp_corp <- tm_map(samp_corp, Textprocessing)
  samp_corp <- tm_map(samp_corp, tolower) # all lowercase
  samp_corp <- tm_map(samp_corp, PlainTextDocument)  
  samp_corp <- tm_map(samp_corp, removePunctuation) # Eliminate punctuation
  samp_corp <- tm_map(samp_corp, removeNumbers) # Eliminate numbers
  #samp_corp <- tm_map(samp_corp,removeWords,stopwords("english")) #remove common english words
  samp_corp <- tm_map(samp_corp, stripWhitespace) # Strip Whitespace
}
corpusData <- build_corpus(allDataSample)

Tokenize and Calculate Frequencies of N-Grams

getTermTable <- function(corpusData, ngrams = 1, lowfreq = 50) {
  #create term-document matrix tokenized on n-grams
  tokenizer <- function(x){NGramTokenizer(x, Weka_control(min = ngrams, max = ngrams)) }
  tdm <- TermDocumentMatrix(corpusData, control = list(tokenize = tokenizer))
  #find the top term grams with a minimum of occurrence in the corpus
  top_terms <- findFreqTerms(tdm,lowfreq)
  top_terms_freq <- rowSums(as.matrix(tdm[top_terms,]))
  top_terms_freq <- data.frame(word = names(top_terms_freq), frequency = top_terms_freq)
  top_terms_freq <- arrange(top_terms_freq, desc(frequency))
}

tt.Data <- list(3)
for (i in 1:3) {
  tt.Data[[i]] <- getTermTable(corpusData, ngrams = i, lowfreq = 10)
}

Wordcloud Graph

# Set random seed
set.seed(1001)
# Set Plotting in 1 row 3 columns

par(mfrow=c(1, 3))
for (i in 1:3) {
  wordcloud(tt.Data[[i]]$word, tt.Data[[i]]$frequency, scale = c(2,.5), max.words=30, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(8, "Dark2"))
}

Graph Frequencies of N-Grams

Next steps

This concludes the initial exploratory analysis. The next steps include:

Build a predictive algorithm that uses an n-gram model with a frequency lookup. Deploy a Shiny app using the algorithm above and that will also suggest the most likely next word after a phrase is typed

Text Mining Analysis Part 1

Chien-Hua Wang

2019/5/13

Data Import

Text Statistics and Summary

Overview the Data and Sample Data

Build Corpus

Tokenize and Calculate Frequencies of N-Grams

Wordcloud Graph

Graph Frequencies of N-Grams

Next steps

This concludes the initial exploratory analysis. The next steps include:

Build a predictive algorithm that uses an n-gram model with a frequency lookup. Deploy a Shiny app using the algorithm above and that will also suggest the most likely next word after a phrase is typed