cname <- file.path("E:","CourseraR", "Coursera-Swiftkey", "final", "en_US")
docs <- Corpus(DirSource(cname, encoding = "UTF-8"))
blogs_pre <- docs[[1]]$content # Post-Cleaning
news_pre <- docs[[2]]$content # Post-Cleaning
twitter_pre <- docs[[3]]$content # Post-Cleaning
summary(docs)
## Length Class Mode
## en_US.blogs.txt 2 PlainTextDocument list
## en_US.news.txt 2 PlainTextDocument list
## en_US.twitter.txt 2 PlainTextDocument list
length(blogs_pre); length(news_pre); length(twitter_pre)
## [1] 899288
## [1] 77259
## [1] 2360148
max(nchar(blogs_pre)); max(nchar(news_pre)); max(nchar(twitter_pre))
## [1] 40833
## [1] 5760
## [1] 140
At this point I will clean up te Corpus to remove:
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docs <- tm_map(docs, content_transformer(function(x) iconv(x, "latin1", "ASCII", sub = ""))) # iconv(enc2utf8(x), sub = "byte"))) esta era la instrucción vieja
docs <- tm_map(docs, toSpace, "/|@|\\|")
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
blogs <- docs[[1]]$content # Post-Cleaning
news <- docs[[2]]$content # Post-Cleaning
twitter <- docs[[3]]$content # Post-Cleaning
The Corpus is huge. That’s both a good think and a bad think. Because we (at least I do) have limited resources (time and procesing power). I will take a sample of the Corpus, at this point I chose 4% as my training sample.
train_porcentage <- 4/100
blogs_index <- sample.split(blogs, train_porcentage)
news_index <- sample.split(news, train_porcentage)
twitter_index <- sample.split(twitter, train_porcentage)
blogs_train <- blogs[blogs_index]
news_train <- news[news_index]
twitter_train <- twitter[twitter_index]
train <- paste(blogs_train, news_train, twitter_train)
write(train, file = "E:/CourseraR/Coursera-Swiftkey/final/subset/train.txt")
I reloaded the sample corpus, this is just a way for me to save time, it’s faster to have a sample file saved as txt, and reloaded than to have to load the whole Corpus, and sample everytime.
train_corpus <- Corpus(DirSource(cname2, encoding = "UTF-8"))
train <- train_corpus[[1]]
train <- txt.to.words(train)
I used the “make.ngrams” function from stylo package. This is useful because it doubt that RWeka will work fine on Shiny because of it’s Java language base. Also make.ngrams was fast enough.
n2 <- function(x) make.ngrams(train, ngram.size = 2)
n3 <- function(x) make.ngrams(train, ngram.size = 3)
n4 <- function(x) make.ngrams(train, ngram.size = 4)
dtm <- DocumentTermMatrix(train_corpus)
dtm2 <- DocumentTermMatrix(train_corpus, control = list(tokenize = n2))
dtm3 <- DocumentTermMatrix(train_corpus, control = list(tokenize = n3))
dtm4 <- DocumentTermMatrix(train_corpus, control = list(tokenize = n4))
After building the n-grams, the Document Term Matrices it’s very useful to sort them out according to their frequency. I stored the results a Data Tables for performances issues.
freq1 <- sort(colSums(as.matrix(dtm)), decreasing = TRUE)
freq2 <- sort(colSums(as.matrix(dtm2)), decreasing = TRUE)
freq3 <- sort(colSums(as.matrix(dtm3)), decreasing = TRUE)
freq4 <- sort(colSums(as.matrix(dtm4)), decreasing = TRUE)
wf1 <- tbl_df(data.frame(word = names(freq1), freq = freq1))
wf2 <- tbl_df(data.frame(word = names(freq2), freq = freq2))
wf3 <- tbl_df(data.frame(word = names(freq3), freq = freq3))
wf4 <- tbl_df(data.frame(word = names(freq4), freq = freq4))
Here are some n-grams histograms with the 15 most frequent frases in each n-gram
freq1_top <- as.integer(wf1[15,2])
freq2_top <- as.integer(wf2[15,2])
freq3_top <- as.integer(wf3[15,2])
freq4_top <- as.integer(wf4[15,2])
g1 <- subset(wf1, freq1 > freq1_top) %>% ggplot(aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat = "identity") + labs(x = "Word", y= "Frequency", title = "Single Word Histogram") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
g1
g2 <- subset(wf2, freq2 > freq2_top) %>% ggplot(aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat = "identity") + labs(x = "Bi-Gram", y= "Frequency", title = "Bi-Gram Histogram") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
g2
g3 <- subset(wf3, freq3 > freq3_top) %>% ggplot(aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat = "identity") + labs(x = "Tri-Gram", y= "Frequency", title = "Tri-Gram Histogram") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
g3
g4 <- subset(wf4, freq4 > freq4_top) %>% ggplot(aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat = "identity") + labs(x = "Four-Gram", y= "Frequency", title = "Four-Gram Histogram") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
g4
Also, it’s possible to make wordclouds. It’s fun and nicer looking, but basically is the same info from the histograms
wordcloud(names(freq1), freq1, min.freq = 1000, max.words = 140, colors = brewer.pal(6, "Dark2"))
wordcloud(names(freq2), freq2, min.freq = 500, max.words = 80, colors = brewer.pal(6, "Dark2"))
wordcloud(names(freq3), freq3, min.freq = 100, max.words = 60, colors = brewer.pal(6, "Dark2"))
wordcloud(names(freq4), freq4, min.freq = 10, max.words= 30, colors = brewer.pal(6, "Dark2"))
So far I haven’t have “important findings”. I’ve been fighting with the coding and the handling of text mining and n-grams. I’ve learned a lot about NLP, and the models, but not so much about the information processed for this report.