The first step in building a predictive model for text is understanding the distribution and relationship between the words, tokens, and phrases in the text. The goal of this task is to understand the basic relationships you observe in the data and prepare to build your first linguistic models.
knitr::opts_chunk$set(echo = TRUE)
smooth_count <- function(df, text) {
fq0 <- df[df$left == text, ]$freq[1]
fq1 = fq0 + 1
nfq1 <- nrow(df[df$freq == fq1, ])
nfq0 <- nrow(df[df$freq == fq0, ])
count <- (fq0+1)*nfq1/nfq0
if (count == 0) {
count = fq0
}
return(count)
}
#Load libraries
library(wordcloud)
library(tm)
library(ngram)
library(RWeka)
library(ggplot2)
library(dplyr)
memory.limit(size = 50000000)
## [1] 5e+07
Loading the three files consumes a lot of processing power and memory. In order to facilitate the analysis it will be read a random 10%, and the final corpus will contain data from the three files (Blogs, News and Twitter)
#Definition of base files we are going to work with
file_blogs <- file("Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
file_news <- file("Coursera-SwiftKey/final/en_US/en_US.news.txt")
file_twitter <- file("Coursera-SwiftKey/final/en_US/en_US.twitter.txt")
#Here we load the files using the readLines command and
#then we close the connection
temp_blogs <- readLines(file_blogs)
close(file_blogs)
temp_news <- readLines(file_news)
close(file_news)
temp_twitter <- readLines(file_twitter)
close(file_twitter)
#Here I create a new column which can be 0 or 1 in value. The purpose
#is to select then only the columns with a value of 1 to make a
#temporal dataframe to work easier.
temp_blogs_rbinom <- rbinom(length(temp_blogs), 1, 0.002)
temp_news_rbinom <- rbinom(length(temp_news), 1, 0.002)
temp_twitter_rbinom <- rbinom(length(temp_twitter), 1, 0.002)
#Dataframe creation
df_blogs <- data.frame(temp_blogs, temp_blogs_rbinom)
df_news <- data.frame(temp_news, temp_news_rbinom)
df_twitter <- data.frame(temp_twitter, temp_twitter_rbinom)
#Dataframe filter to create filtered DF to work with
df_blogs_train <- df_blogs[df_blogs$temp_blogs_rbinom==1, ]
df_news_train <- df_news[df_news$temp_news_rbinom==1, ]
df_twitter_train <- df_twitter[df_twitter$temp_twitter_rbinom==1, ]
Counting how many rows has the final train file
count(df_blogs_train)
## n
## 1 1822
count(df_news_train)
## n
## 1 155
count(df_twitter_train)
## n
## 1 4698
#Definition of base files to write select info
file_train <- file("Coursera-SwiftKey/final/en_US/en_US_train.txt")
writeLines(df_blogs_train$temp_blogs, file_train)
writeLines(df_news_train$temp_news, file_train)
writeLines(df_twitter_train$temp_twitter, file_train)
close(file_train)
Now it is necessary to clean up the corpus, using the tm_map functions. Below you are able to see that numbera, stopwords, punctuantion and whitespaces are removed.
is_english_word <- function(x) x %in% GradyAugmented
#Read the prepared train file. This file was prepared with the capStoneNLP_loadTemp.R script
file_train <- file("Coursera-SwiftKey/final/en_US/en_US_train.txt")
temp_train <- readLines(file_train)
close(file_train)
#Create corpus for each of the text files to be analyzed
corpus_train <- VCorpus(VectorSource(temp_train))
# Perform some transformations to the corpus.
corpus_train <- tm_map(corpus_train, content_transformer(tolower))
corpus_train <- tm_map(corpus_train, removeNumbers)
corpus_train <- tm_map(corpus_train, removeWords, stopwords("english"))
corpus_train <- tm_map(corpus_train, removePunctuation)
corpus_train <- tm_map(corpus_train, stripWhitespace)
Now, the term matrices are generated. Four dataframes containing the onegram, twogram, threegram and fourgram. Those four dataframes will be used in order to show the distribution of the data
#Text matrix and generation tf2 - 2 gram
tdm1 <- TermDocumentMatrix(corpus_train,
control = list(tokenize = function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))))
resultFreq <- findFreqTerms(tdm1, lowfreq = 10)
tf1 <- rowSums(as.matrix(tdm1[resultFreq,]))
tf1 <- data.frame(unigram=names(tf1), freq=tf1)
tf1 <- tf1[order(-tf1$freq), ]
#Text matrix and generation tf2 - 2 gram
tdm2 <- TermDocumentMatrix(corpus_train,
control = list(tokenize = function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))))
resultFreq <- findFreqTerms(tdm2, lowfreq = 20)
tf2 <- rowSums(as.matrix(tdm2[resultFreq,]))
tf2 <- data.frame(bigram=names(tf2), freq=tf2)
tf2 <- tf2 %>% dplyr::rowwise() %>% mutate(left = trimws(paste(strsplit(bigram, " ")[[1]][1:1], "", collapse="")))
tf2 <- tf2 %>% dplyr::rowwise() %>% mutate(right = trimws(paste(strsplit(bigram, " ")[[1]][2:2], "", collapse="")))
tf2 <- tf2 %>% dplyr::rowwise() %>% mutate(adjusted_count = smooth_count(tf2, left))
tf2 <- tf2[order(-tf2$freq), ]
#Text matrix and generation tf2 - 3 gram
tdm3 <- TermDocumentMatrix(corpus_train,
control = list(tokenize = function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))))
resultFreq <- findFreqTerms(tdm3, lowfreq = 5)
tf3 <- rowSums(as.matrix(tdm3[resultFreq,]))
tf3 <- data.frame(trigram=names(tf3), freq=tf3)
tf3 <- tf3 %>% dplyr::rowwise() %>% mutate(left = trimws(paste(strsplit(trigram, " ")[[1]][1:2], "", collapse="")))
tf3 <- tf3 %>% dplyr::rowwise() %>% mutate(right = trimws(paste(strsplit(trigram, " ")[[1]][3:3], "", collapse="")))
tf3 <- tf3 %>% dplyr::rowwise() %>% mutate(adjusted_count = smooth_count(tf3, left))
tf3 <- tf3[order(-tf3$freq), ]
#Text matrix and generation tf4 - 4 gram
tdm4 <- TermDocumentMatrix(corpus_train,
control = list(tokenize = function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))))
resultFreq <- findFreqTerms(tdm4, lowfreq = 2)
tf4 <- rowSums(as.matrix(tdm4[resultFreq,]))
tf4 <- data.frame(cuatrigram=names(tf4), freq=tf4)
tf4 <- tf4 %>% dplyr::rowwise() %>% mutate(left = trimws(paste(strsplit(cuatrigram, " ")[[1]][1:3], "", collapse="")))
tf4 <- tf4 %>% dplyr::rowwise() %>% mutate(right = trimws(paste(strsplit(cuatrigram, " ")[[1]][4:4], "", collapse="")))
tf4 <- tf4 %>% dplyr::rowwise() %>% mutate(adjusted_count = smooth_count(tf4, left))
tf4 <- tf4[order(-tf4$freq), ]
Below it’s included the word frequency, plus the frequency for the one, two and three grams.
# Distributions - Frequency bar plots
ggplot(tf1[1:30,], aes(x = reorder(unigram,freq), y = freq)) + coord_flip() +
geom_bar(stat = "identity", fill = "purple") + theme_bw() +
ggtitle("Most Frequent Unigrams") +labs(x = "", y = "")
ggplot(tf2[1:30,], aes(x = reorder(bigram,freq), y = freq)) + coord_flip() +
geom_bar(stat = "identity", fill = "green") + theme_bw() +
ggtitle("Most Frequent Bigrams") +labs(x = "", y = "")
ggplot(tf3[1:30,], aes(x = reorder(trigram,freq), y = freq)) + coord_flip() +
geom_bar(stat = "identity", fill = "green") + theme_bw() +
ggtitle("Most Frequent Trigrams") +labs(x = "", y = "")
ggplot(tf4[1:30,], aes(x = reorder(cuatrigram,freq), y = freq)) + coord_flip() +
geom_bar(stat = "identity", fill = "green") + theme_bw() +
ggtitle("Most Frequent Quatrigrams") +labs(x = "", y = "")
If you want to find out if words are from a language that is different from English, it is possible to use the following function that basically search the words inside an English Dictionary (GradyAugmented), but in the case of the exercise it was not necessary to do, besides it adds computing time to the process.
is_english_word <- function(x) x %in% GradyAugmented
How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language? 90%?
i<-1
total<-0
while (total < sum(tf1$freq)/2) { total <- total + tf1[i,]$freq; i <- i + 1 }
i
## [1] 99
And increasing the coverage includes appliying some discount algorithm to the probabilities in order to leave probabilities to the words that may not be in the n-gram. Also we may do some aditional clean up, line cleaning up repeated words on one n-gram.