The first step is to download the three files and put them in the workding directory and read them in. The following r code show the process of reading the source files, tokenizing each line and counting the total number of words.
library(dplyr)
library(tm)
library(NLP)
library(ggplot2)
library(ngram)
library(tokenizers)
#Get total number of lines and total number of words in the file "en_US.news.txt"
con <- file("en_US.news.txt", "r")
lineAll <- readLines(con)
close(con)
tokenAll <- tokenizers::tokenize_words(lineAll, lowercase=TRUE)
wordCount <- 0
totalLines <- length(lineAll)
for (i in 1:totalLines) {
wordCount = wordCount + length(tokenAll[[i]])
}
news <- lineAll
newsLines <- totalLines
newsWords <- wordCount
#Get total number of lines and total number of words in the file "en_US.blogs.txt"
con <- file("en_US.blogs.txt", "r")
lineAll <- readLines(con)
close(con)
tokenAll <- tokenizers::tokenize_words(lineAll, lowercase=TRUE)
wordCount <- 0
totalLines <- length(lineAll)
for (i in 1:totalLines) {
wordCount = wordCount + length(tokenAll[[i]])
}
blogs <- lineAll
blogsLines <- totalLines
blogsWords <- wordCount
#Get total number of lines and total number of words in the file "en_US.blogs.txt"
con <- file("en_US.twitter.txt", "r")
lineAll <- readLines(con)
close(con)
tokenAll <- tokenizers::tokenize_words(lineAll, lowercase=TRUE)
wordCount <- 0
totalLines <- length(lineAll)
for (i in 1:totalLines) {
wordCount = wordCount + length(tokenAll[[i]])
}
twitter <- lineAll
twitterLines <- totalLines
twitterWords <- wordCount
summaryMatrix <- matrix( c(newsLines, newsWords, blogsLines, blogsWords, twitterLines, twitterWords),
nrow=3, ncol=2, byrow = TRUE)
dimnames(summaryMatrix) = list(
c("en_US.news", "en_US.blogs", "en_US.Twitter"), # row names
c("Number of Lines", "Number of Words")) # column names
summaryMatrix
## Number of Lines Number of Words
## en_US.news 1010242 34762395
## en_US.blogs 899288 37546246
## en_US.Twitter 2360148 30093369
clean_and_preprocess <- function(text) {
text <- tm_map(text, removePunctuation) %>%
tm_map(tolower) %>%
tm_map(removeWords, stopwords("en")) %>%
tm_map(stripWhitespace)
text
}
newsCorpus <- Corpus(VectorSource(news))
newsCorpus <- clean_and_preprocess(newsCorpus)
blogsCorpus <- Corpus(VectorSource(blogs))
blogsCorpus <- clean_and_preprocess(blogsCorpus)
twitterCorpus <- Corpus(VectorSource(twitter))
twitterCorpus <- tm_map(twitterCorpus, removePunctuation) %>%
tm_map(tolower) %>%
tm_map(removeWords, stopwords("en")) %>%
tm_map(stripWhitespace)
newsTDM <- TermDocumentMatrix(newsCorpus)
blogsTDM <- TermDocumentMatrix(blogsCorpus)
twitterTDM <- TermDocumentMatrix(twitterCorpus)
newsFrequentTerms <- findFreqTerms(newsTDM, lowfreq=60000, highfreq=Inf)
blogsFrequentTerms <- findFreqTerms(blogsTDM, lowfreq=60000, highfreq=Inf)
twitterFrequentTerms <- findFreqTerms(twitterTDM, lowfreq=60000, highfreq=Inf)
count_frequency <- function(terms, tdm) {
termTDM <- tdm[terms,]
frequency <- apply(termTDM, MARGIN=1, FUN=sum)
df <- data.frame(terms, frequency)
}
dfNews <- count_frequency(newsFrequentTerms, newsTDM)
dfBlogs <- count_frequency(blogsFrequentTerms, blogsTDM)
dfTwitter <- count_frequency(twitterFrequentTerms, twitterTDM)
dfNews$terms <- factor(dfNews$terms, levels=dfNews$terms[order(dfNews$frequency)])
ggplot(dfNews, aes(x=terms, y=frequency)) + theme_bw() + geom_bar(stat="identity") + labs(title='en_US.news.txt')
dfBlogs$terms <- factor(dfBlogs$terms, levels=dfBlogs$terms[order(dfBlogs$frequency)])
ggplot(dfBlogs, aes(x=terms, y=frequency)) + theme_bw() + geom_bar(stat="identity") + labs(title='en_US.blogs.txt')
dfTwitter$terms <- factor(dfTwitter$terms, levels=dfTwitter$terms[order(dfTwitter$frequency)])
ggplot(dfTwitter, aes(x=terms, y=frequency)) + theme_bw() + geom_bar(stat="identity") + labs(title='en_US.Twitter.txt')
The Twitter corpus and blogs corpus show that the word “you” was used more repeatedly than any other words. Also the news corpus appear to contain smaller variety of words than the other two, while the Twitter corpus contain larger variety of words than the blogs corpus. This seems appropriate since news articles tend to use the same words repeatedly.
We plan to keep working with how we will deal with stopwords to make the prediction more accurate. We also would like to include the back-off model in our application.