Data set

The data consist of text from 3 different sources: blogs, news, and twitter feeds and are provided in 4 different languages: German, English (US), Finnish, and Russian. For the remainder of this project, we will use only the the English (US) data sets.

Summary of the English (US) data

file.list = c("final/en_US/en_US.blogs.txt", "final/en_US/en_US.news.txt", "final/en_US/en_US.twitter.txt")
text <- list(blogs = "", news = "", twitter = "")

data.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("blogs", "news", "twitter"),c("file size, Mb", "lines", "words")))
for (i in 1:3) {
  con <- file(file.list[i], "rb")
  text[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
  close(con)
  data.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2)
  data.summary[i,2] <- length(text[[i]])
  data.summary[i,3] <- sum(stri_count_words(text[[i]]))
}

The data is summarized in the table below.

file size, Mb lines words
blogs 200.42 899288 37546239
news 196.28 1010242 34762395
twitter 159.36 2360148 30093413

These datasets are rather large, and since the goal is to provide a proof of concept for the data analysis, for the remainder of the report we will sample a smaller fraction of the data (1 %) to perform the analysis. The three parts will be combine into a single file and used to generate the corpus.

set.seed(123)
blogs_sample <- sample(text$blogs, 0.01*length(text$blogs))
news_sample <- sample(text$news, 0.01*length(text$news))
twitter_sample <- sample(text$twitter, 0.01*length(text$twitter))
sampled_data <- c(blogs_sample, news_sample, twitter_sample)
sum <- sum(stri_count_words(sampled_data))

The new data set consists of 1.023.563 words.

Build the corpus

# remove emoticons
sampled_data <- iconv(sampled_data, 'UTF-8', 'ASCII')
corpus <- Corpus(VectorSource(as.data.frame(sampled_data, stringsAsFactors = FALSE))) 
corpus <- corpus %>%
  tm_map(tolower) %>%  
  tm_map(PlainTextDocument) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., tolower): transformation drops documents
## Warning in tm_map.SimpleCorpus(., PlainTextDocument): transformation drops
## documents
term.doc.matrix <- TermDocumentMatrix(corpus)
term.doc.matrix <- as.matrix(term.doc.matrix)
word.freqs <- sort(rowSums(term.doc.matrix), decreasing=TRUE) 
dm <- data.frame(word=names(word.freqs), freq=word.freqs)

Word cloud plot of the most common words in the corpus

wordcloud(dm$word, dm$freq, min.freq= 500, random.order=TRUE, rot.per=.25, colors=brewer.pal(8, "Dark2"))

Build n-gram models

toks0 <- tokens(char_tolower(sampled_data), remove_punct = TRUE)
toks1 <- tokens_remove(toks0, stopwords("english"))
toks2 <- tokens_ngrams(toks1, 2)
toks3 <- tokens_ngrams(toks1, 3)
myDfm_ng1 <- dfm(toks1)
myDfm_ng2 <- dfm(toks2)
myDfm_ng3 <- dfm(toks3)

# To access a list of the most frequently occurring features
myDfm_features_ng1 <- topfeatures(myDfm_ng1, 20)
myDfm_features_ng2 <- topfeatures(myDfm_ng2, 20)
myDfm_features_ng3 <- topfeatures(myDfm_ng3, 20)

# Create a data.frame for ggplot
topDf_ng1 <- data.frame(
    list(
        term = names(myDfm_features_ng1),
        frequency = unname(myDfm_features_ng1)
    )
)

topDf_ng2 <- data.frame(
    list(
        term = names(myDfm_features_ng2),
        frequency = unname(myDfm_features_ng2)
    )
)

topDf_ng3 <- data.frame(
    list(
        term = names(myDfm_features_ng3),
        frequency = unname(myDfm_features_ng3)
    )
)

Visualize top 20 features

# Sort by reverse frequency order
topDf_ng1$term <- with(topDf_ng1, reorder(term, -frequency))
topDf_ng2$term <- with(topDf_ng2, reorder(term, -frequency))
topDf_ng3$term <- with(topDf_ng3, reorder(term, -frequency))

ggplot(topDf_ng1) + geom_col(aes(x=term, y=frequency), color="steelblue", fill="steelblue") +
    theme(axis.text.x=element_text(angle=90, hjust=1))

ggplot(topDf_ng2) + geom_col(aes(x=term, y=frequency), color="steelblue", fill="steelblue") +
    theme(axis.text.x=element_text(angle=90, hjust=1))

ggplot(topDf_ng3) + geom_col(aes(x=term, y=frequency), color="steelblue", fill="steelblue") +
    theme(axis.text.x=element_text(angle=90, hjust=1))

Plans for creating the prediction algorithm

We can build a predictive model with an algorithm based on the previous word. Details about this approach are available here: https://sookocheff.com/post/nlp/ngram-modeling-with-markov-chains/