The data consist of text from 3 different sources: blogs, news, and twitter feeds and are provided in 4 different languages: German, English (US), Finnish, and Russian. For the remainder of this project, we will use only the the English (US) data sets.
file.list = c("final/en_US/en_US.blogs.txt", "final/en_US/en_US.news.txt", "final/en_US/en_US.twitter.txt")
text <- list(blogs = "", news = "", twitter = "")
data.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("blogs", "news", "twitter"),c("file size, Mb", "lines", "words")))
for (i in 1:3) {
con <- file(file.list[i], "rb")
text[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
close(con)
data.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2)
data.summary[i,2] <- length(text[[i]])
data.summary[i,3] <- sum(stri_count_words(text[[i]]))
}
The data is summarized in the table below.
| file size, Mb | lines | words | |
|---|---|---|---|
| blogs | 200.42 | 899288 | 37546239 |
| news | 196.28 | 1010242 | 34762395 |
| 159.36 | 2360148 | 30093413 |
These datasets are rather large, and since the goal is to provide a proof of concept for the data analysis, for the remainder of the report we will sample a smaller fraction of the data (1 %) to perform the analysis. The three parts will be combine into a single file and used to generate the corpus.
set.seed(123)
blogs_sample <- sample(text$blogs, 0.01*length(text$blogs))
news_sample <- sample(text$news, 0.01*length(text$news))
twitter_sample <- sample(text$twitter, 0.01*length(text$twitter))
sampled_data <- c(blogs_sample, news_sample, twitter_sample)
sum <- sum(stri_count_words(sampled_data))
The new data set consists of 1.023.563 words.
# remove emoticons
sampled_data <- iconv(sampled_data, 'UTF-8', 'ASCII')
corpus <- Corpus(VectorSource(as.data.frame(sampled_data, stringsAsFactors = FALSE)))
corpus <- corpus %>%
tm_map(tolower) %>%
tm_map(PlainTextDocument) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., tolower): transformation drops documents
## Warning in tm_map.SimpleCorpus(., PlainTextDocument): transformation drops
## documents
term.doc.matrix <- TermDocumentMatrix(corpus)
term.doc.matrix <- as.matrix(term.doc.matrix)
word.freqs <- sort(rowSums(term.doc.matrix), decreasing=TRUE)
dm <- data.frame(word=names(word.freqs), freq=word.freqs)
wordcloud(dm$word, dm$freq, min.freq= 500, random.order=TRUE, rot.per=.25, colors=brewer.pal(8, "Dark2"))
toks0 <- tokens(char_tolower(sampled_data), remove_punct = TRUE)
toks1 <- tokens_remove(toks0, stopwords("english"))
toks2 <- tokens_ngrams(toks1, 2)
toks3 <- tokens_ngrams(toks1, 3)
myDfm_ng1 <- dfm(toks1)
myDfm_ng2 <- dfm(toks2)
myDfm_ng3 <- dfm(toks3)
# To access a list of the most frequently occurring features
myDfm_features_ng1 <- topfeatures(myDfm_ng1, 20)
myDfm_features_ng2 <- topfeatures(myDfm_ng2, 20)
myDfm_features_ng3 <- topfeatures(myDfm_ng3, 20)
# Create a data.frame for ggplot
topDf_ng1 <- data.frame(
list(
term = names(myDfm_features_ng1),
frequency = unname(myDfm_features_ng1)
)
)
topDf_ng2 <- data.frame(
list(
term = names(myDfm_features_ng2),
frequency = unname(myDfm_features_ng2)
)
)
topDf_ng3 <- data.frame(
list(
term = names(myDfm_features_ng3),
frequency = unname(myDfm_features_ng3)
)
)
# Sort by reverse frequency order
topDf_ng1$term <- with(topDf_ng1, reorder(term, -frequency))
topDf_ng2$term <- with(topDf_ng2, reorder(term, -frequency))
topDf_ng3$term <- with(topDf_ng3, reorder(term, -frequency))
ggplot(topDf_ng1) + geom_col(aes(x=term, y=frequency), color="steelblue", fill="steelblue") +
theme(axis.text.x=element_text(angle=90, hjust=1))
ggplot(topDf_ng2) + geom_col(aes(x=term, y=frequency), color="steelblue", fill="steelblue") +
theme(axis.text.x=element_text(angle=90, hjust=1))
ggplot(topDf_ng3) + geom_col(aes(x=term, y=frequency), color="steelblue", fill="steelblue") +
theme(axis.text.x=element_text(angle=90, hjust=1))
We can build a predictive model with an algorithm based on the previous word. Details about this approach are available here: https://sookocheff.com/post/nlp/ngram-modeling-with-markov-chains/