Loading libraries

library("knitr")
library("stringi")
library("ggplot2")
library("tm")
## Загрузка требуемого пакета: NLP
## 
## Присоединяю пакет: 'NLP'
## Следующий объект скрыт от 'package:ggplot2':
## 
##     annotate
library("wordcloud")
## Загрузка требуемого пакета: RColorBrewer
library("quanteda")
## Warning in .recacheSubclasses(def@className, def, env): неопределенный подкласс
## "ndiMatrix" класса ""replValueSp""; определение не обновлено
## Package version: 3.3.1
## Unicode version: 15.1
## ICU version: 74.1
## Parallel computing: 4 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Присоединяю пакет: 'quanteda'
## Следующий объект скрыт от 'package:tm':
## 
##     stopwords
## Следующие объекты скрыты от 'package:NLP':
## 
##     meta, meta<-

Loading the data

blogs <- readLines("C:/Users/dmitr/Downloads/ru_RU.blogs.txt")
news <- readLines("C:/Users/dmitr/Downloads/ru_RU.news.txt")
twitter <- readLines("C:/Users/dmitr/Downloads/ru_RU.twitter.txt")
## Warning in readLines("C:/Users/dmitr/Downloads/ru_RU.twitter.txt"): сторка
## 191902 похоже, содержит встроенный nul
## Warning in readLines("C:/Users/dmitr/Downloads/ru_RU.twitter.txt"): сторка
## 309777 похоже, содержит встроенный nul

Short summary of the data

# Mb size of each file
blogs_size <- file.info("C:/Users/dmitr/Downloads/ru_RU.blogs.txt")$size / 1024 ^ 2
news_size <- file.info("C:/Users/dmitr/Downloads/ru_RU.news.txt")$size / 1024 ^ 2
twitter_size <- file.info("C:/Users/dmitr/Downloads/ru_RU.twitter.txt")$size / 1024 ^ 2

# Number of words in each file
blogs_words <- stri_count_words(blogs)
news_words <- stri_count_words(news)
twitter_words <- stri_count_words(twitter)

# Creation of a table
summary <- data.frame(files_names = c("blogs", "news", "twitter"),
           Size_MB = c(blogs_size, news_size, twitter_size),
           lines = c(length(blogs), length(news), length(twitter)),
           words = c(sum(blogs_words), sum(news_words), sum(twitter_words)))
kable(summary, caption = "Russian text files")
Russian text files
files_names Size_MB lines words
blogs 111.4424 337100 9388810
news 113.4838 196360 9057279
twitter 100.3097 881414 9234641

Cleaning the Data and Taking a sample

Because of a great size of each file I think that it will be more useful if I get only small sample of data, for example 5% and see some interesting things in these 0.8%.

set.seed(123)

sample_blogs <- sample(blogs, length(blogs) * 0.008)
sample_news <- sample(news, length(news) * 0.008)
sample_twitter <- sample(twitter, length(twitter) * 0.008)

sample_df <- c(sample_blogs, sample_news, sample_twitter)

corpus <- VCorpus(VectorSource(sample_df))
print(corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 11317

Next I need to create a ‘corpus’ for better operating with text datasets.

# Functions for preprocessing

clean_corp <- function(corp_data){
  corp_data <- tm_map(corp_data, removeNumbers)
  corp_data <- tm_map(corp_data, content_transformer(tolower))
  corp_data<- tm_map(corp_data, removeWords, stopwords("ru"))
  corp_data<- tm_map(corp_data, removePunctuation)
  corp_data<- tm_map(corp_data, stripWhitespace)
  return (corp_data)
}

corpus <- clean_corp(corpus)

Explaratory Analysis

Let’s build some plots.

options(mc.cores=1)

getFreq <- function(tdm) {
  freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  return(data.frame(word = names(freq), freq = freq))
}
makeHistogramPlot <- function(data, label, color) {
  ggplot(data[1:10,], aes(reorder(word, -freq), freq)) +
    labs(x = label, y = "Frequency") +
    theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
    geom_bar(stat = "identity", fill = I(color))
}

corpus_dtm <- TermDocumentMatrix(corpus)

corpus_dtm
## <<TermDocumentMatrix (terms: 52576, documents: 11317)>>
## Non-/sparse entries: 135888/594866704
## Sparsity           : 100%
## Maximal term length: 56
## Weighting          : term frequency (tf)
freq1 <- getFreq(removeSparseTerms(corpus_dtm, 0.9999))
makeHistogramPlot(freq1, "10 Most Common Unigrams", color = 3)

Plans for creating an algorithm

Honestly, I don’t have any good ideas now, but I think that it can be possible to predict next word according to the previous 2-3 words.