library("knitr")
library("stringi")
library("ggplot2")
library("tm")
## Загрузка требуемого пакета: NLP
##
## Присоединяю пакет: 'NLP'
## Следующий объект скрыт от 'package:ggplot2':
##
## annotate
library("wordcloud")
## Загрузка требуемого пакета: RColorBrewer
library("quanteda")
## Warning in .recacheSubclasses(def@className, def, env): неопределенный подкласс
## "ndiMatrix" класса ""replValueSp""; определение не обновлено
## Package version: 3.3.1
## Unicode version: 15.1
## ICU version: 74.1
## Parallel computing: 4 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Присоединяю пакет: 'quanteda'
## Следующий объект скрыт от 'package:tm':
##
## stopwords
## Следующие объекты скрыты от 'package:NLP':
##
## meta, meta<-
blogs <- readLines("C:/Users/dmitr/Downloads/ru_RU.blogs.txt")
news <- readLines("C:/Users/dmitr/Downloads/ru_RU.news.txt")
twitter <- readLines("C:/Users/dmitr/Downloads/ru_RU.twitter.txt")
## Warning in readLines("C:/Users/dmitr/Downloads/ru_RU.twitter.txt"): сторка
## 191902 похоже, содержит встроенный nul
## Warning in readLines("C:/Users/dmitr/Downloads/ru_RU.twitter.txt"): сторка
## 309777 похоже, содержит встроенный nul
# Mb size of each file
blogs_size <- file.info("C:/Users/dmitr/Downloads/ru_RU.blogs.txt")$size / 1024 ^ 2
news_size <- file.info("C:/Users/dmitr/Downloads/ru_RU.news.txt")$size / 1024 ^ 2
twitter_size <- file.info("C:/Users/dmitr/Downloads/ru_RU.twitter.txt")$size / 1024 ^ 2
# Number of words in each file
blogs_words <- stri_count_words(blogs)
news_words <- stri_count_words(news)
twitter_words <- stri_count_words(twitter)
# Creation of a table
summary <- data.frame(files_names = c("blogs", "news", "twitter"),
Size_MB = c(blogs_size, news_size, twitter_size),
lines = c(length(blogs), length(news), length(twitter)),
words = c(sum(blogs_words), sum(news_words), sum(twitter_words)))
kable(summary, caption = "Russian text files")
| files_names | Size_MB | lines | words |
|---|---|---|---|
| blogs | 111.4424 | 337100 | 9388810 |
| news | 113.4838 | 196360 | 9057279 |
| 100.3097 | 881414 | 9234641 |
Because of a great size of each file I think that it will be more useful if I get only small sample of data, for example 5% and see some interesting things in these 0.8%.
set.seed(123)
sample_blogs <- sample(blogs, length(blogs) * 0.008)
sample_news <- sample(news, length(news) * 0.008)
sample_twitter <- sample(twitter, length(twitter) * 0.008)
sample_df <- c(sample_blogs, sample_news, sample_twitter)
corpus <- VCorpus(VectorSource(sample_df))
print(corpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 11317
Next I need to create a ‘corpus’ for better operating with text datasets.
# Functions for preprocessing
clean_corp <- function(corp_data){
corp_data <- tm_map(corp_data, removeNumbers)
corp_data <- tm_map(corp_data, content_transformer(tolower))
corp_data<- tm_map(corp_data, removeWords, stopwords("ru"))
corp_data<- tm_map(corp_data, removePunctuation)
corp_data<- tm_map(corp_data, stripWhitespace)
return (corp_data)
}
corpus <- clean_corp(corpus)
Let’s build some plots.
options(mc.cores=1)
getFreq <- function(tdm) {
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
makeHistogramPlot <- function(data, label, color) {
ggplot(data[1:10,], aes(reorder(word, -freq), freq)) +
labs(x = label, y = "Frequency") +
theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
geom_bar(stat = "identity", fill = I(color))
}
corpus_dtm <- TermDocumentMatrix(corpus)
corpus_dtm
## <<TermDocumentMatrix (terms: 52576, documents: 11317)>>
## Non-/sparse entries: 135888/594866704
## Sparsity : 100%
## Maximal term length: 56
## Weighting : term frequency (tf)
freq1 <- getFreq(removeSparseTerms(corpus_dtm, 0.9999))
makeHistogramPlot(freq1, "10 Most Common Unigrams", color = 3)
Honestly, I don’t have any good ideas now, but I think that it can be possible to predict next word according to the previous 2-3 words.