rm(list = ls())
date()
## [1] "Sat May 16 20:33:02 2020"
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_3.6.1 magrittr_1.5 tools_3.6.1 htmltools_0.4.0
## [5] yaml_2.2.0 Rcpp_1.0.2 stringi_1.4.3 rmarkdown_1.16
## [9] knitr_1.25 stringr_1.4.0 xfun_0.10 digest_0.6.21
## [13] rlang_0.4.2 evaluate_0.14
##Библиотеки
library(quanteda)
## Package version: 1.5.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
# library(ggplot2)
library(magrittr)
# library(tibble)
#
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following object is masked from 'package:quanteda':
##
## as.igraph
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(Matrix)
load(file = "Tokens_S2.RData")
load(file = "Tokens_S.RData")
load(file = "Corpus_S2.RData")
Corpus_tokens <- Tokens_S2 %>%
tokens_keep(pattern = c("*_S", "*_V", "*_A")
, padding = TRUE) # оставляю пустоты вместо удаленных слов
Sotu_collocations <- Corpus_tokens %>%
textstat_collocations(size = 2, min_count = 10)
head(Sotu_collocations, 25)
## collocation count count_nested length lambda z
## 1 окружающий_a среда_s 293 0 2 7.549318 58.24696
## 2 мочь_v позволять_v 227 0 2 4.931737 51.69799
## 3 природный_a ресурс_s 124 0 2 4.984881 42.66494
## 4 живой_a существо_s 71 0 2 6.547115 37.58478
## 5 вырубать_v лес_s 77 0 2 6.377009 34.11494
## 6 огромный_a количество_s 51 0 2 6.032517 32.60221
## 7 среда_s обитание_s 60 0 2 6.794779 30.94705
## 8 данный_a момент_s 40 0 2 6.712628 30.00179
## 9 вид_s животное_s 64 0 2 4.285341 29.34183
## 10 современный_a общество_s 39 0 2 5.796897 28.71699
## 11 окружающий_a мир_s 66 0 2 4.034064 28.60588
## 12 иметь_v право_s 36 0 2 6.618411 28.37995
## 13 вид_s растение_s 47 0 2 4.866328 28.22453
## 14 человек_s должный_a 147 0 2 2.679561 27.48300
## 15 единый_a целое_s 32 0 2 7.797010 27.10787
## 16 чистый_a воздух_s 34 0 2 5.783335 26.82597
## 17 живой_a организм_s 33 0 2 6.018694 26.82414
## 18 природный_a богатство_s 41 0 2 5.578288 26.18299
## 19 будущий_a поколение_s 52 0 2 9.300184 25.90481
## 20 вырубка_s лес_s 65 0 2 7.378873 25.71725
## 21 настоящий_a время_s 39 0 2 6.619773 25.68577
## 22 большой_a количество_s 33 0 2 5.236368 25.51468
## 23 красный_a книга_s 57 0 2 10.915033 25.49663
## 24 человек_s мочь_v 180 0 2 2.102636 25.23730
## 25 нести_v ответственность_s 25 0 2 8.101375 25.23263
binDTM <- Tokens_S2 %>%
tokens_keep(pattern = c("*_S", "*_V", "*_A")) %>%
dfm() %>%
dfm_trim(min_docfreq = 10, max_docfreq = Inf) %>%
dfm_weight("boolean")
#Перемножаем матрицы
coocCounts <- t(binDTM) %*% binDTM
as.matrix(binDTM[202:205, 202:205])
## features
## docs десяток_s дождь_s электростанция_s завод_s
## Bio5.48 0 0 0 0
## Bio5.49 0 0 0 0
## Bio5.50 0 0 0 0
## Bio5.51 0 0 0 0
as.matrix(coocCounts[202:205, 202:205])
## десяток_s дождь_s электростанция_s завод_s
## десяток_s 18 0 0 1
## дождь_s 0 32 1 2
## электростанция_s 0 1 25 15
## завод_s 1 2 15 132
In order to not only count joint occurrence we have to determine their significance. Different significance-measures can be used. We need also various counts to calculate the significance of the joint occurrence of a term i (coocTerm) with any other term j: * k - Number of all context units in the corpus * ki - Number of occurrences of coocTerm * kj - Number of occurrences of comparison term j * kij - Number of joint occurrences of coocTerm and j
These quantities can be calculated for any term coocTerm as follows:
coocTerm <- "природа_s"
k <- nrow(binDTM)
ki <- sum(binDTM[, coocTerm])
kj <- colSums(binDTM)
names(kj) <- colnames(binDTM)
kij <- coocCounts[coocTerm, ]
########## MI: log(k*kij / (ki * kj) ########
mutualInformationSig <- log(k * kij / (ki * kj))
mutualInformationSig <- mutualInformationSig[order(mutualInformationSig, decreasing = TRUE)]
########## DICE: 2 X&Y / X + Y ##############
dicesig <- 2 * kij / (ki + kj)
dicesig <- dicesig[order(dicesig, decreasing=TRUE)]
########## Log Likelihood ###################
logsig <- 2 * ((k * log(k)) - (ki * log(ki)) - (kj * log(kj)) + (kij * log(kij))
+ (k - ki - kj + kij) * log(k - ki - kj + kij)
+ (ki - kij) * log(ki - kij) + (kj - kij) * log(kj - kij)
- (k - ki) * log(k - ki) - (k - kj) * log(k - kj))
logsig <- logsig[order(logsig, decreasing=T)]
# Put all significance statistics in one Data-Frame
resultOverView <- data.frame(
names(sort(kij, decreasing=T)[1:10]), sort(kij, decreasing=T)[1:10],
names(mutualInformationSig[1:10]), mutualInformationSig[1:10],
names(dicesig[1:10]), dicesig[1:10],
names(logsig[1:10]), logsig[1:10],
row.names = NULL)
colnames(resultOverView) <- c("Freq-terms", "Freq", "MI-terms", "MI", "Dice-Terms", "Dice", "LL-Terms", "LL")
print(resultOverView)
## Freq-terms Freq MI-terms MI Dice-Terms Dice
## 1 природа_s 3793 природа_s 0.9730214 природа_s 1.00000000
## 2 человек_s 2122 усложнение_s 0.9730214 человек_s 0.56631972
## 3 отношение_s 767 самоорганизующийся_a 0.9730214 отношение_s 0.32240437
## 4 мочь_v 534 отдельность_s 0.9730214 мочь_v 0.22125544
## 5 быть_v 515 благодарить_v 0.9730214 быть_v 0.20526106
## 6 должный_a 321 храм_s 0.9730214 должный_a 0.14874884
## 7 позволять_v 310 матушка_s 0.9730214 позволять_v 0.14509712
## 8 жизнь_s 280 покорять_v 0.9265014 жизнь_s 0.12817578
## 9 мир_s 209 давний_a 0.9040285 ресурс_s 0.09653873
## 10 ресурс_s 205 подстраивать_v 0.8989134 мир_s 0.09653580
## LL-Terms LL
## 1 человек_s 947.78545
## 2 отношение_s 778.31567
## 3 позволять_v 148.08440
## 4 должный_a 125.58213
## 5 гармония_s 93.33184
## 6 мочь_v 91.34075
## 7 общество_s 90.89679
## 8 относиться_v 86.39546
## 9 часть_s 83.35449
## 10 животное_s 83.27076
calculateCoocStatistics <- function(coocTerm, binDTM, measure = "DICE") {
# Ensure Matrix (SparseM} or matrix {base} format
require(Matrix)
# Ensure binary DTM
if (any(binDTM > 1)) {
binDTM[binDTM > 1] <- 1
}
# calculate cooccurrence counts
coocCounts <- t(binDTM) %*% binDTM
# retrieve numbers for statistic calculation
k <- nrow(binDTM)
ki <- sum(binDTM[, coocTerm])
kj <- colSums(binDTM)
names(kj) <- colnames(binDTM)
kij <- coocCounts[coocTerm, ]
# calculate statistics
switch(measure,
DICE = {
dicesig <- 2 * kij / (ki + kj)
dicesig <- dicesig[order(dicesig, decreasing=TRUE)]
sig <- dicesig
},
LOGLIK = {
logsig <- 2 * ((k * log(k)) - (ki * log(ki)) - (kj * log(kj)) + (kij * log(kij))
+ (k - ki - kj + kij) * log(k - ki - kj + kij)
+ (ki - kij) * log(ki - kij) + (kj - kij) * log(kj - kij)
- (k - ki) * log(k - ki) - (k - kj) * log(k - kj))
logsig <- logsig[order(logsig, decreasing=T)]
sig <- logsig
},
MI = {
mutualInformationSig <- log(k * kij / (ki * kj))
mutualInformationSig <- mutualInformationSig[order(mutualInformationSig, decreasing = TRUE)]
sig <- mutualInformationSig
},
{
sig <- sort(kij, decreasing = TRUE)
}
)
sig <- sig[-match(coocTerm, names(sig))]
return(sig)
}
numberOfCoocs <- 15
# Determination of the term of which co-competitors are to be measured.
coocTerm <- "человек_s"
coocs <- calculateCoocStatistics(coocTerm, binDTM, measure = "LOGLIK")
# Display the numberOfCoocs main terms
print(coocs[1:numberOfCoocs])
## природа_s отношение_s позволять_v мочь_v
## 947.78545 319.64677 270.75216 174.16391
## должный_a деятельность_s относиться_v венец_s
## 121.64568 59.31559 57.88473 54.02838
## потребность_s влияние_s ребенок_s неразрывный_a
## 48.16447 46.53840 44.90108 40.73344
## царь_s взаимоотношение_s человечество_s
## 39.16774 38.99199 38.76586
resultGraph <- data.frame(from = character(), to = character(), sig = numeric(0))
# The structure of the temporary graph object is equal to that of the resultGraph
tmpGraph <- data.frame(from = character(), to = character(), sig = numeric(0))
# Fill the data.frame to produce the correct number of lines
tmpGraph[1:numberOfCoocs, 3] <- coocs[1:numberOfCoocs]
# Entry of the search word into the first column in all lines
tmpGraph[, 1] <- coocTerm
# Entry of the co-occurrences into the second column of the respective line
tmpGraph[, 2] <- names(coocs)[1:numberOfCoocs]
# Set the significances
tmpGraph[, 3] <- coocs[1:numberOfCoocs]
# Attach the triples to resultGraph
resultGraph <- rbind(resultGraph, tmpGraph)
# Iteration over the most significant numberOfCoocs co-occurrences of the search term
for (i in 1:numberOfCoocs){
# Calling up the co-occurrence calculation for term i from the search words co-occurrences
newCoocTerm <- names(coocs)[i]
coocs2 <- calculateCoocStatistics(newCoocTerm, binDTM, measure="LOGLIK")
#print the co-occurrences
coocs2[1:10]
# Structure of the temporary graph object
tmpGraph <- data.frame(from = character(), to = character(), sig = numeric(0))
tmpGraph[1:numberOfCoocs, 3] <- coocs2[1:numberOfCoocs]
tmpGraph[, 1] <- newCoocTerm
tmpGraph[, 2] <- names(coocs2)[1:numberOfCoocs]
tmpGraph[, 3] <- coocs2[1:numberOfCoocs]
#Append the result to the result graph
resultGraph <- rbind(resultGraph, tmpGraph[2:length(tmpGraph[, 1]), ])
}
# Sample of some examples from resultGraph
resultGraph[sample(nrow(resultGraph), 6), ]
## from to sig
## 133 позволять_v симбиоз_s 21.87953
## 94 мочь_v научать_v 27.52414
## 115 должный_a благодарный_a 34.81168
## 62 отношение_s потребительский_a 120.59892
## 37 относиться_v природа_s 86.39546
## 73 позволять_v иллюзия_s 33.87950
require(igraph)
# Set the graph and type
graphNetwork <- graph.data.frame(resultGraph, directed = F)
# Identification of all nodes with less than 2 edges
graphVs <- V(graphNetwork)[degree(graphNetwork) < 2]
# These edges are removed from the graph
graphNetwork <- delete.vertices(graphNetwork, graphVs)
# Assign colors to edges and nodes (searchterm blue, rest orange)
V(graphNetwork)$color <- ifelse(V(graphNetwork)$name == coocTerm, 'cornflowerblue', 'orange')
# Edges with a significance of at least 50% of the maximum sig- nificance in the graph are drawn in orange
halfMaxSig <- max(E(graphNetwork)$sig) * 0.5
E(graphNetwork)$color <- ifelse(E(graphNetwork)$sig > halfMaxSig, "coral", "azure3")
# Disable edges with radius
E(graphNetwork)$curved <- 0
# Size the nodes by their degree of networking
V(graphNetwork)$size <- log(degree(graphNetwork)) * 5
# All nodes must be assigned a standard minimum-size
V(graphNetwork)$size[V(graphNetwork)$size < 5] <- 3
# edge thickness
E(graphNetwork)$width <- 2
# Define the frame and spacing for the plot
par(mai=c(0,0,1,0))
# Finaler Plot
plot(graphNetwork,
layout = layout.fruchterman.reingold, # Force Directed Layout
main = paste(coocTerm, ' Graph'),
vertex.label.family = "sans",
vertex.label.cex = 0.8,
vertex.shape = "circle",
vertex.label.dist = 0.5, # Labels of the nodes moved slightly
vertex.frame.color = 'darkolivegreen',
vertex.label.color = 'black', # Color of node names
vertex.label.font = 2, # Font of node names
vertex.label = V(graphNetwork)$name, # node names
vertex.label.cex = 1 # font size of node names
)
require(igraph)
coocTerm <- "природа_s"
# Set the graph and type
graphNetwork <- graph.data.frame(resultGraph, directed = F)
# Identification of all nodes with less than 2 edges
graphVs <- V(graphNetwork)[degree(graphNetwork) < 2]
# These edges are removed from the graph
graphNetwork <- delete.vertices(graphNetwork, graphVs)
# Assign colors to edges and nodes (searchterm blue, rest orange)
V(graphNetwork)$color <- ifelse(V(graphNetwork)$name == coocTerm, 'cornflowerblue', 'orange')
# Edges with a significance of at least 50% of the maximum sig- nificance in the graph are drawn in orange
halfMaxSig <- max(E(graphNetwork)$sig) * 0.5
E(graphNetwork)$color <- ifelse(E(graphNetwork)$sig > halfMaxSig, "coral", "azure3")
# Disable edges with radius
E(graphNetwork)$curved <- 0
# Size the nodes by their degree of networking
V(graphNetwork)$size <- log(degree(graphNetwork)) * 5
# All nodes must be assigned a standard minimum-size
V(graphNetwork)$size[V(graphNetwork)$size < 5] <- 3
# edge thickness
E(graphNetwork)$width <- 2
# Define the frame and spacing for the plot
par(mai=c(0,0,1,0))
# Finaler Plot
plot(graphNetwork,
layout = layout.fruchterman.reingold, # Force Directed Layout
main = paste(coocTerm, ' Graph'),
vertex.label.family = "sans",
vertex.label.cex = 0.8,
vertex.shape = "circle",
vertex.label.dist = 0.5, # Labels of the nodes moved slightly
vertex.frame.color = 'darkolivegreen',
vertex.label.color = 'black', # Color of node names
vertex.label.font = 2, # Font of node names
vertex.label = V(graphNetwork)$name, # node names
vertex.label.cex = 1 # font size of node names
)