tm4ss.github.io

rm(list = ls())
date()

## [1] "Sat May 16 20:33:02 2020"

sessionInfo()

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.6.1  magrittr_1.5    tools_3.6.1     htmltools_0.4.0
##  [5] yaml_2.2.0      Rcpp_1.0.2      stringi_1.4.3   rmarkdown_1.16 
##  [9] knitr_1.25      stringr_1.4.0   xfun_0.10       digest_0.6.21  
## [13] rlang_0.4.2     evaluate_0.14

##Библиотеки

library(quanteda)

## Package version: 1.5.1

## Parallel computing: 2 of 4 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:utils':
## 
##     View

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
# library(ggplot2)
library(magrittr)
# library(tibble)
# 
library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following object is masked from 'package:quanteda':
## 
##     as.igraph

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(Matrix)

Импорт данных

load(file = "Tokens_S2.RData")
load(file = "Tokens_S.RData")
load(file = "Corpus_S2.RData")

Словосочетания

Corpus_tokens <- Tokens_S2 %>% 
        tokens_keep(pattern = c("*_S", "*_V", "*_A")
                    , padding = TRUE) # оставляю пустоты вместо удаленных слов

Sotu_collocations <- Corpus_tokens %>% 
        textstat_collocations(size = 2, min_count = 10)

head(Sotu_collocations, 25)

##                  collocation count count_nested length    lambda        z
## 1       окружающий_a среда_s   293            0      2  7.549318 58.24696
## 2         мочь_v позволять_v   227            0      2  4.931737 51.69799
## 3       природный_a ресурс_s   124            0      2  4.984881 42.66494
## 4         живой_a существо_s    71            0      2  6.547115 37.58478
## 5           вырубать_v лес_s    77            0      2  6.377009 34.11494
## 6    огромный_a количество_s    51            0      2  6.032517 32.60221
## 7         среда_s обитание_s    60            0      2  6.794779 30.94705
## 8          данный_a момент_s    40            0      2  6.712628 30.00179
## 9           вид_s животное_s    64            0      2  4.285341 29.34183
## 10  современный_a общество_s    39            0      2  5.796897 28.71699
## 11        окружающий_a мир_s    66            0      2  4.034064 28.60588
## 12           иметь_v право_s    36            0      2  6.618411 28.37995
## 13          вид_s растение_s    47            0      2  4.866328 28.22453
## 14       человек_s должный_a   147            0      2  2.679561 27.48300
## 15          единый_a целое_s    32            0      2  7.797010 27.10787
## 16         чистый_a воздух_s    34            0      2  5.783335 26.82597
## 17        живой_a организм_s    33            0      2  6.018694 26.82414
## 18   природный_a богатство_s    41            0      2  5.578288 26.18299
## 19     будущий_a поколение_s    52            0      2  9.300184 25.90481
## 20           вырубка_s лес_s    65            0      2  7.378873 25.71725
## 21       настоящий_a время_s    39            0      2  6.619773 25.68577
## 22    большой_a количество_s    33            0      2  5.236368 25.51468
## 23         красный_a книга_s    57            0      2 10.915033 25.49663
## 24          человек_s мочь_v   180            0      2  2.102636 25.23730
## 25 нести_v ответственность_s    25            0      2  8.101375 25.23263

Совместная встречаемость

binDTM <- Tokens_S2 %>% 
        tokens_keep(pattern = c("*_S", "*_V", "*_A")) %>% 
        dfm() %>% 
        dfm_trim(min_docfreq = 10, max_docfreq = Inf) %>% 
        dfm_weight("boolean")

#Перемножаем матрицы
coocCounts <- t(binDTM) %*% binDTM

as.matrix(binDTM[202:205, 202:205])

##          features
## docs      десяток_s дождь_s электростанция_s завод_s
##   Bio5.48         0       0                0       0
##   Bio5.49         0       0                0       0
##   Bio5.50         0       0                0       0
##   Bio5.51         0       0                0       0

as.matrix(coocCounts[202:205, 202:205])

##                  десяток_s дождь_s электростанция_s завод_s
## десяток_s               18       0                0       1
## дождь_s                  0      32                1       2
## электростанция_s         0       1               25      15
## завод_s                  1       2               15     132

In order to not only count joint occurrence we have to determine their significance. Different significance-measures can be used. We need also various counts to calculate the significance of the joint occurrence of a term i (coocTerm) with any other term j: * k - Number of all context units in the corpus * ki - Number of occurrences of coocTerm * kj - Number of occurrences of comparison term j * kij - Number of joint occurrences of coocTerm and j

These quantities can be calculated for any term coocTerm as follows:

coocTerm <- "природа_s"
k <- nrow(binDTM)
ki <- sum(binDTM[, coocTerm])
kj <- colSums(binDTM)
names(kj) <- colnames(binDTM)
kij <- coocCounts[coocTerm, ]

########## MI: log(k*kij / (ki * kj) ########
mutualInformationSig <- log(k * kij / (ki * kj))
mutualInformationSig <- mutualInformationSig[order(mutualInformationSig, decreasing = TRUE)]

########## DICE: 2 X&Y / X + Y ##############
dicesig <- 2 * kij / (ki + kj)
dicesig <- dicesig[order(dicesig, decreasing=TRUE)]
  
########## Log Likelihood ###################
logsig <- 2 * ((k * log(k)) - (ki * log(ki)) - (kj * log(kj)) + (kij * log(kij)) 
          + (k - ki - kj + kij) * log(k - ki - kj + kij) 
          + (ki - kij) * log(ki - kij) + (kj - kij) * log(kj - kij) 
          - (k - ki) * log(k - ki) - (k - kj) * log(k - kj))
logsig <- logsig[order(logsig, decreasing=T)]

# Put all significance statistics in one Data-Frame
resultOverView <- data.frame(
  names(sort(kij, decreasing=T)[1:10]), sort(kij, decreasing=T)[1:10],
  names(mutualInformationSig[1:10]), mutualInformationSig[1:10], 
  names(dicesig[1:10]), dicesig[1:10], 
  names(logsig[1:10]), logsig[1:10],
  row.names = NULL)
colnames(resultOverView) <- c("Freq-terms", "Freq", "MI-terms", "MI", "Dice-Terms", "Dice", "LL-Terms", "LL")
print(resultOverView)

##     Freq-terms Freq             MI-terms        MI  Dice-Terms       Dice
## 1    природа_s 3793            природа_s 0.9730214   природа_s 1.00000000
## 2    человек_s 2122         усложнение_s 0.9730214   человек_s 0.56631972
## 3  отношение_s  767 самоорганизующийся_a 0.9730214 отношение_s 0.32240437
## 4       мочь_v  534        отдельность_s 0.9730214      мочь_v 0.22125544
## 5       быть_v  515        благодарить_v 0.9730214      быть_v 0.20526106
## 6    должный_a  321               храм_s 0.9730214   должный_a 0.14874884
## 7  позволять_v  310            матушка_s 0.9730214 позволять_v 0.14509712
## 8      жизнь_s  280           покорять_v 0.9265014     жизнь_s 0.12817578
## 9        мир_s  209             давний_a 0.9040285    ресурс_s 0.09653873
## 10    ресурс_s  205       подстраивать_v 0.8989134       мир_s 0.09653580
##        LL-Terms        LL
## 1     человек_s 947.78545
## 2   отношение_s 778.31567
## 3   позволять_v 148.08440
## 4     должный_a 125.58213
## 5    гармония_s  93.33184
## 6        мочь_v  91.34075
## 7    общество_s  90.89679
## 8  относиться_v  86.39546
## 9       часть_s  83.35449
## 10   животное_s  83.27076

calculateCoocStatistics <- function(coocTerm, binDTM, measure = "DICE") {
  
  # Ensure Matrix (SparseM} or matrix {base} format
  require(Matrix)
 
  # Ensure binary DTM
  if (any(binDTM > 1)) {
    binDTM[binDTM > 1] <- 1
  }
  
  # calculate cooccurrence counts
  coocCounts <- t(binDTM) %*% binDTM
  
  # retrieve numbers for statistic calculation
  k <- nrow(binDTM)
  ki <- sum(binDTM[, coocTerm])
  kj <- colSums(binDTM)
  names(kj) <- colnames(binDTM)
  kij <- coocCounts[coocTerm, ]
  
  # calculate statistics
  switch(measure, 
         DICE = {
           dicesig <- 2 * kij / (ki + kj)
           dicesig <- dicesig[order(dicesig, decreasing=TRUE)]
           sig <- dicesig
         },
         LOGLIK = {
           logsig <- 2 * ((k * log(k)) - (ki * log(ki)) - (kj * log(kj)) + (kij * log(kij)) 
                          + (k - ki - kj + kij) * log(k - ki - kj + kij) 
                          + (ki - kij) * log(ki - kij) + (kj - kij) * log(kj - kij) 
                          - (k - ki) * log(k - ki) - (k - kj) * log(k - kj))
           logsig <- logsig[order(logsig, decreasing=T)]
           sig <- logsig    
         },
         MI = {
           mutualInformationSig <- log(k * kij / (ki * kj))
           mutualInformationSig <- mutualInformationSig[order(mutualInformationSig, decreasing = TRUE)]
           sig <- mutualInformationSig    
         },
         {
           sig <- sort(kij, decreasing = TRUE)
         }
        )
  sig <- sig[-match(coocTerm, names(sig))]
  return(sig)
}

numberOfCoocs <- 15
# Determination of the term of which co-competitors are to be measured.
coocTerm <- "человек_s"

coocs <- calculateCoocStatistics(coocTerm, binDTM, measure = "LOGLIK")
# Display the numberOfCoocs main terms
print(coocs[1:numberOfCoocs])

##         природа_s       отношение_s       позволять_v            мочь_v 
##         947.78545         319.64677         270.75216         174.16391 
##         должный_a    деятельность_s      относиться_v           венец_s 
##         121.64568          59.31559          57.88473          54.02838 
##     потребность_s         влияние_s         ребенок_s     неразрывный_a 
##          48.16447          46.53840          44.90108          40.73344 
##            царь_s взаимоотношение_s    человечество_s 
##          39.16774          38.99199          38.76586

resultGraph <- data.frame(from = character(), to = character(), sig = numeric(0))

# The structure of the temporary graph object is equal to that of the resultGraph
tmpGraph <- data.frame(from = character(), to = character(), sig = numeric(0))

# Fill the data.frame to produce the correct number of lines
tmpGraph[1:numberOfCoocs, 3] <- coocs[1:numberOfCoocs]
# Entry of the search word into the first column in all lines
tmpGraph[, 1] <- coocTerm
# Entry of the co-occurrences into the second column of the respective line
tmpGraph[, 2] <- names(coocs)[1:numberOfCoocs]
# Set the significances
tmpGraph[, 3] <- coocs[1:numberOfCoocs]

# Attach the triples to resultGraph
resultGraph <- rbind(resultGraph, tmpGraph)

# Iteration over the most significant numberOfCoocs co-occurrences of the search term
for (i in 1:numberOfCoocs){
  
  # Calling up the co-occurrence calculation for term i from the search words co-occurrences
  newCoocTerm <- names(coocs)[i]
  coocs2 <- calculateCoocStatistics(newCoocTerm, binDTM, measure="LOGLIK")
  
  #print the co-occurrences
  coocs2[1:10]
  
  # Structure of the temporary graph object
  tmpGraph <- data.frame(from = character(), to = character(), sig = numeric(0))
  tmpGraph[1:numberOfCoocs, 3] <- coocs2[1:numberOfCoocs]
  tmpGraph[, 1] <- newCoocTerm
  tmpGraph[, 2] <- names(coocs2)[1:numberOfCoocs]
  tmpGraph[, 3] <- coocs2[1:numberOfCoocs]
  
  #Append the result to the result graph
  resultGraph <- rbind(resultGraph, tmpGraph[2:length(tmpGraph[, 1]), ])
}

# Sample of some examples from resultGraph
resultGraph[sample(nrow(resultGraph), 6), ]

##             from                to       sig
## 133  позволять_v         симбиоз_s  21.87953
## 94        мочь_v         научать_v  27.52414
## 115    должный_a     благодарный_a  34.81168
## 62   отношение_s потребительский_a 120.59892
## 37  относиться_v         природа_s  86.39546
## 73   позволять_v         иллюзия_s  33.87950

require(igraph)

# Set the graph and type
graphNetwork <- graph.data.frame(resultGraph, directed = F)

# Identification of all nodes with less than 2 edges
graphVs <- V(graphNetwork)[degree(graphNetwork) < 2]
# These edges are removed from the graph
graphNetwork <- delete.vertices(graphNetwork, graphVs) 

# Assign colors to edges and nodes (searchterm blue, rest orange)
V(graphNetwork)$color <- ifelse(V(graphNetwork)$name == coocTerm, 'cornflowerblue', 'orange') 

# Edges with a significance of at least 50% of the maximum sig- nificance in the graph are drawn in orange
halfMaxSig <- max(E(graphNetwork)$sig) * 0.5
E(graphNetwork)$color <- ifelse(E(graphNetwork)$sig > halfMaxSig, "coral", "azure3")

# Disable edges with radius
E(graphNetwork)$curved <- 0 
# Size the nodes by their degree of networking
V(graphNetwork)$size <- log(degree(graphNetwork)) * 5

# All nodes must be assigned a standard minimum-size
V(graphNetwork)$size[V(graphNetwork)$size < 5] <- 3 

# edge thickness
E(graphNetwork)$width <- 2

# Define the frame and spacing for the plot
par(mai=c(0,0,1,0)) 

# Finaler Plot
plot(graphNetwork,              
     layout = layout.fruchterman.reingold,  # Force Directed Layout 
     main = paste(coocTerm, ' Graph'),
     vertex.label.family = "sans",
     vertex.label.cex = 0.8,
     vertex.shape = "circle",
     vertex.label.dist = 0.5,           # Labels of the nodes moved slightly
     vertex.frame.color = 'darkolivegreen',
     vertex.label.color = 'black',      # Color of node names
     vertex.label.font = 2,         # Font of node names
     vertex.label = V(graphNetwork)$name,       # node names
     vertex.label.cex = 1 # font size of node names 
)

require(igraph)
coocTerm <- "природа_s"


# Set the graph and type
graphNetwork <- graph.data.frame(resultGraph, directed = F)

# Identification of all nodes with less than 2 edges
graphVs <- V(graphNetwork)[degree(graphNetwork) < 2]
# These edges are removed from the graph
graphNetwork <- delete.vertices(graphNetwork, graphVs) 

# Assign colors to edges and nodes (searchterm blue, rest orange)
V(graphNetwork)$color <- ifelse(V(graphNetwork)$name == coocTerm, 'cornflowerblue', 'orange') 

# Edges with a significance of at least 50% of the maximum sig- nificance in the graph are drawn in orange
halfMaxSig <- max(E(graphNetwork)$sig) * 0.5
E(graphNetwork)$color <- ifelse(E(graphNetwork)$sig > halfMaxSig, "coral", "azure3")

# Disable edges with radius
E(graphNetwork)$curved <- 0 
# Size the nodes by their degree of networking
V(graphNetwork)$size <- log(degree(graphNetwork)) * 5

# All nodes must be assigned a standard minimum-size
V(graphNetwork)$size[V(graphNetwork)$size < 5] <- 3 

# edge thickness
E(graphNetwork)$width <- 2

# Define the frame and spacing for the plot
par(mai=c(0,0,1,0)) 

# Finaler Plot
plot(graphNetwork,              
     layout = layout.fruchterman.reingold,  # Force Directed Layout 
     main = paste(coocTerm, ' Graph'),
     vertex.label.family = "sans",
     vertex.label.cex = 0.8,
     vertex.shape = "circle",
     vertex.label.dist = 0.5,           # Labels of the nodes moved slightly
     vertex.frame.color = 'darkolivegreen',
     vertex.label.color = 'black',      # Color of node names
     vertex.label.font = 2,         # Font of node names
     vertex.label = V(graphNetwork)$name,       # node names
     vertex.label.cex = 1 # font size of node names 
)

tm4ss.github.io

Petr Ya

5/15/2020

Импорт данных

Словосочетания

Совместная встречаемость