Exploratory Data Analysis. This script will perform some analysis of corpora resulting from unsupervised clustering. The results will be presented as graphs and tables. All of the code used to conduct the analysis are included in this script.
knitr::opts_chunk$set(echo = TRUE, cache= TRUE)
require(quanteda)
require(readtext)
base <- "/home/larsbun/Dropbox-exclude/old-experiments/20170210/d5bed410-c1e9-4bd4-b657-458e86cddff6/ward-partitioning-0/"
mystopwords <-c("<", ">", "/", "s", ":", ";")
plotdfm<-function(mydfm,order){
freq <- slam::col_sums(mydfm)
freq <- as.data.frame(freq)
freq$names <- rownames(freq)
rownames(freq) <- NULL
colnames(freq) <- c("freq","word")
freq <- freq[order(freq$freq,decreasing = TRUE),]
freq_top25 <- head(freq, n = 25)
ggplot(freq_top25, aes(x=reorder(word, freq), y=freq)) +
geom_bar(stat = "identity") +
coord_flip() +
xlab(paste(order, "-gram words")) +
ylab("Frequency") +
geom_text(aes(label=freq), hjust=-0.20) +
ggtitle(paste("Most Common", order, "-grams"))
}
createwordcloud<-function(mydfm){
textplot_wordcloud(mydfm, min.freq = 30, random.order = FALSE,
rot.per = .25, max.words=100,
colors = RColorBrewer::brewer.pal(8,"Dark2"))
}
library(ggplot2)
createtopmod<-function(mydfm){
if (require(topicmodels)) {
myLDAfit20 <- LDA(convert(mydfm, to = "topicmodels"), k = 20)
# get_terms(myLDAfit20, 5)
# topics(myLDAfit20, 3)
}
}
Iterate over the corpora in the partitioning given by base, save as a list of lists of corpus objects.
library(quanteda)
dfmarray_tig<-list()
for (i in 1:2){
dfmlist<-list()
for (j in 1:3){
dfmlist<- c(dfmlist, dfm(clist_tig[[i]], ngrams=j, removepunct=TRUE, remove = mystopwords))
}
dfmarray_tig<-c(dfmarray_tig, list(dfmlist))
}
## Warning in tokens.character(x, ...): Argument removepunct not used.
## Warning in tokens.character(x, ...): Argument removepunct not used.
## Warning in tokens.character(x, ...): Argument removepunct not used.
## Warning in tokens.character(x, ...): Argument removepunct not used.
## Warning in tokens.character(x, ...): Argument removepunct not used.
## Warning in tokens.character(x, ...): Argument removepunct not used.
For each cluster, the top 25 n-grams up to order 3 will be displayed. Next, the 3-grams will be presente as wordclouds. Finally, the first 5 words of each topic of a Latend Dirichlet Allocation (LDA) topic model is presented for each cluster.
require(ggplot2)
plotdfm(dfmarray_tig[[1]][[1]],1)
plotdfm(dfmarray_tig[[1]][[2]],2)
plotdfm(dfmarray_tig[[1]][[3]],3)
createwordcloud(dfmarray_tig[[1]][[3]])
topmod1 <- createtopmod(dfmarray_tig[[1]][[3]])
## Loading required package: topicmodels
t(as.data.frame(terms(topmod1,5)))
## [,1] [,2] [,3] [,4]
## Topic 1 "'_ዩ_።" "ናይ_የሆዋ_መሰኻኽር" "._._." "ቤተ_-_ክርስትያን"
## Topic 2 "'_ዩ_።" "._._." "እዩ_።_እዚ" "ቅድስተ_ቅዱሳን_ድንግል"
## Topic 3 "'_ዩ_።" "*_*_*" "እዩ_።_እቲ" "._._."
## Topic 4 "'_ዩ_።" "ማለት_ኢዩ_።" "._._." "ኣብ_'_ዚ"
## Topic 5 "'_ዩ_።" "ከምኡ_'_ውን" "*_*_*" "ዝናን_-_ባህታን"
## Topic 6 "እዩ_ነይሩ_።" "'_ዩ_።" "እዩ_።_እዚ" "ኣብ_'_ዚ"
## Topic 7 "'_ዩ_።" "*_*_*" "ቅዱሳን_ድንግል_ማርያም" "ቅድስተ_ቅዱሳን_ድንግል"
## Topic 8 "'_ዩ_።" "እዩ_።_እዚ" "እዩ_።_እቲ" "እዩ_።_ኣብ"
## Topic 9 "*_*_*" "._._." "መጽናዕቲ_መጽሓፍ_ቅዱስ" "ይኽእል_እዩ_።"
## Topic 10 "እዩ_።_እዚ" "መድኃኒና_ኢየሱስ_ክርስቶስ" "።_>_>" "ይኽእል_እዩ_።"
## Topic 11 "'_ዩ_።" "*_*_*" "እዩ_።_እዚ" "።_*_*"
## Topic 12 "-_-_-" "'_ዩ_።" "._._." "ዕደጋ_ፈፃሚ_ኣካል"
## Topic 13 "'_ዩ_።" "።_>_>" "እዩ_።_እዚ" "እዩ_።_እቲ"
## Topic 14 "'_ዩ_።" "._1_፡" "፡_1_-" "._._."
## Topic 15 "'_ዩ_።" "እዩ_:_:" "'_ዚ_፡" "ከምኡ_'_ውን"
## Topic 16 "'_ዩ_።" "ኢዩ_ነይሩ_።" "\"_ጄሪ_\"" "እዩ_።_እቲ"
## Topic 17 "'_ዩ_።" "ቤተ_-_ክርስትያን" "እዩ_።_እዚ" "እዩ_።_ኣብ"
## Topic 18 "._._." "ህዝባዊ_ግንባር_ሓርነት" "ግንባር_ሓርነት_ኤርትራ" "'_ዩ_።"
## Topic 19 "'_ዩ_።" "ናይ_የሆዋ_መሰኻኽር" "._._." "*_*_*"
## Topic 20 "እዩ_።_ኣብ" "እዩ_።_እዚ" "ይኽእል_እዩ_።" "ማለት_እዩ_።"
## [,5]
## Topic 1 "ኣብ_መጽሓፍ_ቅዱስ"
## Topic 2 "ቅዱሳን_ድንግል_ማርያም"
## Topic 3 "እዩ_።_ኣብ"
## Topic 4 "።_ኣብ_'"
## Topic 5 "ብዘይካ_'_ዚ"
## Topic 6 "._._."
## Topic 7 "ኣዴና_ቅድስተ_ቅዱሳን"
## Topic 8 "ይብል_።_("
## Topic 9 "'_ዩ_።"
## Topic 10 "ማሕበር_ፍቕሪ_ሃገር"
## Topic 11 "።_ይኹን_እምበር"
## Topic 12 "እዩ_።_ኣብ"
## Topic 13 "መድኃኒና_ኢየሱስ_ክርስቶስ"
## Topic 14 "._2_፡"
## Topic 15 "፣_ቅ_."
## Topic 16 "ኢዩ_።_እቲ"
## Topic 17 "._._."
## Topic 18 "እዩ_ነይሩ_።"
## Topic 19 "እዩ_።_እዚ"
## Topic 20 "እዩ_።_\""
plotdfm(dfmarray_tig[[2]][[1]],1)
plotdfm(dfmarray_tig[[2]][[2]],2)
plotdfm(dfmarray_tig[[2]][[3]],3)
createwordcloud(dfmarray_tig[[2]][[3]])
topmod2 <- createtopmod(dfmarray_tig[[2]][[3]])
t(as.data.frame(terms(topmod2,5)))
## [,1] [,2] [,3]
## Topic 1 "._._." "እዩ_።_(" "እዩ_ነይሩ_።"
## Topic 2 "እዩ_።_እዚ" "-_-_-" "ማለት_እዩ_።"
## Topic 3 "._._." "እዩ_።_\"" "ማለት_እዩ_።"
## Topic 4 "ከኣ_፡_\"" "ድማ_፡_\"" "\"_በሎም_።"
## Topic 5 "._._." "።_ይኹን_እምበር" "ይኹን_እምበር_:"
## Topic 6 "ቅድስት_ድንግል_ማርያም" "እዩ_።_(" "እዩ_።_እዚ"
## Topic 7 "'_ዩ_።" "ቅድስት_ድንግል_ማርያም" "ቤተ_-_ክርስቲያን"
## Topic 8 "እዩ_።_እዚ" "መድኃኒናን_ኢየሱስ_ክርስቶስ" "ጐይታናን_መድኃኒናን_ኢየሱስ"
## Topic 9 "._._." "እዩ_።_ኣብ" "እዩ_።_("
## Topic 10 "እዩ_።_(" "._._." "እዩ_ነይሩ_።"
## Topic 11 "እዩ_ነይሩ_።" "እዩ_።_(" "እዩ_።_እዚ"
## Topic 12 "\"_በሎም_።" "\"_በሎ_።" "ድማ_፡_\""
## Topic 13 "እዩ_እሞ_፡" "፡_በሎ_።" "፡_በሎም_።"
## Topic 14 "._._." "ማለት_እዩ_።" "እዩ_።_("
## Topic 15 "እዩ_።_(" ":_1_-" "።_(_1"
## Topic 16 "ቅድስት_ድንግል_ማርያም" "…_…_…" "ኣዴና_ቅድስት_ድንግል"
## Topic 17 "እዩ_።_(" "ይኽእል_እዩ_።" "._._."
## Topic 18 "ቅድስት_ድንግል_ማርያም" "ኣዴና_ቅድስት_ድንግል" "እዩ_።_እዚ"
## Topic 19 "._._." "*_*_*" "…_…_…"
## Topic 20 "ቅድስት_ድንግል_ማርያም" "._._." "መድኃኒናን_ኢየሱስ_ክርስቶስ"
## [,4] [,5]
## Topic 1 "እዩ_።_እዚ" "እዩ_።_እቲ"
## Topic 2 "እዩ_።_ኣብ" "ተዋህዶ_ቤተ_ክርስቲያን"
## Topic 3 "'_ዩ_።" "እዩ_።_ስለዚ"
## Topic 4 "\"_በሎ_።" "እዩ_።_("
## Topic 5 "እዩ_።_(" "ቅድስት_ድንግል_ማርያም"
## Topic 6 "ይኽእል_እዩ_።" "።_\"_("
## Topic 7 "ኣዴና_ቅድስት_ድንግል" "እዩ_።_("
## Topic 8 "ቅድስት_ቤተ_ክርስቲያን" "እዩ_።_("
## Topic 9 "እዩ_ነይሩ_።" "እዩ_።_እዚ"
## Topic 10 "እዩ_።_እዚ" "እዩ_።_\""
## Topic 11 "እዩ_።_እቲ" "እዩ_።_ኣብ"
## Topic 12 "+_+_+" "ቅድስት_ድንግል_ማርያም"
## Topic 13 "ንሱ_ኸኣ_፡" "፡_ጐይታይ_፡"
## Topic 14 "።_\"_(" "እዩ_።_እቲ"
## Topic 15 "፣_1_ይ" "(_1_ይ"
## Topic 16 "ቤተ_-_ክርስቲያን" "._._."
## Topic 17 "እዩ_።_ኣብ" "።_(_1"
## Topic 18 "'_ዩ_።" "እዩ_፣_፣"
## Topic 19 "ቅድስት_ድንግል_ማርያም" ">_>_>"
## Topic 20 "እዩ_።_\"" "እዩ_።_እዚ"