Exploratory Data Analysis. This script will perform some analysis of corpora resulting from unsupervised clustering. The results will be presented as graphs and tables. All of the code used to conduct the analysis are included in this script.
knitr::opts_chunk$set(echo = TRUE, cache= FALSE)
require(quanteda)
## Loading required package: quanteda
## quanteda version 0.9.9.23
## Using 31 of 32 cores for parallel computing
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
## The following object is masked from 'package:base':
##
## sample
require(readtext)
## Loading required package: readtext
base <- "/home/larsbun/Dropbox-exclude/old-experiments/20170219/066bf000-c5ed-4716-b571-c2d7a14bdfe3/ward-partitioning-0/"
mystopwords <-c("<", ">", "/", "s", ":", ";")
plotdfm<-function(mydfm,order){
freq <- slam::col_sums(mydfm)
freq <- as.data.frame(freq)
freq$names <- rownames(freq)
rownames(freq) <- NULL
colnames(freq) <- c("freq","word")
freq <- freq[order(freq$freq,decreasing = TRUE),]
freq_top25 <- head(freq, n = 25)
ggplot(freq_top25, aes(x=reorder(word, freq), y=freq)) +
geom_bar(stat = "identity") +
coord_flip() +
xlab(paste(order, "-gram words")) +
ylab("Frequency") +
geom_text(aes(label=freq), hjust=-0.20) +
ggtitle(paste("Most Common", order, "-grams"))
}
createwordcloud<-function(mydfm){
textplot_wordcloud(mydfm, min.freq = 30, random.order = FALSE,
rot.per = .25, max.words=100,
colors = RColorBrewer::brewer.pal(8,"Dark2"))
}
library(ggplot2)
createtopmod<-function(mydfm){
if (require(topicmodels)) {
myLDAfit20 <- LDA(convert(mydfm, to = "topicmodels"), k = 20)
# get_terms(myLDAfit20, 5)
# topics(myLDAfit20, 3)
}
}
Iterate over the corpora in the partitioning given by base, save as a list of lists of corpus objects.
library(quanteda)
dfmarray_som<-list()
for (i in 1:2){
dfmlist<-list()
for (j in 1:3){
dfmlist<- c(dfmlist, dfm(clist_som[[i]], ngrams=j, removepunct=TRUE, remove = mystopwords))
}
dfmarray_som<-c(dfmarray_som, list(dfmlist))
}
## Warning in tokens.character(x, ...): Argument removepunct not used.
## Warning in tokens.character(x, ...): Argument removepunct not used.
## Warning in tokens.character(x, ...): Argument removepunct not used.
## Warning in tokens.character(x, ...): Argument removepunct not used.
## Warning in tokens.character(x, ...): Argument removepunct not used.
## Warning in tokens.character(x, ...): Argument removepunct not used.
For each cluster, the top 25 n-grams up to order 3 will be displayed. Next, the 3-grams will be presente as wordclouds. Finally, the first 5 words of each topic of a Latend Dirichlet Allocation (LDA) topic model is presented for each cluster.
require(ggplot2)
plotdfm(dfmarray_som[[1]][[1]],1)
plotdfm(dfmarray_som[[1]][[2]],2)
plotdfm(dfmarray_som[[1]][[3]],3)
createwordcloud(dfmarray_som[[1]][[3]])
topmod1 <- createtopmod(dfmarray_som[[1]][[1]])
## Loading required package: topicmodels
t(as.data.frame(terms(topmod1,5)))
## [,1] [,2] [,3] [,4] [,5]
## Topic 1 "," "ka" "oo" "ku" "iyo"
## Topic 2 "," "ka" "oo" "ku" "iyo"
## Topic 3 "," "ka" "iyo" "oo" "ku"
## Topic 4 "." "ka" "oo" "," "ee"
## Topic 5 "," "oo" "ka" "ay" "ku"
## Topic 6 "oo" "," "ay" "-" "iyo"
## Topic 7 "," "ay" "oo" "-" "ah"
## Topic 8 "ka" "," "oo" "ay" "iyo"
## Topic 9 "oo" "ka" "," "ku" "iyo"
## Topic 10 "," "-" "ka" "ee" "oo"
## Topic 11 "oo" "u" "\"" "m" ","
## Topic 12 "." "oo" "," "ku" "ka"
## Topic 13 "," "iyo" "oo" "u" "ku"
## Topic 14 "." "oo" "ka" "ay" ","
## Topic 15 "oo" "ka" "," "ku" "ee"
## Topic 16 "," "oo" "iyo" "ka" "ay"
## Topic 17 "," "ka" "uu" "ku" "oo"
## Topic 18 "," "oo" "ay" "ku" "ka"
## Topic 19 "," "\"" "oo" "ku" "ka"
## Topic 20 "," "oo" "ee" "ka" "ayaa"
plotdfm(dfmarray_som[[2]][[1]],1)
plotdfm(dfmarray_som[[2]][[2]],2)
plotdfm(dfmarray_som[[2]][[3]],3)
createwordcloud(dfmarray_som[[2]][[1]])
topmod2 <- createtopmod(dfmarray_som[[2]][[3]])
t(as.data.frame(terms(topmod2,5)))
## [,1] [,2] [,3]
## Topic 1 "._._." "?_?_?" "ka_mid_ah"
## Topic 2 "reer_binu_israa'iil" ",_oo_waxay" "ku_yidhi_,"
## Topic 3 "._._." "ka_mid_ah" "more_._."
## Topic 4 "._._." "(_hol_)" "hol_)_-"
## Topic 5 "._._." "._._(" ",_waayo_,"
## Topic 6 "._._." "ka_mid_ah" "ayaa_sheegay_in"
## Topic 7 "._._." "read_more_…" "ka_mid_ah"
## Topic 8 "._._." "ka_mid_ah" "ayaa_sheegay_in"
## Topic 9 "._._." "ayaa_sheegay_in" "ka_mid_ah"
## Topic 10 "._._." "ka_mid_ah" "]_-_waa"
## Topic 11 "xeer_madaxweynaha_jds" "madaxweynaha_jds_l" "jds_l_."
## Topic 12 "ka_mid_ah" "ayaa_sheegay_in" ",_2015_("
## Topic 13 "ka_mid_ah" ",_2015_(" "2015_(_hol"
## Topic 14 "._._." "ka_mid_ah" "oo_ka_mid"
## Topic 15 "ruuxa_quduuska_ah" "al_-_shabaab" "saddex_-_m"
## Topic 16 "._._." ",_2015_(" "2015_(_hol"
## Topic 17 "._._." "more_._." "[_bmc_]"
## Topic 18 "._._." "ka_mid_ah" "wuxuu_yiri_:"
## Topic 19 "ka_mid_ah" ",_waayo_," "aadan_iyo_xaawa"
## Topic 20 "._._." "._._madaxweynaha" "]_-_magacaygu"
## [,4] [,5]
## Topic 1 "!_!_!" "ku_meel_gaarka"
## Topic 2 ",_oo_wuxuu" "wuxuu_ku_yidhi"
## Topic 3 "[_…_]" "-_(_warsoor"
## Topic 4 ",_2015_(" "2015_(_hol"
## Topic 5 "ka_mid_ah" "jan_2016_-"
## Topic 6 ",_2015_(" "[_…_]"
## Topic 7 "._._admin" "yiri_:_\""
## Topic 8 "[_warsaxaafadeed_]" "ee_qaramada_midoobay"
## Topic 9 "19_,_2015" ",_2015_("
## Topic 10 "[_…_]" "(_scw_)"
## Topic 11 "*_xeer_madaxweynaha" "ka_mid_ah"
## Topic 12 "oo_ay_ka" "ay_ka_mid"
## Topic 13 "(_hol_)" "hol_)_-"
## Topic 14 "!_!_!" "ayaa_sheegay_in"
## Topic 15 "ka_mid_ah" "banii_-_aadanka"
## Topic 16 "(_hol_)" "hol_)_-"
## Topic 17 "bmc_]_-" "._._daawo"
## Topic 18 "(_bnn_)" "2016_(_bnn"
## Topic 19 "-_aad_ee" "ayaa_sheegay_in"
## Topic 20 "-_magacaygu_waa" "(_sh_m"