Exploratory Data Analysis. This script will perform some analysis of corpora resulting from unsupervised clustering. The results will be presented as graphs and tables. All of the code used to conduct the analysis are included in this script.
knitr::opts_chunk$set(echo = TRUE, cache=FALSE)
require(quanteda)
## Loading required package: quanteda
## quanteda version 0.9.9.23
## Using 31 of 32 cores for parallel computing
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
## The following object is masked from 'package:base':
##
## sample
require(readtext)
## Loading required package: readtext
base <- "/home/larsbun/Dropbox-exclude/old-experiments/20170215/12675ef0-0c75-4f8a-aad3-4472cb60d5b9/ward-partitioning-0/"
mystopwords <-c("<", ">", "/", "s", ":", ";")
plotdfm<-function(mydfm,order){
freq <- slam::col_sums(mydfm)
freq <- as.data.frame(freq)
freq$names <- rownames(freq)
rownames(freq) <- NULL
colnames(freq) <- c("freq","word")
freq <- freq[order(freq$freq,decreasing = TRUE),]
freq_top25 <- head(freq, n = 25)
ggplot(freq_top25, aes(x=reorder(word, freq), y=freq)) +
geom_bar(stat = "identity") +
coord_flip() +
xlab(paste(order, "-gram words")) +
ylab("Frequency") +
geom_text(aes(label=freq), hjust=-0.20) +
ggtitle(paste("Most Common", order, "-grams"))
}
createwordcloud<-function(mydfm){
textplot_wordcloud(mydfm, min.freq = 30, random.order = FALSE,
rot.per = .25, max.words=100,
colors = RColorBrewer::brewer.pal(8,"Dark2"))
}
library(ggplot2)
createtopmod<-function(mydfm){
if (require(topicmodels)) {
myLDAfit20 <- LDA(convert(mydfm, to = "topicmodels"), k = 20)
# get_terms(myLDAfit20, 5)
# topics(myLDAfit20, 3)
}
}
Iterate over the corpora in the partitioning given by base, save as a list of lists of corpus objects.
library(quanteda)
dfmarray_oro<-list()
for (i in 1:2){
dfmlist<-list()
for (j in 1:3){
dfmlist<- c(dfmlist, dfm(clist_oro[[i]], ngrams=j, removepunct=TRUE, remove = mystopwords))
}
dfmarray_oro<-c(dfmarray_oro, list(dfmlist))
}
## Warning in tokens.character(x, ...): Argument removepunct not used.
## Warning in tokens.character(x, ...): Argument removepunct not used.
## Warning in tokens.character(x, ...): Argument removepunct not used.
## Warning in tokens.character(x, ...): Argument removepunct not used.
## Warning in tokens.character(x, ...): Argument removepunct not used.
## Warning in tokens.character(x, ...): Argument removepunct not used.
For each cluster, the top 25 n-grams up to order 3 will be displayed. Next, the 3-grams will be presente as wordclouds. Finally, the first 5 words of each topic of a Latend Dirichlet Allocation (LDA) topic model is presented for each cluster.
require(ggplot2)
plotdfm(dfmarray_oro[[1]][[1]],1)
plotdfm(dfmarray_oro[[1]][[2]],2)
plotdfm(dfmarray_oro[[1]][[3]],3)
createwordcloud(dfmarray_oro[[1]][[3]])
topmod1 <- createtopmod(dfmarray_oro[[1]][[3]])
## Loading required package: topicmodels
t(as.data.frame(terms(topmod1,5)))
## [,1] [,2]
## Topic 1 "gaa_ʼ_ela" "oromummaa_,_yaa"
## Topic 2 "!_!_!" "waltajjii_marii_oromoo"
## Topic 3 "qofa_miti_," "obbo_baaroo_tumsaa"
## Topic 4 "!_!_!" "?_?_?"
## Topic 5 "sadarkaa_2_ffaa" "m_/_b"
## Topic 6 "ta_ʼ_e" "yoo_ta_ʼ"
## Topic 7 "odoo_hin_taanee" "hin_taanee_,"
## Topic 8 "barattootaa_fi_hawaasa" "kana_dura_dhaabbachuu"
## Topic 9 "ummata_oromoo_irratti" "/_15_,"
## Topic 10 "osoo_hin_tahini" "obbo_odaa_xasee"
## Topic 11 "!_!_!" "magaala_las_vegas"
## Topic 12 "'_u_." "ilaa_fi_ilaamee"
## Topic 13 "qee_fi_qabeenya" "du_`_a"
## Topic 14 "'_a_." "!_!_!"
## Topic 15 "!_!_!" "osoo_hin_taanee"
## Topic 16 "yeroo_ammaa_kana" "mirga_abbaa_biyyummaa"
## Topic 17 "guidelines_of_qeerroo" "of_qeerroo_bilisummaa"
## Topic 18 "._._." "._watch_video"
## Topic 19 "\"_dubbadhu_\"" "macaafa_qulqulluu_keessatti"
## Topic 20 "'_a_." "tamsaasa_kana_keessatti"
## [,3] [,4]
## Topic 1 ",_yaa_oromummaa" "ta_ʼ_e"
## Topic 2 "…_…_…" "(_2_)"
## Topic 3 "danda_ʼ_a" "!_!_!"
## Topic 4 "…_…_…" "ta_ʼ_e"
## Topic 5 "akka_ta_ʼ" "qabsoo_bilisummaa_oromoo"
## Topic 6 "keessatti_baay_ʼ" "'_e_."
## Topic 7 "'_a_." "hordofaa_watch_video"
## Topic 8 "'_a_." "hanga_seeratti_nuuf"
## Topic 9 "15_,_ful" "bara_1990_′"
## Topic 10 "ta_ʼ_e" "qofa_osoo_hin"
## Topic 11 "oromoo_magaala_las" "hawaasa_oromoo_magaala"
## Topic 12 "'_e_." "qabsoo_oromoo_keessatti"
## Topic 13 "ummata_oromoo_fi" "mootummaan_abbaa_irree"
## Topic 14 "?_?_?" "warra_wangeelaa_oromoo"
## Topic 15 "/_06_/" "says_:_30"
## Topic 16 "fincilli_ummataa_fi" "seenaa_ummata_oromoo"
## Topic 17 "waraanni_mootummaa_wayyaanee" "godina_wallaggaa_lixaa"
## Topic 18 "._._watch" "danda_ʼ_a"
## Topic 19 "guyyaa_gootota_oromoo" "qeerroo_idil_-"
## Topic 20 "gaaffii_fi_deebii" "h_/_w"
## [,5]
## Topic 1 "gaa_ʼ_elaa"
## Topic 2 "dhiisanii_wal_irratti"
## Topic 3 "miti_,_bakka"
## Topic 4 "osoo_hin_taane"
## Topic 5 "\"_(_fakkeenya"
## Topic 6 "._._."
## Topic 7 "biyya_keessaa_fi"
## Topic 8 "adda_bilisummaa_oromoo"
## Topic 9 "1990_′_oota"
## Topic 10 "hin_qabnee_fi"
## Topic 11 "wal_-_gahii"
## Topic 12 "hawaasa_oromoo_gara"
## Topic 13 "goototni_barattootni_oromoo"
## Topic 14 "addi_bilisummaa_oromoo"
## Topic 15 ":_30_/"
## Topic 16 "ta_ʼ_e"
## Topic 17 "bakkoota_adda_addaatti"
## Topic 18 "ta_ʼ_e"
## Topic 19 "._._\""
## Topic 20 "kana_keessatti_:"
plotdfm(dfmarray_oro[[2]][[1]],1)
plotdfm(dfmarray_oro[[2]][[2]],2)
plotdfm(dfmarray_oro[[2]][[3]],3)
createwordcloud(dfmarray_oro[[2]][[3]])
topmod2 <- createtopmod(dfmarray_oro[[2]][[3]])
t(as.data.frame(terms(topmod2,5)))
## [,1] [,2]
## Topic 1 "ta_ʼ_e" "ta_ʼ_u"
## Topic 2 "'_a_." "danda_'_a"
## Topic 3 "ta_ʼ_e" "akka_ta_ʼ"
## Topic 4 "'_a_." "._._."
## Topic 5 "walga_ʼ_iiwwan" "'_a_."
## Topic 6 "qajeelchaa_ta_ʼ" "utuu_hin_ta'in"
## Topic 7 "ta_ʼ_e" "qeentee_ta_ʼ"
## Topic 8 "'_u_." "'_a_."
## Topic 9 "ta_ʼ_e" "ta_ʼ_uusaa"
## Topic 10 "'_a_." "warra_kaaniif_hojii"
## Topic 11 "._._." "danda_ʼ_a"
## Topic 12 "fedhii_ofii_aarsaa" "kana_malees_,"
## Topic 13 "'_a_." "danda_'_a"
## Topic 14 "'_a_." "akka_ta_ʼ"
## Topic 15 "ta_ʼ_e" "utuu_hin_tahin"
## Topic 16 "ta_ʼ_e" "ta_ʼ_us"
## Topic 17 "'_a_." "ta_ʼ_e"
## Topic 18 "'_a_." "ta_'_a"
## Topic 19 "ta_ʼ_e" "akka_ta_ʼ"
## Topic 20 "nu_keessa_jiru" "ta_ʼ_e"
## [,3] [,4]
## Topic 1 "ʼ_u_malee" "danda_ʼ_a"
## Topic 2 "aboo_qabaniif_ulfina" "ulfina_akka_kenninu"
## Topic 3 "(_a_)" "(_b_)"
## Topic 4 "ta_'_a" "'_e_."
## Topic 5 "akka_ta_ʼ" "danda_'_a"
## Topic 6 ":_1_-" "hin_ta'in_,"
## Topic 7 "ta_ʼ_u" "gaa_ʼ_ela"
## Topic 8 "danda_'_u" "haata'u_malee_,"
## Topic 9 "baay_ʼ_ee" "danda_ʼ_a"
## Topic 10 "danda_'_a" "ta_'_a"
## Topic 11 "waa_ʼ_ee" "haata'u_malee_,"
## Topic 12 "ofii_aarsaa_gochuu" "saa_ʼ_ol"
## Topic 13 "ta_'_a" "haata'u_malee_,"
## Topic 14 "ta_ʼ_e" "danda_ʼ_a"
## Topic 15 "ʼ_e_maaliifi" "e_maaliifi_?"
## Topic 16 "ʼ_us_," "ta_ʼ_u"
## Topic 17 "kan_dandeenyu_akkamitti" "ta_'_a"
## Topic 18 "'_u_." "hanga_deebii_kennutti"
## Topic 19 "baay_ʼ_ee" "ta_ʼ_us"
## Topic 20 "'_a_." "'_e_."
## [,5]
## Topic 1 "akka_ta_ʼ"
## Topic 2 ":_1_-"
## Topic 3 "baay_ʼ_ee"
## Topic 4 "yommuu_ta'u_,"
## Topic 5 "ta_ʼ_e"
## Topic 6 "'_a_."
## Topic 7 "ta_ʼ_us"
## Topic 8 "ta_ʼ_e"
## Topic 9 "ta_ʼ_uu"
## Topic 10 "ta_ʼ_e"
## Topic 11 "mat_4_:"
## Topic 12 "amala_fedhii_ofii"
## Topic 13 "qayyabannaa_macaafa_qulqulluu"
## Topic 14 "ta_'_a"
## Topic 15 "hin_tahin_,"
## Topic 16 "ʼ_u_,"
## Topic 17 "dandeenyu_akkamitti_?"
## Topic 18 "deebii_kennutti_eegi"
## Topic 19 "bu_ʼ_uuraa"
## Topic 20 "utuu_hin_ta'in"