Exploratory Data Analysis - HaBiT Project - Somali

Exploratory Data Analysis. This script will perform some analysis of corpora resulting from unsupervised clustering. The results will be presented as graphs and tables. All of the code used to conduct the analysis are included in this script.

knitr::opts_chunk$set(echo = TRUE, cache= FALSE)

Preperatory steps

Loading libraries

require(quanteda)

## Loading required package: quanteda

## quanteda version 0.9.9.23

## Using 31 of 32 cores for parallel computing

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:utils':
## 
##     View

## The following object is masked from 'package:base':
## 
##     sample

require(readtext)

## Loading required package: readtext

base <- "/home/larsbun/Dropbox-exclude/old-experiments/20170219/066bf000-c5ed-4716-b571-c2d7a14bdfe3/ward-partitioning-0/"

mystopwords <-c("<", ">", "/", "s", ":", ";")

Supporting functions

plotdfm<-function(mydfm,order){
  freq <- slam::col_sums(mydfm)
  freq <- as.data.frame(freq)
  freq$names <- rownames(freq) 
  rownames(freq) <- NULL
  colnames(freq) <- c("freq","word")
  freq <- freq[order(freq$freq,decreasing = TRUE),]
  freq_top25 <- head(freq, n = 25)

  ggplot(freq_top25, aes(x=reorder(word, freq), y=freq)) +
    geom_bar(stat = "identity") + 
    coord_flip() +
    xlab(paste(order, "-gram words")) + 
    ylab("Frequency") +
    geom_text(aes(label=freq), hjust=-0.20) +
    ggtitle(paste("Most Common", order, "-grams"))
}

createwordcloud<-function(mydfm){
textplot_wordcloud(mydfm, min.freq = 30, random.order = FALSE,
                   rot.per = .25, max.words=100,
                   colors = RColorBrewer::brewer.pal(8,"Dark2"))
}

library(ggplot2)
createtopmod<-function(mydfm){
if (require(topicmodels)) {
  myLDAfit20 <- LDA(convert(mydfm, to = "topicmodels"), k = 20)
  # get_terms(myLDAfit20, 5)
  # topics(myLDAfit20, 3)
}
}

Create array of corpus objects

Iterate over the corpora in the partitioning given by base, save as a list of lists of corpus objects.

Generate a list of lists, containing the document-feature matrices for the mx3 corpora and 1:3-order n-grams.

library(quanteda)
dfmarray_som<-list()
for (i in 1:2){
 dfmlist<-list()
 for (j in 1:3){
   dfmlist<- c(dfmlist, dfm(clist_som[[i]], ngrams=j, removepunct=TRUE, remove = mystopwords))
 }
 dfmarray_som<-c(dfmarray_som, list(dfmlist))
}

## Warning in tokens.character(x, ...): Argument removepunct not used.

## Warning in tokens.character(x, ...): Argument removepunct not used.

## Warning in tokens.character(x, ...): Argument removepunct not used.

## Warning in tokens.character(x, ...): Argument removepunct not used.

## Warning in tokens.character(x, ...): Argument removepunct not used.

## Warning in tokens.character(x, ...): Argument removepunct not used.

Presentation of Exploratory Data Analysis

For each cluster, the top 25 n-grams up to order 3 will be displayed. Next, the 3-grams will be presente as wordclouds. Finally, the first 5 words of each topic of a Latend Dirichlet Allocation (LDA) topic model is presented for each cluster.

Top 25 N-grams of cluster 1

require(ggplot2)
plotdfm(dfmarray_som[[1]][[1]],1)

plot of chunk unnamed-chunk-5

plotdfm(dfmarray_som[[1]][[2]],2)

plot of chunk unnamed-chunk-5

plotdfm(dfmarray_som[[1]][[3]],3)

plot of chunk unnamed-chunk-5

createwordcloud(dfmarray_som[[1]][[3]])

plot of chunk unnamed-chunk-5

topmod1 <- createtopmod(dfmarray_som[[1]][[1]])

## Loading required package: topicmodels

t(as.data.frame(terms(topmod1,5)))

##          [,1] [,2]  [,3]  [,4] [,5]  
## Topic 1  ","  "ka"  "oo"  "ku" "iyo" 
## Topic 2  ","  "ka"  "oo"  "ku" "iyo" 
## Topic 3  ","  "ka"  "iyo" "oo" "ku"  
## Topic 4  "."  "ka"  "oo"  ","  "ee"  
## Topic 5  ","  "oo"  "ka"  "ay" "ku"  
## Topic 6  "oo" ","   "ay"  "-"  "iyo" 
## Topic 7  ","  "ay"  "oo"  "-"  "ah"  
## Topic 8  "ka" ","   "oo"  "ay" "iyo" 
## Topic 9  "oo" "ka"  ","   "ku" "iyo" 
## Topic 10 ","  "-"   "ka"  "ee" "oo"  
## Topic 11 "oo" "u"   "\""  "m"  ","   
## Topic 12 "."  "oo"  ","   "ku" "ka"  
## Topic 13 ","  "iyo" "oo"  "u"  "ku"  
## Topic 14 "."  "oo"  "ka"  "ay" ","   
## Topic 15 "oo" "ka"  ","   "ku" "ee"  
## Topic 16 ","  "oo"  "iyo" "ka" "ay"  
## Topic 17 ","  "ka"  "uu"  "ku" "oo"  
## Topic 18 ","  "oo"  "ay"  "ku" "ka"  
## Topic 19 ","  "\""  "oo"  "ku" "ka"  
## Topic 20 ","  "oo"  "ee"  "ka" "ayaa"

Top 25 N-grams of cluster 2

plotdfm(dfmarray_som[[2]][[1]],1)

plot of chunk unnamed-chunk-6

plotdfm(dfmarray_som[[2]][[2]],2)

plot of chunk unnamed-chunk-6

plotdfm(dfmarray_som[[2]][[3]],3)

plot of chunk unnamed-chunk-6

createwordcloud(dfmarray_som[[2]][[1]])

plot of chunk unnamed-chunk-6

topmod2 <- createtopmod(dfmarray_som[[2]][[3]])
t(as.data.frame(terms(topmod2,5)))

##          [,1]                    [,2]                 [,3]             
## Topic 1  "._._."                 "?_?_?"              "ka_mid_ah"      
## Topic 2  "reer_binu_israa'iil"   ",_oo_waxay"         "ku_yidhi_,"     
## Topic 3  "._._."                 "ka_mid_ah"          "more_._."       
## Topic 4  "._._."                 "(_hol_)"            "hol_)_-"        
## Topic 5  "._._."                 "._._("              ",_waayo_,"      
## Topic 6  "._._."                 "ka_mid_ah"          "ayaa_sheegay_in"
## Topic 7  "._._."                 "read_more_…"        "ka_mid_ah"      
## Topic 8  "._._."                 "ka_mid_ah"          "ayaa_sheegay_in"
## Topic 9  "._._."                 "ayaa_sheegay_in"    "ka_mid_ah"      
## Topic 10 "._._."                 "ka_mid_ah"          "]_-_waa"        
## Topic 11 "xeer_madaxweynaha_jds" "madaxweynaha_jds_l" "jds_l_."        
## Topic 12 "ka_mid_ah"             "ayaa_sheegay_in"    ",_2015_("       
## Topic 13 "ka_mid_ah"             ",_2015_("           "2015_(_hol"     
## Topic 14 "._._."                 "ka_mid_ah"          "oo_ka_mid"      
## Topic 15 "ruuxa_quduuska_ah"     "al_-_shabaab"       "saddex_-_m"     
## Topic 16 "._._."                 ",_2015_("           "2015_(_hol"     
## Topic 17 "._._."                 "more_._."           "[_bmc_]"        
## Topic 18 "._._."                 "ka_mid_ah"          "wuxuu_yiri_:"   
## Topic 19 "ka_mid_ah"             ",_waayo_,"          "aadan_iyo_xaawa"
## Topic 20 "._._."                 "._._madaxweynaha"   "]_-_magacaygu"  
##          [,4]                  [,5]                  
## Topic 1  "!_!_!"               "ku_meel_gaarka"      
## Topic 2  ",_oo_wuxuu"          "wuxuu_ku_yidhi"      
## Topic 3  "[_…_]"               "-_(_warsoor"         
## Topic 4  ",_2015_("            "2015_(_hol"          
## Topic 5  "ka_mid_ah"           "jan_2016_-"          
## Topic 6  ",_2015_("            "[_…_]"               
## Topic 7  "._._admin"           "yiri_:_\""           
## Topic 8  "[_warsaxaafadeed_]"  "ee_qaramada_midoobay"
## Topic 9  "19_,_2015"           ",_2015_("            
## Topic 10 "[_…_]"               "(_scw_)"             
## Topic 11 "*_xeer_madaxweynaha" "ka_mid_ah"           
## Topic 12 "oo_ay_ka"            "ay_ka_mid"           
## Topic 13 "(_hol_)"             "hol_)_-"             
## Topic 14 "!_!_!"               "ayaa_sheegay_in"     
## Topic 15 "ka_mid_ah"           "banii_-_aadanka"     
## Topic 16 "(_hol_)"             "hol_)_-"             
## Topic 17 "bmc_]_-"             "._._daawo"           
## Topic 18 "(_bnn_)"             "2016_(_bnn"          
## Topic 19 "-_aad_ee"            "ayaa_sheegay_in"     
## Topic 20 "-_magacaygu_waa"     "(_sh_m"