Topic Modelling of English Quran Translation

Load libraries

in this mini project I will use quRan, quanteda.corpora, seededlda, quanteda, and lubridate packages. Those packages are helpful for text mining tasks

library(quRan)
library(quanteda.corpora)
library(seededlda)
library(quanteda)
library(lubridate)

Select Quran Data

I select Yusuf Ali’s Quran translation

df <- quran_en_yusufali
df <- subset(df, select = c(text, surah_title_en, surah, juz, revelation_type))

Building Corpus

corpus_quran_en <- corpus(df, text_field = "text")
head(corpus_quran_en)
Corpus consisting of 6 documents and 4 docvars.
text1 :
"In the name of Allah, Most Gracious, Most Merciful."

text2 :
"Praise be to Allah, the Cherisher and Sustainer of the world..."

text3 :
"Most Gracious, Most Merciful;"

text4 :
"Master of the Day of Judgment."

text5 :
"Thee do we worship, and Thine aid we seek."

text6 :
"Show us the straight way,"

Cleaning and Tokenizing

Remove Symbols and Punctuations

tokens_quran <- tokens(corpus_quran_en, remove_punct = T,
                       remove_numbers = T, remove_symbols = T)
head(tokens_quran)
Tokens consisting of 6 documents and 4 docvars.
text1 :
[1] "In"       "the"      "name"     "of"       "Allah"    "Most"     "Gracious" "Most"     "Merciful"

text2 :
 [1] "Praise"    "be"        "to"        "Allah"     "the"       "Cherisher" "and"       "Sustainer" "of"        "the"       "worlds"   

text3 :
[1] "Most"     "Gracious" "Most"     "Merciful"

text4 :
[1] "Master"   "of"       "the"      "Day"      "of"       "Judgment"

text5 :
[1] "Thee"    "do"      "we"      "worship" "and"     "Thine"   "aid"     "we"      "seek"   

text6 :
[1] "Show"     "us"       "the"      "straight" "way"     

Remove Stopwords

I will remove stopwords, because they are not useful for analysis

tokens_quran <- tokens_remove(tokens_quran, pattern = c(stopwords("en")))
head(tokens_quran)
Tokens consisting of 6 documents and 4 docvars.
text1 :
[1] "name"     "Allah"    "Gracious" "Merciful"

text2 :
[1] "Praise"    "Allah"     "Cherisher" "Sustainer" "worlds"   

text3 :
[1] "Gracious" "Merciful"

text4 :
[1] "Master"   "Day"      "Judgment"

text5 :
[1] "Thee"    "worship" "Thine"   "aid"     "seek"   

text6 :
[1] "Show"     "us"       "straight" "way"     

Case Transformation

Any uppercase will be transformed into lowercase

tokens_quran <- tokens_tolower(tokens_quran, keep_acronyms = F)
head(tokens_quran)
Tokens consisting of 6 documents and 4 docvars.
text1 :
[1] "name"     "allah"    "gracious" "merciful"

text2 :
[1] "praise"    "allah"     "cherisher" "sustainer" "worlds"   

text3 :
[1] "gracious" "merciful"

text4 :
[1] "master"   "day"      "judgment"

text5 :
[1] "thee"    "worship" "thine"   "aid"     "seek"   

text6 :
[1] "show"     "us"       "straight" "way"     

Document Feature Matrix (DFM)

document feature matrix will be constructed for LDA

dfm_quran <- dfm(tokens_quran) %>%
  dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
           max_docfreq = 0.1, docfreq_type = "prop")
head(dfm_quran)
Document-feature matrix of: 6 documents, 1,304 features (99.76% sparse) and 4 docvars.
       features
docs    name gracious merciful praise cherisher worlds day judgment thee worship
  text1    1        1        1      0         0      0   0        0    0       0
  text2    0        0        0      1         1      1   0        0    0       0
  text3    0        1        1      0         0      0   0        0    0       0
  text4    0        0        0      0         0      0   1        1    0       0
  text5    0        0        0      0         0      0   0        0    1       1
  text6    0        0        0      0         0      0   0        0    0       0
[ reached max_nfeat ... 1,294 more features ]

Topic Modelling

I only choose ten topics in this topic modelling

# start text modelling
text_model_lda <- textmodel_lda(dfm_quran, k = 10)
terms(text_model_lda, 10)
      topic1    topic2      topic3       topic4     topic5     topic6    topic7      topic8 topic9      topic10
 [1,] "earth"   "believe"   "shall"      "day"      "people"   "way"     "know"      "thou" "sent"      "men"  
 [2,] "heavens" "good"      "penalty"    "shall"    "o"        "call"    "things"    "thee" "signs"     "one"  
 [3,] "things"  "reward"    "fire"       "judgment" "moses"    "one"     "full"      "thy"  "thee"      "man"  
 [4,] "created" "give"      "therein"    "every"    "behold"   "follow"  "knowledge" "us"   "book"      "let"  
 [5,] "made"    "deeds"     "evil"       "soul"     "pharaoh"  "can"     "verily"    "turn" "hath"      "two"  
 [6,] "verily"  "shall"     "reject"     "can"      "among"    "even"    "hearts"    "away" "people"    "make" 
 [7,] "night"   "life"      "faith"      "us"       "gave"     "besides" "hath"      "see"  "truth"     "women"
 [8,] "see"     "hereafter" "grievous"   "one"      "came"     "worship" "well"      "art"  "clear"     "among"
 [9,] "power"   "evil"      "hell"       "together" "indeed"   "upon"    "merciful"  "o"    "unto"      "fear" 
[10,] "forth"   "messenger" "companions" "come"     "remember" "men"     "thy"       "put"  "messenger" "o"    

Topic Modelling: Seeded LDA

With Seeded LDA we can choose our own topic, let’s say we want to only want topics about war, politics, and economy. So we create the small dictionary contains related keywords.

dictionary_lda <- dictionary(file = "topics.yml")
Warning in readLines(con, warn = readLines.warn) :
  incomplete final line found on 'topics.yml'
dictionary_lda
Dictionary object with 5 key entries.
- [war]:
  - war, sword, kill*, conflict, battle, fight*, attack*, crusade
- [politics]:
  - parliament*, congress*, white house, party leader*, party member*, voter*, lawmaker*, politician*
- [economy]:
  - money, trade*, market, wage, wealth
- [law]:
  - rule, justice, court, regulation, concept, legal, right*
- [eschatology]:
  - afterlife, destiny, hell, heaven, end, punish*, fire

So let’s see the result

text_model_slda <- textmodel_seededlda(dfm_quran, dictionary = dictionary_lda)
terms(text_model_slda, 10)
      war        politics economy   law             eschatology 
 [1,] "fight"    "may"    "wealth"  "thou"          "fire"      
 [2,] "war"      "men"    "people"  "right"         "punishment"
 [3,] "fighting" "made"   "sent"    "righteous"     "hell"      
 [4,] "earth"    "one"    "thee"    "righteousness" "end"       
 [5,] "things"   "fear"   "signs"   "justice"       "heaven"    
 [6,] "heavens"  "two"    "book"    "thy"           "punish"    
 [7,] "power"    "good"   "o"       "thee"          "punished"  
 [8,] "verily"   "night"  "among"   "us"            "shall"     
 [9,] "created"  "let"    "moses"   "believe"       "day"       
[10,] "know"     "women"  "believe" "o"             "penalty"   
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6DQogIGh0bWxfbm90ZWJvb2s6IGRlZmF1bHQNCiAgaHRtbF9kb2N1bWVudDoNCiAgICBkZl9wcmludDogcGFnZWQNCiAgd29yZF9kb2N1bWVudDogZGVmYXVsdA0KLS0tDQoNCiMgVG9waWMgTW9kZWxsaW5nIG9mIEVuZ2xpc2ggUXVyYW4gVHJhbnNsYXRpb24NCg0KIyMgTG9hZCBsaWJyYXJpZXMNCg0KaW4gdGhpcyBtaW5pIHByb2plY3QgSSB3aWxsIHVzZSBxdVJhbiwgcXVhbnRlZGEuY29ycG9yYSwgc2VlZGVkbGRhLCBxdWFudGVkYSwgYW5kIGx1YnJpZGF0ZSBwYWNrYWdlcy4gVGhvc2UgcGFja2FnZXMgYXJlIGhlbHBmdWwgZm9yIHRleHQgbWluaW5nIHRhc2tzDQoNCmBgYHtyfQ0KbGlicmFyeShxdVJhbikNCmxpYnJhcnkocXVhbnRlZGEuY29ycG9yYSkNCmxpYnJhcnkoc2VlZGVkbGRhKQ0KbGlicmFyeShxdWFudGVkYSkNCmxpYnJhcnkobHVicmlkYXRlKQ0KYGBgDQoNCiMjIFNlbGVjdCBRdXJhbiBEYXRhDQoNCkkgc2VsZWN0IFl1c3VmIEFsaSdzIFF1cmFuIHRyYW5zbGF0aW9uDQoNCmBgYHtyfQ0KZGYgPC0gcXVyYW5fZW5feXVzdWZhbGkNCmRmIDwtIHN1YnNldChkZiwgc2VsZWN0ID0gYyh0ZXh0LCBzdXJhaF90aXRsZV9lbiwgc3VyYWgsIGp1eiwgcmV2ZWxhdGlvbl90eXBlKSkNCmBgYA0KDQojIyBCdWlsZGluZyBDb3JwdXMNCg0KYGBge3J9DQpjb3JwdXNfcXVyYW5fZW4gPC0gY29ycHVzKGRmLCB0ZXh0X2ZpZWxkID0gInRleHQiKQ0KaGVhZChjb3JwdXNfcXVyYW5fZW4pDQpgYGANCg0KIyMgQ2xlYW5pbmcgYW5kIFRva2VuaXppbmcNCg0KIyMjIFJlbW92ZSBTeW1ib2xzIGFuZCBQdW5jdHVhdGlvbnMNCg0KYGBge3J9DQp0b2tlbnNfcXVyYW4gPC0gdG9rZW5zKGNvcnB1c19xdXJhbl9lbiwgcmVtb3ZlX3B1bmN0ID0gVCwNCiAgICAgICAgICAgICAgICAgICAgICAgcmVtb3ZlX251bWJlcnMgPSBULCByZW1vdmVfc3ltYm9scyA9IFQpDQpoZWFkKHRva2Vuc19xdXJhbikNCmBgYA0KDQojIyMgUmVtb3ZlIFN0b3B3b3Jkcw0KDQpJIHdpbGwgcmVtb3ZlIHN0b3B3b3JkcywgYmVjYXVzZSB0aGV5IGFyZSBub3QgdXNlZnVsIGZvciBhbmFseXNpcw0KDQpgYGB7cn0NCnRva2Vuc19xdXJhbiA8LSB0b2tlbnNfcmVtb3ZlKHRva2Vuc19xdXJhbiwgcGF0dGVybiA9IGMoc3RvcHdvcmRzKCJlbiIpKSkNCmhlYWQodG9rZW5zX3F1cmFuKQ0KYGBgDQoNCiMjIyBDYXNlIFRyYW5zZm9ybWF0aW9uDQoNCkFueSB1cHBlcmNhc2Ugd2lsbCBiZSB0cmFuc2Zvcm1lZCBpbnRvIGxvd2VyY2FzZQ0KDQpgYGB7cn0NCnRva2Vuc19xdXJhbiA8LSB0b2tlbnNfdG9sb3dlcih0b2tlbnNfcXVyYW4sIGtlZXBfYWNyb255bXMgPSBGKQ0KaGVhZCh0b2tlbnNfcXVyYW4pDQpgYGANCg0KIyMgRG9jdW1lbnQgRmVhdHVyZSBNYXRyaXggKERGTSkNCg0KZG9jdW1lbnQgZmVhdHVyZSBtYXRyaXggd2lsbCBiZSBjb25zdHJ1Y3RlZCBmb3IgTERBDQoNCmBgYHtyfQ0KZGZtX3F1cmFuIDwtIGRmbSh0b2tlbnNfcXVyYW4pICU+JQ0KICBkZm1fdHJpbShtaW5fdGVybWZyZXEgPSAwLjgsIHRlcm1mcmVxX3R5cGUgPSAicXVhbnRpbGUiLA0KICAgICAgICAgICBtYXhfZG9jZnJlcSA9IDAuMSwgZG9jZnJlcV90eXBlID0gInByb3AiKQ0KaGVhZChkZm1fcXVyYW4pDQpgYGANCg0KIyMgVG9waWMgTW9kZWxsaW5nDQoNCkkgb25seSBjaG9vc2UgdGVuIHRvcGljcyBpbiB0aGlzIHRvcGljIG1vZGVsbGluZw0KDQpgYGB7cn0NCiMgc3RhcnQgdGV4dCBtb2RlbGxpbmcNCnRleHRfbW9kZWxfbGRhIDwtIHRleHRtb2RlbF9sZGEoZGZtX3F1cmFuLCBrID0gMTApDQp0ZXJtcyh0ZXh0X21vZGVsX2xkYSwgMTApDQpgYGANCg0KIyMgVG9waWMgTW9kZWxsaW5nOiBTZWVkZWQgTERBDQoNCldpdGggU2VlZGVkIExEQSB3ZSBjYW4gY2hvb3NlIG91ciBvd24gdG9waWMsIGxldCdzIHNheSB3ZSB3YW50IHRvIG9ubHkgd2FudCB0b3BpY3MgYWJvdXQgd2FyLCBwb2xpdGljcywgYW5kIGVjb25vbXkuIFNvIHdlIGNyZWF0ZSB0aGUgc21hbGwgZGljdGlvbmFyeSBjb250YWlucyByZWxhdGVkIGtleXdvcmRzLg0KDQpgYGB7cn0NCmRpY3Rpb25hcnlfbGRhIDwtIGRpY3Rpb25hcnkoZmlsZSA9ICJ0b3BpY3MueW1sIikNCmRpY3Rpb25hcnlfbGRhDQpgYGANCg0KU28gbGV0J3Mgc2VlIHRoZSByZXN1bHQNCg0KYGBge3J9DQp0ZXh0X21vZGVsX3NsZGEgPC0gdGV4dG1vZGVsX3NlZWRlZGxkYShkZm1fcXVyYW4sIGRpY3Rpb25hcnkgPSBkaWN0aW9uYXJ5X2xkYSkNCnRlcm1zKHRleHRfbW9kZWxfc2xkYSwgMTApDQpgYGANCg==