library(tm)
## Loading required package: NLP
quran =readLines('http://mantascode.com/wp-content/uploads/2011/02/Koran.txt',encoding='UTF-8')
quran = Corpus(VectorSource(quran))
inspect(quran[1:5])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5
##
## [1] The Opening
## [2]
## [3] In the name of Allah, the Beneficent, the Merciful.
## [4] [1.1] All praise is due to Allah, the Lord of the Worlds.
## [5] [1.2] The Beneficent, the Merciful.
quran<- tm_map(quran , tolower)
## Warning in tm_map.SimpleCorpus(quran, tolower): transformation drops
## documents
inspect(quran[1:5])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5
##
## [1] the opening
## [2]
## [3] in the name of allah, the beneficent, the merciful.
## [4] [1.1] all praise is due to allah, the lord of the worlds.
## [5] [1.2] the beneficent, the merciful.
quran<- tm_map(quran , removePunctuation)
## Warning in tm_map.SimpleCorpus(quran, removePunctuation): transformation
## drops documents
inspect(quran[1:5])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5
##
## [1] the opening
## [2]
## [3] in the name of allah the beneficent the merciful
## [4] 11 all praise is due to allah the lord of the worlds
## [5] 12 the beneficent the merciful
quran<- tm_map(quran , removeNumbers)
## Warning in tm_map.SimpleCorpus(quran, removeNumbers): transformation drops
## documents
inspect(quran[1:5])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5
##
## [1] the opening
## [2]
## [3] in the name of allah the beneficent the merciful
## [4] all praise is due to allah the lord of the worlds
## [5] the beneficent the merciful
quran<- tm_map(quran , removeWords, stopwords("en"))
## Warning in tm_map.SimpleCorpus(quran, removeWords, stopwords("en")):
## transformation drops documents
inspect(quran[1:5])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5
##
## [1] opening
## [3] name allah beneficent merciful praise due allah lord worlds
## [5] beneficent merciful
dtm = TermDocumentMatrix(quran)
m = as.matrix(dtm)
m[1:10 ,1:10]
## Docs
## Terms 1 2 3 4 5 6 7 8 9 10
## opening 1 0 0 0 0 0 0 0 0 0
## allah 0 0 1 1 0 0 0 0 0 0
## beneficent 0 0 1 0 1 0 0 0 0 0
## merciful 0 0 1 0 1 0 0 0 0 0
## name 0 0 1 0 0 0 0 0 0 0
## due 0 0 0 1 0 0 0 0 0 0
## lord 0 0 0 1 0 0 0 0 0 0
## praise 0 0 0 1 0 0 0 0 0 0
## worlds 0 0 0 1 0 0 0 0 0 0
## day 0 0 0 0 0 1 0 0 0 0
v = sort(rowSums(m),decreasing=TRUE)
d = data.frame(word = names(v),freq=v)
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))

findFreqTerms(dtm, lowfreq = 100)
## [1] "allah" "beneficent" "merciful" "name"
## [5] "lord" "day" "serve" "right"
## [9] "brought" "thou" "upon" "book"
## [13] "evil" "guard" "believe" "given"
## [17] "hereafter" "revealed" "shall" "disbelieve"
## [21] "surely" "will" "great" "hearts"
## [25] "punishment" "believers" "people" "say"
## [29] "chastisement" "land" "make" "said"
## [33] "know" "back" "bring" "away"
## [37] "fire" "like" "one" "see"
## [41] "turn" "fear" "unbelievers" "certainly"
## [45] "things" "created" "may" "men"
## [49] "earth" "forth" "made" "therefore"
## [53] "besides" "call" "good" "except"
## [57] "truth" "gave" "life" "heavens"
## [61] "knows" "knowledge" "unjust" "others"
## [65] "mercy" "come" "whoever" "communications"
## [69] "take" "another" "women" "god"
## [73] "musa" "give" "best" "sent"
## [77] "enter" "allahs" "ask" "better"
## [81] "reward" "among" "came" "man"
## [85] "thus" "indeed" "every" "apostle"
## [89] "apostles" "clear" "nay" "pleases"
## [93] "servants" "way" "two" "mighty"
## [97] "follow" "witness" "night" "therein"
## [101] "let" "lie"
findAssocs(dtm, terms ="allah", corlimit = 0.1)
## $allah
## apostle careful duty merciful surely say
## 0.22 0.19 0.18 0.17 0.17 0.16
## knows forgiving believe knowing things besides
## 0.16 0.16 0.15 0.15 0.14 0.14
## whoever pleases wise whatever name know
## 0.14 0.14 0.13 0.13 0.12 0.12
## way mighty guide will believers men
## 0.12 0.12 0.11 0.11 0.11 0.11
## mercifully allahs ask limits grace aware
## 0.11 0.11 0.11 0.11 0.11 0.11
## latter upon hearts others among witness
## 0.11 0.10 0.10 0.10 0.10 0.10
## sufficient twothirds
## 0.10 0.10
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")
