library(tm)
## Loading required package: NLP
quran =readLines('http://mantascode.com/wp-content/uploads/2011/02/Koran.txt',encoding='UTF-8')
quran = Corpus(VectorSource(quran))
inspect(quran[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] The Opening                                              
## [2]                                                          
## [3] In the name of Allah, the Beneficent, the Merciful.      
## [4] [1.1] All praise is due to Allah, the Lord of the Worlds.
## [5] [1.2] The Beneficent, the Merciful.
quran<- tm_map(quran , tolower)
## Warning in tm_map.SimpleCorpus(quran, tolower): transformation drops
## documents
inspect(quran[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] the opening                                              
## [2]                                                          
## [3] in the name of allah, the beneficent, the merciful.      
## [4] [1.1] all praise is due to allah, the lord of the worlds.
## [5] [1.2] the beneficent, the merciful.
quran<- tm_map(quran , removePunctuation)
## Warning in tm_map.SimpleCorpus(quran, removePunctuation): transformation
## drops documents
inspect(quran[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] the opening                                         
## [2]                                                     
## [3] in the name of allah the beneficent the merciful    
## [4] 11 all praise is due to allah the lord of the worlds
## [5] 12 the beneficent the merciful
quran<- tm_map(quran , removeNumbers)
## Warning in tm_map.SimpleCorpus(quran, removeNumbers): transformation drops
## documents
inspect(quran[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] the opening                                       
## [2]                                                   
## [3] in the name of allah the beneficent the merciful  
## [4]  all praise is due to allah the lord of the worlds
## [5]  the beneficent the merciful
quran<- tm_map(quran , removeWords, stopwords("en"))
## Warning in tm_map.SimpleCorpus(quran, removeWords, stopwords("en")):
## transformation drops documents
inspect(quran[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1]  opening                                                               
## [3]   name  allah  beneficent  merciful   praise  due  allah  lord   worlds
## [5]   beneficent  merciful
dtm = TermDocumentMatrix(quran)
 
m = as.matrix(dtm)
m[1:10  ,1:10]
##             Docs
## Terms        1 2 3 4 5 6 7 8 9 10
##   opening    1 0 0 0 0 0 0 0 0  0
##   allah      0 0 1 1 0 0 0 0 0  0
##   beneficent 0 0 1 0 1 0 0 0 0  0
##   merciful   0 0 1 0 1 0 0 0 0  0
##   name       0 0 1 0 0 0 0 0 0  0
##   due        0 0 0 1 0 0 0 0 0  0
##   lord       0 0 0 1 0 0 0 0 0  0
##   praise     0 0 0 1 0 0 0 0 0  0
##   worlds     0 0 0 1 0 0 0 0 0  0
##   day        0 0 0 0 0 1 0 0 0  0
v = sort(rowSums(m),decreasing=TRUE)
 
d = data.frame(word = names(v),freq=v)
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))

findFreqTerms(dtm, lowfreq = 100)
##   [1] "allah"          "beneficent"     "merciful"       "name"          
##   [5] "lord"           "day"            "serve"          "right"         
##   [9] "brought"        "thou"           "upon"           "book"          
##  [13] "evil"           "guard"          "believe"        "given"         
##  [17] "hereafter"      "revealed"       "shall"          "disbelieve"    
##  [21] "surely"         "will"           "great"          "hearts"        
##  [25] "punishment"     "believers"      "people"         "say"           
##  [29] "chastisement"   "land"           "make"           "said"          
##  [33] "know"           "back"           "bring"          "away"          
##  [37] "fire"           "like"           "one"            "see"           
##  [41] "turn"           "fear"           "unbelievers"    "certainly"     
##  [45] "things"         "created"        "may"            "men"           
##  [49] "earth"          "forth"          "made"           "therefore"     
##  [53] "besides"        "call"           "good"           "except"        
##  [57] "truth"          "gave"           "life"           "heavens"       
##  [61] "knows"          "knowledge"      "unjust"         "others"        
##  [65] "mercy"          "come"           "whoever"        "communications"
##  [69] "take"           "another"        "women"          "god"           
##  [73] "musa"           "give"           "best"           "sent"          
##  [77] "enter"          "allahs"         "ask"            "better"        
##  [81] "reward"         "among"          "came"           "man"           
##  [85] "thus"           "indeed"         "every"          "apostle"       
##  [89] "apostles"       "clear"          "nay"            "pleases"       
##  [93] "servants"       "way"            "two"            "mighty"        
##  [97] "follow"         "witness"        "night"          "therein"       
## [101] "let"            "lie"
findAssocs(dtm, terms ="allah", corlimit = 0.1)
## $allah
##    apostle    careful       duty   merciful     surely        say 
##       0.22       0.19       0.18       0.17       0.17       0.16 
##      knows  forgiving    believe    knowing     things    besides 
##       0.16       0.16       0.15       0.15       0.14       0.14 
##    whoever    pleases       wise   whatever       name       know 
##       0.14       0.14       0.13       0.13       0.12       0.12 
##        way     mighty      guide       will  believers        men 
##       0.12       0.12       0.11       0.11       0.11       0.11 
## mercifully     allahs        ask     limits      grace      aware 
##       0.11       0.11       0.11       0.11       0.11       0.11 
##     latter       upon     hearts     others      among    witness 
##       0.11       0.10       0.10       0.10       0.10       0.10 
## sufficient  twothirds 
##       0.10       0.10
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")