library(tm)

TEXTFILE = "/home/ricardo/Documents/recerca/songs/BlackMetal_inmortal.text"
inmortal = readLines(TEXTFILE)
inmortal = readLines(TEXTFILE)
length(inmortal)
## [1] 1996
head(inmortal)
## [1] "Storming the borders of chaos"        
## [2] "For the cause of a battle once strong"
## [3] "Faster than the wind we rode"         
## [4] "To where our blackened hordes dawned" 
## [5] ""                                     
## [6] "Armed in the fires of combat"
tail(inmortal)
## [1] "Rule the power of the Vrilla"       
## [2] "Above the storms on Baeskades"      
## [3] "As the holocaust sky draw near"     
## [4] "The mighty sound of damnation calls"
## [5] ""                                   
## [6] "Pure holocaust"
vec <- VectorSource(inmortal)
corpus <- Corpus(vec)
summary(corpus)
## A corpus with 1996 text documents
## 
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
##   create_date creator 
## Available variables in the data frame are:
##   MetaID
inspect(corpus[1:7])
## A corpus with 7 text documents
## 
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
##   create_date creator 
## Available variables in the data frame are:
##   MetaID 
## 
## [[1]]
## Storming the borders of chaos
## 
## [[2]]
## For the cause of a battle once strong
## 
## [[3]]
## Faster than the wind we rode
## 
## [[4]]
## To where our blackened hordes dawned
## 
## [[5]]
## 
## 
## [[6]]
## Armed in the fires of combat
## 
## [[7]]
## The end will come fast on this day
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
doc.corpus <- tm_map(corpus, removeWords, stopwords("english"))
inspect(doc.corpus[1:2])
## A corpus with 2 text documents
## 
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
##   create_date creator 
## Available variables in the data frame are:
##   MetaID 
## 
## [[1]]
## storming  borders  chaos
## 
## [[2]]
##   cause   battle  strong
library(SnowballC)
corpus <- tm_map(corpus, stemDocument)
doc.corpus <- tm_map(corpus, stripWhitespace)
inspect(doc.corpus[1:8])
## A corpus with 8 text documents
## 
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
##   create_date creator 
## Available variables in the data frame are:
##   MetaID 
## 
## [[1]]
## storm the border of chao
## 
## [[2]]
## for the caus of a battl onc strong
## 
## [[3]]
## faster than the wind we rode
## 
## [[4]]
## to where our blacken hord dawn
## 
## [[5]]
## 
## 
## [[6]]
## arm in the fire of combat
## 
## [[7]]
## the end will come fast on this day
## 
## [[8]]
## stronger than the god we fought
TDM <- TermDocumentMatrix(doc.corpus)
TDM
## A term-document matrix (1149 terms, 1996 documents)
## 
## Non-/sparse entries: 6983/2286421
## Sparsity           : 100%
## Maximal term length: 14 
## Weighting          : term frequency (tf)