library(tm)
TEXTFILE = "/home/ricardo/Documents/recerca/songs/BlackMetal_inmortal.text"
inmortal = readLines(TEXTFILE)
inmortal = readLines(TEXTFILE)
length(inmortal)
## [1] 1996
head(inmortal)
## [1] "Storming the borders of chaos"
## [2] "For the cause of a battle once strong"
## [3] "Faster than the wind we rode"
## [4] "To where our blackened hordes dawned"
## [5] ""
## [6] "Armed in the fires of combat"
tail(inmortal)
## [1] "Rule the power of the Vrilla"
## [2] "Above the storms on Baeskades"
## [3] "As the holocaust sky draw near"
## [4] "The mighty sound of damnation calls"
## [5] ""
## [6] "Pure holocaust"
vec <- VectorSource(inmortal)
corpus <- Corpus(vec)
summary(corpus)
## A corpus with 1996 text documents
##
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
## create_date creator
## Available variables in the data frame are:
## MetaID
inspect(corpus[1:7])
## A corpus with 7 text documents
##
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
## create_date creator
## Available variables in the data frame are:
## MetaID
##
## [[1]]
## Storming the borders of chaos
##
## [[2]]
## For the cause of a battle once strong
##
## [[3]]
## Faster than the wind we rode
##
## [[4]]
## To where our blackened hordes dawned
##
## [[5]]
##
##
## [[6]]
## Armed in the fires of combat
##
## [[7]]
## The end will come fast on this day
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
doc.corpus <- tm_map(corpus, removeWords, stopwords("english"))
inspect(doc.corpus[1:2])
## A corpus with 2 text documents
##
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
## create_date creator
## Available variables in the data frame are:
## MetaID
##
## [[1]]
## storming borders chaos
##
## [[2]]
## cause battle strong
library(SnowballC)
corpus <- tm_map(corpus, stemDocument)
doc.corpus <- tm_map(corpus, stripWhitespace)
inspect(doc.corpus[1:8])
## A corpus with 8 text documents
##
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
## create_date creator
## Available variables in the data frame are:
## MetaID
##
## [[1]]
## storm the border of chao
##
## [[2]]
## for the caus of a battl onc strong
##
## [[3]]
## faster than the wind we rode
##
## [[4]]
## to where our blacken hord dawn
##
## [[5]]
##
##
## [[6]]
## arm in the fire of combat
##
## [[7]]
## the end will come fast on this day
##
## [[8]]
## stronger than the god we fought
TDM <- TermDocumentMatrix(doc.corpus)
TDM
## A term-document matrix (1149 terms, 1996 documents)
##
## Non-/sparse entries: 6983/2286421
## Sparsity : 100%
## Maximal term length: 14
## Weighting : term frequency (tf)