library(readr)
exemplo<-"Hoje tem filezin de frango com pure de batata. Amanhã tem carne ensopada ou almondega de soja. Quarta linguiça e macarrão, e quinta coxa com sobrecoxa e FAROFA NATALINA. \U0001f385\U0001f3fbhttps://t.co/GhD1E1dzVm https://t.co/yKcdUj4v1Ā"
exemplo
## [1] "Hoje tem filezin de frango com pure de batata. Amanhã tem carne ensopada ou almondega de soja. Quarta linguiça e macarrão, e quinta coxa com sobrecoxa e FAROFA NATALINA. <U+0001F385><U+0001F3FB>https://t.co/GhD1E1dzVm https://t.co/yKcdUj4v1A"
## [1] "Hoje tem filezin de frango com pure de batata. Amanhã tem carne ensopada ou almondega de soja. Quarta linguiça e macarrão, e quinta coxa com sobrecoxa e FAROFA NATALINA. <U+0001F385><U+0001F3FB>https://t.co/GhD1E1dzVm https://t.co/yKcdUj4v1<U+5FC2><U+1F09><U+B190><U+AFC6><U+A1B3><U+275F><U+C38B><U+AA94><U+E587><U+CC6A><U+271A><U+7112><U+E129><U+A2B9><U+D454><U+7D95><U+8D1A><U+154E><U+A7AE><ed><U+00AB><U+00A1><U+7DAC><U+F9A5><U+BC02><U+A6A9><U+52F0><U+245E><U+0A50><U+E577><U+EB7D><U+5946><U+D7D7><U+2B98><U+5744><U+B30A><U+61AC><U+9D9A><U+8FD1><U+99BC><U+AEB2><U+4A88><U+A25E><U+5538><U+77D6><U+2934><U+CF8C><U+4E4B><U+9749><U+5570><U+55B5><U+024D><U+1A2F><U+8B7D><U+E8D2><U+4872><U+0897><ed><U+00A0><U+0098><U+EE2C><U+2E70><U+FA20><U+A166><U+49AE><U+624D><U+9B61><U+AFD4><U+78EE><U+059A><U+BCCE><U+1881><U+5C78><U+698C><ed><U+00A3><U+00B2><U+3115><U+C6FF><U+4948><U+36C4><U+FD8F><U+1F29><U+313D><U+F16F><U+2AB8><U+F662><U+3712><U+B1F0><U+6CB0><U+5DEC><U+A268><U+3A93><U+8AEA><U+0BFF><U+79C1><U+F70B><U+3C7A><U+E40E><U+A01C><U+CE63><U+6EB4><U+26C9><U+EE57><U+6E77><U+8777><U+599D><U+9037><U+0FCB><U+CDE8><U+3598><U+D609><ed><U+00BB><U+0088><U+7A8F><U+86F4><U+7FCC><ed><U+00AF><U+00B4><U+2E98><U+4DE5><U+5BBB><U+6E9A><U+374C><U+C9F4><U+BA72><U+573C><U+1C59><U+C3BA><U+9AE3><U+CB88><U+FA38><U+5638><U+FAA4><U+447C><U+EB1D><U+A018><U+5609><U+17C7><U+F5AC><U+92E3><U+773A><U+FCC8><U+88A1><U+A2DC>n<U+CE76><ed><U+00B4><U+0097><U+5C56><U+C405><U+F5B6><U+5E82><U+D45D><U+9555><U+5E72><U+E6DC><U+37F5><U+F381><U+F00F><U+F814><U+2A12><U+102B><U+A848><U+41AD><U+B103><U+E12F><U+0560><U+129A><U+350E><U+2A68><U+5E38><U+E6C4><U+0310>"
exemplo2<-chartr("áéíóúÁÉÍÓÚýÝàèìòùÀÈÌÒÙâêîôûÂÊÎÔÛãõÃÕñÑäëïöüÄËÏÖÜÿçÇ",
"aeiouaeiouyyaeiouaeiouaeiouaeiouaoaonnaeiouaeiouycc",exemplo)
exemplo2
## [1] "Hoje tem filezin de frango com pure de batata. Amanha tem carne ensopada ou almondega de soja. Quarta linguica e macarrao, e quinta coxa com sobrecoxa e FAROFA NATALINA. <U+0001F385><U+0001F3FB>https://t.co/GhD1E1dzVm https://t.co/yKcdUj4v1<U+5FC2><U+1F09><U+B190><U+AFC6><U+A1B3><U+275F><U+C38B><U+AA94><U+E587><U+CC6A><U+271A><U+7112><U+E129><U+A2B9><U+D454><U+7D95><U+8D1A><U+154E><U+A7AE><ed><U+00AB><U+00A1><U+7DAC><U+F9A5><U+BC02><U+A6A9><U+52F0><U+245E><U+0A50><U+E577><U+EB7D><U+5946><U+D7D7><U+2B98><U+5744><U+B30A><U+61AC><U+9D9A><U+8FD1><U+99BC><U+AEB2><U+4A88><U+A25E><U+5538><U+77D6><U+2934><U+CF8C><U+4E4B><U+9749><U+5570><U+55B5><U+024D><U+1A2F><U+8B7D><U+E8D2><U+4872><U+0897><ed><U+00A0><U+0098><U+EE2C><U+2E70><U+FA20><U+A166><U+49AE><U+624D><U+9B61><U+AFD4><U+78EE><U+059A><U+BCCE><U+1881><U+5C78><U+698C><ed><U+00A3><U+00B2><U+3115><U+C6FF><U+4948><U+36C4><U+FD8F><U+1F29><U+313D><U+F16F><U+2AB8><U+F662><U+3712><U+B1F0><U+6CB0><U+5DEC><U+A268><U+3A93><U+8AEA><U+0BFF><U+79C1><U+F70B><U+3C7A><U+E40E><U+A01C><U+CE63><U+6EB4><U+26C9><U+EE57><U+6E77><U+8777><U+599D><U+9037><U+0FCB><U+CDE8><U+3598><U+D609><ed><U+00BB><U+0088><U+7A8F><U+86F4><U+7FCC><ed><U+00AF><U+00B4><U+2E98><U+4DE5><U+5BBB><U+6E9A><U+374C><U+C9F4><U+BA72><U+573C><U+1C59><U+C3BA><U+9AE3><U+CB88><U+FA38><U+5638><U+FAA4><U+447C><U+EB1D><U+A018><U+5609><U+17C7><U+F5AC><U+92E3><U+773A><U+FCC8><U+88A1><U+A2DC>n<U+CE76><ed><U+00B4><U+0097><U+5C56><U+C405><U+F5B6><U+5E82><U+D45D><U+9555><U+5E72><U+E6DC><U+37F5><U+F381><U+F00F><U+F814><U+2A12><U+102B><U+A848><U+41AD><U+B103><U+E12F><U+0560><U+129A><U+350E><U+2A68><U+5E38><U+E6C4><U+0310>"
## Loading required package: NLP
## Loading required package: stylo
##
## ### stylo version: 0.7.1 ###
##
## If you plan to cite this software (please do!), use the following reference:
## Eder, M., Rybicki, J. and Kestemont, M. (2016). Stylometry with R:
## a package for computational text analysis. R Journal 8(1): 107-121.
## <https://journal.r-project.org/archive/2016/RJ-2016-007/index.html>
##
## To get full BibTeX entry, type: citation("stylo")
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
##
## Attaching package: 'textmining'
## The following object is masked from 'package:tm':
##
## getMeta
## The following object is masked from 'package:stats':
##
## terms
## The following object is masked from 'package:base':
##
## parse
exemplo_corpus <- tmCorpus(exemplo)
exemplo_corpus <- tm_map(exemplo_corpus, content_transformer(tolower))
exemplo_corpus <- tm_map(exemplo_corpus, removePunctuation)
exemplo_corpus <- tm_map(exemplo_corpus, removeNumbers)
exemplo_corpus <- tm_map(exemplo_corpus, removeWords, stopwords("portuguese"))
exemplo_corpus <- tm_map(exemplo_corpus, stripWhitespace)
exemplo_corpus <- TermDocumentMatrix(exemplo_corpus)
exemplo_corpus_matrix <- as.matrix(exemplo_corpus)
palavras<-row.names(exemplo_corpus_matrix)
É preciso proceder com a lematização do texto. É necessário identificar e converter formas flexionadas das palavras para as suas versões dicionarizadas. Por exemplo, é preciso tomar as palavras comi, comemos. comeríamos, comia e transformá-las todas em comer.
O Blog do Marcos Nunes fez isso para a legenda de seriados. Funcionou muito bem.
# lemmatizacao
# fonte: https://marcusnunes.me/posts/analise-de-sentimentos-com-r-bojack-horseman-vs-brooklyn-99/
# obs - colocar no formato tidy antes de executar.
lemma_dic <- read.delim(file = "https://raw.githubusercontent.com/michmech/lemmatization-lists/master/lemmatization-pt.txt", header = FALSE, stringsAsFactors = FALSE)
names(lemma_dic) <- c("stem", "term")
for (j in 1:length(palavras)){
comparacao <- palavras[j] == lemma_dic$term
if (sum(comparacao) == 1){
palavras[j] <- as.character(lemma_dic$stem[comparacao])
} else {
palavras[j] <- palavras[j]
}
}