Sys.setenv(LANG = "en")
#install.packages("tinytex")
# A few examples on how to Install packages
#install.packages("tm") # for text mining
#install.packages("SnowballC") # for text manipulation
#install.packages("wordcloud") # word-cloud generator
library(tm)
## Loading required package: NLP
library(SnowballC)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(udpipe)
library(lattice)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
La librería Udpipe proporciona algunos modelos preentrenados para algunos idiomas y se pueden descargar a la computadora utilizando el comando udpipe_download_model() para posteriormente cargarlo a tu ambiente de R utilizando el comando udpipe_load_model().
model <- udpipe_download_model(language = "english")
## Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe to /private/var/folders/0m/n2h82y8x40l03_rndfw0bt640000gn/T/net.whatsapp.WhatsApp/documents/1E619178-64A6-4B87-A5DF-318C1A7CF99E/english-ewt-ud-2.5-191206.udpipe
## - This model has been trained on version 2.5 of data from https://universaldependencies.org
## - The model is distributed under the CC-BY-SA-NC license: https://creativecommons.org/licenses/by-nc-sa/4.0
## - Visit https://github.com/jwijffels/udpipe.models.ud.2.5 for model license details.
## - For a list of all models and their licenses (most models you can download with this package have either a CC-BY-SA or a CC-BY-SA-NC license) read the documentation at ?udpipe_download_model. For building your own models: visit the documentation by typing vignette('udpipe-train', package = 'udpipe')
## Downloading finished, model stored at '/private/var/folders/0m/n2h82y8x40l03_rndfw0bt640000gn/T/net.whatsapp.WhatsApp/documents/1E619178-64A6-4B87-A5DF-318C1A7CF99E/english-ewt-ud-2.5-191206.udpipe'
udmodel_english <- udpipe_load_model(file = 'english-ewt-ud-2.5-191206.udpipe')
(https://www.kaggle.com/datasets/meetnagadia/amazon-kindle-book-review-for-sentiment-analysis)
bd <- read.csv("/Users/hugoenrique/Desktop/Universidad/8vo Semestre/Planeación Estratégica/M2/R Scripts/all_kindle_review.csv", header = T, stringsAsFactors = F)
glimpse(bd)
## Rows: 12,000
## Columns: 11
## $ X <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ Unnamed..0 <int> 11539, 5957, 9146, 7038, 1776, 3744, 13641, 4448, 2797,…
## $ asin <chr> "B0033UV8HI", "B002HJV4DE", "B002ZG96I4", "B002QHWOEU",…
## $ helpful <chr> "[8, 10]", "[1, 1]", "[0, 0]", "[1, 3]", "[0, 1]", "[6,…
## $ rating <int> 3, 5, 3, 3, 4, 5, 2, 4, 5, 4, 1, 4, 1, 4, 5, 2, 4, 1, 5…
## $ reviewText <chr> "Jace Rankin may be short, but he's nothing to mess wit…
## $ reviewTime <chr> "09 2, 2010", "10 8, 2013", "04 11, 2014", "07 5, 2014"…
## $ reviewerID <chr> "A3HHXRELK8BHQG", "A2RGNZ0TRF578I", "A3S0H2HV6U1I7F", "…
## $ reviewerName <chr> "Ridley", "Holly Butler", "Merissa", "Cleargrace", "Rjo…
## $ summary <chr> "Entertaining But Average", "Terrific menage scenes!", …
## $ unixReviewTime <int> 1283385600, 1381190400, 1397174400, 1404518400, 1356912…
Es un subconjunto de las reseñas de productos de la categoría Amazon Kindle Store, cubriendo desde mayo de 1996 hasta julio de 2014.
Contenido:
Tiene un como total de reseñas 982,619, donde cada usuario ha escrito al menos 5 reseñas y cada producto tiene al menos 5 reseñas.
Columnas del dataset:
bd_2 <- bd %>%
mutate(
review_date = mdy(reviewTime),
year = str_sub(review_date,1,4),
month = str_sub(review_date,6,7),
date = str_sub(review_date,9,10))
bd_2 %>% group_by(year) %>% count() %>% ggplot() + geom_bar(aes(x = year, y = n, group = 1), stat = "identity", fill = "steelblue") +labs(title = "Cantidad de reseñas por año", x = "Año", y = "Número de reseñas") + theme_minimal()
bd_2 %>% group_by(month) %>% count() %>% ggplot() + geom_bar(aes(x = month, y = n), stat = "identity", fill = "lightgreen") +labs(title = "Cantidad de reseñas por mes", x = "Mes", y = "Número de reseñas") + theme_minimal()
# Convertir de texto o factor a numérico
bd_2$rating <- as.numeric(as.character(bd_2$rating))
bd_2 %>% group_by(rating) %>% count() %>% ggplot() + geom_bar(aes(x = rating, y = n), stat = "identity", fill = "lightpink") +labs(title = "Cantidad de Puntuación del producto (1 a 5) ", x = "Puntuacion", y = "Cantidad de Puntuación") + theme_minimal()
Se aplico un filtro donde nos enfocamos en las reseñas del año 2013 durante los meses de Enero a Junio.
bd_2 <- bd_2 %>% mutate(month = as.numeric(month))
filtro_1 <- bd_2 %>% filter(year == 2013 & month >= 01 & month <= 06)
filtro_1 %>% group_by(month) %>% count() %>% ggplot() + geom_bar(aes(x = month, y = n), stat = "identity", fill = "steelblue") + labs(title = "Cantidad de reseñas 2013 (Enero a Junio)", x = "Mes", y = "Cantidad de reseñas") + theme_minimal()
# Crear corpus a partir de la columna reviewText del dataframe filtro_1
corpus1 <- VCorpus(VectorSource(filtro_1$reviewText))
# Convertir a minúsculas
corpus1 <- tm_map(corpus1, content_transformer(tolower))
# Quitar números
corpus1 <- tm_map(corpus1, removeNumbers)
# Quitar puntuación
corpus1 <- tm_map(corpus1, removePunctuation)
# Quitar espacios en blanco extra
corpus1 <- tm_map(corpus1, stripWhitespace)
Para encontrar las palabras más frecuentes se usa la ‘Matriz de Terminos Documento’, básicamente es una tabla que contiene la frecuencia de las palabras presentes en el documento que se analiza.
dtm <- TermDocumentMatrix(corpus1)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 50)
# Quitar stopwords en inglés
corpus1 <- tm_map(corpus1, removeWords, stopwords("english"))
dtm1 <- TermDocumentMatrix(corpus1)
m1 <- as.matrix(dtm1)
v1 <- sort(rowSums(m1),decreasing=TRUE)
d1 <- data.frame(word = names(v1),freq=v1)
head(d1, 50)
findFreqTerms(dtm1, lowfreq = 20)
## [1] "able" "absolutely" "across" "action"
## [5] "actually" "add" "added" "admit"
## [9] "adult" "adults" "adventure" "age"
## [13] "ago" "almost" "alone" "along"
## [17] "alpha" "already" "also" "although"
## [21] "always" "amazing" "amazon" "amount"
## [25] "annoying" "another" "anyone" "anything"
## [29] "around" "attention" "attraction" "author"
## [33] "authors" "available" "away" "awesome"
## [37] "back" "background" "bad" "based"
## [41] "beautiful" "became" "become" "becomes"
## [45] "bed" "begin" "beginning" "believable"
## [49] "believe" "best" "better" "beyond"
## [53] "big" "bit" "black" "book"
## [57] "books" "boring" "bother" "bought"
## [61] "boy" "break" "bring" "brother"
## [65] "brothers" "brought" "buy" "call"
## [69] "called" "came" "can" "cant"
## [73] "care" "case" "caught" "certain"
## [77] "certainly" "chance" "change" "chapter"
## [81] "chapters" "character" "characters" "check"
## [85] "children" "christian" "classic" "clear"
## [89] "close" "collection" "come" "comes"
## [93] "coming" "complete" "completely" "concept"
## [97] "confused" "confusing" "connection" "consider"
## [101] "continue" "continues" "couldnt" "couple"
## [105] "course" "cover" "crazy" "create"
## [109] "cup" "cute" "dark" "day"
## [113] "days" "deal" "death" "decent"
## [117] "decided" "definitely" "depth" "description"
## [121] "descriptions" "despite" "detail" "details"
## [125] "developed" "development" "dialogue" "didnt"
## [129] "different" "difficult" "disappointed" "doesnt"
## [133] "done" "dont" "downloaded" "dragon"
## [137] "dragons" "drama" "drawn" "due"
## [141] "early" "earth" "easily" "easy"
## [145] "ebook" "editing" "editor" "either"
## [149] "else" "emotions" "end" "ended"
## [153] "ending" "ends" "enjoy" "enjoyable"
## [157] "enjoyed" "enough" "entertaining" "entire"
## [161] "erotic" "erotica" "errors" "especially"
## [165] "etc" "even" "ever" "every"
## [169] "everyone" "everything" "evil" "exactly"
## [173] "example" "excellent" "exciting" "expect"
## [177] "expected" "expecting" "experience" "extremely"
## [181] "eyes" "face" "fact" "fairly"
## [185] "fairy" "fall" "familiar" "family"
## [189] "fan" "fantasy" "far" "fast"
## [193] "father" "favorite" "feel" "feeling"
## [197] "feelings" "feels" "fell" "felt"
## [201] "female" "fiction" "figure" "filled"
## [205] "finally" "find" "finding" "finds"
## [209] "finish" "finished" "first" "five"
## [213] "follow" "forced" "form" "forward"
## [217] "found" "four" "free" "friend"
## [221] "friends" "full" "fun" "funny"
## [225] "future" "gave" "gay" "genre"
## [229] "get" "gets" "getting" "girl"
## [233] "give" "given" "gives" "giving"
## [237] "glad" "god" "goes" "going"
## [241] "gone" "good" "got" "grammar"
## [245] "grammatical" "graphic" "great" "group"
## [249] "guess" "guy" "guys" "hadnt"
## [253] "half" "handsome" "happen" "happened"
## [257] "happens" "happy" "hard" "hate"
## [261] "havent" "head" "heart" "heat"
## [265] "held" "hell" "help" "hero"
## [269] "heroine" "hes" "high" "highly"
## [273] "history" "hold" "home" "honestly"
## [277] "hooked" "hope" "hoping" "horror"
## [281] "hot" "hours" "house" "however"
## [285] "huge" "human" "humans" "humor"
## [289] "husband" "idea" "ideas" "ill"
## [293] "immediately" "important" "information" "instead"
## [297] "interest" "interested" "interesting" "introduced"
## [301] "involved" "isnt" "issue" "issues"
## [305] "ive" "jack" "jake" "james"
## [309] "job" "just" "keep" "keeps"
## [313] "kept" "kids" "kill" "killer"
## [317] "kind" "kinda" "kindle" "king"
## [321] "knew" "know" "known" "knows"
## [325] "lack" "language" "last" "later"
## [329] "lead" "learn" "least" "leave"
## [333] "left" "length" "less" "let"
## [337] "lets" "level" "life" "light"
## [341] "like" "liked" "line" "lines"
## [345] "list" "literally" "little" "live"
## [349] "lives" "living" "long" "longer"
## [353] "look" "looking" "lost" "lot"
## [357] "lots" "love" "loved" "loves"
## [361] "loving" "made" "magic" "main"
## [365] "make" "makes" "making" "male"
## [369] "man" "many" "marry" "mate"
## [373] "may" "maybe" "mean" "means"
## [377] "meet" "meets" "men" "mention"
## [381] "mentioned" "met" "might" "mind"
## [385] "missing" "money" "months" "mother"
## [389] "move" "moves" "movie" "much"
## [393] "must" "mysteries" "mystery" "name"
## [397] "need" "needed" "needs" "never"
## [401] "new" "next" "nice" "night"
## [405] "none" "note" "nothing" "novel"
## [409] "novella" "novels" "now" "number"
## [413] "often" "okay" "old" "older"
## [417] "one" "ones" "opinion" "order"
## [421] "original" "others" "otherwise" "outside"
## [425] "overall" "pace" "page" "pages"
## [429] "paid" "paranormal" "part" "particularly"
## [433] "parts" "pass" "past" "pay"
## [437] "people" "perfect" "perhaps" "person"
## [441] "pick" "picked" "place" "play"
## [445] "please" "plot" "plots" "plus"
## [449] "point" "poor" "potential" "predictable"
## [453] "premise" "pretty" "previous" "price"
## [457] "probably" "problem" "problems" "purchase"
## [461] "purchased" "purpose" "put" "quick"
## [465] "quickly" "quite" "rather" "rating"
## [469] "read" "reader" "readers" "reading"
## [473] "reads" "ready" "real" "realistic"
## [477] "realize" "really" "reason" "recommend"
## [481] "recommended" "relationship" "remember" "research"
## [485] "rest" "return" "review" "reviewers"
## [489] "reviews" "right" "romance" "romantic"
## [493] "run" "rushed" "sad" "said"
## [497] "save" "saw" "say" "saying"
## [501] "says" "scene" "scenes" "school"
## [505] "scifi" "second" "see" "seeing"
## [509] "seem" "seemed" "seems" "seen"
## [513] "sense" "sequel" "serial" "series"
## [517] "serious" "seriously" "set" "setting"
## [521] "several" "sex" "sexual" "sexy"
## [525] "shes" "shifter" "short" "show"
## [529] "shows" "side" "silly" "simple"
## [533] "simply" "since" "sister" "situation"
## [537] "slow" "small" "solid" "someone"
## [541] "something" "sometimes" "somewhat" "soon"
## [545] "sorry" "sort" "space" "spent"
## [549] "stand" "star" "stars" "start"
## [553] "started" "starts" "stay" "steamy"
## [557] "still" "stop" "stories" "story"
## [561] "storyline" "straight" "strange" "strong"
## [565] "stuff" "stupid" "style" "super"
## [569] "supposed" "sure" "surprised" "suspense"
## [573] "sweet" "take" "taken" "takes"
## [577] "taking" "tale" "tales" "talk"
## [581] "talking" "tell" "telling" "tells"
## [585] "terrible" "thank" "thats" "theres"
## [589] "theyre" "thing" "things" "think"
## [593] "thinking" "third" "thoroughly" "though"
## [597] "thought" "three" "throughout" "thrown"
## [601] "time" "times" "title" "together"
## [605] "told" "took" "top" "totally"
## [609] "tried" "trouble" "true" "truly"
## [613] "try" "trying" "turn" "turned"
## [617] "turns" "twist" "twists" "two"
## [621] "type" "typos" "understand" "unfortunately"
## [625] "unique" "unless" "use" "used"
## [629] "uses" "using" "usual" "usually"
## [633] "vampire" "vampires" "verne" "version"
## [637] "wait" "waiting" "want" "wanted"
## [641] "wanting" "wants" "war" "wasnt"
## [645] "waste" "way" "weird" "well"
## [649] "went" "whole" "wife" "will"
## [653] "willing" "wish" "without" "wolf"
## [657] "woman" "women" "wonder" "wonderful"
## [661] "wont" "word" "words" "work"
## [665] "worked" "working" "works" "world"
## [669] "worst" "worth" "wouldnt" "write"
## [673] "writer" "writers" "writes" "writing"
## [677] "written" "wrong" "wrote" "year"
## [681] "years" "yes" "yet" "youll"
## [685] "young" "youre" "zombies"
findAssocs(dtm1, terms = "book", corlimit = 0.5)
## $book
## numeric(0)
barplot(d1[1:20,]$freq, las = 2, names.arg = d1[1:20,]$word,
col ="lightblue", main ="Palabras más Frecuentes F1",
ylab = "Frecuencias")
La palabra con mayor dominio es “book”, siendo el término más relevante y frecuente. Al inicio de la lista se agrupan los conceptos fundamentales (“book”, “story”, “read”). A lo largo de la lista, las palabras van disminuyendo. Esto muestra que, después de los conceptos clave, aparece un vocabulario más variado, que sirve para dar opinión y describir los elementos de las historias.
# Aplicar anotación lingüística con udpipe sobre la columna reviewText
s1 <- udpipe_annotate(udmodel_english, x = filtro_1$reviewText)
# Convertir a data frame
x1 <- as.data.frame(s1)
stats1 <- txt_freq(x1$upos)
stats1$key <- factor(stats1$key, levels = rev(stats1$key))
barchart(key ~ freq, data = stats1, col = "orange",
main = "UPOS (Universal Parts of Speech)\n Frecuencia de Ocurrencia F1",
xlab = "Frecuencia")
En el analisis morfosintatico La categoría con mayor dominio es
NOUN/Sustantivo, es el tipo de palabra más relevante y frecuente,
indicando que el texto se centra en nombrar personas, lugares y cosas.
Al inicio de la lista se agrupan las partes fundamentales que construyen
el núcleo de las oraciones. La frecuencia de las categorías va
disminuyendo. Esto refleja una estructura lingüística típica: el texto
se construye sobre una base de sustantivos y verbos
stats1 <- subset(x1, upos %in% c("NOUN"))
stats1 <- txt_freq(stats1$token)
stats1$key <- factor(stats1$key, levels = rev(stats1$key))
barchart(key ~ freq, data = head(stats1, 20), col = "cadetblue",
main = "Sustantivos más Comunes F1", xlab = "Frecuencia")
Los sustantivos más frecuentes como book, story, characters, series y
author reflejan que las reseñas de los usuarios giran principalmente en
torno al contenido narrativo más que al dispositivo o la plataforma.
stats1 <- subset(x1, upos %in% c("ADJ"))
stats1 <- txt_freq(stats1$token)
stats1$key <- factor(stats1$key, levels = rev(stats1$key))
barchart(key ~ freq, data = head(stats1, 20), col = "purple",
main = "Adjetivos más Comunes F1", xlab = "Freq")
Los adjetivos dominantes como good, great, interesting, y better
muestran una tendencia positiva general en las reseñas, lo que indica
alta satisfacción con los libros analizados asi como la presencia de bad
y different, aunque menos frecuente, evidencia una minoría crítica,
posiblemente asociada a reseñas negativas
stats1 <- subset(x1, upos %in% c("VERB"))
stats1 <- txt_freq(stats1$token)
stats1$key <- factor(stats1$key, levels = rev(stats1$key))
barchart(key ~ freq, data = head(stats1, 20), col = "gold",
main = "Verbos más Comunes F1", xlab = "Frecuencia")
Los verbos más frecuentes, encabezados por read, have, get, like, love y
enjoyed, reflejan que las reseñas están centradas en la experiencia de
lectura personal y en la valoración emocional del libro.
Extracción automática de palabras claves utilizando un algoritmo preentrenado: RAKE (Rapid Automatic Keyword Extraction). En esencia, dicho algoritmo busca determinar frases clave dentro de un texto al analizar la frecuencia con las que ciertas palabras aparecen de manera conjunta dentro del texto en cuestión.
stats1 <- keywords_rake(x = x1, term = "lemma", group = "doc_id",
relevant = x1$upos %in% c("NOUN", "ADJ"))
stats1$key <- factor(stats1$keyword, levels = rev(stats1$keyword))
barchart(key ~ rake, data = head(subset(stats1, freq > 3), 20), col = "red",
main = "Keywords identificados por RAKE F1",
xlab = "Rake")
Las frases más comunes identificadas por RAKE, como “grammatical error”,
“panic attack”, “female lead” o “historical romance”, muestran una
combinación de temas narrativos y percepciones críticas
x1$phrase_tag <- as_phrasemachine(x1$upos, type = "upos")
stats1 <- keywords_phrases(x = x1$phrase_tag, term = tolower(x1$token),
pattern = "(A|N)*N(P+D*(A|N)*N)*",
is_regex = TRUE, detailed = FALSE)
stats1 <- subset(stats1, ngram > 1 & freq > 3)
stats1$key <- factor(stats1$keyword, levels = rev(stats1$keyword))
barchart(key ~ freq, data = head(stats1, 20), col = "magenta",
main = "Keywords - Frases nominales F1", xlab = "Frecuencia")
Las frases más frecuentes como “that I”, “story line”, “short story”,
“sex scenes”, “main character” y “good book” reflejan que los usuarios
se enfocan principalmente en aspectos narrativos y de contenido asi
también se enfocan en opiniones de los lectores.
Nube con minimo de frecuencia de 50 veces la palabra
set.seed(20250402)
wordcloud(words = d1$word, freq = d1$freq, min.freq = 50,
max.words=Inf, random.order=T, rot.per=0.5,
colors=brewer.pal(8, "Dark2"))
Se observa una gran variedad léxica, donde destacan palabras como book,
story, characters, series, good, reading, love, y just. Esto muestra que
el corpus de reseñas es amplio y abarca múltiples temas relacionados con
la lectura, trama, personajes y emociones.
Nube con minimo de frecuencia de 150 veces la palabra
set.seed(20250402)
wordcloud(words = d1$word, freq = d1$freq, min.freq = 150,
max.words=Inf, random.order=T, rot.per=0.5,
colors=brewer.pal(8, "Dark2"))
En esta nube se concentran palabras como book, story, read, characters,
y good, lo que sugiere que la mayoría de los comentarios giran en torno
a la experiencia general de lectura y la calidad narrativa.
Se aplico un filtro donde nos enfocamos en las reseñas del año 2013 durante los meses de Julio a Diciembre.
filtro_2 <- bd_2 %>% filter(year == 2013 & month >= 07 & month <= 12)
filtro_2 %>% group_by(month) %>% count() %>% ggplot() + geom_bar(aes(x = month, y = n), stat = "identity", fill = "steelblue") + labs(title = "Cantidad de reseñas 2013 (Julio a Diciembre)", x = "Mes", y = "Cantidad de reseñas") + theme_minimal()
# Crear corpus a partir de la columna reviewText del dataframe filtro_1
corpus2 <- VCorpus(VectorSource(filtro_2$reviewText))
# Convertir a minúsculas
corpus2 <- tm_map(corpus2, content_transformer(tolower))
# Quitar números
corpus2 <- tm_map(corpus2, removeNumbers)
# Quitar puntuación
corpus2 <- tm_map(corpus2, removePunctuation)
# Quitar espacios en blanco extra
corpus2 <- tm_map(corpus2, stripWhitespace)
Para encontrar las palabras más frecuentes se usa la ‘Matriz de Terminos Documento’, básicamente es una tabla que contiene la frecuencia de las palabras presentes en el documento que se analiza.
dtm2 <- TermDocumentMatrix(corpus2)
m2 <- as.matrix(dtm2)
v2 <- sort(rowSums(m2),decreasing=TRUE)
d2 <- data.frame(word = names(v2),freq=v2)
head(d2, 50)
# Quitar stopwords en inglés
corpus2 <- tm_map(corpus2, removeWords, stopwords("english"))
dtm3 <- TermDocumentMatrix(corpus2)
m3 <- as.matrix(dtm3)
v3 <- sort(rowSums(m3),decreasing=TRUE)
d3 <- data.frame(word = names(v3),freq=v3)
head(d3, 50)
findFreqTerms(dtm3, lowfreq = 20)
## [1] "able" "absolutely" "across" "action" "actually"
## [6] "add" "admit" "age" "ago" "almost"
## [11] "along" "alpha" "already" "also" "although"
## [16] "always" "amazing" "amazon" "another" "anyone"
## [21] "anything" "around" "attention" "author" "authors"
## [26] "away" "back" "bad" "based" "bdsm"
## [31] "beautiful" "become" "becomes" "bed" "beginning"
## [36] "believable" "believe" "best" "better" "beyond"
## [41] "big" "bit" "blake" "book" "books"
## [46] "boring" "bought" "brett" "bring" "brother"
## [51] "buy" "call" "came" "can" "cant"
## [56] "care" "case" "certainly" "chance" "change"
## [61] "chapter" "chapters" "character" "characters" "check"
## [66] "chemistry" "child" "children" "christmas" "classic"
## [71] "collection" "come" "comes" "coming" "complete"
## [76] "completely" "continue" "copy" "couldnt" "couple"
## [81] "course" "cover" "cute" "dark" "daughter"
## [86] "day" "days" "decent" "decided" "definitely"
## [91] "demons" "depth" "described" "description" "descriptions"
## [96] "details" "developed" "development" "dialogue" "didnt"
## [101] "different" "difficult" "disappointed" "doesnt" "done"
## [106] "dont" "due" "earth" "easy" "edge"
## [111] "editing" "either" "else" "emotional" "end"
## [116] "ended" "ending" "ends" "enjoy" "enjoyable"
## [121] "enjoyed" "enough" "entertaining" "entire" "erotic"
## [126] "erotica" "errors" "especially" "even" "ever"
## [131] "every" "everyone" "everything" "evil" "excellent"
## [136] "exciting" "expect" "expected" "expecting" "eyes"
## [141] "fact" "fairly" "family" "fan" "fantasy"
## [146] "far" "fast" "father" "favorite" "feel"
## [151] "feeling" "feelings" "felt" "female" "fiction"
## [156] "fighting" "figure" "finally" "find" "finding"
## [161] "finds" "finish" "finished" "first" "five"
## [166] "follow" "forward" "found" "four" "free"
## [171] "friend" "friends" "full" "fun" "funny"
## [176] "future" "gave" "gay" "get" "gets"
## [181] "getting" "girl" "give" "given" "gives"
## [186] "glad" "goes" "going" "good" "got"
## [191] "grammar" "great" "group" "guess" "guy"
## [196] "guys" "half" "happen" "happened" "happens"
## [201] "happy" "hard" "hate" "havent" "heart"
## [206] "hell" "help" "hero" "heroine" "hes"
## [211] "high" "highly" "history" "hold" "home"
## [216] "honest" "hope" "hoping" "hot" "house"
## [221] "however" "human" "humans" "hunter" "husband"
## [226] "idea" "ill" "immediately" "instead" "interest"
## [231] "interested" "interesting" "involved" "isnt" "issues"
## [236] "ive" "job" "just" "kallysten" "kate"
## [241] "keep" "keeps" "kept" "kind" "kindle"
## [246] "knew" "know" "known" "knows" "language"
## [251] "last" "later" "lead" "learn" "least"
## [256] "leave" "left" "less" "let" "life"
## [261] "light" "like" "liked" "line" "lisa"
## [266] "little" "live" "lives" "living" "long"
## [271] "longer" "look" "looked" "looking" "lost"
## [276] "lot" "lots" "love" "loved" "loves"
## [281] "made" "magic" "main" "make" "makes"
## [286] "making" "male" "man" "many" "marc"
## [291] "marriage" "married" "mate" "mates" "matter"
## [296] "may" "maybe" "mean" "meet" "men"
## [301] "met" "might" "mind" "mistress" "money"
## [306] "mother" "move" "movie" "moving" "much"
## [311] "must" "mystery" "name" "need" "needed"
## [316] "needs" "never" "new" "next" "nice"
## [321] "night" "nothing" "novel" "novella" "novels"
## [326] "now" "number" "okay" "old" "older"
## [331] "one" "ones" "order" "others" "overall"
## [336] "pack" "page" "pages" "pain" "part"
## [341] "parts" "past" "pay" "people" "perfect"
## [346] "perhaps" "person" "pick" "place" "play"
## [351] "plot" "point" "points" "poor" "potential"
## [356] "predictable" "premise" "pretty" "price" "probably"
## [361] "problem" "problems" "put" "quick" "quickly"
## [366] "quite" "rather" "rating" "read" "reader"
## [371] "readers" "reading" "real" "really" "reason"
## [376] "recommend" "relationship" "remember" "rest" "review"
## [381] "reviews" "ridiculous" "right" "romance" "rushed"
## [386] "said" "save" "say" "scene" "scenes"
## [391] "school" "second" "secret" "see" "seem"
## [396] "seemed" "seems" "seen" "sense" "series"
## [401] "seriously" "set" "several" "sex" "sexual"
## [406] "sexy" "shes" "short" "show" "shows"
## [411] "side" "sin" "since" "sister" "slow"
## [416] "small" "someone" "something" "sometimes" "soon"
## [421] "sorry" "sort" "soul" "star" "stars"
## [426] "start" "started" "starts" "stay" "steamy"
## [431] "still" "stop" "stories" "story" "storyline"
## [436] "strange" "strong" "stuff" "style" "supposed"
## [441] "sure" "surprised" "suspense" "sweet" "take"
## [446] "takes" "tale" "tales" "talk" "tell"
## [451] "thats" "theres" "theyre" "thing" "things"
## [456] "think" "thinking" "third" "though" "thought"
## [461] "three" "throughout" "time" "times" "together"
## [466] "told" "took" "top" "totally" "towards"
## [471] "tried" "true" "truly" "try" "trying"
## [476] "turn" "turns" "twist" "twists" "two"
## [481] "type" "understand" "unique" "use" "used"
## [486] "usually" "vampire" "vampires" "version" "wait"
## [491] "want" "wanted" "wanting" "wants" "war"
## [496] "wasnt" "waste" "way" "well" "went"
## [501] "west" "whats" "whole" "will" "wish"
## [506] "within" "without" "woman" "women" "wonderful"
## [511] "wont" "word" "words" "work" "works"
## [516] "world" "worst" "worth" "wouldnt" "write"
## [521] "writer" "writing" "written" "wrong" "year"
## [526] "years" "yes" "yet" "youll" "young"
## [531] "youre"
findAssocs(dtm3, terms = "book", corlimit = 0.2)
## $book
## like first also author hint contained even much
## 0.27 0.24 0.23 0.23 0.22 0.21 0.21 0.21
barplot(d3[1:20,]$freq, las = 2, names.arg = d3[1:20,]$word,
col ="lightblue", main ="Palabras más Frecuentes F2",
ylab = "Frecuencias")
En el segundo semestre de 2013, las palabras más frecuentes siguen una
tendencia muy similar al primer filtro, con “book”, “story”, “read” y
“one” como los términos dominantes. Esto confirma que los usuarios
continúan centrando sus reseñas en la experiencia de lectura y en la
calidad narrativa.
# Aplicar anotación lingüística con udpipe sobre la columna reviewText
s2 <- udpipe_annotate(udmodel_english, x = filtro_2$reviewText)
# Convertir a data frame
x2 <- as.data.frame(s2)
stats2 <- txt_freq(x2$upos)
stats2$key <- factor(stats2$key, levels = rev(stats2$key))
barchart(key ~ freq, data = stats2, col = "orange",
main = "UPOS (Universal Parts of Speech)\n Frecuencia de Ocurrencia F2",
xlab = "Frecuencia")
El análisis morfosintáctico muestra que las categorías más frecuentes en
las reseñas son los sustantivos (NOUN), seguidos por verbos (VERB) y
pronombres (PRON). Este patrón confirma que los usuarios estructuran sus
reseñas en torno a temas concretos (libros, historias, personajes)
acompañados de acciones e impresiones personales.Los demas valores son
indicadores de un lenguaje natural
stats2 <- subset(x2, upos %in% c("NOUN"))
stats2 <- txt_freq(stats2$token)
stats2$key <- factor(stats2$key, levels = rev(stats2$key))
barchart(key ~ freq, data = head(stats2, 20), col = "cadetblue",
main = "Sustantivos más Comunes F2", xlab = "Frecuencia")
se vuelve a confirmar una continuidad en el foco narrativo ya observado
en el primer filtro. Los lectores siguen centrando sus reseñas en los
elementos fundamentales de la experiencia literaria: el libro en sí, la
historia, los personajes y el autor.
stats2 <- subset(x2, upos %in% c("ADJ"))
stats2 <- txt_freq(stats2$token)
stats2$key <- factor(stats2$key, levels = rev(stats2$key))
barchart(key ~ freq, data = head(stats2, 20), col = "purple",
main = "Adjetivos más Comunes F2", xlab = "Frecuencia")
Los adjetivos predominantes como good, great, interesting, y better
evidencian que la percepción general del público sigue siendo positiva,
destacando la calidad y disfrute de las obras. La alta frecuencia de
good y great reafirma la satisfacción lectora, mientras que interesting
sugiere un interés sostenido en la trama o temática.
stats2 <- subset(x2, upos %in% c("VERB"))
stats2 <- txt_freq(stats2$token)
stats2$key <- factor(stats2$key, levels = rev(stats2$key))
barchart(key ~ freq, data = head(stats2, 20), col = "gold",
main = "Verbos más Comunes F2", xlab = "Frecuencia")
Muy parecido a el Filtro 1 El verbo más frecuente, read, domina
ampliamente la distribución, seguido de have, get, had y reading, lo que
confirma que las reseñas giran en torno a la acción de leer y la
experiencia de lectura. Esto refuerza la idea de que los usuarios no
solo evalúan el producto, sino que narran su vivencia personal con el
libro.
Extracción automática de palabras claves utilizando un algoritmo preentrenado: RAKE (Rapid Automatic Keyword Extraction). En esencia, dicho algoritmo busca determinar frases clave dentro de un texto al analizar la frecuencia con las que ciertas palabras aparecen de manera conjunta dentro del texto en cuestión.
stats2 <- keywords_rake(x = x2, term = "lemma", group = "doc_id",
relevant = x2$upos %in% c("NOUN", "ADJ"))
stats2$key <- factor(stats2$keyword, levels = rev(stats2$keyword))
barchart(key ~ rake, data = head(subset(stats2, freq > 3), 20), col = "red",
main = "Keywords identificados por RAKE",
xlab = "Rake")
Las frases extraídas por RAKE revelan los temas más específicos y
recurrentes en las reseñas del segundo semestre. Entre las más
destacadas se encuentran “high school”, “fairy tale”, “write style”, y
“old fashioned”, lo que indica una fuerte presencia de obras ambientadas
en contextos juveniles.
x2$phrase_tag <- as_phrasemachine(x2$upos, type = "upos")
stats2 <- keywords_phrases(x = x2$phrase_tag, term = tolower(x2$token),
pattern = "(A|N)*N(P+D*(A|N)*N)*",
is_regex = TRUE, detailed = FALSE)
stats2 <- subset(stats2, ngram > 1 & freq > 3)
stats2$key <- factor(stats2$keyword, levels = rev(stats2$keyword))
barchart(key ~ freq, data = head(stats2, 20), col = "magenta",
main = "Keywords - Frases nominales F2 ", xlab = "Frecuencia")
Las frases nominales más frecuentes, como “short story”, “good read”,
“sex scenes”, “great story”, y “story line”, revelan que las reseñas del
segundo semestre mantienen un foco fuerte en la narrativa, el
entretenimiento y la temática romántica o erótica.
Nube con minimo de frecuencia de 50 veces la palabra
set.seed(20250402)
wordcloud(words = d3$word, freq = d3$freq, min.freq = 50,
max.words=Inf, random.order=T, rot.per=0.5,
colors=brewer.pal(8, "Dark2"))
se observa una amplia variedad de términos como book, story, love,
character, great, good, like, author y series, lo que refleja la
diversidad temática y emocional de las reseñas. Los lectores hablan
tanto de la historia y sus personajes como de la calidad narrativa
Nube con minimo de frecuencia de 100 veces la palabra
set.seed(20250402)
wordcloud(words = d3$word, freq = d3$freq, min.freq = 100,
max.words=Inf, random.order=T, rot.per=0.5,
colors=brewer.pal(8, "Dark2"))
la nube se concentra en los términos más representativos: book, story,
good, like, love, y characters, evidenciando los ejes principales del
discurso de los usuarios. Este filtro reduce el ruido y deja ver con
claridad que las reseñas giran principalmente en torno a la calidad del
libro, la trama, y la satisfacción emocional del lector.
Filtro 1 (Enero – Junio 2013)
Enfoque en historia, personajes y autor; menor atención a temas técnicos.
Lenguaje positivo y descriptivo, con adjetivos como good, great e interesting.
Reseñas centradas en la calidad literaria y la estructura narrativa.
Aparición de críticas sobre formato y errores gramaticales, reflejando una etapa de adaptación al entorno digital.
Predomina un tono racional y analítico, más enfocado en la valoración del contenido que en la emoción.
Filtro 2 (Julio – Diciembre 2013)
Lenguaje más emocional y expresivo, centrado en géneros románticos y juveniles.
Palabras clave: love, romance, quick read, fairy tale y high school.
Mayor énfasis en la conexión afectiva con los personajes y la trama.
Disminuyen las críticas técnicas; los lectores se enfocan en disfrutar y recomendar.
Tono cercano y entusiasta, reflejando una comunidad lectora madura y comprometida con la experiencia narrativa.