Sys.setenv(LANG = "en")
# Instalar paquetes
# install.packages("tm")
# install.packages("SnowballC")
# install.packages("wordcloud")
# install.packages("RColorBrewer")
# install.packages("pdftools")
library("tm")
## Warning: package 'tm' was built under R version 4.3.3
## Loading required package: NLP
library("SnowballC")
library("wordcloud")
## Warning: package 'wordcloud' was built under R version 4.3.3
## Loading required package: RColorBrewer
library("RColorBrewer")
library("pdftools")
## Warning: package 'pdftools' was built under R version 4.3.3
## Using poppler version 23.08.0
text_corpus <- Corpus(DirSource("C:\\Users\\kathi\\OneDrive\\Escritorio\\M4\\ALADDIN"))
# Working directory tiene los archivos en pdf
files <- list.files(pattern = "pdf$")
files
## [1] "groosham-grange.pdf"
## [2] "Harry Potter and the Sorcerers Stone.pdf"
corp1 <- Corpus(URISource(files),
readerControl = list(reader = readPDF))
# URI quiere decir Uniform Resource Identifier.
# Con URI le decimos a la función Corpus() el vector de nuestros archivos.
# Y el readerControl le indica el tipo de lector que se debe usar (en este caso pdf).
corp1 <- tm_map(corp1, content_transformer(tolower))
corp1 <- tm_map(corp1, removeNumbers)
corp1 <- tm_map(corp1, removeWords, stopwords("english"))
corp1 <- tm_map(corp1, removePunctuation)
corp1 <- tm_map(corp1, stripWhitespace)
# Remover stopwords
corp1 <- tm_map(corp1, removeWords, c("the", "and", "that", "his", "but", "you", "your", "like"))
# Term-document matrix. Es una tabla que contiene la frecuencia de las palabras.
dtm <- TermDocumentMatrix(corp1)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v) # Después de hacerlo matriz se transforma a dataframe.
head(d, 100)
## word freq
## harry harry 1214
## said said 886
## ron ron 410
## one one 367
## david david 339
## hagrid hagrid 336
## back back 329
## hermione hermione 257
## got got 252
## just just 250
## now now 234
## know know 233
## get get 224
## looked looked 219
## see see 217
## time time 183
## professor professor 180
## going going 174
## something something 172
## never never 167
## think think 159
## around around 158
## first first 157
## door door 150
## even even 150
## two two 148
## eyes eyes 146
## right right 146
## school school 145
## snape snape 145
## look look 144
## dumbledore dumbledore 143
## way way 143
## still still 139
## come come 131
## next next 130
## thought thought 126
## jill jill 125
## well well 124
## room room 122
## went went 122
## head head 121
## uncle uncle 121
## behind behind 118
## will will 118
## last last 117
## can can 116
## dudley dudley 116
## looking looking 114
## harrys harrys 113
## told told 113
## away away 111
## malfoy malfoy 109
## face face 108
## asked asked 107
## seemed seemed 107
## turned turned 107
## mrs mrs 106
## people people 106
## vernon vernon 105
## want want 105
## much much 102
## neville neville 102
## hand hand 101
## day day 100
## made made 100
## really really 100
## yeh yeh 100
## good good 99
## left left 99
## anything anything 95
## boy boy 95
## long long 95
## mcgonagall mcgonagall 95
## potter potter 95
## three three 95
## found found 94
## knew knew 93
## say say 92
## came came 91
## heard heard 90
## quirrell quirrell 90
## though though 89
## took took 89
## little little 86
## gryffindor gryffindor 85
## saw saw 85
## tell tell 85
## black black 84
## stone stone 84
## voice voice 84
## ever ever 81
## find find 81
## house house 81
## seen seen 81
## bit bit 80
## gone gone 80
## great great 80
## must must 80
## suddenly suddenly 80
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 10,
max.words=Inf, random.order=T, rot.per=0.5,
colors=brewer.pal(8, "Dark2"))
print(wordcloud)
## function (words, freq, scale = c(4, 0.5), min.freq = 3, max.words = Inf,
## random.order = TRUE, random.color = FALSE, rot.per = 0.1,
## colors = "black", ordered.colors = FALSE, use.r.layout = FALSE,
## fixed.asp = TRUE, ...)
## {
## if (!fixed.asp && rot.per > 0)
## stop("Variable aspect ratio not supported for rotated words. Set rot.per=0.")
## tails <- "g|j|p|q|y"
## last <- 1
## nc <- length(colors)
## if (missing(freq)) {
## requireNamespace("tm")
## requireNamespace("slam")
## if (is.character(words) || is.factor(words)) {
## corpus <- tm::Corpus(tm::VectorSource(words))
## corpus <- tm::tm_map(corpus, tm::removePunctuation)
## corpus <- tm::tm_map(corpus, function(x) tm::removeWords(x,
## tm::stopwords()))
## }
## else corpus <- words
## tdm <- tm::TermDocumentMatrix(corpus)
## freq <- slam::row_sums(tdm)
## words <- names(freq)
## }
## if (ordered.colors) {
## if (length(colors) != 1 && length(colors) != length(words)) {
## stop(paste("Length of colors does not match length of words",
## "vector"))
## }
## }
## if (min.freq > max(freq))
## min.freq <- 0
## overlap <- function(x1, y1, sw1, sh1) {
## if (!use.r.layout)
## return(is_overlap(x1, y1, sw1, sh1, boxes))
## s <- 0
## if (length(boxes) == 0)
## return(FALSE)
## for (i in c(last, 1:length(boxes))) {
## bnds <- boxes[[i]]
## x2 <- bnds[1]
## y2 <- bnds[2]
## sw2 <- bnds[3]
## sh2 <- bnds[4]
## if (x1 < x2)
## overlap <- x1 + sw1 > x2 - s
## else overlap <- x2 + sw2 > x1 - s
## if (y1 < y2)
## overlap <- overlap && (y1 + sh1 > y2 - s)
## else overlap <- overlap && (y2 + sh2 > y1 - s)
## if (overlap) {
## last <<- i
## return(TRUE)
## }
## }
## FALSE
## }
## ord <- rank(-freq, ties.method = "random")
## words <- words[ord <= max.words]
## freq <- freq[ord <= max.words]
## if (ordered.colors) {
## colors <- colors[ord <= max.words]
## }
## if (random.order)
## ord <- sample.int(length(words))
## else ord <- order(freq, decreasing = TRUE)
## words <- words[ord]
## freq <- freq[ord]
## words <- words[freq >= min.freq]
## freq <- freq[freq >= min.freq]
## if (ordered.colors) {
## colors <- colors[ord][freq >= min.freq]
## }
## thetaStep <- 0.1
## rStep <- 0.05
## plot.new()
## op <- par("mar")
## par(mar = c(0, 0, 0, 0))
## if (fixed.asp)
## plot.window(c(0, 1), c(0, 1), asp = 1)
## else plot.window(c(0, 1), c(0, 1))
## normedFreq <- freq/max(freq)
## size <- (scale[1] - scale[2]) * normedFreq + scale[2]
## boxes <- list()
## for (i in 1:length(words)) {
## rotWord <- runif(1) < rot.per
## r <- 0
## theta <- runif(1, 0, 2 * pi)
## x1 <- 0.5
## y1 <- 0.5
## wid <- strwidth(words[i], cex = size[i], ...)
## ht <- strheight(words[i], cex = size[i], ...)
## if (grepl(tails, words[i]))
## ht <- ht + ht * 0.2
## if (rotWord) {
## tmp <- ht
## ht <- wid
## wid <- tmp
## }
## isOverlaped <- TRUE
## while (isOverlaped) {
## if (!overlap(x1 - 0.5 * wid, y1 - 0.5 * ht, wid,
## ht) && x1 - 0.5 * wid > 0 && y1 - 0.5 * ht >
## 0 && x1 + 0.5 * wid < 1 && y1 + 0.5 * ht < 1) {
## if (!random.color) {
## if (ordered.colors) {
## cc <- colors[i]
## }
## else {
## cc <- ceiling(nc * normedFreq[i])
## cc <- colors[cc]
## }
## }
## else {
## cc <- colors[sample(1:nc, 1)]
## }
## text(x1, y1, words[i], cex = size[i], offset = 0,
## srt = rotWord * 90, col = cc, ...)
## boxes[[length(boxes) + 1]] <- c(x1 - 0.5 * wid,
## y1 - 0.5 * ht, wid, ht)
## isOverlaped <- FALSE
## }
## else {
## if (r > sqrt(0.5)) {
## warning(paste(words[i], "could not be fit on page. It will not be plotted."))
## isOverlaped <- FALSE
## }
## theta <- theta + thetaStep
## r <- r + rStep * thetaStep/(2 * pi)
## x1 <- 0.5 + r * cos(theta)
## y1 <- 0.5 + r * sin(theta)
## }
## }
## }
## par(mar = op)
## invisible()
## }
## <bytecode: 0x0000029b849128f8>
## <environment: namespace:wordcloud>
findFreqTerms(dtm, lowfreq = 20)
## [1] "’ll" "’re" "’ve" "“’m" "“’s"
## [6] "able" "across" "afternoon" "ahead" "air"
## [11] "almost" "alone" "along" "already" "also"
## [16] "although" "always" "angry" "another" "answer"
## [21] "anyone" "anything" "anyway" "arm" "arms"
## [26] "around" "arrived" "ask" "asked" "asleep"
## [31] "aunt" "away" "back" "bad" "bed"
## [36] "began" "behind" "believe" "best" "better"
## [41] "big" "birthday" "bit" "black" "blood"
## [46] "bloodbath" "boat" "body" "book" "books"
## [51] "boy" "boys" "break" "breath" "bright"
## [56] "broom" "broomstick" "brought" "call" "called"
## [61] "came" "can" "captain" "car" "castle"
## [66] "cat" "catch" "caught" "ceiling" "christmas"
## [71] "class" "cloak" "close" "cold" "come"
## [76] "coming" "common" "corner" "corridor" "course"
## [81] "crabbe" "cup" "cupboard" "dark" "david"
## [86] "david”" "day" "days" "dead" "dear"
## [91] "decided" "deep" "desk" "didn’t" "died"
## [96] "different" "disappeared" "dog" "don" "don’t"
## [101] "done" "door" "doors" "dormitory" "dragon"
## [106] "dream" "drive" "dropped" "dudley" "dudleys"
## [111] "dumbledore" "dursley" "dursleys" "edge" "either"
## [116] "eliot" "else" "empty" "end" "enough"
## [121] "even" "evening" "ever" "every" "everyone"
## [126] "everything" "exactly" "except" "expelled" "eye"
## [131] "eyes" "face" "fact" "family" "famous"
## [136] "far" "fast" "fat" "father" "feel"
## [141] "feeling" "feet" "fell" "felt" "fer"
## [146] "field" "filch" "finally" "find" "fingers"
## [151] "fire" "first" "five" "flamel" "flitwick"
## [156] "floor" "fluffy" "follow" "followed" "forest"
## [161] "forget" "forgotten" "forward" "found" "four"
## [166] "fred" "free" "front" "full" "funny"
## [171] "game" "gasped" "gave" "george" "get"
## [176] "getting" "ghost" "giant" "give" "given"
## [181] "glanced" "glass" "glasses" "going" "gold"
## [186] "gone" "good" "got" "gotten" "goyle"
## [191] "grange" "granger" "great" "green" "gregor"
## [196] "gringotts" "groosham" "ground" "gryffindor" "hadn’t"
## [201] "hagrid" "hagrids" "hair" "half" "hall"
## [206] "hand" "hands" "happened" "happy" "hard"
## [211] "hardly" "harry" "harrys" "hat" "head"
## [216] "heads" "hear" "heard" "heart" "held"
## [221] "help" "hermione" "high" "hogwarts" "holding"
## [226] "home" "hope" "horrible" "hour" "house"
## [231] "however" "huge" "hundred" "idea" "inside"
## [236] "instead" "island" "jeffrey" "jill" "just"
## [241] "keep" "kept" "kilgraw" "kill" "knew"
## [246] "knocked" "know" "knows" "large" "last"
## [251] "late" "later" "lay" "leapt" "least"
## [256] "leave" "left" "legs" "let" "letter"
## [261] "letters" "library" "life" "light" "liked"
## [266] "little" "london" "long" "look" "looked"
## [271] "looking" "lost" "lot" "mad" "madam"
## [276] "made" "magic" "make" "making" "malfoy"
## [281] "man" "managed" "many" "match" "maybe"
## [286] "mcgonagall" "mean" "meant" "met" "middle"
## [291] "might" "mind" "minutes" "mirror" "miss"
## [296] "moment" "morning" "mother" "mouth" "move"
## [301] "moved" "mrs" "much" "muggle" "must"
## [306] "muttered" "name" "names" "near" "nearly"
## [311] "neck" "need" "never" "neville" "new"
## [316] "next" "nice" "night" "nobody" "noise"
## [321] "norbert" "nose" "note" "nothing" "noticed"
## [326] "now" "number" "old" "one" "onto"
## [331] "open" "opened" "outside" "owl" "owls"
## [336] "pain" "pale" "parents" "passed" "past"
## [341] "peeves" "people" "percy" "perhaps" "petunia"
## [346] "picked" "place" "platform" "please" "pleased"
## [351] "pocket" "point" "pointed" "points" "potter"
## [356] "probably" "professor" "pulled" "put" "quickly"
## [361] "quidditch" "quirrell" "quirrells" "quite" "raised"
## [366] "ran" "rather" "reached" "read" "real"
## [371] "realized" "really" "reason" "red" "remember"
## [376] "remembered" "rest" "right" "ring" "robes"
## [381] "ron" "ronan" "room" "round" "run"
## [386] "said" "sat" "saw" "say" "saying"
## [391] "school" "screamed" "sea" "seat" "second"
## [396] "secret" "see" "seem" "seemed" "seen"
## [401] "send" "sent" "set" "seven" "shook"
## [406] "shoulder" "shouted" "show" "shut" "side"
## [411] "sight" "sign" "silence" "silver" "since"
## [416] "sir" "sit" "sitting" "six" "sleep"
## [421] "slowly" "slytherin" "small" "smiled" "smiling"
## [426] "snape" "snapes" "snapped" "somehow" "someone"
## [431] "something" "somewhere" "son" "soon" "sorry"
## [436] "sort" "sound" "speak" "spoke" "stand"
## [441] "standing" "stared" "staring" "start" "started"
## [446] "stay" "stepped" "still" "stone" "stood"
## [451] "stop" "stopped" "straight" "strange" "street"
## [456] "students" "study" "stupid" "suddenly" "suppose"
## [461] "sure" "swung" "table" "take" "taken"
## [466] "taking" "talk" "talking" "teacher" "teachers"
## [471] "team" "tell" "telling" "ten" "ter"
## [476] "thick" "thin" "thing" "things" "think"
## [481] "thinking" "third" "though" "thought" "thousand"
## [486] "three" "threw" "time" "times" "together"
## [491] "told" "tonight" "took" "top" "toward"
## [496] "towards" "train" "tried" "troll" "trouble"
## [501] "true" "try" "trying" "turn" "turned"
## [506] "twelve" "twins" "two" "uncle" "understand"
## [511] "unicorn" "use" "used" "usual" "vernon"
## [516] "voice" "voldemort" "wait" "waiting" "walked"
## [521] "walking" "wall" "walls" "wand" "want"
## [526] "wanted" "wasn’t" "watch" "watched" "watching"
## [531] "water" "way" "wearing" "weasley" "week"
## [536] "well" "went" "whispered" "white" "whole"
## [541] "will" "windergast" "window" "without" "wizard"
## [546] "wizards" "woman" "won" "wondering" "wood"
## [551] "word" "words" "work" "world" "worse"
## [556] "wrong" "yeah" "year" "years" "yeh"
## [561] "yer" "yes" "yet" "young"
findAssocs(dtm, terms = "genie", corlimit = 0.2)
## $genie
## numeric(0)
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col ="lightblue", main ="Palabras más frecuentes",
ylab = "Frecuencias de palabras")
# install.packages ("readtext")
library(readtext)
## Warning: package 'readtext' was built under R version 4.3.3
# install.packages("syuzhet")
library(syuzhet)
## Warning: package 'syuzhet' was built under R version 4.3.3
texto_palabras <- get_tokens(d)
emociones_df <- get_nrc_sentiment(texto_palabras, language = 'english')
# Gráfica de barras de las emociones
barplot(colSums(prop.table(emociones_df[, 1:10])))
Preguntas
1. ¿Se tomará y analizará todo lo que aparece en la página?
¿Qué si y que no? ¿Por qué?
No, solamente se considerarán aquellas palabras que mencionen personajes
importantes, situaciones o puntos clave en la historia de Aladdín. Esto
para hacer una comparación acertada de los factores clave que se
mencionaron más en una película o la otra, así como también ver que
factores clave fueron omitidos o agregados en la versión de 1992 y la
del 2019.
2. Ahora procederás a realizar un análisis como el visto en
clase, considera agregar stopwords, hacer una comparación de frecuencia
y un análisis de asociasión muy básico. Describe lo realizado al final
del html.
Se agregaron como stopwords las palabras “the”, “and”, “that”, “his”,
“but”, “you”, “your”, “like”, ya que no eran relevantes para el
análisis, al ser en su mayoría conectores o pronombres que no tienen
gran impacto en la historia.
Para la comparación de frecuencia, se uso la función “findFreqTerms” para buscar las palabras frecuentes y “lowfreq = 20” se utilizó para especificar el buscar palabras que aparezcan al menos 20 veces.
Para hacer el análisis de asociasión se uso la función “findAssocs”, se utilizó “genie” como la palabra para la cuál se buscarán otras palabras asociadas con esta, y “corlimit = 0.2” solo es especificación del límite mínimo de correlación.
3. Busca alguna función asociada al análisis de texto que no
hayamos visto en clase, úsala y escribe lo que lograste.
Realicé un análisis de sentimientos para examinar las principales
emociones descritas en la historia de Aladdín. Primero obtuve los tokens
(palabras individuales) del texto para preparar la información a
analizar, y luego use la función “get_nrc_sentiment” para hacer el
análisis de sentimientos. Lo que hace este análisis es asignar un valor
númerico a cada palabra basado en su sentimiento, ya sea positiva o
negativa.
Para graficar use la función “colSums” para calcular la suma de frecuencias de cada emoción, y luego “prop.table()” para calcular la proporción de cada emoción.
Bibliografía
Isasi, J. (2021, 23 marzo). Análisis de sentimientos en R con «syuzhet». Programming Historian. https://programminghistorian.org/es/lecciones/analisis-de-sentimientos-r