Abraham Castañon - A01747966 Angie Zerón - A00834060
En este Markdown se buscará hacer un análisis de texto de el script de Star Wars episodio IV
La base de datos que se utilizó fue sacada de Kaggle y todos los derechos de la misma son del propietario Link: https://www.kaggle.com/datasets/xvivancos/star-wars-movie-scripts
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
Sys.setenv(LANG = "en")
sw <- readLines(("SW_EpisodeIV.txt"), warn=FALSE)
sw <- iconv(sw,"WINDOWS-1252","UTF-8")
docs <- Corpus(VectorSource(sw))
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
# Remove your own stop word
# specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("The", "a", "I", "We", "She", "He", "the", "she", "he", "A"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, c("The", "a", "I", "We", :
## transformation drops documents
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation drops
## documents
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 100)
## word freq
## luke luke 317
## han han 163
## threepio threepio 132
## ben ben 93
## red red 75
## going going 71
## leader leader 63
## get get 60
## leia leia 60
## can can 54
## vader vader 52
## right right 50
## come come 47
## sir sir 46
## will will 45
## biggs biggs 44
## now now 43
## know know 40
## one one 39
## got got 39
## just just 38
## see see 37
## think think 37
## well well 36
## owen owen 35
## back back 33
## ship ship 32
## like like 32
## tarkin tarkin 31
## gold gold 30
## trooper trooper 29
## take take 29
## time time 26
## look look 26
## must must 24
## force force 22
## want want 21
## help help 21
## old old 21
## voice voice 21
## officer officer 20
## alderaan alderaan 20
## station station 20
## artoo artoo 19
## rebel rebel 19
## good good 19
## kenobi kenobi 19
## obiwan obiwan 19
## five five 19
## wedge wedge 19
## wait wait 18
## hold hold 18
## little little 18
## something something 18
## sure sure 17
## imperial imperial 17
## star star 17
## way way 17
## long long 16
## power power 16
## make make 16
## two two 16
## intercom intercom 16
## blast blast 15
## much much 15
## planet planet 15
## base base 15
## find find 15
## enough enough 15
## uncle uncle 15
## kid kid 15
## droids droids 15
## coming coming 15
## first first 14
## yes yes 14
## let let 14
## father father 14
## stay stay 14
## princess princess 13
## better better 13
## death death 13
## found found 13
## empire empire 13
## side side 13
## around around 13
## droid droid 13
## okay okay 13
## hey hey 12
## talking talking 12
## thought thought 12
## say say 12
## battle battle 12
## lot lot 12
## rebellion rebellion 12
## tell tell 12
## never never 12
## unit unit 12
## thing thing 12
## ten ten 12
## jabba jabba 12
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 5,
max.words=Inf, random.order=T, rot.per=0.5,
colors=brewer.pal(8, "Dark2"))
findFreqTerms(dtm, lowfreq = 20)
## [1] "threepio" "time" "come" "get" "luke" "going"
## [7] "officer" "vader" "ship" "want" "trooper" "right"
## [13] "now" "one" "must" "look" "just" "biggs"
## [19] "back" "know" "got" "leia" "will" "alderaan"
## [25] "take" "station" "see" "sir" "well" "think"
## [31] "help" "owen" "like" "can" "red" "old"
## [37] "ben" "force" "tarkin" "han" "voice" "gold"
## [43] "leader"
findAssocs(dtm, terms = "Luke", corlimit = 0.2)
## $Luke
## numeric(0)
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")