Tema 7: Análisis semántico Articulo PL

El texto utilizado para este análisis habla la situación en la Premier League por violaciones a las reglas financieras y las preocupaciones de que tanto el Manchester City como el Chelsea podrían enfrentar un riesgo, ya que ambos enfrentan violaciones de las reglas financieras.

library("tm") 
## Warning: package 'tm' was built under R version 4.3.2
## Loading required package: NLP
library("SnowballC")
library("wordcloud")
## Warning: package 'wordcloud' was built under R version 4.3.2
## Loading required package: RColorBrewer
library("RColorBrewer")
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("rvest")
library("tm")
library("tidyr")
library("ggplot2")
## Warning: package 'ggplot2' was built under R version 4.3.2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library("stringr")
library("readr")
## 
## Attaching package: 'readr'
## The following object is masked from 'package:rvest':
## 
##     guess_encoding
library("janeaustenr")
## Warning: package 'janeaustenr' was built under R version 4.3.2
#choose.files()
text <- readLines("C:\\Users\\danbr\\OneDrive\\Desktop\\Mineria de Datos\\Actividad Constante - El Diario\\pl.txt")

str(text)
##  chr [1:12] "Manchester City and Chelsea run the risk of relegation from the Premier League after Everton's 10-point deducti"| __truncated__ ...
pltext <- iconv(text,"WINDOWS-1252","UTF-8")

I.- Limpieza de Texto

pltext <- Corpus(VectorSource(pltext))
# Convert the text to lower case
pl <- tm_map(pltext, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(pltext, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
pl <- tm_map(pl, removeNumbers)
## Warning in tm_map.SimpleCorpus(pl, removeNumbers): transformation drops
## documents
# Remove english common stopwords
pl <- tm_map(pl, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(pl, removeWords, stopwords("english")):
## transformation drops documents
# Remove punctuations
pl <- tm_map(pl, removePunctuation)
## Warning in tm_map.SimpleCorpus(pl, removePunctuation): transformation drops
## documents
# Eliminate extra white spaces
pl <- tm_map(pl, stripWhitespace) 
## Warning in tm_map.SimpleCorpus(pl, stripWhitespace): transformation drops
## documents

II.- Análisis semántico

dtm <- TermDocumentMatrix(pl)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
View(d)
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 3,
          max.words=Inf, random.order=F, rot.per=0.5,
          colors=brewer.pal(5, "Dark2"))

III. Conclusiones

findAssocs(dtm, terms = "city", corlimit = 0.2)
## $city
##      former  accounting     adviser allegations        also      banker 
##        0.77        0.67        0.67        0.67        0.67        0.67 
##         ceo     counsel     current     dealing         fan     general 
##        0.67        0.67        0.67        0.67        0.67        0.67 
##      harris    historic      issues  journalist        mail         man 
##        0.67        0.67        0.67        0.67        0.67        0.67 
##        nick         plc   responded       sport        view        well 
##        0.67        0.67        0.67        0.67        0.67        0.67 
##         ffp      lawyer      stefan      across interesting   financial 
##        0.64        0.64        0.64        0.64        0.64        0.39 
##     chelsea    evertons     leagues    payments   sanctions 
##        0.38        0.29        0.29        0.29        0.29
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")