Tema 7: Análisis semántico Articulo PL

El texto utilizado para este análisis habla la situación en la Premier League por violaciones a las reglas financieras y las preocupaciones de que tanto el Manchester City como el Chelsea podrían enfrentar un riesgo, ya que ambos enfrentan violaciones de las reglas financieras.

library("tm")

## Warning: package 'tm' was built under R version 4.3.2

## Loading required package: NLP

library("SnowballC")
library("wordcloud")

## Warning: package 'wordcloud' was built under R version 4.3.2

## Loading required package: RColorBrewer

library("RColorBrewer")
library("dplyr")

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library("rvest")
library("tm")
library("tidyr")
library("ggplot2")

## Warning: package 'ggplot2' was built under R version 4.3.2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library("stringr")
library("readr")

## 
## Attaching package: 'readr'

## The following object is masked from 'package:rvest':
## 
##     guess_encoding

library("janeaustenr")

## Warning: package 'janeaustenr' was built under R version 4.3.2

#choose.files()
text <- readLines("C:\\Users\\danbr\\OneDrive\\Desktop\\Mineria de Datos\\Actividad Constante - El Diario\\pl.txt")

str(text)

##  chr [1:12] "Manchester City and Chelsea run the risk of relegation from the Premier League after Everton's 10-point deducti"| __truncated__ ...

pltext <- iconv(text,"WINDOWS-1252","UTF-8")

I.- Limpieza de Texto

pltext <- Corpus(VectorSource(pltext))

# Convert the text to lower case
pl <- tm_map(pltext, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(pltext, content_transformer(tolower)):
## transformation drops documents

# Remove numbers
pl <- tm_map(pl, removeNumbers)

## Warning in tm_map.SimpleCorpus(pl, removeNumbers): transformation drops
## documents

# Remove english common stopwords
pl <- tm_map(pl, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(pl, removeWords, stopwords("english")):
## transformation drops documents

# Remove punctuations
pl <- tm_map(pl, removePunctuation)

## Warning in tm_map.SimpleCorpus(pl, removePunctuation): transformation drops
## documents

# Eliminate extra white spaces
pl <- tm_map(pl, stripWhitespace)

## Warning in tm_map.SimpleCorpus(pl, stripWhitespace): transformation drops
## documents

II.- Análisis semántico

dtm <- TermDocumentMatrix(pl)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)

View(d)

set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 3,
          max.words=Inf, random.order=F, rot.per=0.5,
          colors=brewer.pal(5, "Dark2"))

III. Conclusiones

findAssocs(dtm, terms = "city", corlimit = 0.2)

## $city
##      former  accounting     adviser allegations        also      banker 
##        0.77        0.67        0.67        0.67        0.67        0.67 
##         ceo     counsel     current     dealing         fan     general 
##        0.67        0.67        0.67        0.67        0.67        0.67 
##      harris    historic      issues  journalist        mail         man 
##        0.67        0.67        0.67        0.67        0.67        0.67 
##        nick         plc   responded       sport        view        well 
##        0.67        0.67        0.67        0.67        0.67        0.67 
##         ffp      lawyer      stefan      across interesting   financial 
##        0.64        0.64        0.64        0.64        0.64        0.39 
##     chelsea    evertons     leagues    payments   sanctions 
##        0.38        0.29        0.29        0.29        0.29

barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")

Tema 7 - Análisis semántico

Daniel Bravo_A01708675 y Daniel Najera_A01709578

2023-11-20

Tema 7: Análisis semántico Articulo PL

I.- Limpieza de Texto

II.- Análisis semántico

III. Conclusiones