# # Instalacja pakietów
# install.packages("tm") # text mining
# install.packages("SnowballC") # stemming
# install.packages("wordcloud") # chmury słów
# install.packages("digest") ## zaawansowane chmura słów
# install.packages("wordcloud2")
# install.packages("RColorBrewer") # paleta kolorów
# install.packages("syuzhet") # analiza sentymentu
# install.packages("ggplot2") # wizualizacja
# install.packages("tidytext") # manipulacje danymi
# install.packages("dplyr")
# załadowanie pakietów
library("tm")
library("SnowballC")
library("RColorBrewer")
library("wordcloud")
library("digest")
library("wordcloud2")
library("syuzhet")
library("ggplot2")
library("tidytext")
library("dplyr")
library(gridExtra)
text <- readLines(file.choose())#### przemówienia inauguracyjne prezydentów USA
#text[2]
TextDoc <- Corpus(VectorSource(text)) #budowa corpusu
TextDoc
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 57
#wyświetlanie danego dokumentu z corpusu (zbioru dokumentów)
#writeLines(as.character(TextDoc[[2]]))
###Funkcja content_transformer może służyć do transformacji funkcji do obiektów R'owych
toSpace <-content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(TextDoc, content_transformer(tolower)):
## transformation drops documents
stopwords("english")
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
## [101] "who's" "what's" "here's" "there's" "when's"
## [106] "where's" "why's" "how's" "a" "an"
## [111] "the" "and" "but" "if" "or"
## [116] "because" "as" "until" "while" "of"
## [121] "at" "by" "for" "with" "about"
## [126] "against" "between" "into" "through" "during"
## [131] "before" "after" "above" "below" "to"
## [136] "from" "up" "down" "in" "out"
## [141] "on" "off" "over" "under" "again"
## [146] "further" "then" "once" "here" "there"
## [151] "when" "where" "why" "how" "all"
## [156] "any" "both" "each" "few" "more"
## [161] "most" "other" "some" "such" "no"
## [166] "nor" "not" "only" "own" "same"
## [171] "so" "than" "too" "very"
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(TextDoc, removeWords, stopwords("english")):
## transformation drops documents
Na przykłąd usuwanie słów: “company” i “team”
TextDoc <- tm_map(TextDoc, removeWords, c( "company","team"))
## Warning in tm_map.SimpleCorpus(TextDoc, removeWords, c("company", "team")):
## transformation drops documents
TextDoc <- tm_map(TextDoc, removePunctuation)
## Warning in tm_map.SimpleCorpus(TextDoc, removePunctuation): transformation
## drops documents
TextDoc <- tm_map(TextDoc, stripWhitespace)
## Warning in tm_map.SimpleCorpus(TextDoc, stripWhitespace): transformation
## drops documents
TextDoc <- tm_map(TextDoc, stemDocument)
## Warning in tm_map.SimpleCorpus(TextDoc, stemDocument): transformation drops
## documents
TextDoc_tdm <- TermDocumentMatrix(TextDoc)
TextDoc_tdm
## <<TermDocumentMatrix (terms: 5232, documents: 57)>>
## Non-/sparse entries: 32638/265586
## Sparsity : 89%
## Maximal term length: 20
## Weighting : term frequency (tf)
tdm_m <- as.matrix(TextDoc_tdm)
head(tdm_m,10)
## Docs
## Terms 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## yrprezspeech 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 14th 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1789washingtonfellow 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## accomplish 0 1 0 0 0 0 0 0 2 1 1 1 0 0 3 3 0 1 3 0
## accord 0 1 0 1 1 1 0 0 1 3 0 0 0 1 2 2 0 3 0 3
## acknowledg 0 1 0 1 1 1 1 0 0 1 2 1 0 0 3 1 1 3 0 0
## acquit 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## act 0 1 1 0 0 4 0 0 1 3 0 1 1 1 12 3 0 5 2 4
## actual 0 1 0 0 0 0 0 0 0 0 0 0 0 6 3 0 0 1 2 1
## actuat 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## Docs
## Terms 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
## yrprezspeech 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 14th 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1789washingtonfellow 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## accomplish 0 1 1 2 2 1 1 1 0 1 0 3 0 0 0 0 2
## accord 0 1 0 2 1 1 0 0 2 0 0 3 0 0 3 0 0
## acknowledg 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0
## acquit 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## act 0 0 2 2 1 1 2 0 0 4 2 2 0 0 0 2 1
## actual 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 2 0
## actuat 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## Docs
## Terms 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## yrprezspeech 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 14th 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1789washingtonfellow 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## accomplish 3 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 0
## accord 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## acknowledg 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## acquit 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## act 3 0 2 0 0 2 1 0 0 0 5 0 5 2 3 2 0
## actual 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## actuat 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Docs
## Terms 55 56 57
## yrprezspeech 0 0 0
## 14th 0 0 0
## 1789washingtonfellow 0 0 0
## accomplish 1 0 0
## accord 0 0 0
## acknowledg 0 0 0
## acquit 0 0 0
## act 2 5 1
## actual 0 0 0
## actuat 0 0 0
tdm_v <-sort(rowSums(tdm_m),decreasing=TRUE)
head(tdm_v,10)
## will govern nation peopl can state upon great power must
## 871 679 655 602 465 446 369 364 362 346
tdm_d <- data.frame(word = names(tdm_v),freq=tdm_v)
head(tdm_d,5)
barplot(tdm_d[1:5,]$freq, las = 1, names.arg = tdm_d[1:5,]$word,
col ="lightgreen", main ="5 najczesciej wystepujących słów",
ylab = "czestość słów")
wordcloud(words = tdm_d$word, freq = tdm_d$freq, min.freq = 5,
max.words=50, random.order=FALSE, rot.per=0.70,
colors=brewer.pal(8, "Dark2"))
##w kształcie diamentu
# wordcloud2( tdm_d,color = brewer.pal(10,"Set3"), backgroundColor = "grey", shape = 'diamond',size = 0.5)
## w kształcie centroidu
# wordcloud2( tdm_d,color = brewer.pal(10,"Set3"), backgroundColor = "grey", shape = 'centroid',size = 0.5)
## w kształcie trójkąta
# wordcloud2( tdm_d,color = brewer.pal(10,"Set3"), backgroundColor = "grey", shape = 'triangle-forward',size = 0.5)
## w kształcie pięciokąta
# wordcloud2( tdm_d,color = brewer.pal(10,"Set3"), backgroundColor = "grey", shape = 'pentagon',size = 0.5)
## w kształcie gwiazdy
wordcloud2( tdm_d,color = brewer.pal(10,"Set3"), backgroundColor = "grey", shape = 'star',size = 0.5)
## CHMURA SŁÓW W PODZIALE NA DOKUMENTY
PZYKŁADOWO WZIĘTO 4 OSTATNIE PRZEMÓWIENIA
a<-as.matrix(tdm_m[,54:57],) # 4 ostatnie dokumenty (przemówienia)
head(a,10)
## Docs
## Terms 54 55 56 57
## yrprezspeech 0 0 0 0
## 14th 0 0 0 0
## 1789washingtonfellow 0 0 0 0
## accomplish 0 1 0 0
## accord 0 0 0 0
## acknowledg 0 0 0 0
## acquit 0 0 0 0
## act 0 2 5 1
## actual 0 0 0 0
## actuat 0 0 0 0
colnames(a)<-c("1997-Clinton","2001-Bush","2005-Bush","2009-Obama")
head(a,10)
## Docs
## Terms 1997-Clinton 2001-Bush 2005-Bush 2009-Obama
## yrprezspeech 0 0 0 0
## 14th 0 0 0 0
## 1789washingtonfellow 0 0 0 0
## accomplish 0 1 0 0
## accord 0 0 0 0
## acknowledg 0 0 0 0
## acquit 0 0 0 0
## act 0 2 5 1
## actual 0 0 0 0
## actuat 0 0 0 0
CHMURA SŁÓW Z 4 DOKUMENTÓW
comparison.cloud(a,colors = brewer.pal(4, "Paired"),title.colors = "darkgrey",title.bg.colors = "white")
tdm_d$word<-as.character(tdm_d$word) # typ zmiennej word jako charakter
df<-tdm_d
colnames(df) <-c("word","n") # nazwanie kolumn
df[1:10,]
BIBLIOTEKA BING
Umozliwia klasyfikację słów na pozytywne i negatywne
df %>%
inner_join(get_sentiments("bing"), by = "word")
WIZUALIZACJA SENTYMENTU BING
df[1:500,] %>%
inner_join(get_sentiments("bing"), by = "word") %>%
group_by(sentiment) %>%
ungroup() %>%
mutate(word = reorder(word, n))%>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("red2", "green3")) +
facet_wrap(~sentiment, scales = "free_y") + # rozdzielenie na pozytywne, negatywne
#ylim(0, 20) +
labs(y = NULL, x = NULL) +
coord_flip() +
theme_minimal()
BIBLIOTEKA NRC
NRC sentyment umożliwia klasyfikować słowa na 8 emocji
Wywołanie tablicy NCR
head(get_sentiments("nrc"),10)
df2<-df[c(1:500),] # ograniczenie liczby słów do 500 najcześciej występujących
df2 %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(sentiment=="anger") %>%
ggplot(aes(x=reorder(word,n),y=n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("black")) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = NULL, x = NULL) +
coord_flip() +
theme_minimal()
wykres1<- df2 %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(sentiment=="positive") %>%
ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("green3")) +
facet_wrap(~sentiment, scales = "free_y") +
#ylim(0, 200) +
labs(y = NULL, x = NULL) +
coord_flip() +
theme_minimal()
wykres2<-df2 %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(sentiment=="negative") %>%
ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("red")) +
facet_wrap(~sentiment, scales = "free_y") +
#ylim(0, 200) +
labs(y = NULL, x = NULL) +
coord_flip() +
theme_minimal()
wykres3<-df2 %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(sentiment=="joy") %>%
ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("pink")) +
facet_wrap(~sentiment, scales = "free_y") +
#ylim(0, 200) +
labs(y = NULL, x = NULL) +
coord_flip() +
theme_minimal()
wykres4<-df2 %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(sentiment=="trust") %>%
ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("orange")) +
facet_wrap(~sentiment, scales = "free_y") +
#ylim(0, 200) +
labs(y = NULL, x = NULL) +
coord_flip() +
theme_minimal()
wykres5<-df2 %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(sentiment=="anticipation") %>%
ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("gold")) +
facet_wrap(~sentiment, scales = "free_y") +
# ylim(0, 200) +
labs(y = NULL, x = NULL) +
coord_flip() +
theme_minimal()
wykres6<-df2 %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(sentiment=="fear") %>%
ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("black")) +
facet_wrap(~sentiment, scales = "free_y") +
#ylim(0, 500) +
labs(y = NULL, x = NULL) +
coord_flip() +
theme_minimal()
grid.arrange(wykres1, wykres2, wykres3, wykres4, wykres5, wykres6)
Made by:
Majkowska Agata
agata.majkowska@phdstud.ug.edu.pl