PREPROCESING
ZAMIANA NA MAŁE LITERY
TextDoc <- tm_map(TextDoc, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(TextDoc, content_transformer(tolower)):
## transformation drops documents
USUNIĘCIE STOPWORDS
stopwords("english")
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
## [101] "who's" "what's" "here's" "there's" "when's"
## [106] "where's" "why's" "how's" "a" "an"
## [111] "the" "and" "but" "if" "or"
## [116] "because" "as" "until" "while" "of"
## [121] "at" "by" "for" "with" "about"
## [126] "against" "between" "into" "through" "during"
## [131] "before" "after" "above" "below" "to"
## [136] "from" "up" "down" "in" "out"
## [141] "on" "off" "over" "under" "again"
## [146] "further" "then" "once" "here" "there"
## [151] "when" "where" "why" "how" "all"
## [156] "any" "both" "each" "few" "more"
## [161] "most" "other" "some" "such" "no"
## [166] "nor" "not" "only" "own" "same"
## [171] "so" "than" "too" "very"
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(TextDoc, removeWords, stopwords("english")):
## transformation drops documents
USUNIĘCIE ZNAKÓW INTERPUNKCYJNYCH
TextDoc <- tm_map(TextDoc, removePunctuation)
## Warning in tm_map.SimpleCorpus(TextDoc, removePunctuation): transformation
## drops documents
USUNIĘCIE ZDEFINIOWANYCH SYMBOLI
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
TextDoc <- tm_map(TextDoc, toSpace, "–")
## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, "–"): transformation drops
## documents
TextDoc <- tm_map(TextDoc, toSpace, "’")
## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, "’"): transformation drops
## documents
TextDoc <- tm_map(TextDoc, toSpace, "‘")
## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, "‘"): transformation drops
## documents
TextDoc <- tm_map(TextDoc, toSpace, "•")
## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, "•"): transformation drops
## documents
TextDoc <- tm_map(TextDoc, toSpace, "“")
## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, "“"): transformation drops
## documents
TextDoc <- tm_map(TextDoc, toSpace, "”")
## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, "”"): transformation drops
## documents
ZAMIANA KONRETNYCH SŁÓW
TextDoc <- tm_map(TextDoc, content_transformer(gsub),
pattern = "organiz", replacement = "organ")
## Warning in tm_map.SimpleCorpus(TextDoc, content_transformer(gsub), pattern =
## "organiz", : transformation drops documents
TextDoc <- tm_map(TextDoc, content_transformer(gsub),
pattern = "organis", replacement = "organ")
## Warning in tm_map.SimpleCorpus(TextDoc, content_transformer(gsub), pattern =
## "organis", : transformation drops documents
TextDoc <- tm_map(TextDoc, content_transformer(gsub),
pattern = "andgovern", replacement = "govern")
## Warning in tm_map.SimpleCorpus(TextDoc, content_transformer(gsub), pattern =
## "andgovern", : transformation drops documents
TextDoc <- tm_map(TextDoc, content_transformer(gsub),
pattern = "inenterpris", replacement = "enterpris")
## Warning in tm_map.SimpleCorpus(TextDoc, content_transformer(gsub), pattern =
## "inenterpris", : transformation drops documents
TextDoc <- tm_map(TextDoc, content_transformer(gsub),
pattern = "team-", replacement = "team")
## Warning in tm_map.SimpleCorpus(TextDoc, content_transformer(gsub), pattern =
## "team-", : transformation drops documents
DEFINIOWANIE WYRAZÓW I ICH USUNIĘCIE
myStopwords <- c("can", "say","one","way","use",
"also","howev","tell","will",
"much","need","take","tend","even",
"like","particular","rather","said",
"get","well","make","ask","come","end",
"first","two","help","often","may",
"might","see","someth","thing","point",
"post","look","right","now","think","‘ve ",
"‘re ","anoth","put","set","new","good",
"want","sure","kind","larg","yes,","day","etc",
"quit","sinc","attempt","lack","seen","awar",
"littl","ever","moreov","though","found","abl",
"enough","far","earli","away","achiev","draw",
"last","never","brief","bit","entir","brief",
"great","lot","t","s","don","isn","paul","didn","are","n","won","let",
"doesn","go","know","yes","lou","couldn")
TextDoc <- tm_map(TextDoc, removeWords, myStopwords)
## Warning in tm_map.SimpleCorpus(TextDoc, removeWords, myStopwords):
## transformation drops documents
STEAMING - USUNIĘCIE KOŃCÓWEK FLEKSYJNYCH
TextDoc <- tm_map(TextDoc, stemDocument, "english")
## Warning in tm_map.SimpleCorpus(TextDoc, stemDocument, "english"):
## transformation drops documents
USUNIĘCIE BIAŁYCH SPACJI
TextDoc <- tm_map(TextDoc, stripWhitespace)
## Warning in tm_map.SimpleCorpus(TextDoc, stripWhitespace): transformation drops
## documents
BUDOWA MACIERZY TDM
TextDoc_tdm<-TermDocumentMatrix(TextDoc)
TextDoc_tdm
## <<TermDocumentMatrix (terms: 5205, documents: 57)>>
## Non-/sparse entries: 31266/265419
## Sparsity : 89%
## Maximal term length: 20
## Weighting : term frequency (tf)
tdm_m <- as.matrix(TextDoc_tdm)
head(tdm_m,10)
## Docs
## Terms 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
## yrprezspeech 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 14th 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1789washingtonfellow 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## accomplish 0 1 0 0 0 0 0 0 2 1 1 1 0 0 3 3 0 1 3 0 0 1
## accord 0 1 0 1 1 1 0 0 1 3 0 0 0 1 2 2 0 3 0 3 0 1
## acknowledg 0 1 0 1 1 1 1 0 0 1 2 1 0 0 3 1 1 3 0 0 0 0
## acquit 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## act 0 1 1 0 0 4 0 0 1 3 0 1 1 1 12 3 0 5 2 4 0 0
## actual 0 1 0 0 0 0 0 0 0 0 0 0 0 6 3 0 0 1 2 1 0 0
## actuat 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## Docs
## Terms 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
## yrprezspeech 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 14th 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1789washingtonfellow 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## accomplish 1 2 2 1 1 1 0 1 0 3 0 0 0 0 2 3 0 0 0
## accord 0 2 1 1 0 0 2 0 0 3 0 0 3 0 0 0 0 0 0
## acknowledg 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## acquit 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## act 2 2 1 1 2 0 0 4 2 2 0 0 0 2 1 3 0 2 0
## actual 0 0 0 1 0 0 0 0 0 0 0 1 0 2 0 0 0 0 0
## actuat 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Docs
## Terms 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
## yrprezspeech 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 14th 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1789washingtonfellow 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## accomplish 1 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0
## accord 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## acknowledg 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## acquit 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## act 0 2 1 0 0 0 5 0 5 2 3 2 0 2 5 1
## actual 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## actuat 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
tdm_v<-sort(rowSums(tdm_m), decreasing=TRUE)
head(tdm_v, 10)
## govern nation peopl state upon power must countri world shall
## 679 655 602 446 369 362 346 336 333 314
tdm_d <- data.frame(word = names(tdm_v),freq=tdm_v)
head(tdm_d,5)
## word freq
## govern govern 679
## nation nation 655
## peopl peopl 602
## state state 446
## upon upon 369
CHMURA SŁÓW
wordcloud(words = tdm_d$word, freq = tdm_d$freq, min.freq = 5,
max.words=50, random.order=FALSE, rot.per=0.70,
colors=brewer.pal(8, "Dark2"))

KSZTAŁTY CHMUR SŁÓW
##w kształcie diamentu
# wordcloud2( tdm_d,color = brewer.pal(10,"Set3"), backgroundColor = "grey", shape = 'diamond',size = 0.5)
## w kształcie centroidu
# wordcloud2( tdm_d,color = brewer.pal(10,"Set3"), backgroundColor = "grey", shape = 'centroid',size = 0.5)
## w kształcie trójkąta
# wordcloud2( tdm_d,color = brewer.pal(10,"Set3"), backgroundColor = "grey", shape = 'triangle-forward',size = 0.5)
## w kształcie pięciokąta
# wordcloud2( tdm_d,color = brewer.pal(10,"Set3"), backgroundColor = "grey", shape = 'pentagon',size = 0.5)
## w kształcie gwiazdy
wordcloud2( tdm_d,color = brewer.pal(10,"Set3"), backgroundColor = "grey", shape = 'star',size = 0.5)
OBAMA
TextDoc_tdm_o<-TermDocumentMatrix(TextDoc[[57]])
TextDoc_tdm_o
## <<TermDocumentMatrix (terms: 676, documents: 1)>>
## Non-/sparse entries: 676/0
## Sparsity : 0%
## Maximal term length: 11
## Weighting : term frequency (tf)
tdm_m_o <- as.matrix(TextDoc_tdm_o)
head(tdm_m_o,10)
## Docs
## Terms 1
## 2009obama 1
## abandon 1
## abil 1
## accept 1
## account 1
## achiev 1
## across 3
## act 1
## action 1
## advanc 2
tdm_v_o<-sort(rowSums(tdm_m_o), decreasing=TRUE)
head(tdm_v_o, 10)
## nation america generat everi must peopl work world less common
## 15 10 9 8 8 8 8 8 7 6
tdm_d_o <- data.frame(word = names(tdm_v_o),freq=tdm_v_o)
head(tdm_d_o,5)
## word freq
## nation nation 15
## america america 10
## generat generat 9
## everi everi 8
## must must 8
wordcloud(words = tdm_d_o$word, freq = tdm_d_o$freq, min.freq = 5,
max.words=50, random.order=FALSE, rot.per=0.70,
colors=brewer.pal(8, "Dark2"))

WYSZUKIWANIE POSZCZEGÓLNYCH SŁÓW
tdm_d_1 <- data.frame(tdm_m)
tdm_d_1%>%
mutate(words=row.names(tdm_d_1))%>%
filter(words=='power')
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20
## power 0 2 0 6 5 5 2 3 7 24 10 8 7 10 67 27 3 14 12 4
## X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 X32 X33 X34 X35 X36 X37 X38
## power 1 0 3 3 11 4 10 7 7 4 5 6 1 7 0 5 4 3
## X39 X40 X41 X42 X43 X44 X45 X46 X47 X48 X49 X50 X51 X52 X53 X54 X55 X56
## power 10 0 0 2 4 5 9 0 1 2 2 2 3 3 5 5 4 3
## X57 words
## power 5 power
CHMURA SŁÓW W PODZIALE NA DOKUMENTY
a<-as.matrix(tdm_m[,54:57],) # 4 ostatnie dokumenty (przemówienia)
head(a,10)
## Docs
## Terms 54 55 56 57
## yrprezspeech 0 0 0 0
## 14th 0 0 0 0
## 1789washingtonfellow 0 0 0 0
## accomplish 0 1 0 0
## accord 0 0 0 0
## acknowledg 0 0 0 0
## acquit 0 0 0 0
## act 0 2 5 1
## actual 0 0 0 0
## actuat 0 0 0 0
colnames(a)<-c("1997-Clinton","2001-Bush","2005-Bush","2009-Obama")
head(a,10)
## Docs
## Terms 1997-Clinton 2001-Bush 2005-Bush 2009-Obama
## yrprezspeech 0 0 0 0
## 14th 0 0 0 0
## 1789washingtonfellow 0 0 0 0
## accomplish 0 1 0 0
## accord 0 0 0 0
## acknowledg 0 0 0 0
## acquit 0 0 0 0
## act 0 2 5 1
## actual 0 0 0 0
## actuat 0 0 0 0
comparison.cloud(a,colors = brewer.pal(4, "Paired"),title.colors = "darkgrey",title.bg.colors = "white")

ANALIZA SENTYMENTU
PRZYGOTOWANIE DANYCH
tdm_d$word<-as.character(tdm_d$word) # typ zmiennej word jako charakter
df<-tdm_d
colnames(df) <-c("word","n") # nazwanie kolumn
df[1:10,]
## word n
## govern govern 679
## nation nation 655
## peopl peopl 602
## state state 446
## upon upon 369
## power power 362
## must must 346
## countri countri 336
## world world 333
## shall shall 314
BIBLIOTEKA BING
df_bing<-df %>%
inner_join(get_sentiments("bing"), by = "word")
head(df_bing, 10)
## word n sentiment
## 1 freedom 183 positive
## 2 free 180 positive
## 3 right 155 positive
## 4 faith 141 positive
## 5 work 137 positive
## 6 protect 130 positive
## 7 best 120 positive
## 8 respect 118 positive
## 9 support 114 positive
## 10 honor 105 positive
df[1:500,] %>%
inner_join(get_sentiments("bing"), by = "word") %>%
group_by(sentiment) %>%
ungroup() %>%
mutate(word = reorder(word, n))%>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("red2", "green3")) +
facet_wrap(~sentiment, scales = "free_y") + # rozdzielenie na pozytywne, negatywne
#ylim(0, 20) +
labs(y = NULL, x = NULL) +
coord_flip() +
theme_minimal()

sum(df_bing$sentiment=='positive')
## [1] 243
sum(df_bing$sentiment=='negative')
## [1] 433
table(df_bing$sentiment)
##
## negative positive
## 433 243
summary(as.factor(df_bing$sentiment))
## negative positive
## 433 243
BIBLIOTEKA NRC
NRC sentyment umożliwia klasyfikować słowa na 8 emocji
Wywołanie tablicy NRC
head(get_sentiments("nrc"),10)
## # A tibble: 10 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
df2<-df[c(1:500),] # ograniczenie liczby słów do 500 najcześciej występujących
df2 %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(sentiment=="anger") %>%
ggplot(aes(x=reorder(word,n),y=n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("black")) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = NULL, x = NULL) +
coord_flip() +
theme_minimal()

wykres1<- df2 %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(sentiment=="positive") %>%
ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("green3")) +
facet_wrap(~sentiment, scales = "free_y") +
#ylim(0, 200) +
labs(y = NULL, x = NULL) +
coord_flip() +
theme_minimal()
wykres2<-df2 %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(sentiment=="negative") %>%
ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("red")) +
facet_wrap(~sentiment, scales = "free_y") +
#ylim(0, 200) +
labs(y = NULL, x = NULL) +
coord_flip() +
theme_minimal()
wykres3<-df2 %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(sentiment=="joy") %>%
ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("pink")) +
facet_wrap(~sentiment, scales = "free_y") +
#ylim(0, 200) +
labs(y = NULL, x = NULL) +
coord_flip() +
theme_minimal()
wykres4<-df2 %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(sentiment=="trust") %>%
ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("orange")) +
facet_wrap(~sentiment, scales = "free_y") +
#ylim(0, 200) +
labs(y = NULL, x = NULL) +
coord_flip() +
theme_minimal()
wykres5<-df2 %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(sentiment=="anticipation") %>%
ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("gold")) +
facet_wrap(~sentiment, scales = "free_y") +
# ylim(0, 200) +
labs(y = NULL, x = NULL) +
coord_flip() +
theme_minimal()
wykres6<-df2 %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(sentiment=="fear") %>%
ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("black")) +
facet_wrap(~sentiment, scales = "free_y") +
#ylim(0, 500) +
labs(y = NULL, x = NULL) +
coord_flip() +
theme_minimal()
grid.arrange(wykres1, wykres2, wykres3, wykres4, wykres5, wykres6)

BIGRAMY
# UTWORZENIE RAMIKI DANYCH Z LISTY
df2<-data.frame(Reduce(rbind, TextDoc))
colnames(df2)<-c( "text")
BUDOWA BIGRAMÓW
df2_bigrams<- df2 %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
SORTOWANIE BIGRAMÓW
df2_bigrams<-df2_bigrams %>%
count(bigram, sort = TRUE)
WIZUALIZACJA BIGRAMÓW
ggplot(df2_bigrams[1:10,], aes(fct_reorder(bigram, n), n)) +
geom_col() +
coord_flip()

ROZDZIELENIE BIGRAMÓW
library(dplyr)
library(stringr)
bigrams_separated <- df2_bigrams %>%
mutate(split = str_split_fixed(bigram, " ", 2)) %>%
mutate(
word1 = split[,1],
word2 = split[,2]
) %>%
select(-split)
stop_words <- data.frame(word = stopwords("en"))
bigrams_filtered <- bigrams_separated[
!(bigrams_separated$word1 %in% stop_words$word) &
!(bigrams_separated$word2 %in% stop_words$word),
]
bigram_counts <- bigrams_filtered %>%
select(word1, word2, n)
BUDOWA GRAFÓW/SIECI BIGRAMÓW
library(igraph)
bigram_graph <- bigram_counts %>%
filter(n > 10) %>%
graph_from_data_frame()
bigram_graph
## IGRAPH c8eb6fb DN-- 61 53 --
## + attr: name (v/c), n (e/n)
## + edges from c8eb6fb (vertex names):
## [1] unit ->state fellow ->citizen american ->peopl feder ->govern
## [5] self ->govern four ->year year ->ago general ->govern
## [9] men ->women upon ->us constitut->law constitut->unit
## [13] everi ->citizen govern ->peopl foreign ->nation form ->govern
## [17] free ->peopl among ->nation govern ->must nation ->world
## [21] state ->govern among ->peopl free ->govern nation ->govern
## [25] polit ->parti rest ->upon almighti ->god peac ->world
## [29] peopl ->unit peopl ->world branch ->govern depend ->upon
## + ... omitted several edges
GRAF BIGRAMÓW
plot(
bigram_graph,
vertex.color = "lightblue",
vertex.size = 20,
vertex.label.cex = 0.8,
edge.arrow.size = 0.5,
layout = layout_with_fr
)
