Text_mining

PAKIETY

# # Instalacja pakietów
# install.packages("tm")  # text mining
# install.packages("SnowballC") #  stemming
# install.packages("wordcloud") # chmury słów 
# install.packages("digest") ## zaawansowane chmura słów
# install.packages("wordcloud2")
# install.packages("RColorBrewer") # paleta kolorów
# install.packages("syuzhet") # analiza sentymentu
# install.packages("ggplot2") # wizualizacja
# install.packages("tidytext") # manipulacje danymi
# install.packages("dplyr")

# załadowanie pakietów
library("tm")
library("SnowballC")
library("RColorBrewer")
library("wordcloud")
library("digest")
library("wordcloud2")
library("syuzhet")
library("ggplot2")
library("tidytext")
library("dplyr")
library(gridExtra)

ZAŁADOWANIE PLIKU Z DANYMI

text <- readLines(file.choose())#### przemówienia inauguracyjne prezydentów USA
#text[2]

BUDOWA KORPUSU - ZBIORU DOKUMENTÓW

TextDoc <- Corpus(VectorSource(text)) #budowa corpusu
TextDoc

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 57

#wyświetlanie danego dokumentu z corpusu (zbioru dokumentów)
#writeLines(as.character(TextDoc[[2]]))

PRZYGOTOWANIE DANYCH

ZMIANA NA MAŁE LITERY

###Funkcja content_transformer może służyć do transformacji funkcji do obiektów R'owych
toSpace <-content_transformer(function (x , pattern ) gsub(pattern, " ", x))

TextDoc <- tm_map(TextDoc, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(TextDoc, content_transformer(tolower)):
## transformation drops documents

USUWANIE STOPWORDS

stopwords("english")

##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"    
## [101] "who's"      "what's"     "here's"     "there's"    "when's"    
## [106] "where's"    "why's"      "how's"      "a"          "an"        
## [111] "the"        "and"        "but"        "if"         "or"        
## [116] "because"    "as"         "until"      "while"      "of"        
## [121] "at"         "by"         "for"        "with"       "about"     
## [126] "against"    "between"    "into"       "through"    "during"    
## [131] "before"     "after"      "above"      "below"      "to"        
## [136] "from"       "up"         "down"       "in"         "out"       
## [141] "on"         "off"        "over"       "under"      "again"     
## [146] "further"    "then"       "once"       "here"       "there"     
## [151] "when"       "where"      "why"        "how"        "all"       
## [156] "any"        "both"       "each"       "few"        "more"      
## [161] "most"       "other"      "some"       "such"       "no"        
## [166] "nor"        "not"        "only"       "own"        "same"      
## [171] "so"         "than"       "too"        "very"

TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(TextDoc, removeWords, stopwords("english")):
## transformation drops documents

USUWANIE WŁASNYCH SŁÓW

Na przykłąd usuwanie słów: “company” i “team”

TextDoc <- tm_map(TextDoc, removeWords, c( "company","team"))

## Warning in tm_map.SimpleCorpus(TextDoc, removeWords, c("company", "team")):
## transformation drops documents

USUWANIE ZNAKÓW INTERPUNKCYJNYCH

TextDoc <- tm_map(TextDoc, removePunctuation)

## Warning in tm_map.SimpleCorpus(TextDoc, removePunctuation): transformation
## drops documents

USUWANIE BIAŁYCH SPACJI

TextDoc <- tm_map(TextDoc, stripWhitespace)

## Warning in tm_map.SimpleCorpus(TextDoc, stripWhitespace): transformation
## drops documents

STEMMING

TextDoc <- tm_map(TextDoc, stemDocument)

## Warning in tm_map.SimpleCorpus(TextDoc, stemDocument): transformation drops
## documents

BUDOWA MACIERZY TDM

TextDoc_tdm <- TermDocumentMatrix(TextDoc)
TextDoc_tdm

## <<TermDocumentMatrix (terms: 5232, documents: 57)>>
## Non-/sparse entries: 32638/265586
## Sparsity           : 89%
## Maximal term length: 20
## Weighting          : term frequency (tf)

tdm_m <- as.matrix(TextDoc_tdm)
head(tdm_m,10)

##                       Docs
## Terms                  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
##   yrprezspeech         1 0 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0
##   14th                 0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0
##   1789washingtonfellow 0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0
##   accomplish           0 1 0 0 0 0 0 0 2  1  1  1  0  0  3  3  0  1  3  0
##   accord               0 1 0 1 1 1 0 0 1  3  0  0  0  1  2  2  0  3  0  3
##   acknowledg           0 1 0 1 1 1 1 0 0  1  2  1  0  0  3  1  1  3  0  0
##   acquit               0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0
##   act                  0 1 1 0 0 4 0 0 1  3  0  1  1  1 12  3  0  5  2  4
##   actual               0 1 0 0 0 0 0 0 0  0  0  0  0  6  3  0  0  1  2  1
##   actuat               0 1 0 0 0 0 0 0 0  0  0  0  0  1  0  0  0  0  0  0
##                       Docs
## Terms                  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
##   yrprezspeech          0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   14th                  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   1789washingtonfellow  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   accomplish            0  1  1  2  2  1  1  1  0  1  0  3  0  0  0  0  2
##   accord                0  1  0  2  1  1  0  0  2  0  0  3  0  0  3  0  0
##   acknowledg            0  0  1  0  0  1  0  0  0  0  1  0  0  0  0  0  0
##   acquit                0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   act                   0  0  2  2  1  1  2  0  0  4  2  2  0  0  0  2  1
##   actual                0  0  0  0  0  1  0  0  0  0  0  0  0  1  0  2  0
##   actuat                0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0
##                       Docs
## Terms                  38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
##   yrprezspeech          0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   14th                  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   1789washingtonfellow  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   accomplish            3  0  0  0  1  1  0  0  0  0  0  1  0  1  0  0  0
##   accord                0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##   acknowledg            0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   acquit                0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   act                   3  0  2  0  0  2  1  0  0  0  5  0  5  2  3  2  0
##   actual                0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   actuat                0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##                       Docs
## Terms                  55 56 57
##   yrprezspeech          0  0  0
##   14th                  0  0  0
##   1789washingtonfellow  0  0  0
##   accomplish            1  0  0
##   accord                0  0  0
##   acknowledg            0  0  0
##   acquit                0  0  0
##   act                   2  5  1
##   actual                0  0  0
##   actuat                0  0  0

MACIERZ CZESTOŚCI SŁÓW

tdm_v <-sort(rowSums(tdm_m),decreasing=TRUE)
head(tdm_v,10)

##   will govern nation  peopl    can  state   upon  great  power   must 
##    871    679    655    602    465    446    369    364    362    346

tdm_d <- data.frame(word = names(tdm_v),freq=tdm_v)
head(tdm_d,5)

WYKRES 5 NAJCZEŚCIEJ WYSTEPUJĄCYCH SŁÓW

barplot(tdm_d[1:5,]$freq, las = 1, names.arg = tdm_d[1:5,]$word,
        col ="lightgreen", main ="5 najczesciej wystepujących słów",
        ylab = "czestość słów")

CHMURY SŁÓW

PROSTA CHMURA SŁÓW

wordcloud(words = tdm_d$word, freq = tdm_d$freq, min.freq = 5,
          max.words=50, random.order=FALSE, rot.per=0.70,
          colors=brewer.pal(8, "Dark2"))

KSZTAŁTY CHMURY SŁÓW

##w kształcie diamentu
# wordcloud2( tdm_d,color = brewer.pal(10,"Set3"), backgroundColor = "grey", shape = 'diamond',size = 0.5)

## w kształcie centroidu
# wordcloud2( tdm_d,color = brewer.pal(10,"Set3"), backgroundColor = "grey", shape = 'centroid',size = 0.5)

## w kształcie trójkąta
# wordcloud2( tdm_d,color = brewer.pal(10,"Set3"), backgroundColor = "grey", shape = 'triangle-forward',size = 0.5)


## w kształcie pięciokąta
# wordcloud2( tdm_d,color = brewer.pal(10,"Set3"), backgroundColor = "grey", shape = 'pentagon',size = 0.5)

## w kształcie gwiazdy
wordcloud2( tdm_d,color = brewer.pal(10,"Set3"), backgroundColor = "grey", shape = 'star',size = 0.5)

## CHMURA SŁÓW W PODZIALE NA DOKUMENTY

PZYKŁADOWO WZIĘTO 4 OSTATNIE PRZEMÓWIENIA

a<-as.matrix(tdm_m[,54:57],) # 4 ostatnie dokumenty (przemówienia)
head(a,10)

##                       Docs
## Terms                  54 55 56 57
##   yrprezspeech          0  0  0  0
##   14th                  0  0  0  0
##   1789washingtonfellow  0  0  0  0
##   accomplish            0  1  0  0
##   accord                0  0  0  0
##   acknowledg            0  0  0  0
##   acquit                0  0  0  0
##   act                   0  2  5  1
##   actual                0  0  0  0
##   actuat                0  0  0  0

colnames(a)<-c("1997-Clinton","2001-Bush","2005-Bush","2009-Obama")
head(a,10)

##                       Docs
## Terms                  1997-Clinton 2001-Bush 2005-Bush 2009-Obama
##   yrprezspeech                    0         0         0          0
##   14th                            0         0         0          0
##   1789washingtonfellow            0         0         0          0
##   accomplish                      0         1         0          0
##   accord                          0         0         0          0
##   acknowledg                      0         0         0          0
##   acquit                          0         0         0          0
##   act                             0         2         5          1
##   actual                          0         0         0          0
##   actuat                          0         0         0          0

CHMURA SŁÓW Z 4 DOKUMENTÓW

comparison.cloud(a,colors = brewer.pal(4, "Paired"),title.colors = "darkgrey",title.bg.colors = "white")

PRZYGOTOWANIE DANYCH DO ANALIZY SENTYMENTU

tdm_d$word<-as.character(tdm_d$word) # typ zmiennej word jako charakter
df<-tdm_d
colnames(df) <-c("word","n") # nazwanie kolumn
df[1:10,]

ANALIZA SENTYMENTU

BIBLIOTEKA BING

Umozliwia klasyfikację słów na pozytywne i negatywne

df %>%  
  inner_join(get_sentiments("bing"), by = "word")

WIZUALIZACJA SENTYMENTU BING

df[1:500,] %>%  
  inner_join(get_sentiments("bing"), by = "word") %>%
  group_by(sentiment) %>%
  ungroup() %>%
  mutate(word = reorder(word, n))%>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = c("red2", "green3")) +
  facet_wrap(~sentiment, scales = "free_y") +   # rozdzielenie na pozytywne, negatywne 
  #ylim(0, 20) +
  labs(y = NULL, x = NULL) +
  coord_flip() +
  theme_minimal()

BIBLIOTEKA NRC

NRC sentyment umożliwia klasyfikować słowa na 8 emocji

Wywołanie tablicy NCR

head(get_sentiments("nrc"),10)

WYKRES DLA ZŁOŚCI

df2<-df[c(1:500),] # ograniczenie liczby słów do 500 najcześciej występujących

df2 %>%
  inner_join(get_sentiments("nrc"), by = "word") %>%
  filter(sentiment=="anger") %>%
  ggplot(aes(x=reorder(word,n),y=n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = c("black")) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = NULL, x = NULL) +
  coord_flip() +
  theme_minimal()

WYKRES ŁĄCZNY DLA WIELU EMOCJI

wykres1<- df2 %>%
  inner_join(get_sentiments("nrc"), by = "word") %>%
  filter(sentiment=="positive") %>%
  ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = c("green3")) +
  facet_wrap(~sentiment, scales = "free_y") +
  #ylim(0, 200) +
  labs(y = NULL, x = NULL) +
  coord_flip() +
  theme_minimal()
wykres2<-df2 %>%
  inner_join(get_sentiments("nrc"), by = "word") %>%
  filter(sentiment=="negative") %>%
  ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = c("red")) +
  facet_wrap(~sentiment, scales = "free_y") +
  #ylim(0, 200) +
  labs(y = NULL, x = NULL) +
  coord_flip() +
  theme_minimal()
wykres3<-df2 %>%
  inner_join(get_sentiments("nrc"), by = "word") %>%
  filter(sentiment=="joy") %>%
  ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = c("pink")) +
  facet_wrap(~sentiment, scales = "free_y") +
  #ylim(0, 200) +
  labs(y = NULL, x = NULL) +
  coord_flip() +
  theme_minimal()
wykres4<-df2 %>%
  inner_join(get_sentiments("nrc"), by = "word") %>%
  filter(sentiment=="trust") %>%
  ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = c("orange")) +
  facet_wrap(~sentiment, scales = "free_y") +
  #ylim(0, 200) +
  labs(y = NULL, x = NULL) +
  coord_flip() +
  theme_minimal()
wykres5<-df2 %>%
  inner_join(get_sentiments("nrc"), by = "word") %>%
  filter(sentiment=="anticipation") %>%
  ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = c("gold")) +
  facet_wrap(~sentiment, scales = "free_y") +
  # ylim(0, 200) +
  labs(y = NULL, x = NULL) +
  coord_flip() +
  theme_minimal()
wykres6<-df2 %>%
  inner_join(get_sentiments("nrc"), by = "word") %>%
  filter(sentiment=="fear") %>%
  ggplot(aes(x=reorder(word,n),y=n, fill = sentiment))+
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = c("black")) +
  facet_wrap(~sentiment, scales = "free_y") +
  #ylim(0, 500) +
  labs(y = NULL, x = NULL) +
  coord_flip() +
  theme_minimal()

grid.arrange(wykres1, wykres2, wykres3, wykres4, wykres5, wykres6)

                                                                   Made by: 
                                                                  Majkowska Agata
                                                                  agata.majkowska@phdstud.ug.edu.pl

Text_mining_R