PAKIETY
library("tm")
library("SnowballC")
library("RColorBrewer")
library("wordcloud")
library("digest")
library("wordcloud2")
library("syuzhet")
library("ggplot2")
library("tidytext")
library("dplyr")
library(gridExtra)
library(readxl)
library(igraph)
library(RColorBrewer)
library(ggplot2)
library(ggrepel)
WCZYTYWANIE DANYCH
UWAGA!!!!!
Prosze podać swoją ścieżkę do pliku!
apple <- read.csv(file.choose(), header = T)
BUDOWA CORPUSU
corpus <- iconv(apple$text)
corpus <- Corpus(VectorSource(corpus))
CZYSZCZENIE DANYCH
corpus <- tm_map(corpus, tolower)
## Warning in tm_map.SimpleCorpus(corpus, tolower): transformation drops documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
cleanset <- tm_map(corpus, removeWords, stopwords('english'))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
removeURL <- function(x) gsub('http[[:alnum:]]*', '', x)
cleanset <- tm_map(cleanset, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(cleanset, content_transformer(removeURL)):
## transformation drops documents
cleanset <- tm_map(cleanset, removeWords, c('aapl', 'apple'))
## Warning in tm_map.SimpleCorpus(cleanset, removeWords, c("aapl", "apple")):
## transformation drops documents
cleanset <- tm_map(cleanset, gsub,
pattern = 'stocks',
replacement = 'stock')
## Warning in tm_map.SimpleCorpus(cleanset, gsub, pattern = "stocks", replacement
## = "stock"): transformation drops documents
cleanset <- tm_map(cleanset, stripWhitespace)
## Warning in tm_map.SimpleCorpus(cleanset, stripWhitespace): transformation drops
## documents
PRZYGOTOWANIE DANYCH
# Term document matrix
tdm <- TermDocumentMatrix(cleanset)
tdm <- as.matrix(tdm)
tdm <- tdm[rowSums(tdm)>60,]
tdm[1:10,1:10]
## Docs
## Terms 1 2 3 4 5 6 7 8 9 10
## eps 1 1 0 0 0 1 1 1 0 2
## est 1 1 0 0 0 2 2 0 0 0
## almost 0 0 0 1 0 0 0 0 0 0
## delay 0 0 0 1 0 0 0 0 0 0
## fang 0 0 0 1 0 0 0 0 0 0
## get 0 0 0 1 0 0 0 0 0 0
## iphone 0 0 0 1 0 0 0 0 0 0
## means 0 0 0 1 0 0 0 0 0 0
## might 0 0 0 1 0 0 0 0 0 0
## pos… 0 0 0 1 0 0 0 0 0 0
tdm[tdm>1] <- 1
termM <- tdm %*% t(tdm)
termM[1:10,1:10]
## Terms
## Terms eps est almost delay fang get iphone means might pos…
## eps 92 69 0 0 0 3 1 0 0 0
## est 69 73 0 0 0 0 2 0 0 0
## almost 0 0 88 86 86 86 86 86 86 85
## delay 0 0 86 86 86 86 86 86 86 85
## fang 0 0 86 86 87 86 86 86 86 85
## get 3 0 86 86 86 135 88 86 89 85
## iphone 1 2 86 86 86 88 137 86 86 85
## means 0 0 86 86 86 86 86 86 86 85
## might 0 0 86 86 86 89 86 86 95 85
## pos… 0 0 85 85 85 85 85 85 85 85
BUDOWA SIECI
g <- graph.adjacency(termM, weighted = T, mode = 'undirected')
g
## IGRAPH bc5d76c UNW- 27 208 --
## + attr: name (v/c), weight (e/n)
## + edges from bc5d76c (vertex names):
## [1] eps --eps eps --est eps --get eps --iphone
## [5] eps --earnings eps --reports eps --today eps --close
## [9] est --est est --iphone est --earnings est --reports
## [13] almost--almost almost--delay almost--fang almost--get
## [17] almost--iphone almost--means almost--might almost--pos…
## [21] almost--stock almost--sylvacap almost--things almost--ugly
## [25] almost--earnings delay --delay delay --fang delay --get
## [29] delay --iphone delay --means delay --might delay --pos…
## + ... omitted several edges
g <- simplify(g) # USUNIĘCIE PĘTLI
OPIS SIECI
g
## IGRAPH bc625d8 UNW- 27 181 --
## + attr: name (v/c), weight (e/n)
## + edges from bc625d8 (vertex names):
## [1] eps --est eps --get eps --iphone eps --earnings
## [5] eps --reports eps --today eps --close est --iphone
## [9] est --earnings est --reports almost--delay almost--fang
## [13] almost--get almost--iphone almost--means almost--might
## [17] almost--pos… almost--stock almost--sylvacap almost--things
## [21] almost--ugly almost--earnings delay --fang delay --get
## [25] delay --iphone delay --means delay --might delay --pos…
## [29] delay --stock delay --sylvacap delay --things delay --ugly
## + ... omitted several edges
WIZUALIZACJA PROSTA SIECI
plot(g)
WIZUALIZACJA
plot(g,
vertex.color=brewer.pal(8, "Pastel2"),
edge.arrow.size=0.1,
vertex.size=5,
layout=layout.star)
WIZUALIZACJA STOPNI WIERZCHOŁKÓW
PRZYGOTOWANIE DANYCH
deg<-degree(g, mode="all") # zapisanie do zmiennej degree
deg_dataframe <- as.data.frame(deg) # zapisujemy degree do dataframe
deg_dataframe$node <-row.names(deg_dataframe) # tworzenie kolumny o nazwie node i przypisanie nazwy wiesza
row.names(deg_dataframe) <- NULL # usuwanie nazwy wierszy z dataframe
nodes<-deg_dataframe %>%
select(node,deg) %>% # wybieranie dwóch kolumn
arrange(desc(deg)) # sortowanie malejące
nodes
## node deg
## 1 sylvacap 23
## 2 earnings 19
## 3 get 17
## 4 iphone 16
## 5 reports 16
## 6 close 16
## 7 today 15
## 8 puts 15
## 9 stock 14
## 10 dont 14
## 11 moving 14
## 12 might 13
## 13 things 13
## 14 calls 13
## 15 anything 13
## 16 expect 13
## 17 almost 12
## 18 delay 12
## 19 fang 12
## 20 means 12
## 21 ugly 12
## 22 rather 12
## 23 earth 12
## 24 head… 12
## 25 pos… 11
## 26 eps 7
## 27 est 4
HISTOGRAM STOPNI WIERZCHOŁKÓW
ggplot(data=nodes, aes(x=reorder(node,deg), y=deg))+
geom_histogram(stat='identity') +
xlab(" ") +
ylab("Liczebność") +
ggtitle("Histogram stopni wierzchołków")+
coord_flip()
## Warning in geom_histogram(stat = "identity"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
WIZUALIZACJA WEDŁUG STOPNIA WIERZCHOŁKÓW
plot(g,
vertex.color=brewer.pal(8, "Pastel2"),
edge.arrow.size=1,
edge.color="black", ### KOLOR KRAWĘDZI
vertex.size=degree(g), ### WIELKOŚĆ WĘZŁA (AKTORA)
vertex.label.cex=1, ### WIELKOŚĆ ETYKIETY WĘZŁA
layout=layout.svd)
## Warning in v(graph): SVD layout was removed, we use Fruchterman-Reingold
## instead.
WIZUALIZACJA WEDŁUG MIARY POSREDNICTWA
plot(g,
vertex.color=brewer.pal(8, "Pastel2"),
edge.arrow.size=1,
edge.color="black", ### KOLOR KRAWĘDZI
vertex.size=betweenness(g)*0.2, ### WIELKOŚĆ WĘZŁA (AKTORA)
vertex.label.cex=1, ### WIELKOŚĆ ETYKIETY WĘZŁA
layout=layout.graphopt)
WIZUALIZACJA - MODYFIKACJA KRAWĘDZI
E(g)$width <- (log(E(g)$weight)+1)/2
edge_attr(g)
## $weight
## [1] 69 3 1 70 9 3 1 2 54 10 86 86 86 86 86 86 85 86 85 86 86 2 86 86 86
## [26] 86 86 85 86 85 86 86 1 86 86 86 86 85 86 85 86 86 1 88 86 89 85 97 85 87
## [51] 86 9 4 1 3 1 86 86 85 87 85 86 86 18 6 5 86 85 86 85 86 86 1 85 86
## [76] 85 86 86 3 3 85 85 85 85 85 86 86 13 1 1 85 85 85 85 85 85 85 85 85 85
## [101] 85 85 85 85 86 2 2 1 6 3 11 3 9 2 1 85 85 85 85 85 85 85 85 85 85
## [126] 85 86 85 86 85 87 85 85 85 85 87 85 85 85 85 85 85 85 85 85 85 85 86 85 85
## [151] 85 85 85 85 85 85 86 85 85 86 85 85 85 85 85 85 85 85 85 85 86 85 85 85 85
## [176] 85 85 85 85 85 85
##
## $width
## [1] 2.6170533 1.0493061 0.5000000 2.6242476 1.5986123 1.0493061 0.5000000
## [8] 0.8465736 2.4944920 1.6512925 2.7271736 2.7271736 2.7271736 2.7271736
## [15] 2.7271736 2.7271736 2.7213256 2.7271736 2.7213256 2.7271736 2.7271736
## [22] 0.8465736 2.7271736 2.7271736 2.7271736 2.7271736 2.7271736 2.7213256
## [29] 2.7271736 2.7213256 2.7271736 2.7271736 0.5000000 2.7271736 2.7271736
## [36] 2.7271736 2.7271736 2.7213256 2.7271736 2.7213256 2.7271736 2.7271736
## [43] 0.5000000 2.7386684 2.7271736 2.7443182 2.7213256 2.7873555 2.7213256
## [50] 2.7329541 2.7271736 1.5986123 1.1931472 0.5000000 1.0493061 0.5000000
## [57] 2.7271736 2.7271736 2.7213256 2.7329541 2.7213256 2.7271736 2.7271736
## [64] 1.9451859 1.3958797 1.3047190 2.7271736 2.7213256 2.7271736 2.7213256
## [71] 2.7271736 2.7271736 0.5000000 2.7213256 2.7271736 2.7213256 2.7271736
## [78] 2.7271736 1.0493061 1.0493061 2.7213256 2.7213256 2.7213256 2.7213256
## [85] 2.7213256 2.7271736 2.7271736 1.7824747 0.5000000 0.5000000 2.7213256
## [92] 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256
## [99] 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256 2.7271736
## [106] 0.8465736 0.8465736 0.5000000 1.3958797 1.0493061 1.6989476 1.0493061
## [113] 1.5986123 0.8465736 0.5000000 2.7213256 2.7213256 2.7213256 2.7213256
## [120] 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256
## [127] 2.7271736 2.7213256 2.7271736 2.7213256 2.7329541 2.7213256 2.7213256
## [134] 2.7213256 2.7213256 2.7329541 2.7213256 2.7213256 2.7213256 2.7213256
## [141] 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256
## [148] 2.7271736 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256
## [155] 2.7213256 2.7213256 2.7271736 2.7213256 2.7213256 2.7271736 2.7213256
## [162] 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256
## [169] 2.7213256 2.7213256 2.7271736 2.7213256 2.7213256 2.7213256 2.7213256
## [176] 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256 2.7213256
plot(g)
plot(g,
vertex.color=brewer.pal(8, "Pastel2"),
edge.arrow.size=1,
edge.color="black", ### KOLOR KRAWĘDZI
vertex.size=degree(g), ### WIELKOŚĆ WĘZŁA (AKTORA)
vertex.label.cex=1, ### WIELKOŚĆ ETYKIETY WĘZŁA
layout=layout.graphopt)
WYKRYWANIE WSPÓLNOTOWE
prop <- cluster_label_prop(g)
plot(prop, g)
WYKRYWANIE SPOŁECZNOŚCI
comm <- cluster_edge_betweenness(g)
## Warning in cluster_edge_betweenness(g): At
## core/community/edge_betweenness.c:493 : Membership vector will be selected
## based on the highest modularity score.
plot(comm, g)
WIZUALIZACJA KLIKI
kliki = cliques(g)
a<-largest_cliques(g)
clique1<-a[[1]]
g2<-induced.subgraph(graph=g,vids=clique1)
plot(g2)
Made by:
Majkowska Agata
agata.majkowska@phdstud.ug.edu.pl