2023/01/31 (updated: 2023-12-09)
library(tm) library(SnowballC) library(stringr) library(tidytext) library(tidyverse)
DirSource() para especificar o diretorio e o padrão dos nomes dos arquivos que serão usados na função VCorpus() que cria a coleção de textos, ou corpus de texto.VCorpus() e DirSource() são do pacote tm.DIR_SOURCE <- "D:/Roberto/Cursos/Analise de Dados 2023/Curso 2024/Unidade 5/federalist" corpus_raw <- VCorpus(DirSource(directory = DIR_SOURCE, pattern = "fp")) corpus_raw
## <<VCorpus>> ## Metadata: corpus specific: 0, document level (indexed): 0 ## Content: documents: 85
tidy().tidy() quando aplicada a um corpus adiciona metadados, por exemplo, o autor do documento. Esses metadados estão em branco no nosso exemplo, por isso vamos ficar apenas com as colunas id e text.corpus_tidy <- tidy(corpus_raw) %>% select(id, text)
## # A tibble: 6 × 2 ## id text ## <chr> <chr> ## 1 fp01.txt "AFTER an unequivocal experience of the inefficiency of the subsisti… ## 2 fp02.txt "WHEN the people of America reflect that they are now called upon to… ## 3 fp03.txt "IT IS not a new observation that the people of any country (if, lik… ## 4 fp04.txt "MY LAST paper assigned several reasons why the safety of the people… ## 5 fp05.txt "QUEEN ANNE, in her letter of the 1st July, 1706, to the Scotch Parl… ## 6 fp06.txt "THE three last numbers of this paper have been dedicated to an enum…
str_sub() do pacote stringr. Essa função extrai ou substitui elementos em uma determinada posição em uma string.corpus_tidy <- corpus_tidy %>% mutate(new_id = as.integer(str_sub(id, start=3, end=4))) glimpse(corpus_tidy)
## Rows: 85 ## Columns: 3 ## $ id <chr> "fp01.txt", "fp02.txt", "fp03.txt", "fp04.txt", "fp05.txt", "fp… ## $ text <chr> "AFTER an unequivocal experience of the inefficiency of the sub… ## $ new_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, …
glimpse(corpus_tidy)
## Rows: 85 ## Columns: 3 ## $ id <chr> "fp01.txt", "fp02.txt", "fp03.txt", "fp04.txt", "fp05.txt", "fp… ## $ text <chr> "AFTER an unequivocal experience of the inefficiency of the sub… ## $ new_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, …
unnest_tokens() do pacote tidytext.wordStem() do pacote SnowballC para retirar prefixos e sufixos de cada palavra.str_replace_all(), do strngr com uma expressão regular, regex.tokens_raw <- corpus_tidy %>% unnest_tokens(word, text, to_lower = TRUE) %>% mutate(stem = wordStem(word)) %>% mutate(word = str_replace_all(word, "\\d+", "")) %>% filter(word != "")
glimpse(tokens_raw)
## Rows: 187,412 ## Columns: 4 ## $ id <chr> "fp01.txt", "fp01.txt", "fp01.txt", "fp01.txt", "fp01.txt", "fp… ## $ new_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, … ## $ word <chr> "after", "an", "unequivocal", "experience", "of", "the", "ineff… ## $ stem <chr> "after", "an", "unequivoc", "experi", "of", "the", "ineffici", …
anti_join() do tidyverse.anti_join() retira as observações comuns aos dois data.frames.data("stop_words", package = "tidytext")
glimpse(stop_words)
## Rows: 1,149 ## Columns: 2 ## $ word <chr> "a", "a's", "able", "about", "above", "according", "accordingl… ## $ lexicon <chr> "SMART", "SMART", "SMART", "SMART", "SMART", "SMART", "SMART",…
tokens <- tokens_raw %>% anti_join(stop_words, by="word") glimpse(tokens)
## Rows: 64,106 ## Columns: 4 ## $ id <chr> "fp01.txt", "fp01.txt", "fp01.txt", "fp01.txt", "fp01.txt", "fp… ## $ new_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, … ## $ word <chr> "unequivocal", "experience", "inefficiency", "subsisting", "fed… ## $ stem <chr> "unequivoc", "experi", "ineffici", "subsist", "feder", "govern"…
head(content(corpus_raw[[10]]), n=10)
## [1] "AMONG the numerous advantages promised by a well-constructed Union, none " ## [2] " deserves to be more accurately developed than its tendency to break and " ## [3] " control the violence of faction. The friend of popular governments never " ## [4] " finds himself so much alarmed for their character and fate, as when he " ## [5] " contemplates their propensity to this dangerous vice. He will not fail, " ## [6] " therefore, to set a due value on any plan which, without violating the " ## [7] " principles to which he is attached, provides a proper cure for it. The " ## [8] " instability, injustice, and confusion introduced into the public councils, " ## [9] " have, in truth, been the mortal diseases under which popular governments " ## [10] " have everywhere perished; as they continue to be the favorite and fruitful "
count()tokens_counts <- count(tokens, new_id, stem) head(tokens_counts)
## # A tibble: 6 × 3 ## new_id stem n ## <int> <chr> <int> ## 1 1 absurd 1 ## 2 1 accid 1 ## 3 1 acknowledg 1 ## 4 1 act 1 ## 5 1 actuat 1 ## 6 1 add 1
cast_dtm() cria uma matriz de documentos e termos, a função cast_tdm() cria uma matriz de termo-documento, ambas são do pacote tidytext.dtm <- cast_dtm(tokens_counts, document = new_id, term = stem, value = n) dtm
## <<DocumentTermMatrix (documents: 85, terms: 4674)>> ## Non-/sparse entries: 37214/360076 ## Sparsity : 91% ## Maximal term length: 17 ## Weighting : term frequency (tf)
inspect() do pacote tm permite olhar a matriz de documentos e termos.inspect(dtm[1:5, 1:8])
## <<DocumentTermMatrix (documents: 5, terms: 8)>> ## Non-/sparse entries: 16/24 ## Sparsity : 60% ## Maximal term length: 10 ## Weighting : term frequency (tf) ## Sample : ## Terms ## Docs absurd accid acknowledg act actuat add addit address ## 1 1 1 1 1 1 1 1 1 ## 2 0 0 0 0 0 0 0 0 ## 3 0 0 2 1 1 1 1 0 ## 4 0 0 0 1 0 0 0 1 ## 5 0 0 0 1 0 0 0 0
dtm.mat <- as.matrix(dtm) dtm.mat[1:5, 1:8]
## Terms ## Docs absurd accid acknowledg act actuat add addit address ## 1 1 1 1 1 1 1 1 1 ## 2 0 0 0 0 0 0 0 0 ## 3 0 0 2 1 1 1 1 0 ## 4 0 0 0 1 0 0 0 1 ## 5 0 0 0 1 0 0 0 0
wordcloud() do pacote wordcloud cria uma nuvem de palavras. Vamos usar essa função para fazer a nuvem de palavras dos ensaios 12 e 24. Comecemos carregando o wordcloud.library(wordcloud)
## Carregando pacotes exigidos: RColorBrewer
doc_12 <- filter(tokens_counts, new_id == 12) wordcloud(words = doc_12$stem, freq = doc_12$n, max.words = 15) doc_24 <- filter(tokens_counts, new_id == 24) wordcloud(words = doc_24$stem, freq = doc_24$n, max.words = 15)
stemCompletion() do pacote tm recupera a versão completa da raiz da palavra.stemCompletion(c("revenu", "commerc", "peac", "armi"), corpus_raw)
## revenu commerc peac armi ## "revenue" "commerce" "peace" "armies"
bind_tf_idf() do pacote tidytext calcula e adicona o tf-idf a um data.frame.tokens_counts <- bind_tf_idf(tokens_counts,
term = stem,
document = new_id,
n = n)
head(tokens_counts)
## # A tibble: 6 × 6 ## new_id stem n tf idf tf_idf ## <int> <chr> <int> <dbl> <dbl> <dbl> ## 1 1 absurd 1 0.00186 1.73 0.00323 ## 2 1 accid 1 0.00186 3.75 0.00698 ## 3 1 acknowledg 1 0.00186 1.55 0.00289 ## 4 1 act 1 0.00186 0.400 0.000744 ## 5 1 actuat 1 0.00186 2.14 0.00399 ## 6 1 add 1 0.00186 1.35 0.00252
weightTfIdf() do pacote tm. Essa função toma como input uma matriz de documento e termo e tem um argumento chamado normalize cujo o default é TRUE, se for definido como FALSE então a frequência do termo, \(\mbox{tf}(w,d)\), não será dividida pelo número de termos no documento \(d\). Neste caso, não ponderamos pelo tamanho do documento.dtm_tf_idf <- weightTfIdf(dtm) dtm_tf_idf
## <<DocumentTermMatrix (documents: 85, terms: 4674)>> ## Non-/sparse entries: 37044/360246 ## Sparsity : 91% ## Maximal term length: 17 ## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
slice_max() do tidyverse para extrair os dez maiores it-idf dos ensaios 12 e 24. Os dados estão no data.frame tokens_counts.tokens_counts %>% filter(new_id == 12) %>% slice_max(tf_idf, n=10)
## # A tibble: 10 × 6 ## new_id stem n tf idf tf_idf ## <int> <chr> <int> <dbl> <dbl> <dbl> ## 1 12 revenu 11 0.0138 1.22 0.0169 ## 2 12 contraband 3 0.00376 4.44 0.0167 ## 3 12 patrol 3 0.00376 4.44 0.0167 ## 4 12 excis 5 0.00627 2.65 0.0166 ## 5 12 coast 3 0.00376 3.75 0.0141 ## 6 12 tax 8 0.0100 1.31 0.0131 ## 7 12 trade 6 0.00752 1.73 0.0130 ## 8 12 cent 2 0.00251 4.44 0.0111 ## 9 12 gallon 2 0.00251 4.44 0.0111 ## 10 12 commerc 8 0.0100 1.11 0.0111
tokens_counts %>% filter(new_id == 24) %>% slice_max(tf_idf, n=10)
## # A tibble: 10 × 6 ## new_id stem n tf idf tf_idf ## <int> <chr> <int> <dbl> <dbl> <dbl> ## 1 24 garrison 6 0.00926 2.83 0.0262 ## 2 24 dock 3 0.00463 4.44 0.0206 ## 3 24 yard 3 0.00463 4.44 0.0206 ## 4 24 settlement 3 0.00463 3.75 0.0174 ## 5 24 spain 4 0.00617 2.36 0.0146 ## 6 24 armi 7 0.0108 1.26 0.0137 ## 7 24 frontier 3 0.00463 2.83 0.0131 ## 8 24 arsen 2 0.00309 3.75 0.0116 ## 9 24 western 3 0.00463 2.50 0.0116 ## 10 24 nearer 2 0.00309 3.34 0.0103
set.seed().cast_dtm() para criar a matriz de termos e documentos e finalmente usaremos a função kmeans().hamilton <- c(1, 6:9, 11:13, 15:17, 21:36, 59:61, 65:85)
hamilton_docs <- tokens_counts %>%
filter(new_id %in% hamilton)
hamilton_dtm <- cast_dtm(hamilton_docs,
document = new_id, term = stem,
value = n) %>%
weightTfIdf()
hamilton_dtm
## <<DocumentTermMatrix (documents: 51, terms: 3918)>> ## Non-/sparse entries: 22478/177340 ## Sparsity : 89% ## Maximal term length: 17 ## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
inspect(hamilton_dtm[1:5, 1:4])
## <<DocumentTermMatrix (documents: 5, terms: 4)>> ## Non-/sparse entries: 6/14 ## Sparsity : 70% ## Maximal term length: 10 ## Weighting : term frequency - inverse document frequency (normalized) (tf-idf) ## Sample : ## Terms ## Docs absurd accid acknowledg act ## 1 0.003672226 0.008700978 0.00437709 0.0012521887 ## 6 0.000000000 0.000000000 0.00000000 0.0000000000 ## 7 0.000000000 0.000000000 0.00000000 0.0008533317 ## 8 0.000000000 0.000000000 0.00000000 0.0009001678 ## 9 0.000000000 0.000000000 0.00000000 0.0000000000
set.seed(1234) km.out <- kmeans(hamilton_dtm, centers = 5) km.out$iter
## [1] 3
table() para saber quantos documentos ficaram em cada cluster.table(km.out$cluster)
## ## 1 2 3 4 5 ## 8 6 1 1 35
colnames() para dar a cada centro de cluster o termo correspondente e a função enframe() para tranformar o vetor com nomes em um tibble de modo a usar as facilidades do tidyverse.colnames(km.out$centers) <- colnames(hamilton_dtm)
for (i in 1:5) {
print(str_c("CLUSTER ", i))
print("Top 10 words: ")
cluster_centers <- enframe(km.out$centers[i, ]) %>%
slice_max(value, n=10)
print(cluster_centers)
print("Federalist papers classified:")
cluster_docs <- enframe(km.out$cluster, "document", "cluster") %>%
filter(cluster == i)
print(as.vector(cluster_docs$document))
cat("\n")
}
## [1] "CLUSTER 1" ## [1] "Top 10 words: " ## # A tibble: 10 × 2 ## name value ## <chr> <dbl> ## 1 senat 0.0222 ## 2 presid 0.0194 ## 3 governor 0.0118 ## 4 pardon 0.0114 ## 5 treati 0.0113 ## 6 offic 0.0104 ## 7 appoint 0.0103 ## 8 impeach 0.00974 ## 9 nomin 0.00950 ## 10 vote 0.00727 ## [1] "Federalist papers classified:" ## [1] "66" "68" "69" "74" "75" "76" "77" "79"
## [1] "CLUSTER 2" ## [1] "Top 10 words: " ## # A tibble: 10 × 2 ## name value ## <chr> <dbl> ## 1 armi 0.0229 ## 2 militia 0.0225 ## 3 militari 0.0141 ## 4 disciplin 0.00963 ## 5 garrison 0.00805 ## 6 peac 0.00793 ## 7 troop 0.00758 ## 8 liberti 0.00619 ## 9 corp 0.00566 ## 10 neighbor 0.00553 ## [1] "Federalist papers classified:" ## [1] "8" "24" "25" "26" "28" "29"
## [1] "CLUSTER 3" ## [1] "Top 10 words: " ## # A tibble: 10 × 2 ## name value ## <chr> <dbl> ## 1 northern 0.0607 ## 2 southern 0.0455 ## 3 confederaci 0.0448 ## 4 list 0.0326 ## 5 frontier 0.0265 ## 6 comprehens 0.0238 ## 7 civil 0.0219 ## 8 jersei 0.0218 ## 9 pennsylvania 0.0216 ## 10 navig 0.0200 ## [1] "Federalist papers classified:" ## [1] "13"
## [1] "CLUSTER 4" ## [1] "Top 10 words: " ## # A tibble: 10 × 2 ## name value ## <chr> <dbl> ## 1 vacanc 0.0947 ## 2 recess 0.0552 ## 3 session 0.0483 ## 4 senat 0.0482 ## 5 claus 0.0465 ## 6 fill 0.0453 ## 7 appoint 0.0312 ## 8 expir 0.0237 ## 9 presid 0.0216 ## 10 unfound 0.0192 ## [1] "Federalist papers classified:" ## [1] "67"
## [1] "CLUSTER 5" ## [1] "Top 10 words: " ## # A tibble: 10 × 2 ## name value ## <chr> <dbl> ## 1 court 0.00783 ## 2 juri 0.00490 ## 3 tax 0.00457 ## 4 jurisdict 0.00395 ## 5 taxat 0.00373 ## 6 elect 0.00338 ## 7 trial 0.00335 ## 8 land 0.00334 ## 9 revenu 0.00327 ## 10 claus 0.00316 ## [1] "Federalist papers classified:" ## [1] "1" "6" "7" "9" "11" "12" "15" "16" "17" "21" "22" "23" "27" "30" "31" ## [16] "32" "33" "34" "35" "36" "59" "60" "61" "65" "70" "71" "72" "73" "78" "80" ## [31] "81" "82" "83" "84" "85"
madison <- c(10, 14, 37:48, 58)
jay <- c(2:5, 64)
joint <- c(18:20)
STYLE_WORDS <- c("although", "always", "commonly", "consequently",
"considerable", "enough",
"there", "upon", "while", "whilst")
tokens_raw <- tokens_raw %>%
mutate(author = case_when(new_id %in% hamilton ~ "Hamilton",
new_id %in% madison ~ "Madison",
new_id %in% jay ~ "Jay",
new_id %in% joint ~ "Joint",
TRUE ~ "Disputed"))
head(tokens_raw)
## # A tibble: 6 × 5 ## id new_id word stem author ## <chr> <int> <chr> <chr> <chr> ## 1 fp01.txt 1 after after Hamilton ## 2 fp01.txt 1 an an Hamilton ## 3 fp01.txt 1 unequivocal unequivoc Hamilton ## 4 fp01.txt 1 experience experi Hamilton ## 5 fp01.txt 1 of of Hamilton ## 6 fp01.txt 1 the the Hamilton
tfm <- tokens_raw %>% group_by(author, word) %>% summarize(n = n()) %>% ungroup() %>% group_by(author) %>% mutate(tf_thou = n/sum(n) * 1000) %>% filter(word %in% STYLE_WORDS) %>% select(-n) %>% pivot_wider(names_from = word, values_from = tf_thou) %>% mutate_at(vars(always:consequently), replace_na, 0)
## `summarise()` has grouped output by 'author'. You can override using the ## `.groups` argument.
head(tfm)
## # A tibble: 5 × 11 ## # Groups: author [5] ## author although always commonly consequently considerable there upon whilst ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 Dispu… 0.137 0.228 0.0456 0.365 0.228 1.32 0.137 0.411 ## 2 Hamil… 0.00904 0.551 0.190 0.0361 0.416 3.41 3.34 0.00904 ## 3 Jay 0.597 0.955 0.119 0.477 0.119 1.19 0.119 NA ## 4 Joint NA 0.355 0.178 0 0.355 0.711 0.355 0.355 ## 5 Madis… 0.196 0.171 0 0.318 0.122 0.857 0.171 0.294 ## # ℹ 2 more variables: enough <dbl>, `while` <dbl>
reg_data <- tokens_raw %>%
group_by(author, new_id, word) %>%
summarize(n = n()) %>%
mutate(tf_thou = n/sum(n) * 1000) %>%
filter(word %in% STYLE_WORDS) %>%
mutate(author_outcome = case_when(author == "Hamilton" ~ 1,
author == "Madison" ~ -1,
TRUE ~ NA_real_)) %>%
select(-n) %>%
pivot_wider(names_from = word, values_from = tf_thou) %>%
mutate_at(vars(always:`while`), replace_na, 0) %>%
ungroup()
## `summarise()` has grouped output by 'author', 'new_id'. You can override using ## the `.groups` argument.
head(reg_data)
## # A tibble: 6 × 13 ## author new_id author_outcome always consequently there whilst although upon ## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 Disputed 49 NA 0.608 0.608 1.22 0.608 0 0 ## 2 Disputed 50 NA 0 0 0 0 0.908 0.908 ## 3 Disputed 51 NA 0 1.04 2.09 1.04 0 0 ## 4 Disputed 52 NA 0.542 0 0 0 0 0 ## 5 Disputed 53 NA 0 0 0.924 0.462 0.462 0 ## 6 Disputed 54 NA 0 1.00 0.501 0 0.501 1.00 ## # ℹ 4 more variables: considerable <dbl>, commonly <dbl>, enough <dbl>, ## # `while` <dbl>
hm.fit <- lm(author_outcome ~ upon + there + consequently + whilst, data = reg_data) hm.fit
## ## Call: ## lm(formula = author_outcome ~ upon + there + consequently + whilst, ## data = reg_data) ## ## Coefficients: ## (Intercept) upon there consequently whilst ## -0.1955 0.2128 0.1180 -0.5964 -0.9090
broom::augment(hm.fit)
## # A tibble: 65 × 12 ## .rownames author_outcome upon there consequently whilst .fitted .resid ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 12 1 3.78 1.26 0 0 0.757 0.243 ## 2 13 1 2.06 4.12 0 0 0.729 0.271 ## 3 14 1 4.88 3.99 0 0 1.31 -0.314 ## 4 15 1 1.51 1.01 0 0 0.244 0.756 ## 5 16 1 2.02 1.52 0 0 0.415 0.585 ## 6 17 1 2.40 3.21 0 0 0.695 0.305 ## 7 18 1 3.26 4.19 0 0 0.993 0.00738 ## 8 19 1 2.08 9.38 0 0 1.35 -0.354 ## 9 20 1 3.25 5.85 0 0 1.19 -0.186 ## 10 21 1 2.95 1.96 0 0 0.663 0.337 ## # ℹ 55 more rows ## # ℹ 4 more variables: .hat <dbl>, .sigma <dbl>, .cooksd <dbl>, .std.resid <dbl>
library(modelr) author_data <- reg_data %>% add_predictions(hm.fit) %>% mutate(pred_author = if_else(pred >= 0, "Hamilton", "Madison"))
author_data %>% filter(!is.na(author_outcome)) %>% group_by(author) %>% summarize(`Proportion Correct` = mean(author == pred_author))
## # A tibble: 2 × 2 ## author `Proportion Correct` ## <chr> <dbl> ## 1 Hamilton 1 ## 2 Madison 1
ham_mad <- reg_data %>% filter(!is.na(author_outcome))
n <- nrow(ham_mad)
hm.classify <- as.vector(rep(NA, n), mode = "list")
for (i in 1:n){
sub.fit <- lm(author_outcome ~ upon + there + consequently + whilst,
data = ham_mad[-i, ])
hm.classify[[i]] <- slice(ham_mad, i) %>% add_predictions(sub.fit)
}
bind_rows(hm.classify) %>% mutate(pred_author = if_else(pred >= 0, "Hamilton", "Madison")) %>% group_by(author) %>% summarize(`Proportion Correct` = mean(author == pred_author))
## # A tibble: 2 × 2 ## author `Proportion Correct` ## <chr> <dbl> ## 1 Hamilton 1 ## 2 Madison 0.786
plot_data <- author_data %>%
filter(!author %in% c("Jay", "Joint"))
plot_data %>%
ggplot(aes(new_id, pred, color = author, shape=author)) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_point(size=3) +
scale_y_continuous(breaks = seq(10,80, by=10), minor_breaks = seq(5,80,by=5)) +
scale_color_manual(values = c("Madison" = "blue",
"Hamilton" = "red",
"Disputed" = "black")) +
scale_shape_manual(values = c("Madison" = 16, "Hamilton" = 15, "Disputed" = 17)) +
labs(color = "Author", shape = "Author", x="Federalist Papers", y="Predicted values") +
theme_classic()
plot_data %>% filter(author == "Disputed", pred > 0) %>% select(author, new_id, pred, pred_author)
## # A tibble: 1 × 4 ## author new_id pred pred_author ## <chr> <int> <dbl> <chr> ## 1 Disputed 55 0.0932 Hamilton
florentine <- read.csv("florentine.csv")
florentine[1:6, 1:6]
## FAMILY ACCIAIUOL ALBIZZI BARBADORI BISCHERI CASTELLAN ## 1 ACCIAIUOL 0 0 0 0 0 ## 2 ALBIZZI 0 0 0 0 0 ## 3 BARBADORI 0 0 0 0 1 ## 4 BISCHERI 0 0 0 0 0 ## 5 CASTELLAN 0 0 1 0 0 ## 6 GINORI 0 1 0 0 0
rowwise() do pacote tidyverse, que diz ao R para somar por linhas e não por colunas como é o padrão. Depois, para indicar quais variáveis serão somadas, usaremos a função c_across().florentine %>% group_by(FAMILY) %>% rowwise() %>% summarize(connections = sum(c_across(ACCIAIUOL:TORNABUON))) %>% arrange(desc(connections))
column_to_rownames() para transformar o tibble em um data.frame com os nomes das linhas e colunas dados pelos nomes das família.as.matrix() no data.frame.graph.adjacency() do pacote igraph. O argumento mode é definido como undirected e diag como FALSE para indicar a hipótese que não há casamento na mesma família.plot() para fazer o gráfico.library(igraph) florence <- florentine %>% column_to_rownames(var = "FAMILY") %>% as.matrix() florence <- graph.adjacency(florence, mode="undirected", diag = FALSE) plot(florence)
…
degree() retorna o grau de cada nódulo.degree(florence)
## ACCIAIUOL ALBIZZI BARBADORI BISCHERI CASTELLAN GINORI GUADAGNI LAMBERTES ## 1 3 2 3 3 1 4 1 ## MEDICI PAZZI PERUZZI PUCCI RIDOLFI SALVIATI STROZZI TORNABUON ## 6 1 3 0 3 2 4 3
…
closeness() do pacote igraph calcula a proximidade de cada vértice.closeness(florence)
## ACCIAIUOL ALBIZZI BARBADORI BISCHERI CASTELLAN GINORI GUADAGNI ## 0.02631579 0.03448276 0.03125000 0.02857143 0.02777778 0.02380952 0.03333333 ## LAMBERTES MEDICI PAZZI PERUZZI PUCCI RIDOLFI SALVIATI ## 0.02325581 0.04000000 0.02040816 0.02631579 NaN 0.03571429 0.02777778 ## STROZZI TORNABUON ## 0.03125000 0.03448276
1/(closeness(florence) * 15)
## ACCIAIUOL ALBIZZI BARBADORI BISCHERI CASTELLAN GINORI GUADAGNI LAMBERTES ## 2.533333 1.933333 2.133333 2.333333 2.400000 2.800000 2.000000 2.866667 ## MEDICI PAZZI PERUZZI PUCCI RIDOLFI SALVIATI STROZZI TORNABUON ## 1.666667 3.266667 2.533333 NaN 1.866667 2.400000 2.133333 1.933333
…
betweenness() calcula a medida de intermediação.betweenness(florence)
## ACCIAIUOL ALBIZZI BARBADORI BISCHERI CASTELLAN GINORI GUADAGNI LAMBERTES ## 0.000000 19.333333 8.500000 9.500000 5.000000 0.000000 23.166667 0.000000 ## MEDICI PAZZI PERUZZI PUCCI RIDOLFI SALVIATI STROZZI TORNABUON ## 47.500000 0.000000 2.000000 0.000000 10.333333 13.000000 9.333333 8.333333
plot() aplicada em um objeto do tipo igraph.florence2 <- delete.vertices(florence, "PUCCI")
plot(florence2,
vertex.size = closeness(florence2)*1000, main = "Closeness")
plot(florence,
vertex.size = betweenness(florence), main = "Betweenness")
twitter.following <- read.csv("twitter-following.csv")
twitter.senator <- read.csv("twitter-senator.csv")
follow <- twitter.following
senator <- twitter.senator
head(follow)
## following followed ## 1 SenAlexander RoyBlunt ## 2 SenAlexander SenatorBurr ## 3 SenAlexander JohnBoozman ## 4 SenAlexander SenJohnBarrasso ## 5 SenAlexander SenBennetCO ## 6 SenAlexander SenDanCoats
head(senator)
## screen_name name party state ## 1 SenAlexander Lamar Alexander R TN ## 2 RoyBlunt Roy Blunt R MO ## 3 SenatorBoxer Barbara Boxer D CA ## 4 SenSherrodBrown Sherrod Brown D OH ## 5 SenatorBurr Richard Burr R NC ## 6 SenatorBaldwin Tammy Baldwin D WI
as.matrix(), e depois aplicar a função graph_from_edgelist() com argumento directed com TRUE.twitter_adj <- graph_from_edgelist(as.matrix(follow),
directed = TRUE)
…
degree() pode ser usada em redes orientadas, para isso devemos definir o argumento mode como in, para grau de recepção, out para grau de emissão ou total para a soma dos graus de emissão e recepção.mutate().senator <- senator %>%
mutate(indegree = degree(twitter_adj, mode = "in"),
outdegree = degree(twitter_adj, mode = "out"))
senator %>% slice_max(order_by = indegree, n=3) %>% arrange(desc(indegree)) %>% select(name, party, state, indegree, outdegree)
## name party state indegree outdegree ## 1 Tom Cotton R AR 64 15 ## 2 Richard J. Durbin D IL 60 87 ## 3 John Barrasso R WY 58 79 ## 4 Joe Donnelly D IN 58 9 ## 5 Orrin G. Hatch R UT 58 50
…
closeness() calcula a medida de proximidade, o argumento mode deve ser definido como in, out ou total, no último caso o cálculo ignora a orientação.scale_color_parties <- scale_color_manual("Party",
values = c(R = "red",
D = "blue",
I = "green"),
labels = c(R = "Republican",
D = "Democrat",
I = "Independent"))
scale_shape_parties <- scale_shape_manual("Party",
values = c(R = 16,
D = 17,
I = 4),
labels = c(R = "Republican",
D = "Democrat",
I = "Independent"))
senator %>%
mutate(closeness_in = closeness(twitter_adj, mode = "in"),
closeness_out = closeness(twitter_adj, mode = "out")) %>%
ggplot(aes(closeness_in, closeness_out, color = party, shape = party)) +
geom_point(size=3) +
scale_color_parties +
scale_shape_parties +
labs(title = "Closeness", x = "Incoming path", y = "Outgoing path") +
theme_classic(base_size = 22) +
theme(legend.position = "none")
data.frame(c_in = closeness(twitter_adj, mode = "in"),
c_out = closeness(twitter_adj, mode = "out")) %>%
na.exclude() %>%
cor()
## c_in c_out ## c_in 1.0000000 -0.1371397 ## c_out -0.1371397 1.0000000
…
betweenness() tem o argumento directed que deve ser definido como verdadeiro ou falso.senator %>%
mutate(betweenness_dir = betweenness(twitter_adj, directed = TRUE),
betweenness_undir = betweenness(twitter_adj, directed = FALSE)) %>%
ggplot(aes(x=betweenness_dir, y=betweenness_undir, color=party, shape=party)) +
geom_point(size=3) +
scale_color_parties +
scale_shape_parties +
labs(title = "Betweenness", x="Directed", y="Undirecetd") +
theme_classic(base_size=22) +
theme(legend.position = "none")
cor(betweenness(twitter_adj, directed = TRUE),
betweenness(twitter_adj, directed = FALSE))
## [1] 0.8203257
page_rank() do pacote igraph calcula o PageRank. A função pode ser aplicada em redes não orientadas, para isso defina o argumento directed como falso.page_rank() retorna um objeto do tipo lista, o valor do PageRank está em um elemento da lista chamdo vector.graph_from_data_frame() para criar direto do data.frame. Para isso vamos ter de especificar a lista de arestas, d, e os vértices, vertice.senator <- senator %>%
mutate(page_rank = page_rank(twitter_adj)[["vector"]])
net <- graph_from_data_frame(d = follow,
vertices = senator, directed = TRUE)
net
## IGRAPH 0e49632 DN-- 91 3859 -- ## + attr: name (v/c), party (v/c), state (v/c), indegree (v/n), outdegree ## | (v/n), page_rank (v/n) ## + edges from 0e49632 (vertex names): ## [1] Lamar Alexander->Roy Blunt Lamar Alexander->Richard Burr ## [3] Lamar Alexander->John Boozman Lamar Alexander->John Barrasso ## [5] Lamar Alexander->Michael F. Bennet Lamar Alexander->Daniel Coats ## [7] Lamar Alexander->Susan M. Collins Lamar Alexander->John Cornyn ## [9] Lamar Alexander->Bob Corker Lamar Alexander->Michael B. Enzi ## [11] Lamar Alexander->Joni Ernst Lamar Alexander->Chuck Grassley ## [13] Lamar Alexander->Cory Gardner Lamar Alexander->Orrin G. Hatch ## + ... omitted several edges
E() retorna as arestas, edgeshead(E(net))
## + 6/3859 edges from 0e49632 (vertex names): ## [1] Lamar Alexander->Roy Blunt Lamar Alexander->Richard Burr ## [3] Lamar Alexander->John Boozman Lamar Alexander->John Barrasso ## [5] Lamar Alexander->Michael F. Bennet Lamar Alexander->Daniel Coats
V() retorna os vérticeshead(V(net))
## + 6/91 vertices, named, from 0e49632: ## [1] Lamar Alexander Roy Blunt Barbara Boxer Sherrod Brown ## [5] Richard Burr Tammy Baldwin
head(V(net)$party)
## [1] "R" "R" "D" "D" "R" "D"
E(net)$weight <- vetor com os pesos
col <- senator %>%
mutate(col = case_when(party == "R" ~ "red",
party == "D" ~ "blue",
TRUE ~ "black")) %>%
select(col) %>% pull()
plot(net, vertex.size = V(net)$page_rank * 1000,
vertex.label = NA, vertex.color = col,
edge.arrow.size = 0.1, edge.width = 0.5)
senator %>% group_by(party) %>% summarize(PageRank = mean(page_rank))
## # A tibble: 3 × 2 ## party PageRank ## <chr> <dbl> ## 1 D 0.0109 ## 2 I 0.0119 ## 3 R 0.0110
PageRank <- function(n,A,d,pr) {
deg <- degree(A, mode = "out")
for (j in 1:n) {
pr[j] <- (1-d)/n + d * sum(A[,j]*pr/deg)
}
return(pr)
}
while(condição) {
comandos
}
nodes <- 4 adj <- matrix(c(0,1,0,1,1,0,1,0,0,1,0,0,0,1,0,0), ncol=nodes, nrow=nodes, byrow = TRUE) adj
## [,1] [,2] [,3] [,4] ## [1,] 0 1 0 1 ## [2,] 1 0 1 0 ## [3,] 0 1 0 0 ## [4,] 0 1 0 0
adj <- graph.adjacency(adj)
d <- 0.85
pr <- rep(1/nodes, nodes)
diff <- 100 #valor bem grande para iniciar
while (diff > 0.001) {
pr.pre <- pr
pr <- PageRank(n=nodes, A=adj, d=d, pr=pr)
diff <- max(abs(pr - pr.pre))
}
pr
## [1] 0.2213090 0.4316623 0.2209565 0.1315563
data("us.cities", package = "maps")
head(us.cities)
## name country.etc pop lat long capital ## 1 Abilene TX TX 113888 32.45 -99.74 0 ## 2 Akron OH OH 206634 41.08 -81.52 0 ## 3 Alameda CA CA 70069 37.77 -122.26 0 ## 4 Albany GA GA 75510 31.58 -84.18 0 ## 5 Albany NY NY 93576 42.67 -73.80 2 ## 6 Albany OR OR 45535 44.62 -123.09 0
map_data() do pacote ggplot2 para transformar os dados do pacote maps em uma estrutura adequada para o ggplo2. A função geom_map() faz o mapa com os dados gerados pela função map_data() e a função borders() coloca as fronteiras.coord_quickmap() faz a projeção do mapa.capitals <- filter(us.cities, capital == 2, !country.etc %in% c("HI", "AK"))
usa_map <- map_data("usa")
ggplot() +
geom_map(map = usa_map) +
borders(database = "usa") +
geom_point(aes(x=long, y=lat, size=pop), data = capitals) +
scale_size_area(guide = "none") +
coord_quickmap() +
theme_void(base_size = 12) +
labs(x=NULL, y=NULL)
cal_cities <- filter(us.cities, country.etc == "CA") %>% slice_max(pop, n=7) ggplot() + borders(database = "state", regions = "California") + geom_point(aes(long, lat), data = cal_cities, size=3) + geom_text(aes(long,lat, label = name), data=cal_cities, hjust=-0.05) + coord_quickmap() + theme_void() + labs(x=NULL, y=NULL)
head(usa_map)
## long lat group order region subregion ## 1 -101.4078 29.74224 1 1 main <NA> ## 2 -101.3906 29.74224 1 2 main <NA> ## 3 -101.3620 29.65056 1 3 main <NA> ## 4 -101.3505 29.63911 1 4 main <NA> ## 5 -101.3219 29.63338 1 5 main <NA> ## 6 -101.3047 29.64484 1 6 main <NA>
dim(usa_map)
## [1] 7243 6
colors() mostra os nomes dessas cores.allcolors <- colors() head(allcolors)
## [1] "white" "aliceblue" "antiquewhite" "antiquewhite1" ## [5] "antiquewhite2" "antiquewhite3"
length(allcolors)
## [1] 657
rgb() ajuda a criar os códigos hexadecimais para as cores a partir de valores numéricos. A função tem três argumentos, red, green e blue que podem ter valores de zero a um.rgb().red <- rgb(red=1, green=0, blue=0) green <- rgb(red=0, green=1, blue=0) blue <- rgb(red=0, green=0, blue=1) c(red, green, blue)
## [1] "#FF0000" "#00FF00" "#0000FF"
black <- rgb(red=0, green=0, blue=0) white <- rgb(red=1, green=1, blue=1) c(black, white)
## [1] "#000000" "#FFFFFF"
rgb(red = c(0.5,1), green=c(0,1), blue=c(0.5,0))
## [1] "#800080" "#FFFF00"
rgb() isso é feito definindo valores entre zero e um para o argumento alpha.blue.trans <- rgb(red=0, green=0, blue=1, alpha=0.5) black.trans <- rgb(red=0, green=0, blue=0, alpha=0.5) c(blue.trans, black.trans)
## [1] "#0000FF80" "#00000080"
scale_color_identity() e scale_alpha_identity() para criar as escalar de cor e transparência.sample_data <- tibble(x=rep(1:4, each=2), y=x+rep(c(0,0.2), times=2),
color = rep(c("#000000", "#0000FF"), each=4),
alpha = c(1,1,0.5,0.5,1,1,0.5,0.5))
sample_data
## # A tibble: 8 × 4 ## x y color alpha ## <int> <dbl> <chr> <dbl> ## 1 1 1 #000000 1 ## 2 1 1.2 #000000 1 ## 3 2 2 #000000 0.5 ## 4 2 2.2 #000000 0.5 ## 5 3 3 #0000FF 1 ## 6 3 3.2 #0000FF 1 ## 7 4 4 #0000FF 0.5 ## 8 4 4.2 #0000FF 0.5
sample_data %>% ggplot(aes(x,y,color=color, alpha=alpha)) + geom_point(size=15) + scale_color_identity() + scale_alpha_identity() + labs(x=NULL, y=NULL) + theme_classic()
pres08 <- read.csv("pres08.csv")
pres08 <- pres08 %>%
mutate(Dem = Obama/(Obama+McCain),
Rep = McCain/(Obama+McCain))
cal_color <- pres08 %>%
filter(state == "CA") %>%
mutate(purple_shade = rgb(red=Rep, green=0, blue=Dem)) %>%
select(purple_shade) %>% pull()
ggplot() + borders(database="state", regions = "California", fill="blue") + coord_quickmap() + theme_void()
ggplot() + borders(database="state", regions = "California", fill=cal_color) + coord_quickmap() + theme_void()
pres08 <- pres08 %>% mutate(state = str_to_lower(state.name)) %>%
filter(!state %in% c("hawaii", "d.c.", "alaska"))
states <- map_data("state") %>%
filter(!region %in% c("hawaii", "district of columbia", "alaska")) %>%
full_join(pres08, by = c("region" = "state")) %>%
mutate(party = if_else(Dem > Rep, "Dem", "Rep"),
purple_shade = rgb(red=Rep, green=0, blue=Dem))
glimpse(states)
## Rows: 15,527 ## Columns: 14 ## $ long <dbl> -87.46201, -87.48493, -87.52503, -87.53076, -87.57087, -8… ## $ lat <dbl> 30.38968, 30.37249, 30.37249, 30.33239, 30.32665, 30.3266… ## $ group <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, … ## $ order <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17… ## $ region <chr> "alabama", "alabama", "alabama", "alabama", "alabama", "a… ## $ subregion <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N… ## $ state.name <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "A… ## $ Obama <int> 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 3… ## $ McCain <int> 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 6… ## $ EV <int> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, … ## $ Dem <dbl> 0.3939394, 0.3939394, 0.3939394, 0.3939394, 0.3939394, 0.… ## $ Rep <dbl> 0.6060606, 0.6060606, 0.6060606, 0.6060606, 0.6060606, 0.… ## $ party <chr> "Rep", "Rep", "Rep", "Rep", "Rep", "Rep", "Rep", "Rep", "… ## $ purple_shade <chr> "#9B0064", "#9B0064", "#9B0064", "#9B0064", "#9B0064", "#…
states %>%
ggplot() +
geom_polygon(aes(group=group, x=long, y=lat, fill=party)) +
borders("state") +
coord_quickmap() +
scale_fill_manual(values = c("Rep" = "red", Dem="blue"), guide="none") +
theme_void() +
labs(x=NULL, y=NULL)
states %>%
ggplot() +
geom_polygon(aes(group=group, x=long, y=lat, fill=purple_shade)) +
borders("state") +
coord_quickmap() +
scale_fill_identity() +
theme_void() +
labs(x=NULL, y=NULL)
walmart <- read.csv("walmart.csv")
str(walmart)
## 'data.frame': 3251 obs. of 7 variables: ## $ opendate : chr "1962-03-01" "1962-07-01" "1964-08-01" "1965-08-01" ... ## $ st.address: chr "5801 SW Regional Airport Blvd" "2110 WEST WALNUT" "1417 HWY 62/65 N" "2901 HWY 412 EAST" ... ## $ city : chr "Bentonville" "Rogers" "Harrison" "Siloam Springs" ... ## $ state : chr "AR" "AR" "AR" "AR" ... ## $ long : num -94.2 -94.1 -93.1 -94.5 -92.3 ... ## $ lat : num 36.4 36.3 36.2 36.2 34.8 ... ## $ type : chr "DistributionCenter" "SuperCenter" "SuperCenter" "SuperCenter" ...
walmart <- walmart %>% mutate(opendate = as.Date(opendate)) str(walmart)
## 'data.frame': 3251 obs. of 7 variables: ## $ opendate : Date, format: "1962-03-01" "1962-07-01" ... ## $ st.address: chr "5801 SW Regional Airport Blvd" "2110 WEST WALNUT" "1417 HWY 62/65 N" "2901 HWY 412 EAST" ... ## $ city : chr "Bentonville" "Rogers" "Harrison" "Siloam Springs" ... ## $ state : chr "AR" "AR" "AR" "AR" ... ## $ long : num -94.2 -94.1 -93.1 -94.5 -92.3 ... ## $ lat : num 36.4 36.3 36.2 36.2 34.8 ... ## $ type : chr "DistributionCenter" "SuperCenter" "SuperCenter" "SuperCenter" ...
head(walmart)
## opendate st.address city state long ## 1 1962-03-01 5801 SW Regional Airport Blvd Bentonville AR -94.23982 ## 2 1962-07-01 2110 WEST WALNUT Rogers AR -94.07141 ## 3 1964-08-01 1417 HWY 62/65 N Harrison AR -93.09345 ## 4 1965-08-01 2901 HWY 412 EAST Siloam Springs AR -94.50208 ## 5 1967-10-01 3801 CAMP ROBINSON RD. North Little Rock AR -92.30229 ## 6 1967-10-01 1621 NORTH BUSINESS 9 Morrilton AR -92.75858 ## lat type ## 1 36.35088 DistributionCenter ## 2 36.34224 SuperCenter ## 3 36.23698 SuperCenter ## 4 36.17990 SuperCenter ## 5 34.81327 Wal-MartStore ## 6 35.15649 SuperCenter
walmart <- walmart %>%
mutate(size = if_else(type == "DistributionCenter", 2, 1),
type = recode(type,
"DistributionCenter" = "Distribution \ncenter",
"SuperCenter" = "Supercenter",
"Wal-MartStore" = "Walmart"))
ggplot() +
borders(database = "state") +
geom_point(aes(long,lat, color=type, size=size),
data = walmart, alpha = 1/3) +
coord_quickmap() +
scale_size_identity() +
theme_void(base_size=12) +
labs(color="Type")
walmart.map <- function(data, date){
temp <- filter(data, opendate <= date) %>%
mutate(size = if_else(type == "DistributionCenter", 2, 1))
ggplot() +
borders(database = "state") +
geom_point(aes(long,lat, color=type, size=size),
data = temp, alpha = 1/3) +
coord_quickmap() +
scale_size_identity() +
theme_void(base_size=12) +
labs(color="Type") +
ggtitle(date)
}
library(gridExtra)
grid.arrange(
walmart.map(walmart, as.Date("1974-12-31")),
walmart.map(walmart, as.Date("1984-12-31")),
walmart.map(walmart, as.Date("1994-12-31")),
walmart.map(walmart, as.Date("2004-12-31")),
ncol = 2, nrow=2)
transition_states(). A ideia é criar um gráfico para cada ano e mostrar todos os gráficos em sequência.floor_date() do pacote lubridade. Essa função arredonda uma data para a unidade mais próxima, no caso ano.transition_states() o argumento state será definido como o ano que criamos com a função floor_date().shadow_mark() deixa visível os pontos dos anos anteriores.anim_save() salva a animação como GIF (ou outros formatos).library(gganimate) library(lubridate) walmart <- walmart %>% mutate(year = floor_date(opendate, unit = "year"))
walmart_animated <- ggplot() +
borders(database = "state") +
geom_point(aes(long,lat, color=type),
data = walmart) +
coord_quickmap() +
theme_void() +
transition_states(states = year,
transition_length = 0,
state_length = 1) +
shadow_mark()
anim_save("walmart.gif")
walmart_animated