Google Page Rank

library(igraph)
sites_edges <- 
  c("A","B",
  "B","A",
  "B","C",
  "C","A",
  "C","B",
  "C","E",
  "D","A",
  "E","D",
  "E","C",
  "E","B")
net_sites <- graph(sites_edges)
plot(net_sites, layout=layout.circle)

A B C D E
A 0 1 0 0 0
B 1 0 1 0 0
C 1 1 0 0 1
D 1 0 0 0 0
E 0 1 1 1 0
m<-
c(0,1,0,0,0,
  1,0,1,0,0,
  1,1,0,0,1,
  1,0,0,0,0,
  0,1,1,1,0)
m <- matrix(m,ncol = 5,byrow = TRUE)
colnames(m)<-c("A","B","C","D","E")
rownames(m)<-c("A","B","C","D","E")
print(m)
  A B C D E
A 0 1 0 0 0
B 1 0 1 0 0
C 1 1 0 0 1
D 1 0 0 0 0
E 0 1 1 1 0
m<-
m/rowSums(m)
print(m)
          A         B         C         D         E
A 0.0000000 1.0000000 0.0000000 0.0000000 0.0000000
B 0.5000000 0.0000000 0.5000000 0.0000000 0.0000000
C 0.3333333 0.3333333 0.0000000 0.0000000 0.3333333
D 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000
E 0.0000000 0.3333333 0.3333333 0.3333333 0.0000000
p<-m
for(i in 1:1000){
  p<-m%*%p
}
print(p)
          A         B         C          D          E
A 0.2926829 0.3902439 0.2195122 0.02439024 0.07317073
B 0.2926829 0.3902439 0.2195122 0.02439024 0.07317073
C 0.2926829 0.3902439 0.2195122 0.02439024 0.07317073
D 0.2926829 0.3902439 0.2195122 0.02439024 0.07317073
E 0.2926829 0.3902439 0.2195122 0.02439024 0.07317073

B, A, C, E, D

EL Quijote

library(tm)
library(dplyr)
library(tidyr)
library(igraph)

Cargar el libro

libro <- file("borges1.txt")
#libro2 <- file("biblia.txt")
libro_lines <- readLines(libro)
#libro_lines2 <- readLines(libro2)
libro_words <- paste(libro_lines,collapse = " ")
libro_words <- strsplit(libro_words, split = " ") %>% unlist()
#libro_words_2 <- paste(libro_lines2,collapse = " ")
#libro_words_2 <- strsplit(libro_words_2, split = " ") %>% unlist()
#libro_words <- c(libro_words,libro_words_2)
str(libro_words)
 chr [1:3581] "CENTENARIO" "DE" "BORGES" "" "" "\"Mi" "entraÒable" "seÒor" ...

palabras unicas

length(unique(libro_words))
[1] 1133
table(libro_words) %>% as.data.frame()

Filtros

libro_words <- sapply(libro_words,"removePunctuation",USE.NAMES = FALSE)
libro_words <- sapply(libro_words,"tolower",USE.NAMES = FALSE)
libro_words <- sapply(libro_words,"stripWhitespace",USE.NAMES = FALSE)
libro_words <- sapply(libro_words,"removeNumbers",USE.NAMES = FALSE)
# libro_words <- sapply(libro_words,"removeWords",words=stopwords('spanish'), USE.NAMES = FALSE)
libro_words <- libro_words[libro_words!=""]
libro_words <- libro_words[libro_words!=" "]

bigramas

bigrams<-
lapply(ngrams(libro_words,2), paste, collapse=" ") %>% unlist()
bigrams<-
table(bigrams) %>% as.data.frame()
bigrams <- bigrams %>% separate(bigrams,into=c("word1","word2"),sep=" ") 
muestra_words <- bigrams %>% filter(Freq>10)
libro_vertices<-
paste(muestra_words$word1,muestra_words$word2) %>% strsplit(" ") %>% unlist()
g1<-
graph(libro_vertices)
plot(g1,edge.arrow.size = 0.3,
     arrow.width = 0.1,
     vertex.size = 5,
     vertex.label.cex = 0.8)

next_word <- function(palabra){
  top_words<-
  bigrams %>% 
    filter(word1 == palabra) %>% 
    arrange(desc(Freq)) %>% 
    select(word2,Freq) 
  total <- sum(top_words$Freq)
  top_words$Freq<- top_words$Freq/total
  sample(top_words$word2,size = 1,prob = top_words$Freq) %>% return()
}
next_word("dulcinea")
[1] "del"
sentence <- function(seed="don",n=10){
  oracion <- seed
  for(i in 1:n){
    oracion <- c( oracion,next_word(oracion[i]) )
  }
  paste(oracion,collapse = " ") %>% return()
}
sentence("cervantes",n = 25)
[1] "cervantes lo amo a travès de don quijote creo que los repitiera profusamente entonces tambièn debe haber pensado en el prìncipe hamlet la noche scherezade empieza"

Tarea

  1. Armar la cadena de markov para el Quijote.
LS0tCnRpdGxlOiAiQXBsaWNhY2lvbmVzIGRlIENhZGVuYXMgZGUgTWFya292IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgoKIyMgR29vZ2xlIFBhZ2UgUmFuawoKYGBge3J9CmxpYnJhcnkoaWdyYXBoKQpzaXRlc19lZGdlcyA8LSAKICBjKCJBIiwiQiIsCiAgIkIiLCJBIiwKICAiQiIsIkMiLAogICJDIiwiQSIsCiAgIkMiLCJCIiwKICAiQyIsIkUiLAogICJEIiwiQSIsCiAgIkUiLCJEIiwKICAiRSIsIkMiLAogICJFIiwiQiIpCgpuZXRfc2l0ZXMgPC0gZ3JhcGgoc2l0ZXNfZWRnZXMpCnBsb3QobmV0X3NpdGVzLCBsYXlvdXQ9bGF5b3V0LmNpcmNsZSkKYGBgCgoKfHxBfEJ8Q3xEfEV8CnwtLXwtLXwtLXwtLXwtLXwtLXwKfEF8MHwxfDB8MHwwfAp8QnwxfDB8MXwwfDB8CnxDfDF8MXwwfDB8MXwKfER8MXwwfDB8MHwwfAp8RXwwfDF8MXwxfDB8CgoKYGBge3J9Cm08LQpjKDAsMSwwLDAsMCwKICAxLDAsMSwwLDAsCiAgMSwxLDAsMCwxLAogIDEsMCwwLDAsMCwKICAwLDEsMSwxLDApCm0gPC0gbWF0cml4KG0sbmNvbCA9IDUsYnlyb3cgPSBUUlVFKQpjb2xuYW1lcyhtKTwtYygiQSIsIkIiLCJDIiwiRCIsIkUiKQpyb3duYW1lcyhtKTwtYygiQSIsIkIiLCJDIiwiRCIsIkUiKQpwcmludChtKQpgYGAKCgoKYGBge3J9Cm08LQptL3Jvd1N1bXMobSkKCnByaW50KG0pCmBgYAoKCmBgYHtyfQpwPC1tCmZvcihpIGluIDE6MTAwMCl7CiAgcDwtbSUqJXAKfQpwcmludChwKQpgYGAKCkIsIEEsIEMsIEUsIEQgCgoKIyMgRUwgUXVpam90ZQoKYGBge3J9CmxpYnJhcnkodG0pCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkodGlkeXIpCmxpYnJhcnkoaWdyYXBoKQpgYGAKCgojIyMgQ2FyZ2FyIGVsIGxpYnJvCgpgYGB7cn0KbGlicm8gPC0gZmlsZSgiYm9yZ2VzMS50eHQiKQojbGlicm8yIDwtIGZpbGUoImJpYmxpYS50eHQiKQpsaWJyb19saW5lcyA8LSByZWFkTGluZXMobGlicm8pCiNsaWJyb19saW5lczIgPC0gcmVhZExpbmVzKGxpYnJvMikKbGlicm9fd29yZHMgPC0gcGFzdGUobGlicm9fbGluZXMsY29sbGFwc2UgPSAiICIpCmxpYnJvX3dvcmRzIDwtIHN0cnNwbGl0KGxpYnJvX3dvcmRzLCBzcGxpdCA9ICIgIikgJT4lIHVubGlzdCgpCiNsaWJyb193b3Jkc18yIDwtIHBhc3RlKGxpYnJvX2xpbmVzMixjb2xsYXBzZSA9ICIgIikKI2xpYnJvX3dvcmRzXzIgPC0gc3Ryc3BsaXQobGlicm9fd29yZHNfMiwgc3BsaXQgPSAiICIpICU+JSB1bmxpc3QoKQojbGlicm9fd29yZHMgPC0gYyhsaWJyb193b3JkcyxsaWJyb193b3Jkc18yKQpzdHIobGlicm9fd29yZHMpCmBgYAoKcGFsYWJyYXMgdW5pY2FzCmBgYHtyfQpsZW5ndGgodW5pcXVlKGxpYnJvX3dvcmRzKSkKYGBgCgpgYGB7cn0KdGFibGUobGlicm9fd29yZHMpICU+JSBhcy5kYXRhLmZyYW1lKCkKYGBgCgogIAogIAogRmlsdHJvcyAKYGBge3J9CmxpYnJvX3dvcmRzIDwtIHNhcHBseShsaWJyb193b3JkcywicmVtb3ZlUHVuY3R1YXRpb24iLFVTRS5OQU1FUyA9IEZBTFNFKQpsaWJyb193b3JkcyA8LSBzYXBwbHkobGlicm9fd29yZHMsInRvbG93ZXIiLFVTRS5OQU1FUyA9IEZBTFNFKQpsaWJyb193b3JkcyA8LSBzYXBwbHkobGlicm9fd29yZHMsInN0cmlwV2hpdGVzcGFjZSIsVVNFLk5BTUVTID0gRkFMU0UpCmxpYnJvX3dvcmRzIDwtIHNhcHBseShsaWJyb193b3JkcywicmVtb3ZlTnVtYmVycyIsVVNFLk5BTUVTID0gRkFMU0UpCiMgbGlicm9fd29yZHMgPC0gc2FwcGx5KGxpYnJvX3dvcmRzLCJyZW1vdmVXb3JkcyIsd29yZHM9c3RvcHdvcmRzKCdzcGFuaXNoJyksIFVTRS5OQU1FUyA9IEZBTFNFKQpsaWJyb193b3JkcyA8LSBsaWJyb193b3Jkc1tsaWJyb193b3JkcyE9IiJdCmxpYnJvX3dvcmRzIDwtIGxpYnJvX3dvcmRzW2xpYnJvX3dvcmRzIT0iICJdCgpgYGAKCmJpZ3JhbWFzCmBgYHtyfQpiaWdyYW1zPC0KbGFwcGx5KG5ncmFtcyhsaWJyb193b3JkcywyKSwgcGFzdGUsIGNvbGxhcHNlPSIgIikgJT4lIHVubGlzdCgpCgpiaWdyYW1zPC0KdGFibGUoYmlncmFtcykgJT4lIGFzLmRhdGEuZnJhbWUoKQoKYmlncmFtcyA8LSBiaWdyYW1zICU+JSBzZXBhcmF0ZShiaWdyYW1zLGludG89Yygid29yZDEiLCJ3b3JkMiIpLHNlcD0iICIpIAoKYGBgCgpgYGB7cn0KCm11ZXN0cmFfd29yZHMgPC0gYmlncmFtcyAlPiUgZmlsdGVyKEZyZXE+MTApCgpsaWJyb192ZXJ0aWNlczwtCnBhc3RlKG11ZXN0cmFfd29yZHMkd29yZDEsbXVlc3RyYV93b3JkcyR3b3JkMikgJT4lIHN0cnNwbGl0KCIgIikgJT4lIHVubGlzdCgpCgpnMTwtCmdyYXBoKGxpYnJvX3ZlcnRpY2VzKQoKcGxvdChnMSxlZGdlLmFycm93LnNpemUgPSAwLjMsCiAgICAgYXJyb3cud2lkdGggPSAwLjEsCiAgICAgdmVydGV4LnNpemUgPSA1LAogICAgIHZlcnRleC5sYWJlbC5jZXggPSAwLjgpCgpgYGAKCgpgYGB7cn0KbmV4dF93b3JkIDwtIGZ1bmN0aW9uKHBhbGFicmEpewogIHRvcF93b3JkczwtCiAgYmlncmFtcyAlPiUgCiAgICBmaWx0ZXIod29yZDEgPT0gcGFsYWJyYSkgJT4lIAogICAgYXJyYW5nZShkZXNjKEZyZXEpKSAlPiUgCiAgICBzZWxlY3Qod29yZDIsRnJlcSkgCiAgdG90YWwgPC0gc3VtKHRvcF93b3JkcyRGcmVxKQogIHRvcF93b3JkcyRGcmVxPC0gdG9wX3dvcmRzJEZyZXEvdG90YWwKICBzYW1wbGUodG9wX3dvcmRzJHdvcmQyLHNpemUgPSAxLHByb2IgPSB0b3Bfd29yZHMkRnJlcSkgJT4lIHJldHVybigpCn0KCgpuZXh0X3dvcmQoImR1bGNpbmVhIikKYGBgCgoKCmBgYHtyfQpzZW50ZW5jZSA8LSBmdW5jdGlvbihzZWVkPSJkb24iLG49MTApewogIG9yYWNpb24gPC0gc2VlZAogIGZvcihpIGluIDE6bil7CiAgICBvcmFjaW9uIDwtIGMoIG9yYWNpb24sbmV4dF93b3JkKG9yYWNpb25baV0pICkKICB9CiAgcGFzdGUob3JhY2lvbixjb2xsYXBzZSA9ICIgIikgJT4lIHJldHVybigpCn0KCgpzZW50ZW5jZSgiY2VydmFudGVzIixuID0gMjUpCmBgYAoKCiNUYXJlYQoxLiBBcm1hciBsYSBjYWRlbmEgZGUgbWFya292IHBhcmEgZWwgUXVpam90ZS4KCgoK