library(tm)
library(dplyr)
library(tidyr)
quijote<-file("quijote.txt","r",encoding = "UTF-8")
quijote_lines<-readLines(quijote)
#############Parte 1################
#quijote_words<-paste(quijote_lines,collapse = " ")
#quijote_words<-strsplit(quijote_words,split = " ")
#length(quijote_words[[1]]) #num pal
#length(unique(quijote_words[[1]])) #num pal unique
#quijote_words<-sapply(quijote_words[[1]],"removePunctuation",USE.NAMES = FALSE)
#quijote_words<-sapply(quijote_words,"tolower",USE.NAMES = FALSE)
#quijote_words<-sapply(quijote_words,"stripWhitespace",USE.NAMES = FALSE)
#quijote_words<-sapply(quijote_words,"removeNumbers",USE.NAMES = FALSE)
#quijote_words<-quijote_words[quijote_words!=""]
#table(quijote_words) %>% as.data.frame() %>% View()
#############Parte 2################
quijote_src<-VectorSource(quijote_lines)
quijote_corpus<-VCorpus(quijote_src)
clean_corpus<-function(corpus){
corpus<-tm_map(corpus,stripWhitespace)
corpus<-tm_map(corpus,removePunctuation)
corpus<-tm_map(corpus,content_transformer(tolower))
corpus<-tm_map(corpus,removeNumbers)
return(corpus)
}
quijote_corpus<-clean_corpus(quijote_corpus)
#quijote_dtm<-DocumentTermMatrix(quijote_corpus)
#quijote_dtm_m<-as.matrix(quijote_dtm)
#quijote_tdm<-TermDocumentMatrix(quijote_corpus)
#quijote_tdm_m<-as.matrix(quijote_tdm)
BigramTokenizer<-function (x){
unlist(lapply(ngrams(words(x),2),paste,collapse = " "),use.names = FALSE)
}
tdm<-TermDocumentMatrix(quijote_corpus,control = list(tokenize = BigramTokenizer))
closing unused connection 4 (quijote.txt)
#m<-as.matrix(tdm)
#rowSums(as.matrix(tdm)) %>% as.data.frame() %>% View
#bigram<-as.data.frame(bigram)
#names(bigram)<-c("frec","bigram")
#bigram<-as.data.frame((bigram))
#bigram$bigram<-as.character((bigram$bigram))
#bigram<-separate(bigram,bigram,into = c("word1", "word2"))
#sentence<- function(palabra = "venta", n = 10){
# oracion <- palabra
#for(i in 1:n){
# NextWord<-bigram %>%
#filter(word1 == palabra) %>%
#arrange(desc(freq)) %>% head(1) %>% select(word2) %>% as.character()
#paste(c(oracion,NextWord),collapse = " ")
#palabra<-NextWord
#}
#}
sentence_maxpro<- function(palabra = "venta", n = 10){
oracion <- palabra
for(i in 1:n){
NextWord<-bigram %>%
filter(word1 == palabra) %>%
arrange(desc(freq)) %>% head(5)
NextWord<-sample(NextWord$word2,size = 1, prob = NextWord$freq/sum(NextWord$freq))
paste(c(oracion,NextWord),collapse = " ")
palabra<-NextWord
}
}
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7cn0NCmxpYnJhcnkodG0pDQpsaWJyYXJ5KGRwbHlyKQ0KbGlicmFyeSh0aWR5cikNCnF1aWpvdGU8LWZpbGUoInF1aWpvdGUudHh0IiwiciIsZW5jb2RpbmcgPSAiVVRGLTgiKQ0KcXVpam90ZV9saW5lczwtcmVhZExpbmVzKHF1aWpvdGUpDQoNCiMjIyMjIyMjIyMjIyNQYXJ0ZSAxIyMjIyMjIyMjIyMjIyMjIw0KDQojcXVpam90ZV93b3JkczwtcGFzdGUocXVpam90ZV9saW5lcyxjb2xsYXBzZSA9ICIgIikNCiNxdWlqb3RlX3dvcmRzPC1zdHJzcGxpdChxdWlqb3RlX3dvcmRzLHNwbGl0ID0gIiAiKQ0KI2xlbmd0aChxdWlqb3RlX3dvcmRzW1sxXV0pICNudW0gcGFsDQojbGVuZ3RoKHVuaXF1ZShxdWlqb3RlX3dvcmRzW1sxXV0pKSAjbnVtIHBhbCB1bmlxdWUNCg0KI3F1aWpvdGVfd29yZHM8LXNhcHBseShxdWlqb3RlX3dvcmRzW1sxXV0sInJlbW92ZVB1bmN0dWF0aW9uIixVU0UuTkFNRVMgPSBGQUxTRSkgDQojcXVpam90ZV93b3Jkczwtc2FwcGx5KHF1aWpvdGVfd29yZHMsInRvbG93ZXIiLFVTRS5OQU1FUyA9IEZBTFNFKQ0KI3F1aWpvdGVfd29yZHM8LXNhcHBseShxdWlqb3RlX3dvcmRzLCJzdHJpcFdoaXRlc3BhY2UiLFVTRS5OQU1FUyA9IEZBTFNFKQ0KI3F1aWpvdGVfd29yZHM8LXNhcHBseShxdWlqb3RlX3dvcmRzLCJyZW1vdmVOdW1iZXJzIixVU0UuTkFNRVMgPSBGQUxTRSkNCiNxdWlqb3RlX3dvcmRzPC1xdWlqb3RlX3dvcmRzW3F1aWpvdGVfd29yZHMhPSIiXQ0KICANCiN0YWJsZShxdWlqb3RlX3dvcmRzKSAlPiUgYXMuZGF0YS5mcmFtZSgpICU+JSBWaWV3KCkNCiAgICAgICAgDQojIyMjIyMjIyMjIyMjUGFydGUgMiMjIyMjIyMjIyMjIyMjIyMNCg0KcXVpam90ZV9zcmM8LVZlY3RvclNvdXJjZShxdWlqb3RlX2xpbmVzKQ0KcXVpam90ZV9jb3JwdXM8LVZDb3JwdXMocXVpam90ZV9zcmMpDQoNCmNsZWFuX2NvcnB1czwtZnVuY3Rpb24oY29ycHVzKXsNCiAgY29ycHVzPC10bV9tYXAoY29ycHVzLHN0cmlwV2hpdGVzcGFjZSkNCiAgY29ycHVzPC10bV9tYXAoY29ycHVzLHJlbW92ZVB1bmN0dWF0aW9uKQ0KICBjb3JwdXM8LXRtX21hcChjb3JwdXMsY29udGVudF90cmFuc2Zvcm1lcih0b2xvd2VyKSkNCiAgY29ycHVzPC10bV9tYXAoY29ycHVzLHJlbW92ZU51bWJlcnMpDQogIHJldHVybihjb3JwdXMpDQp9DQoNCnF1aWpvdGVfY29ycHVzPC1jbGVhbl9jb3JwdXMocXVpam90ZV9jb3JwdXMpDQoNCiNxdWlqb3RlX2R0bTwtRG9jdW1lbnRUZXJtTWF0cml4KHF1aWpvdGVfY29ycHVzKQ0KI3F1aWpvdGVfZHRtX208LWFzLm1hdHJpeChxdWlqb3RlX2R0bSkNCg0KI3F1aWpvdGVfdGRtPC1UZXJtRG9jdW1lbnRNYXRyaXgocXVpam90ZV9jb3JwdXMpDQojcXVpam90ZV90ZG1fbTwtYXMubWF0cml4KHF1aWpvdGVfdGRtKQ0KDQpCaWdyYW1Ub2tlbml6ZXI8LWZ1bmN0aW9uICh4KXsNCiAgdW5saXN0KGxhcHBseShuZ3JhbXMod29yZHMoeCksMikscGFzdGUsY29sbGFwc2UgPSAiICIpLHVzZS5uYW1lcyA9IEZBTFNFKQ0KfQ0KDQp0ZG08LVRlcm1Eb2N1bWVudE1hdHJpeChxdWlqb3RlX2NvcnB1cyxjb250cm9sID0gbGlzdCh0b2tlbml6ZSA9IEJpZ3JhbVRva2VuaXplcikpDQojbTwtYXMubWF0cml4KHRkbSkNCiNyb3dTdW1zKGFzLm1hdHJpeCh0ZG0pKSAlPiUgYXMuZGF0YS5mcmFtZSgpICU+JSBWaWV3DQoNCiNiaWdyYW08LWFzLmRhdGEuZnJhbWUoYmlncmFtKQ0KDQojbmFtZXMoYmlncmFtKTwtYygiZnJlYyIsImJpZ3JhbSIpDQojYmlncmFtPC1hcy5kYXRhLmZyYW1lKChiaWdyYW0pKQ0KDQojYmlncmFtJGJpZ3JhbTwtYXMuY2hhcmFjdGVyKChiaWdyYW0kYmlncmFtKSkNCg0KI2JpZ3JhbTwtc2VwYXJhdGUoYmlncmFtLGJpZ3JhbSxpbnRvID0gYygid29yZDEiLCAid29yZDIiKSkNCg0KDQojc2VudGVuY2U8LSBmdW5jdGlvbihwYWxhYnJhID0gInZlbnRhIiwgbiA9IDEwKXsNCiAjIG9yYWNpb24gPC0gcGFsYWJyYQ0KICAjZm9yKGkgaW4gMTpuKXsNCiAgICMgTmV4dFdvcmQ8LWJpZ3JhbSAlPiUgDQogICAgI2ZpbHRlcih3b3JkMSA9PSBwYWxhYnJhKSAlPiUgDQogICAgI2FycmFuZ2UoZGVzYyhmcmVxKSkgJT4lIGhlYWQoMSkgJT4lIHNlbGVjdCh3b3JkMikgJT4lIGFzLmNoYXJhY3RlcigpDQogICNwYXN0ZShjKG9yYWNpb24sTmV4dFdvcmQpLGNvbGxhcHNlID0gIiAiKQ0KICAjcGFsYWJyYTwtTmV4dFdvcmQNCiAgI30NCiN9DQoNCnNlbnRlbmNlX21heHBybzwtIGZ1bmN0aW9uKHBhbGFicmEgPSAidmVudGEiLCBuID0gMTApew0KICBvcmFjaW9uIDwtIHBhbGFicmENCiAgZm9yKGkgaW4gMTpuKXsNCiAgICBOZXh0V29yZDwtYmlncmFtICU+JSANCiAgICBmaWx0ZXIod29yZDEgPT0gcGFsYWJyYSkgJT4lIA0KICAgIGFycmFuZ2UoZGVzYyhmcmVxKSkgJT4lIGhlYWQoNSkgDQogICAgTmV4dFdvcmQ8LXNhbXBsZShOZXh0V29yZCR3b3JkMixzaXplID0gMSwgcHJvYiA9IE5leHRXb3JkJGZyZXEvc3VtKE5leHRXb3JkJGZyZXEpKQ0KICAgIA0KICAgIA0KICAgIA0KICBwYXN0ZShjKG9yYWNpb24sTmV4dFdvcmQpLGNvbGxhcHNlID0gIiAiKQ0KICBwYWxhYnJhPC1OZXh0V29yZA0KICB9DQp9DQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQpgYGANCg0K