Bruno González
Nov 22th, 2018
The data is loaded with the next code
if(!dir.exists("./data")){download.file(fileurl, destfile = temp, method = "curl")
data_enb <- unz(description = temp, filename = "final/en_US/en_US.blogs.txt")
data_enb <- readLines(data_enb,n_upload)
data_enn <- unz(description = temp, filename = "final/en_US/en_US.news.txt")
data_enn <- readLines(data_enn,n_upload)
data_ent <- unz(description = temp, filename = "final/en_US/en_US.twitter.txt")
data_ent <- readLines(data_ent, n_upload)
} else {
con_enb <- file("./data/final/en_US/en_US.blogs.txt", encoding="UTF-8")
con_enn <- file("./data/final/en_US/en_US.news.txt", encoding="UTF-8")
con_ent <- file("./data/final/en_US/en_US.twitter.txt", encoding="UTF-8")
data_enb <- readLines(con_enb, encoding = "UTF-8")
data_enn <- readLines(con_enn, encoding = "UTF-8")
data_ent <- readLines(con_ent, encoding = "UTF-8")
close(con_enb)
close(con_enn)
close(con_ent)
}
ftoken2 <- function(dat){
tok <- str_split(dat,boundary("word")) %>% unlist()%>% tolower()
tok <- tok[!str_detect(tok,"[:digit:]")]
tok <- tok[str_detect(tok,"^[a-zA-Z]")]
tok2 <- NULL
for(i in 1:(length(tok)-1)){tok2[i] <- str_c(tok[i], tok[i+1], sep = " ") }
tok <- tok[1:(length(tok)-1)]
mat <- data.frame(tok,tok2) %>% group_by(tok) %>%
mutate(freqt=n()) %>% ungroup() %>%
group_by(tok2) %>% mutate(freq=n()/freqt) %>%
summarize(tok=nth(tok,1), freq=max(freq)) %>%
ungroup() %>% group_by(tok) %>%
mutate(rank = rank(-freq, ties.method = "min")) %>%
ungroup() %>% filter(rank ==1) %>%
select(tok2) %>% separate(tok2, sep = " ", into = c("wrd","wpred"))
mat
}
shiny <- function(text){
filt <- str_split(text, " ", simplify = TRUE)
filt <- filt[1,length(filt)]
filt <- tolower(filt)
mat <- mat %>% filter(wrd==filt) %>% select(wpred)
}