17 de mayo de 2019
Data has been given within the course material and It consist on 3 files from 3 different sources (news, blogs and tweets). This data has been cleaned using the following functions for text cleaning and creating a simplified file for training with late ngrams:
# text: clean file
textcleaning <- function(x){
#to lower, remove number...
[...]
# replace words that contain "@", "#", "http://", "https://"
[...]
# replace all non-alphanumeric characters with a space at the beginning/end of a word.
[...]
return(textclean)
}
# tidytext: reads txt file cleans it and save it during process
SuperTidyFile <- function(filesdir = "./data/en_US", output = "./data/SuperTidyFile.txt") {
files <- list.files(filesdir, pattern = "*.txt", full.names = TRUE)
file.create(output)
for (f in files){#textcleaning line and write it in the output file
}
}
return(paste0("Has creado un SuperTidyFile en ", output))
}
Using the previous SuperTidyFile some proves has been performed until getting the next function that creates a readable simple file (.rds) with a dataframe of ngrams (3-grams) and the times each combination appeared in the SuperTidyFile: Here is the code to create the ngram files, in .rds, readable at shiny execution:
SuperTidyNgram <- function(stf = "./data/SuperTidyFile.txt", output = "./data/SuperTidyNgram.rds",
Ngram = 3, samplediv = 1){
con <- file(stf, open="r"); LINES <- readLines(con); close(con) # read lines #>4000000 LINEAS
LINEStib <- tibble(line = 1:length(LINES), words = LINES)
nsamples <- round(dim(LINEStib)[1]/samplediv) #set sample div
tbnS <- sample_n(LINEStib, nsamples)
tbn3 <- tbnS %>%
unnest_tokens(trigram, words, token = "ngrams", n = Ngram) %>% [...]#more thing
saveRDS(tbn3, file = output) #save object (tbn english 3gram)
return(paste0("Has creado un SuperTidyNgram en ", output))
}
#This is the function used to create the data table shown:
#return most common words for ending phrase
nextword <- function(x, y){
TBNf <- x[x$word1 == word(y,-2) & x$word2 == word(y,-1),]
return(TBNf)
}
## Load libraries
library(shiny)
stn <- readRDS("./SuperTidyNgram.Rds")
# return most common words for ending phrase
nextword <- function(x, y) {
TBNf <- x[x$word1 == word(y, -2) & x$word2 == word(y, -1), ]
return(TBNf)
}
source("global.R")
ui <- fluidPage(h1("Next word proposal"), sidebarLayout(sidebarPanel(textInput(inputId = "phrase",
label = "Unfinished phrase...", value = "Hi, my name is"), p("This tool reads a dataframe of ngrams created from several text files (english) given within the course")),
mainPanel(tableOutput(outputId = "ending"))))
server <- function(input, output) {
output$ending <- renderTable({
head(nextword(stn, input$phrase))
})
}
shinyApp(ui, server)