17 de mayo de 2019
Data has been given within the course material and It consist on 3 files from 3 different sources (news, blogs and tweets). This data has been cleaned using the following functions for text cleaning and creating a simplified file for training with late ngrams:
# text: clean file textcleaning <- function(x){ #to lower, remove number... [...] # replace words that contain "@", "#", "http://", "https://" [...] # replace all non-alphanumeric characters with a space at the beginning/end of a word. [...] return(textclean) } # tidytext: reads txt file cleans it and save it during process SuperTidyFile <- function(filesdir = "./data/en_US", output = "./data/SuperTidyFile.txt") { files <- list.files(filesdir, pattern = "*.txt", full.names = TRUE) file.create(output) for (f in files){#textcleaning line and write it in the output file } } return(paste0("Has creado un SuperTidyFile en ", output)) }
Using the previous SuperTidyFile some proves has been performed until getting the next function that creates a readable simple file (.rds) with a dataframe of ngrams (3-grams) and the times each combination appeared in the SuperTidyFile: Here is the code to create the ngram files, in .rds, readable at shiny execution:
SuperTidyNgram <- function(stf = "./data/SuperTidyFile.txt", output = "./data/SuperTidyNgram.rds", Ngram = 3, samplediv = 1){ con <- file(stf, open="r"); LINES <- readLines(con); close(con) # read lines #>4000000 LINEAS LINEStib <- tibble(line = 1:length(LINES), words = LINES) nsamples <- round(dim(LINEStib)[1]/samplediv) #set sample div tbnS <- sample_n(LINEStib, nsamples) tbn3 <- tbnS %>% unnest_tokens(trigram, words, token = "ngrams", n = Ngram) %>% [...]#more thing saveRDS(tbn3, file = output) #save object (tbn english 3gram) return(paste0("Has creado un SuperTidyNgram en ", output)) } #This is the function used to create the data table shown: #return most common words for ending phrase nextword <- function(x, y){ TBNf <- x[x$word1 == word(y,-2) & x$word2 == word(y,-1),] return(TBNf) }
## Load libraries library(shiny) stn <- readRDS("./SuperTidyNgram.Rds") # return most common words for ending phrase nextword <- function(x, y) { TBNf <- x[x$word1 == word(y, -2) & x$word2 == word(y, -1), ] return(TBNf) }
source("global.R") ui <- fluidPage(h1("Next word proposal"), sidebarLayout(sidebarPanel(textInput(inputId = "phrase", label = "Unfinished phrase...", value = "Hi, my name is"), p("This tool reads a dataframe of ngrams created from several text files (english) given within the course")), mainPanel(tableOutput(outputId = "ending")))) server <- function(input, output) { output$ending <- renderTable({ head(nextword(stn, input$phrase)) }) } shinyApp(ui, server)