Next word proposal (coursera capstone last exercise)

17 de mayo de 2019

Shiny app

Process (resumed)

Data has been given within the course material and It consist on 3 files from 3 different sources (news, blogs and tweets). This data has been cleaned using the following functions for text cleaning and creating a simplified file for training with late ngrams:

# text: clean file
textcleaning <- function(x){ 
    #to lower, remove number...
    [...]
    # replace words that contain "@", "#", "http://", "https://" 
    [...]
    # replace all non-alphanumeric characters with a space at the beginning/end of a word.
    [...]
    return(textclean)
}
# tidytext: reads txt file cleans it and save it during process
SuperTidyFile <- function(filesdir = "./data/en_US", output = "./data/SuperTidyFile.txt") {
    files <- list.files(filesdir, pattern = "*.txt", full.names = TRUE)
    file.create(output)

    for (f in files){#textcleaning line and write it in the output file
        }
    }
    return(paste0("Has creado un SuperTidyFile en ", output))
}

Ngrams and function to find the next word

Using the previous SuperTidyFile some proves has been performed until getting the next function that creates a readable simple file (.rds) with a dataframe of ngrams (3-grams) and the times each combination appeared in the SuperTidyFile: Here is the code to create the ngram files, in .rds, readable at shiny execution:

SuperTidyNgram <- function(stf = "./data/SuperTidyFile.txt", output = "./data/SuperTidyNgram.rds",
                           Ngram = 3, samplediv = 1){
    con <- file(stf, open="r");    LINES <- readLines(con);    close(con) # read lines #>4000000 LINEAS
    LINEStib <- tibble(line = 1:length(LINES), words = LINES)
    nsamples <- round(dim(LINEStib)[1]/samplediv) #set sample div
    tbnS <- sample_n(LINEStib, nsamples)
    tbn3 <- tbnS %>%
      unnest_tokens(trigram, words, token = "ngrams", n = Ngram) %>% [...]#more thing
    
    saveRDS(tbn3, file = output) #save object (tbn english 3gram)
    return(paste0("Has creado un SuperTidyNgram en ", output))
}
 
#This is the function used to create the data table shown:
#return most common words for ending phrase
nextword <- function(x, y){
    TBNf <- x[x$word1 == word(y,-2) & x$word2 == word(y,-1),]
    return(TBNf)
}

global code (global.R) and shiny code (app.R)

## Load libraries
library(shiny)

stn <- readRDS("./SuperTidyNgram.Rds")

# return most common words for ending phrase
nextword <- function(x, y) {
    TBNf <- x[x$word1 == word(y, -2) & x$word2 == word(y, -1), ]
    return(TBNf)
}

source("global.R")

ui <- fluidPage(h1("Next word proposal"), sidebarLayout(sidebarPanel(textInput(inputId = "phrase", 
    label = "Unfinished phrase...", value = "Hi, my name is"), p("This tool reads a dataframe of ngrams created from several text files (english) given within the course")), 
    mainPanel(tableOutput(outputId = "ending"))))

server <- function(input, output) {
    output$ending <- renderTable({
        head(nextword(stn, input$phrase))
    })
}

shinyApp(ui, server)