Clean Data

The first step is to obtain the data and clean it of unwanted symbols, numbers and characters and for this we will use the following code.

https://ereb2002.shinyapps.io/words_prediction/

setwd("/home/ron/Documentos/DataScience/capstone/final/")
twitterLines <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul=TRUE)
blogslines <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul=TRUE)
newslines <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul=TRUE)

sample_data<-c(sample(blogslines,length(blogslines)*.05),
               sample(newslines,length(newslines)*.05),
               sample(twitterLines,length(twitterLines)*.05))

corpus<-VCorpus(VectorSource(sample_data))
corpus<-tm_map(corpus,removePunctuation)
corpus<-tm_map(corpus,stripWhitespace)
corpus<-tm_map(corpus,tolower)
corpus<-tm_map(corpus,removeNumbers)
corpus<-tm_map(corpus,PlainTextDocument)
corpus<-tm_map(corpus,removeWords,stopwords("english"))

Create Grammar

The next step is to create the grammar function to be able to trend the most common phrases, for this exercise we will take grammars that contain 2, 3, 4, 5, 6 and 7 words.

two_w<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
three_w<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
four_w<-function(x) NGramTokenizer(x,Weka_control(min=4,max=4))
five_w<-function(x) NGramTokenizer(x,Weka_control(min=5,max=5))
six_w<-function(x) NGramTokenizer(x,Weka_control(min=6,max=6))
seven_w<-function(x) NGramTokenizer(x,Weka_control(min=7,max=7))

Generate frecuency table

For this we are going to take the grammar and we are going to apply it to the data set that has been cleaned, then the frequency table is generated with the minimum value to accept to enter the table, then the frequency information is crossed with the words to create a matrix of frequencies. This is repeated in grammars from 2 to 7 words.

https://ereb2002.shinyapps.io/words_prediction/

wt2 <- TermDocumentMatrix(corpus, control = list(tokenize = two_w))
two_corpus<-findFreqTerms(wt2,lowfreq=200)
freq2<-rowSums(as.matrix(wt2[two_corpus,]))
freq2_table <- data.frame(Word=names(freq2),frequency=freq2)
head(freq2_table)

##                Word frequency
## can get     can get       564
## can help   can help       224
## can make   can make       302
## can see     can see       363
## cant wait cant wait       908
## come back come back       312

Generate App

Por cada tabla de frecuencia generamos un archivo al cual le aplicamos las siguientes funciones.

https://ereb2002.shinyapps.io/words_prediction/

prediccion_palabra <- function(vector){
  oracion <- "^"
  for(i in 1:length(vector)){
    if(i > 1){
      oracion <- paste(oracion, vector[i], sep = " ")
    }else{
      oracion <- paste(oracion, vector[i], sep = "")
    }
  }
  if(length(vector) == 6){
    datos <- subset(t7, grepl(oracion, t7$Word, ignore.case = TRUE))
    if(length(datos$Word) > 0){
      return(datos)
    }else{
      vector <- dar_vector(oracion, TRUE)
      return(prediccion_palabra(vector))
    }
  }else if(length(vector) == 5){
    datos <- subset(t6, grepl(oracion, t6$Word, ignore.case = TRUE))
    if(length(datos$Word) > 0){
      return(datos)
    }else{
      vector <- dar_vector(oracion, TRUE)
      return(prediccion_palabra(vector))
    }
  }else if(length(vector) == 4){
    datos <- subset(t5, grepl(oracion, t5$Word, ignore.case = TRUE))
    if(length(datos$Word) > 0){
      return(datos)
    }else{
      vector <- dar_vector(oracion, TRUE)
      return(prediccion_palabra(vector))
    }
  }else if(length(vector) == 3){
    datos <- subset(t4, grepl(oracion, t4$Word, ignore.case = TRUE))
    if(length(datos$Word) > 0){
      return(datos)
    }else{
      vector <- dar_vector(oracion, TRUE)
      return(prediccion_palabra(vector))
    }
  }else if(length(vector) == 2){
    datos <- subset(t3, grepl(oracion, t3$Word, ignore.case = TRUE))
    if(length(datos$Word) > 0){
      return(datos)
    }else{
      vector <- dar_vector(oracion, TRUE)
      return(prediccion_palabra(vector))
    }
  }else if(length(vector) == 1){
    datos <- subset(t2, grepl(oracion, t2$Word, ignore.case = TRUE))
    return(datos)
  }
}

Words Prediction

Ronaldo Echeverria Bardales

Words prediction

Clean Data

Create Grammar

Generate frecuency table

Generate App