The goal of this exercise is to create a product to highlight the prediction algorithm that you have built and to provide an interface that can be accessed by others.
To be able to do this we will rely on information from blogs, twitter and news, using the libraries.
https://ereb2002.shinyapps.io/words_prediction/
The first step is to obtain the data and clean it of unwanted symbols, numbers and characters and for this we will use the following code.
https://ereb2002.shinyapps.io/words_prediction/
setwd("/home/ron/Documentos/DataScience/capstone/final/")
twitterLines <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul=TRUE)
blogslines <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul=TRUE)
newslines <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul=TRUE)
sample_data<-c(sample(blogslines,length(blogslines)*.05),
sample(newslines,length(newslines)*.05),
sample(twitterLines,length(twitterLines)*.05))
corpus<-VCorpus(VectorSource(sample_data))
corpus<-tm_map(corpus,removePunctuation)
corpus<-tm_map(corpus,stripWhitespace)
corpus<-tm_map(corpus,tolower)
corpus<-tm_map(corpus,removeNumbers)
corpus<-tm_map(corpus,PlainTextDocument)
corpus<-tm_map(corpus,removeWords,stopwords("english"))
The next step is to create the grammar function to be able to trend the most common phrases, for this exercise we will take grammars that contain 2, 3, 4, 5, 6 and 7 words.
two_w<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
three_w<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
four_w<-function(x) NGramTokenizer(x,Weka_control(min=4,max=4))
five_w<-function(x) NGramTokenizer(x,Weka_control(min=5,max=5))
six_w<-function(x) NGramTokenizer(x,Weka_control(min=6,max=6))
seven_w<-function(x) NGramTokenizer(x,Weka_control(min=7,max=7))
For this we are going to take the grammar and we are going to apply it to the data set that has been cleaned, then the frequency table is generated with the minimum value to accept to enter the table, then the frequency information is crossed with the words to create a matrix of frequencies. This is repeated in grammars from 2 to 7 words.
https://ereb2002.shinyapps.io/words_prediction/
wt2 <- TermDocumentMatrix(corpus, control = list(tokenize = two_w))
two_corpus<-findFreqTerms(wt2,lowfreq=200)
freq2<-rowSums(as.matrix(wt2[two_corpus,]))
freq2_table <- data.frame(Word=names(freq2),frequency=freq2)
head(freq2_table)
## Word frequency
## can get can get 564
## can help can help 224
## can make can make 302
## can see can see 363
## cant wait cant wait 908
## come back come back 312
Por cada tabla de frecuencia generamos un archivo al cual le aplicamos las siguientes funciones.
https://ereb2002.shinyapps.io/words_prediction/
prediccion_palabra <- function(vector){
oracion <- "^"
for(i in 1:length(vector)){
if(i > 1){
oracion <- paste(oracion, vector[i], sep = " ")
}else{
oracion <- paste(oracion, vector[i], sep = "")
}
}
if(length(vector) == 6){
datos <- subset(t7, grepl(oracion, t7$Word, ignore.case = TRUE))
if(length(datos$Word) > 0){
return(datos)
}else{
vector <- dar_vector(oracion, TRUE)
return(prediccion_palabra(vector))
}
}else if(length(vector) == 5){
datos <- subset(t6, grepl(oracion, t6$Word, ignore.case = TRUE))
if(length(datos$Word) > 0){
return(datos)
}else{
vector <- dar_vector(oracion, TRUE)
return(prediccion_palabra(vector))
}
}else if(length(vector) == 4){
datos <- subset(t5, grepl(oracion, t5$Word, ignore.case = TRUE))
if(length(datos$Word) > 0){
return(datos)
}else{
vector <- dar_vector(oracion, TRUE)
return(prediccion_palabra(vector))
}
}else if(length(vector) == 3){
datos <- subset(t4, grepl(oracion, t4$Word, ignore.case = TRUE))
if(length(datos$Word) > 0){
return(datos)
}else{
vector <- dar_vector(oracion, TRUE)
return(prediccion_palabra(vector))
}
}else if(length(vector) == 2){
datos <- subset(t3, grepl(oracion, t3$Word, ignore.case = TRUE))
if(length(datos$Word) > 0){
return(datos)
}else{
vector <- dar_vector(oracion, TRUE)
return(prediccion_palabra(vector))
}
}else if(length(vector) == 1){
datos <- subset(t2, grepl(oracion, t2$Word, ignore.case = TRUE))
return(datos)
}
}