Introduction

This is the Milestone report in my development of an app to predict the next word given a word or phrase.

In this report I will load the data from blog, news and twitter, clean the data and conduct exploratory analysis.

Loading data files

Blogs

con <- file("./data/en_US.blogs.txt", "r")
blogs<- readLines(con, encoding = "UTF-8")
print(paste("File size",file.info("./data/en_US.blogs.txt")$size, sep = ": "))

## [1] "File size: 210160014"

print(paste("Number of lines", length(blogs), sep = ": "))

## [1] "Number of lines: 899288"

close(con)

News

con <- file("./data/en_US.news.txt", "rb")
news<- readLines(con, encoding = "UTF-8")
print(paste("File size",file.info("./data/en_US.news.txt")$size, sep = ": "))

## [1] "File size: 205811889"

print(paste("Number of lines", length(news), sep = ": "))

## [1] "Number of lines: 1010242"

close(con)

Twitter

con <- file("./data/en_US.twitter.txt", "r")
twitter<- readLines(con, encoding = "UTF-8", skipNul = TRUE)
print(paste("File size", file.info("./data/en_US.twitter.txt")$size, sep = ": "))

## [1] "File size: 167105338"

print(paste("Number of lines", length(twitter), sep = ": "))

## [1] "Number of lines: 2360148"

close(con)

Sampling data

There is trade off between data set size and memory resource, so I have taken a sample of 1% of the data.

set.seed(1234) #for reproducibility
sizeSample <- 0.01 #only taking 1% of the data.

ts <- sample(length(twitter),length(twitter)*sizeSample)
twitSample <- twitter[ts]

ns <- sample(length(news),length(news)*sizeSample)
newsSample <- news[ns]

bs <- sample(length(blogs),length(blogs)*sizeSample)
blSample <- blogs[bs]

sampledata<-c(twitSample,blSample,newsSample)

Clean data

I have put the cleaning data, through some cleaning algorithms to: -Remove profanities -Remove numbers -Remove punctuation -Remove white spaces -Make all lower case

clean1_data <- function(sd) { 
  sd<-iconv(sd, to="ASCII", sub = "")
  return(sd)
}

profanity <- read.csv("data/bad-words.csv", stringsAsFactors = FALSE,header=FALSE)

clean2_data <- function(sd) { 
  text_corpus <- Corpus(VectorSource(sd), readerControl = list(language = "en")) 
  text_corpus <- tm_map(text_corpus, removeNumbers) 
  text_corpus <- tm_map(text_corpus, removePunctuation) 
  text_corpus <- tm_map(text_corpus, stripWhitespace) 
  text_corpus <- tm_map(text_corpus, tolower) 
  text_corpus <- tm_map(text_corpus, removeWords, profanity$V1)
  return(text_corpus)
}

sampledata <- clean1_data(sampledata) 
sampledatacorpus <- clean2_data(sampledata) 

cleanSample<- as.vector(sampledatacorpus$content)

Creating n-grams

To explore the dataset we need to consider n-grams. I have considered n-grams up to length 4.

f<- cleanSample

one_ngram <- NGramTokenizer(df, Weka_control(min = 1, max = 1, delimiters = " \\r\\n\\t.,;:\"()?!"))
two_ngram <- NGramTokenizer(df, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
three_ngram <- NGramTokenizer(df, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))
four_ngram <- NGramTokenizer(df, Weka_control(min = 4, max = 4, delimiters = " \\r\\n\\t.,;:\"()?!"))

#creating frequency tables
df_one <- as.data.frame(table(one_ngram))
df_two <- as.data.frame(table(two_ngram))
df_three <- as.data.frame(table(three_ngram))
df_four <- as.data.frame(table(four_ngram))

#splitting up n-grams into seperage words, and adding these individual words to the data frame
temp1 <- data.frame(do.call('rbind', strsplit(as.character(df_two$two_ngram),' ',fixed=TRUE)))
df_two["fw"] <- temp1$X1
df_two["sw"] <- temp1$X2

temp1 <- data.frame(do.call('rbind', strsplit(as.character(df_three$three_ngram),' ',fixed=TRUE)))
df_three["fw"] <- temp1$X1
df_three["sw"] <- temp1$X2
df_three["tw"] <- temp1$X3

temp1 <- data.frame(do.call('rbind', strsplit(as.character(df_four$four_ngram),' ',fixed=TRUE)))
df_four["fw"] <- temp1$X1
df_four["sw"] <- temp1$X2
df_four["tw"] <- temp1$X3
df_four["ftw"] <- temp1$X4

Histogram of data sets

Find the most frequent terms

df_one_top<- head(df_one[order(-df_one$Freq),],20)
df_two_top<- head(df_two[order(-df_two$Freq),],20)
df_three_top<- head(df_three[order(-df_three$Freq),],20)
df_four_top<- head(df_four[order(-df_four$Freq),],20)

Frequency of one-grams

library(ggplot2)
  p <- ggplot(df_one_top, aes(reorder(one_ngram, Freq), Freq))    
  p <- p + geom_bar(stat="identity", fill="lightgreen", colour="darkgreen")   
  p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))   
  p <- p + labs(title = "Top 20 Words with Stopwords")
  p <- p + ylab("Frequency")  + xlab("Word")
  p

Frequency of two-grams

library(ggplot2)
  p <- ggplot(df_two_top, aes(reorder(two_ngram, Freq), Freq))    
  p <- p + geom_bar(stat="identity", fill="lightgreen", colour="darkgreen")   
  p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))   
  p <- p + labs(title = "Top 20 Words with Stopwords")
  p <- p + ylab("Frequency")  + xlab("Word")
  p

Frequency of three-grams

library(ggplot2)
  p <- ggplot(df_three_top, aes(reorder(three_ngram, Freq), Freq))    
  p <- p + geom_bar(stat="identity", fill="lightgreen", colour="darkgreen")   
  p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))   
  p <- p + labs(title = "Top 20 Words with Stopwords")
  p <- p + ylab("Frequency")  + xlab("Word")
  p

Frequency of four-grams

library(ggplot2)
  p <- ggplot(df_four_top, aes(reorder(four_ngram, Freq), Freq))    
  p <- p + geom_bar(stat="identity", fill="lightgreen", colour="darkgreen")   
  p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))   
  p <- p + labs(title = "Top 20 Words with Stopwords")
  p <- p + ylab("Frequency")  + xlab("Word")
  p

Plans for creating algorithm

I plan to use the stupid backoff algorithm. I will explain the algorithm via an example. Suppose we have the phrase “how are you”, the algorithm will first look at 4-grams to see if any start with “how are you”. Lets suppose there are 4-grams that start with “how are you”. In particular, there is one “how are you doing”, that appears 5 times. Then you would look at how many of the three gram “how are you” there are, suppose there are 10. Then you would give the prediction of the word “doing” a score of 5/10=0.5.

However if there was no 4-grams that start with “how are you”, you would then look at three grams that start with “are you” and repeat the process above. However, this is not likely to be nearly as good estimate for the word following “how are you”, as it would be if 4-grams existed, so you would multiply the scores by a value of 0.4

This would then continue considering 2- gram if necessary.