This is the Milestone report in my development of an app to predict the next word given a word or phrase.
In this report I will load the data from blog, news and twitter, clean the data and conduct exploratory analysis.
Blogs
con <- file("./data/en_US.blogs.txt", "r")
blogs<- readLines(con, encoding = "UTF-8")
print(paste("File size",file.info("./data/en_US.blogs.txt")$size, sep = ": "))
## [1] "File size: 210160014"
print(paste("Number of lines", length(blogs), sep = ": "))
## [1] "Number of lines: 899288"
close(con)
News
con <- file("./data/en_US.news.txt", "rb")
news<- readLines(con, encoding = "UTF-8")
print(paste("File size",file.info("./data/en_US.news.txt")$size, sep = ": "))
## [1] "File size: 205811889"
print(paste("Number of lines", length(news), sep = ": "))
## [1] "Number of lines: 1010242"
close(con)
con <- file("./data/en_US.twitter.txt", "r")
twitter<- readLines(con, encoding = "UTF-8", skipNul = TRUE)
print(paste("File size", file.info("./data/en_US.twitter.txt")$size, sep = ": "))
## [1] "File size: 167105338"
print(paste("Number of lines", length(twitter), sep = ": "))
## [1] "Number of lines: 2360148"
close(con)
There is trade off between data set size and memory resource, so I have taken a sample of 1% of the data.
set.seed(1234) #for reproducibility
sizeSample <- 0.01 #only taking 1% of the data.
ts <- sample(length(twitter),length(twitter)*sizeSample)
twitSample <- twitter[ts]
ns <- sample(length(news),length(news)*sizeSample)
newsSample <- news[ns]
bs <- sample(length(blogs),length(blogs)*sizeSample)
blSample <- blogs[bs]
sampledata<-c(twitSample,blSample,newsSample)
I have put the cleaning data, through some cleaning algorithms to: -Remove profanities -Remove numbers -Remove punctuation -Remove white spaces -Make all lower case
clean1_data <- function(sd) {
sd<-iconv(sd, to="ASCII", sub = "")
return(sd)
}
profanity <- read.csv("data/bad-words.csv", stringsAsFactors = FALSE,header=FALSE)
clean2_data <- function(sd) {
text_corpus <- Corpus(VectorSource(sd), readerControl = list(language = "en"))
text_corpus <- tm_map(text_corpus, removeNumbers)
text_corpus <- tm_map(text_corpus, removePunctuation)
text_corpus <- tm_map(text_corpus, stripWhitespace)
text_corpus <- tm_map(text_corpus, tolower)
text_corpus <- tm_map(text_corpus, removeWords, profanity$V1)
return(text_corpus)
}
sampledata <- clean1_data(sampledata)
sampledatacorpus <- clean2_data(sampledata)
cleanSample<- as.vector(sampledatacorpus$content)
To explore the dataset we need to consider n-grams. I have considered n-grams up to length 4.
f<- cleanSample
one_ngram <- NGramTokenizer(df, Weka_control(min = 1, max = 1, delimiters = " \\r\\n\\t.,;:\"()?!"))
two_ngram <- NGramTokenizer(df, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
three_ngram <- NGramTokenizer(df, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))
four_ngram <- NGramTokenizer(df, Weka_control(min = 4, max = 4, delimiters = " \\r\\n\\t.,;:\"()?!"))
#creating frequency tables
df_one <- as.data.frame(table(one_ngram))
df_two <- as.data.frame(table(two_ngram))
df_three <- as.data.frame(table(three_ngram))
df_four <- as.data.frame(table(four_ngram))
#splitting up n-grams into seperage words, and adding these individual words to the data frame
temp1 <- data.frame(do.call('rbind', strsplit(as.character(df_two$two_ngram),' ',fixed=TRUE)))
df_two["fw"] <- temp1$X1
df_two["sw"] <- temp1$X2
temp1 <- data.frame(do.call('rbind', strsplit(as.character(df_three$three_ngram),' ',fixed=TRUE)))
df_three["fw"] <- temp1$X1
df_three["sw"] <- temp1$X2
df_three["tw"] <- temp1$X3
temp1 <- data.frame(do.call('rbind', strsplit(as.character(df_four$four_ngram),' ',fixed=TRUE)))
df_four["fw"] <- temp1$X1
df_four["sw"] <- temp1$X2
df_four["tw"] <- temp1$X3
df_four["ftw"] <- temp1$X4
Find the most frequent terms
df_one_top<- head(df_one[order(-df_one$Freq),],20)
df_two_top<- head(df_two[order(-df_two$Freq),],20)
df_three_top<- head(df_three[order(-df_three$Freq),],20)
df_four_top<- head(df_four[order(-df_four$Freq),],20)
Frequency of one-grams
library(ggplot2)
p <- ggplot(df_one_top, aes(reorder(one_ngram, Freq), Freq))
p <- p + geom_bar(stat="identity", fill="lightgreen", colour="darkgreen")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p <- p + labs(title = "Top 20 Words with Stopwords")
p <- p + ylab("Frequency") + xlab("Word")
p
Frequency of two-grams
library(ggplot2)
p <- ggplot(df_two_top, aes(reorder(two_ngram, Freq), Freq))
p <- p + geom_bar(stat="identity", fill="lightgreen", colour="darkgreen")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p <- p + labs(title = "Top 20 Words with Stopwords")
p <- p + ylab("Frequency") + xlab("Word")
p
Frequency of three-grams
library(ggplot2)
p <- ggplot(df_three_top, aes(reorder(three_ngram, Freq), Freq))
p <- p + geom_bar(stat="identity", fill="lightgreen", colour="darkgreen")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p <- p + labs(title = "Top 20 Words with Stopwords")
p <- p + ylab("Frequency") + xlab("Word")
p
Frequency of four-grams
library(ggplot2)
p <- ggplot(df_four_top, aes(reorder(four_ngram, Freq), Freq))
p <- p + geom_bar(stat="identity", fill="lightgreen", colour="darkgreen")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p <- p + labs(title = "Top 20 Words with Stopwords")
p <- p + ylab("Frequency") + xlab("Word")
p
I plan to use the stupid backoff algorithm. I will explain the algorithm via an example. Suppose we have the phrase “how are you”, the algorithm will first look at 4-grams to see if any start with “how are you”. Lets suppose there are 4-grams that start with “how are you”. In particular, there is one “how are you doing”, that appears 5 times. Then you would look at how many of the three gram “how are you” there are, suppose there are 10. Then you would give the prediction of the word “doing” a score of 5/10=0.5.
However if there was no 4-grams that start with “how are you”, you would then look at three grams that start with “are you” and repeat the process above. However, this is not likely to be nearly as good estimate for the word following “how are you”, as it would be if 4-grams existed, so you would multiply the scores by a value of 0.4
This would then continue considering 2- gram if necessary.