In this project we are using corpora collected from twitter, blogs, and news to make a predictive text model.
First step is to get and clean the data. We use tokenization technique. Second step is to explore the data and undrestand the features of the data. after these steps we can start modeling.
This milestone report is a summary of the first to steps and our plans for building the model using natural language processing.
First step is to download and unzip the data.
Url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("./Coursera-SwiftKey.zip")){
download.file(Url,destfile="./Coursera-SwiftKey.zip",mode = "wb")
}
if(!file.exists("./final")){
unzip(zipfile="./Coursera-SwiftKey.zip")
}
Next we make connections for each text file and read lines.
twitterfiledir <- "final/en_US/en_US.twitter.txt"
blogsfiledir <- "final/en_US/en_US.blogs.txt"
newsfiledir <- "final/en_US/en_US.news.txt"
contwitter <- file(twitterfiledir, open="rb")
twitterdata <- readLines(contwitter, skipNul=TRUE)
close(contwitter)
conblogs <- file(blogsfiledir, open="rb")
blogsdata <- readLines(conblogs, skipNul=TRUE)
close(conblogs)
connews <- file(newsfiledir, open="rb")
newsdata <- readLines(connews, skipNul=TRUE)
close(connews)
We get file size in MB for each file.
twittersize <- (file.info(twitterfiledir)$size) / (1024^2)
blogssize <- (file.info(blogsfiledir)$size) /(1024^2)
newssize <- (file.info(newsfiledir)$size) /(1024^2)
sizeMB <- c(twittersize, blogssize, newssize)
We count lines in each file.
twitterlines <- length(twitterdata)
blogslines <- length(blogsdata)
newslines <- length(newsdata)
lines <- c(twitterlines, blogslines, newslines)
To count words in each file, stringr package is used.
require(stringr)
twitterwords <- sum(str_count(twitterdata, "\\S+"))
blogswords <- sum(str_count(blogsdata, "\\S+"))
newswords <- sum(str_count(newsdata, "\\S+"))
words <- c(twitterwords, blogswords, newswords)
The following table shows size, line counts, and word counts for each file:
require(grid)
require(gridExtra)
filename <- c("twitter", "blogs","news")
filesinfo <- data.frame(filename,sizeMB,lines,words)
grid.table(filesinfo)
According to the large size of the datasets, we sample 5% of the lines from each file for the exploratory analysis.
sampletwitter <- sample(twitterdata, as.integer(twitterlines/20), replace = FALSE)
sampleblogs <- sample(blogsdata, as.integer(blogslines/20), replace = FALSE)
samplenews <- sample(newsdata, as.integer(newslines/20), replace = FALSE)
We use tokenizers package to tokenize words in each line for each file. Then we merge the words to have a list of all the words in all three files. Then we sort words by frequency and select the top 10.
require(tokenizers)
tokenizetwitter <- tokenize_words(sampletwitter)
tokenizeblogs <- tokenize_words(sampleblogs)
tokenizenews <- tokenize_words(samplenews)
tokenizeall <- c(unlist(tokenizetwitter),unlist(tokenizeblogs),unlist(tokenizenews))
require(plyr)
topwords <- as.data.frame(head(sort(table(tokenizeall),decreasing = TRUE),10))
Here is a barplot of 10 most repeated words:
require(ggplot2)
g <- ggplot(data=topwords, aes(x=tokenizeall, y=Freq))
g <- g + geom_bar(stat = "identity", fill= "blue", width = 0.7)
g <- g + coord_flip()
g <- g + labs(x="Words", y="Frequency", title="Top Words Frequencies")
g
We find all the two-grams and three-grams in each line for each file and then combine them to have a list of all two-grams and three-grams. Then we sort them by frequency and select the top 10.
twogramstwitter <- tokenize_ngrams(sampletwitter, n=2)
twogramsblogs <- tokenize_ngrams(sampleblogs, n=2)
twogramsnews <- tokenize_ngrams(samplenews, n=2)
twogramsall <- c(unlist(twogramstwitter), unlist(twogramsblogs), unlist(twogramsnews))
threegramstwitter <- tokenize_ngrams(sampletwitter, n=3)
threegramsblogs <- tokenize_ngrams(sampleblogs, n=3)
threegramsnews <- tokenize_ngrams(samplenews, n=3)
threegramsall <- c(unlist(threegramstwitter), unlist(threegramsblogs), unlist(threegramsnews))
toptwograms <- as.data.frame(head(sort(table(twogramsall),decreasing = TRUE),10))
topthreegrams <- as.data.frame(head(sort(table(threegramsall),decreasing = TRUE),10))
Here is a barplot of 10 most repeated two-grams:
g <- ggplot(data=toptwograms, aes(x=twogramsall, y=Freq))
g <- g + geom_bar(stat = "identity", fill= "blue", width = 0.7)
g <- g + coord_flip()
g <- g + labs(x="Words", y="Frequency", title="Top Two-grams Frequencies")
g
Here is a barplot of 10 most repeated three-grams:
g <- ggplot(data=topthreegrams, aes(x=threegramsall, y=Freq))
g <- g + geom_bar(stat = "identity", fill= "blue", width = 0.7)
g <- g + coord_flip()
g <- g + labs(x="Words", y="Frequency", title="Top Three-grams Frequencies")
g
Next steps are: