The goal of this project is just to display that we’ve gotten used to working with the data and that we are on track to create our prediction algorithm. This report on R Pubs (http://rpubs.com/) explains our exploratory analysis and our goals for the eventual app and algorithm.
This document aims to be concise and explain only the major features of the data we have identified, and to briefly summarize our plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager.
We’ve made use of use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to:
set.seed(1)
library(tau)
library(RTextTools)
library(stringr)
library(stringi)
library(RWeka)
cumLines <- NA
cumSamples <- list()
lineList <- list()
outputLines <- c()
# ################################
# Get the model to predicto words in the corpus
# ################################
getModel <- function(tmpWL)
{
Lines <- tmpWL$Lines ; Words <- tmpWL$Words
lm( log(Words) ~ log(Lines) ) # Fitting a power law
}
# ################################
# Get the model to predicto lines for a specified number of words in th corpus
# ################################
getModelInverted <- function(tmpWL)
{
Lines <- tmpWL$Lines ; Words <- tmpWL$Words
lm( log(Lines) ~ log(Words) ) # Fitting a power law
}
# ################################
# Plot the model and sample data
# ################################
plotModel <- function(tmpWL, tmpModel, tmpYlim, tmpHeadline)
{
plot(tmpWL$Lines, tmpWL$Words, ylim=c(0,tmpYlim), xlab="Lines", ylab="Lines", main = tmpHeadline, pch=16) # wider
A <- seq(from=0, to=max(tmpWL$Lines), by=1000)
B <- exp( predict( tmpModel ,list(Lines=A) ) )
lines(A, B, col="red",lwd=2.5)
}
# ############################
# Determine number of words in a file
# ############################
totalWords <- function(tmpModel, tmpLinesInFile)
{
tmpWordsInFile <- exp(predict(tmpModel,list(Lines=tmpLinesInFile)))
}
# ############################
# Determine number of lines required for a proportion of total words
# ############################
linesNeeded <- function(tmpModel, tmpWordsInFile, tmpProportion)
{
exp(predict(tmpModel,list( Words=(tmpWordsInFile * tmpProportion))))
}
# ################################
# Create a dataset of cumulative unique words based on lines read
# ################################
cumUniqueStats <- function(tmpFileName, tmpGrams)
{
ngramSets <- lapply(randomLineSets(tmpFileName), FindNGram, grams=tmpGrams)
cumUniqueWords <- data.frame(250, length(ngramSets[[1]]$WordOrString))
names(cumUniqueWords) <- c("Lines", "Words")
uniqueWords <- ngramSets[[1]]$WordOrString
lineCount <- 250
for(i in 2:200)
{
lineCount <- lineCount + 250
uniqueWords <- unique(c(uniqueWords, ngramSets[[i]]$WordOrString))
uniqueWordLen <- length(uniqueWords)
cumUniqueWords <- rbind(cumUniqueWords, c(lineCount, uniqueWordLen))
}
cumUniqueWords
}
# ################################
# Create random sets of lines from document
# ################################
randomLineSets <- function(tmpFileName)
{
cumLines <- NA
cumSamples <- list()
lineList <- list()
outputLines <- c()
con <- file(tmpFileName, "rt")
linesInFile <- length(readLines(con))
close(con)
lineNums <- c(1:linesInFile); lineNums
cumLines <- sample(lineNums, 250, replace=FALSE)
lineList[[1]] <- cumLines
for(i in 2:200)
{
tmpNums <- sort(sample(lineNums[-cumLines], 250, replace=FALSE))
cumLines <- c(cumLines, tmpNums)
lineList[[i]] <- tmpNums
}
con <- file(tmpFileName, "rt")
for(i in 1:200)
{
curPos <- 0
fileSample <- c()
for(j in 1:length(lineList[[i]]))
{
lineskip <- curPos + lineList[[i]][j] - 1
fileSample <- c(fileSample, readLines(con, n = 1, skip = lineskip,warn = FALSE))
curPos <- lineList[[i]][j]
}
cumSamples[[i]] <- fileSample
}
close(con)
cumSamples
}
# ################################
# Find n-grams function
# ################################
FindNGram <- function(corpusDf, grams) {
corpusDf <- tolower(corpusDf)
tempGram <- NGramTokenizer(corpusDf, Weka_control(min = grams, max = grams, delimiters = " \\r\\n\\t.,;:\"()?!"))
if(length(tempGram))
{
tempGram1 <- data.frame(table(tempGram))
tempGram2 <- tempGram1[order(tempGram1$Freq, decreasing = TRUE),]
colnames(tempGram2) <- c("WordOrString","Count")
tempGram2 <- tempGram2[grepl("^[A-Za-z ]+$", as.character(tempGram2$WordOrString)),]
tempGram2
}
}
# ################################
# Count the lines in a file
# ################################
countFileLines <- function(tmpFileName)
{
con <- file(tmpFileName, "rt")
linesInFile <- length(readLines(con))
close(con)
linesInFile
}
# Get file lengths
linesInFile <- as.data.frame( rbind( countFileLines("en_US.blogs.txt"), countFileLines("en_US.news.txt"), countFileLines("en_US.twitter.txt") ))
rownames(linesInFile) <- c("Blogs", "News", "Twitter")
# Generate models for 1, 2 and 3 grams
ngramLines90 <- list()
fileModels <- list()
cumUniques <- list()
fileWords <- list()
fileModelsInverted <- list()
corpus <- c("Blogs", "News", "Twitter")
for(j in 1:4)
{
# the i represents the level of ngram used to generate the model
# Cumulative unique word statistics, correlated with line counts (based on random sampling)
cumUniques[["Blogs"]][[j]] <- cumUniqueStats("en_US.blogs.txt", j)
cumUniques[["News"]][[j]] <- cumUniqueStats("en_US.news.txt", j)
cumUniques[["Twitter"]][[j]] <- cumUniqueStats("en_US.twitter.txt", j)
# Create a linear model based on random samples from source data
fileModels[["Blogs"]][[j]] <- getModel(cumUniques[["Blogs"]][[j]])
fileModels[["News"]][[j]] <- getModel(cumUniques[["News"]][[j]])
fileModels[["Twitter"]][[j]] <- getModel(cumUniques[["Twitter"]][[j]])
# Create a linear model based on random samples from source data
fileModelsInverted[["Blogs"]][[j]] <- getModelInverted(cumUniques[["Blogs"]][[j]])
fileModelsInverted[["News"]][[j]] <- getModelInverted(cumUniques[["News"]][[j]])
fileModelsInverted[["Twitter"]][[j]] <- getModelInverted(cumUniques[["Twitter"]][[j]])
# Find words in
fileWords[["Blogs"]][[j]] <- totalWords(fileModels[["Blogs"]][[j]], linesInFile["Blogs",])
fileWords[["News"]][[j]] <- totalWords(fileModels[["News"]][[j]], linesInFile["News",])
fileWords[["Twitter"]][[j]] <- totalWords(fileModels[["Twitter"]][[j]], linesInFile["Twitter",])
}
If you can read this, the answer is yes.
In the below table, you’ll see a number of key statistics:
rsquareValues <- list()
for(i in 1:4)
{
rsquareValues[[i]] <- as.data.frame(
rbind(
c("Blogs", i, round(summary(fileModels[["Blogs"]][[i]])$r.squared, 3), linesInFile["Blogs",], round(fileWords[["Blogs"]][[i]], 0), round(linesNeeded( fileModelsInverted[["Blogs"]][[i]], fileWords[["Blogs"]][[i]],0.98), 0) ),
c("News", i, round(summary(fileModels[["News"]][[i]])$r.squared, 3), linesInFile["News",], round(fileWords[["News"]][[i]], 0), round(linesNeeded( fileModelsInverted[["News"]][[i]], fileWords[["News"]][[i]],0.98), 0) ),
c("Twitter", i, round(summary(fileModels[["Twitter"]][[i]])$r.squared, 3), linesInFile["Twitter",], round(fileWords[["Twitter"]][[i]], 0), round(linesNeeded(fileModelsInverted[["Twitter"]][[i]], fileWords[["Twitter"]][[i]],0.98), 0) ) ))
colnames(rsquareValues[[i]]) <- c("Source", "Grams", "Rsquare", "Lines", "Words", "Lines98")
}
rbind(rsquareValues[[1]], rsquareValues[[2]], rsquareValues[[3]], rsquareValues[[4]])
## Source Grams Rsquare Lines Words Lines98
## 1 Blogs 1 0.838 899288 3679 153484
## 2 News 1 0.696 77259 3512 22015
## 3 Twitter 1 0.861 2360148 1556 594778
## 4 Blogs 2 0.815 899288 10051 168590
## 5 News 2 0.641 77259 8111 24005
## 6 Twitter 2 0.767 2360148 3131 376485
## 7 Blogs 3 0.81 899288 11901 196038
## 8 News 3 0.682 77259 9107 29480
## 9 Twitter 3 0.744 2360148 3269 405666
## 10 Blogs 4 0.814 899288 11983 221547
## 11 News 4 0.695 77259 9110 32815
## 12 Twitter 4 0.687 2360148 3069 337220
Below, we’ve plotted the distribution charts for all n-gram counts, correlated with line numbers, for each of the corpuses. We’ve also overlaid a red line for the prediction model.
i = 1
par(mfrow=c(2,2))
plotModel(cumUniques[["Blogs"]][[i]], fileModels[["Blogs"]][[i]], 16000, paste("Blog ", i, "-Gram Distribution", sep=""))
plotModel(cumUniques[["News"]][[i]], fileModels[["News"]][[i]], 12000, paste("News ", i, "-Gram Distribution", sep=""))
plotModel(cumUniques[["Twitter"]][[i]], fileModels[["Twitter"]][[i]], 8000, paste("Twitter ", i, "-Gram Distribution", sep=""))
i = 2
par(mfrow=c(2,2))
plotModel(cumUniques[["Blogs"]][[i]], fileModels[["Blogs"]][[i]], 16000, paste("Blog ", i, "-Gram Distribution", sep=""))
plotModel(cumUniques[["News"]][[i]], fileModels[["News"]][[i]], 12000, paste("News ", i, "-Gram Distribution", sep=""))
plotModel(cumUniques[["Twitter"]][[i]], fileModels[["Twitter"]][[i]], 8000, paste("Twitter ", i, "-Gram Distribution", sep=""))
i = 3
par(mfrow=c(2,2))
plotModel(cumUniques[["Blogs"]][[i]], fileModels[["Blogs"]][[i]], 16000, paste("Blog ", i, "-Gram Distribution", sep=""))
plotModel(cumUniques[["News"]][[i]], fileModels[["News"]][[i]], 12000, paste("News ", i, "-Gram Distribution", sep=""))
plotModel(cumUniques[["Twitter"]][[i]], fileModels[["Twitter"]][[i]], 8000, paste("Twitter ", i, "-Gram Distribution", sep=""))
i = 4
par(mfrow=c(2,2))
plotModel(cumUniques[["Blogs"]][[i]], fileModels[["Blogs"]][[i]], 16000, paste("Blog ", i, "-Gram Distribution", sep=""))
plotModel(cumUniques[["News"]][[i]], fileModels[["News"]][[i]], 12000, paste("News ", i, "-Gram Distribution", sep=""))
plotModel(cumUniques[["Twitter"]][[i]], fileModels[["Twitter"]][[i]], 8000, paste("Twitter ", i, "-Gram Distribution", sep=""))
I need to compile frequency lists before creating these charts ##############################################
#library(ggplot2)
# df_uni <- data.frame("unigram"=names(freq_uni[ord_uni]), "freq"=freq_uni[ord_uni])
# df_bi <- data.frame("unigram"=names(freq_bi[ord_bi]), "freq"=freq_bi[ord_bi])
# df_tri <- data.frame("unigram"=names(freq_tri[ord_tri]), "freq"=freq_tri[ord_tri])
# ggplot(df_uni[1:40,], aes(factor(unigram, levels = unique(unigram)), freq)) +
# geom_bar(stat = 'identity') +
# theme(axis.text.x=element_text(angle=90)) +
# xlab('Unigram') +
# ylab('Frequency')
According to our analysis, the number of unique words contained within each corpus levels off quickly after the first few thousand lines, and then increases very slowly as you continue reading new lines. This knowledge can be leveraged to improve the efficiency of the raw data analysis. Instead of reading the entire corpus, you can obtain 98% of all unique word by reading in only a small random sampling of the full corpus.