Peer-graded Assignment: Exploratory Data Analysis

The goal of this project is just to display that we’ve gotten used to working with the data and that we are on track to create our prediction algorithm. This report on R Pubs (http://rpubs.com/) explains our exploratory analysis and our goals for the eventual app and algorithm.

This document aims to be concise and explain only the major features of the data we have identified, and to briefly summarize our plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager.

We’ve made use of use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to:

set.seed(1)
library(tau)
library(RTextTools)
library(stringr)
library(stringi)
library(RWeka)
cumLines <- NA
cumSamples <- list()
lineList <- list()
outputLines <- c()


# ################################
# Get the model to predicto words in the corpus
# ################################
getModel <- function(tmpWL)
{
  Lines <- tmpWL$Lines ; Words <- tmpWL$Words
  lm( log(Words) ~ log(Lines) ) # Fitting a power law
}


# ################################
# Get the model to predicto lines for a specified number of words in th corpus
# ################################
getModelInverted <- function(tmpWL)
{
  Lines <- tmpWL$Lines ; Words <- tmpWL$Words
  lm( log(Lines) ~ log(Words) ) # Fitting a power law
}


# ################################
# Plot the model and sample data
# ################################
plotModel <- function(tmpWL, tmpModel, tmpYlim, tmpHeadline)
{
  plot(tmpWL$Lines, tmpWL$Words, ylim=c(0,tmpYlim), xlab="Lines", ylab="Lines", main = tmpHeadline, pch=16) # wider 
  A <- seq(from=0, to=max(tmpWL$Lines), by=1000)
  B <- exp( predict( tmpModel ,list(Lines=A) ) )
  lines(A, B, col="red",lwd=2.5)

}


# ############################
# Determine number of words in a file
# ############################
totalWords <- function(tmpModel, tmpLinesInFile)
{
  tmpWordsInFile <- exp(predict(tmpModel,list(Lines=tmpLinesInFile)))
}


# ############################
# Determine number of lines required for a proportion of total words
# ############################
linesNeeded <- function(tmpModel, tmpWordsInFile, tmpProportion)
{
  exp(predict(tmpModel,list( Words=(tmpWordsInFile * tmpProportion))))
}


# ################################
# Create a dataset of cumulative unique words based on lines read
# ################################
cumUniqueStats <- function(tmpFileName, tmpGrams)
{

  ngramSets <- lapply(randomLineSets(tmpFileName), FindNGram, grams=tmpGrams)
  cumUniqueWords <- data.frame(250, length(ngramSets[[1]]$WordOrString))
  names(cumUniqueWords) <- c("Lines", "Words")
  uniqueWords <- ngramSets[[1]]$WordOrString
  lineCount <- 250
  
  for(i in 2:200)
  {
    lineCount <- lineCount + 250
    uniqueWords <- unique(c(uniqueWords, ngramSets[[i]]$WordOrString))
    uniqueWordLen <- length(uniqueWords)
    cumUniqueWords <- rbind(cumUniqueWords, c(lineCount, uniqueWordLen))
  }
  cumUniqueWords
}


# ################################
# Create random sets of lines from document
# ################################
randomLineSets <- function(tmpFileName)
{
  cumLines <- NA
  cumSamples <- list()
  lineList <- list()
  outputLines <- c()
  con <- file(tmpFileName, "rt")
  linesInFile <- length(readLines(con))
  close(con)

  lineNums <- c(1:linesInFile); lineNums
  cumLines <- sample(lineNums, 250, replace=FALSE)
  lineList[[1]] <- cumLines
  for(i in 2:200)
  {
    tmpNums <- sort(sample(lineNums[-cumLines], 250, replace=FALSE))
    cumLines <- c(cumLines, tmpNums)
    lineList[[i]] <- tmpNums
  }
  
  con <- file(tmpFileName, "rt")
  for(i in 1:200)
  {
    curPos <- 0
    fileSample <- c()
    for(j in 1:length(lineList[[i]]))
    {
      lineskip <- curPos + lineList[[i]][j] - 1
      fileSample <- c(fileSample, readLines(con, n = 1, skip = lineskip,warn = FALSE))
      curPos <- lineList[[i]][j]
    }
    cumSamples[[i]] <- fileSample
  }
  close(con)
  cumSamples
}



# ################################
# Find n-grams function
# ################################
FindNGram <- function(corpusDf, grams) {
  corpusDf <- tolower(corpusDf)
  tempGram <- NGramTokenizer(corpusDf, Weka_control(min = grams, max = grams, delimiters = " \\r\\n\\t.,;:\"()?!"))
  if(length(tempGram))
  {
    tempGram1 <- data.frame(table(tempGram))
    tempGram2 <- tempGram1[order(tempGram1$Freq, decreasing = TRUE),]
    colnames(tempGram2) <- c("WordOrString","Count")
    tempGram2 <- tempGram2[grepl("^[A-Za-z ]+$", as.character(tempGram2$WordOrString)),]
    tempGram2
  }
}


# ################################
# Count the lines in a file
# ################################
countFileLines <- function(tmpFileName)
{
  con <- file(tmpFileName, "rt")
  linesInFile <- length(readLines(con))
  close(con)
  linesInFile
}

Identify key data

# Get file lengths
linesInFile <- as.data.frame( rbind( countFileLines("en_US.blogs.txt"), countFileLines("en_US.news.txt"), countFileLines("en_US.twitter.txt") ))
rownames(linesInFile) <- c("Blogs", "News", "Twitter")

# Generate models for 1, 2 and 3 grams
ngramLines90 <- list()
fileModels <- list()
cumUniques <- list()
fileWords <- list()
fileModelsInverted <- list()
corpus <- c("Blogs", "News", "Twitter")

for(j in 1:4)
{
  # the i represents the level of ngram used to generate the model
    
  # Cumulative unique word statistics, correlated with line counts (based on random sampling)
  cumUniques[["Blogs"]][[j]] <- cumUniqueStats("en_US.blogs.txt", j)
  cumUniques[["News"]][[j]] <- cumUniqueStats("en_US.news.txt", j)
  cumUniques[["Twitter"]][[j]] <- cumUniqueStats("en_US.twitter.txt", j)

  # Create a linear model based on random samples from source data    
  fileModels[["Blogs"]][[j]] <- getModel(cumUniques[["Blogs"]][[j]])
  fileModels[["News"]][[j]] <- getModel(cumUniques[["News"]][[j]])
  fileModels[["Twitter"]][[j]] <- getModel(cumUniques[["Twitter"]][[j]])

  # Create a linear model based on random samples from source data    
  fileModelsInverted[["Blogs"]][[j]] <- getModelInverted(cumUniques[["Blogs"]][[j]])
  fileModelsInverted[["News"]][[j]] <- getModelInverted(cumUniques[["News"]][[j]])
  fileModelsInverted[["Twitter"]][[j]] <- getModelInverted(cumUniques[["Twitter"]][[j]])  
   
  # Find words in
  fileWords[["Blogs"]][[j]] <- totalWords(fileModels[["Blogs"]][[j]], linesInFile["Blogs",])
  fileWords[["News"]][[j]] <- totalWords(fileModels[["News"]][[j]], linesInFile["News",])  
  fileWords[["Twitter"]][[j]] <- totalWords(fileModels[["Twitter"]][[j]], linesInFile["Twitter",])  
    
}

Key question

Has the data scientist done basic summaries of the three files? Word counts, line counts and basic data tables?

In the below table, you’ll see a number of key statistics:

  • Source (Blogs, News or Twitter)
  • Grams (The number of words in the n-gram used for analysis)
  • Lines (Total lines contained in the corpus)
  • Words (The total estimated number of words in the corups)
  • Lines90 (How many lines must be read in order to get 90% of all unique words contained within the corpus)
rsquareValues <- list()

for(i in 1:4)
{
  
  rsquareValues[[i]] <- as.data.frame(
    rbind(
      c("Blogs", i, round(summary(fileModels[["Blogs"]][[i]])$r.squared, 3), linesInFile["Blogs",], round(fileWords[["Blogs"]][[i]], 0), round(linesNeeded( fileModelsInverted[["Blogs"]][[i]], fileWords[["Blogs"]][[i]],0.98), 0) ), 
      c("News", i, round(summary(fileModels[["News"]][[i]])$r.squared, 3), linesInFile["News",], round(fileWords[["News"]][[i]], 0), round(linesNeeded( fileModelsInverted[["News"]][[i]], fileWords[["News"]][[i]],0.98), 0) ), 
      c("Twitter", i, round(summary(fileModels[["Twitter"]][[i]])$r.squared, 3), linesInFile["Twitter",], round(fileWords[["Twitter"]][[i]], 0), round(linesNeeded(fileModelsInverted[["Twitter"]][[i]], fileWords[["Twitter"]][[i]],0.98), 0) )  ))
  
  colnames(rsquareValues[[i]]) <- c("Source", "Grams", "Rsquare", "Lines", "Words", "Lines98")

}

rbind(rsquareValues[[1]], rsquareValues[[2]], rsquareValues[[3]], rsquareValues[[4]])
##     Source Grams Rsquare   Lines Words Lines98
## 1    Blogs     1   0.838  899288  3679  153484
## 2     News     1   0.696   77259  3512   22015
## 3  Twitter     1   0.861 2360148  1556  594778
## 4    Blogs     2   0.815  899288 10051  168590
## 5     News     2   0.641   77259  8111   24005
## 6  Twitter     2   0.767 2360148  3131  376485
## 7    Blogs     3    0.81  899288 11901  196038
## 8     News     3   0.682   77259  9107   29480
## 9  Twitter     3   0.744 2360148  3269  405666
## 10   Blogs     4   0.814  899288 11983  221547
## 11    News     4   0.695   77259  9110   32815
## 12 Twitter     4   0.687 2360148  3069  337220

Has the data scientist made basic plots, such as histograms to illustrate features of the data?

Below, we’ve plotted the distribution charts for all n-gram counts, correlated with line numbers, for each of the corpuses. We’ve also overlaid a red line for the prediction model.

Plot of 1-Grams and Predictive Models for Each File

i = 1
par(mfrow=c(2,2))
plotModel(cumUniques[["Blogs"]][[i]], fileModels[["Blogs"]][[i]], 16000, paste("Blog ", i, "-Gram Distribution", sep=""))
plotModel(cumUniques[["News"]][[i]], fileModels[["News"]][[i]], 12000, paste("News ", i, "-Gram Distribution", sep=""))
plotModel(cumUniques[["Twitter"]][[i]], fileModels[["Twitter"]][[i]], 8000, paste("Twitter ", i, "-Gram Distribution", sep=""))

Plot of 2-Gramsand and Predictive Models for Each File

i = 2
par(mfrow=c(2,2))
plotModel(cumUniques[["Blogs"]][[i]], fileModels[["Blogs"]][[i]], 16000, paste("Blog ", i, "-Gram Distribution", sep=""))
plotModel(cumUniques[["News"]][[i]], fileModels[["News"]][[i]], 12000, paste("News ", i, "-Gram Distribution", sep=""))
plotModel(cumUniques[["Twitter"]][[i]], fileModels[["Twitter"]][[i]], 8000, paste("Twitter ", i, "-Gram Distribution", sep=""))

Plot of 3-Gramsand and Predictive Models for Each File

i = 3
par(mfrow=c(2,2))
plotModel(cumUniques[["Blogs"]][[i]], fileModels[["Blogs"]][[i]], 16000, paste("Blog ", i, "-Gram Distribution", sep=""))
plotModel(cumUniques[["News"]][[i]], fileModels[["News"]][[i]], 12000, paste("News ", i, "-Gram Distribution", sep=""))
plotModel(cumUniques[["Twitter"]][[i]], fileModels[["Twitter"]][[i]], 8000, paste("Twitter ", i, "-Gram Distribution", sep=""))

Plot of 4-Gramsand and Predictive Models for Each File

i = 4
par(mfrow=c(2,2))
plotModel(cumUniques[["Blogs"]][[i]], fileModels[["Blogs"]][[i]], 16000, paste("Blog ", i, "-Gram Distribution", sep=""))
plotModel(cumUniques[["News"]][[i]], fileModels[["News"]][[i]], 12000, paste("News ", i, "-Gram Distribution", sep=""))
plotModel(cumUniques[["Twitter"]][[i]], fileModels[["Twitter"]][[i]], 8000, paste("Twitter ", i, "-Gram Distribution", sep=""))

Frequency charts for most common NGrams

I need to compile frequency lists before creating these charts ##############################################

#library(ggplot2)

# df_uni  <- data.frame("unigram"=names(freq_uni[ord_uni]), "freq"=freq_uni[ord_uni])
                      
# df_bi  <- data.frame("unigram"=names(freq_bi[ord_bi]), "freq"=freq_bi[ord_bi])

# df_tri  <- data.frame("unigram"=names(freq_tri[ord_tri]), "freq"=freq_tri[ord_tri])

# ggplot(df_uni[1:40,], aes(factor(unigram, levels = unique(unigram)), freq)) +
  # geom_bar(stat = 'identity') +
  # theme(axis.text.x=element_text(angle=90)) +
  # xlab('Unigram') +
  # ylab('Frequency')

Was the report written in a brief, concise style, in a way that a non-data scientist manager could appreciate?

According to our analysis, the number of unique words contained within each corpus levels off quickly after the first few thousand lines, and then increases very slowly as you continue reading new lines. This knowledge can be leveraged to improve the efficiency of the raw data analysis. Instead of reading the entire corpus, you can obtain 98% of all unique word by reading in only a small random sampling of the full corpus.