## Loading required package: NLP
## Package version: 1.4.0
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
## 
##     View
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
## Loading required package: RColorBrewer
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate

Overview

This milestone report is a requirement of one of the deliverables as part of the Data Sceince Capstone Project which in turn is a part of the Data Sceince Specialization by the John Hopkins University on Coursera. Follwoing tasks are accomplished in this milestone report:

For this project, I am using the “quanteda” package as it provides a rich set of text analysis features coupled with excellent performance relative to Java-based R packages for text analysis. More information here [https://github.com/lgreski/datasciencectacontent/blob/master/markdown/capstone-choosingATextPackage.md].

A summary of the dataset

fileStat<- function(fileName) {
    
    #Reading each of the files by opening a file connection
    con <- file(fileName)
    lines <- readLines(con)
    close(con)
    
    size <- file.info(fileName)[1]/1024^2 #Getting the file size
    nchars <- lapply(lines, nchar) #Getting the number of lines
    maxchars <- which.max(nchars) #Getting the length of the longest line
    word_count <- sum(sapply(strsplit(lines, "\\s+"), length)) #Counting the number of words
    
    return(c(fileName, format(round(as.double(size), 2), nsmall=2), length(lines), maxchars, word_count))
}
files <- c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt")

#Getting the text summary
test_summary <- c(fileStat(files[1]), fileStat(files[2]), fileStat(files[3]))
## Warning in readLines(con): incomplete final line found on 'en_US.news.txt'
## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul
#Coverting all the data into a data frame and printing it
df <- data.frame(matrix(unlist(test_summary), nrow=3, byrow=T))
colnames(df) <- c("Text File", "Size(MB)", "Line Count", "Max Line Length", "Words Count")
print(df)
##           Text File Size(MB) Line Count Max Line Length Words Count
## 1   en_US.blogs.txt   200.42     899288          483415    37334441
## 2    en_US.news.txt   196.28      77259           14556     2643972
## 3 en_US.twitter.txt   159.36    2360148         1484357    30373792

Random sampling of the dataset

samplingFun <- function(filename, p)
{
  con <- file(paste0("en_US.",filename,".txt"))
  tempData <- readLines(con)
  close(con)
  
  #Random sampling of the file
  set.seed(100)
  sampleFile <- tempData[rbinom(n=1000, size = 1, prob = p) == 1]
  
  #Making a new file
  con <- file(paste0("sample_en_US.",filename,".txt"))
  writeLines(sampleFile, con)
  close(con)
}
#Saving the file names
files <- c("blogs", "news", "twitter")
prob <- c(0.01, 0.01, 0.01)

#Calling the function to sample each of the three files
for(i in 1:3)
samplingFun(files[i],prob[i])
## Warning in readLines(con): incomplete final line found on 'en_US.news.txt'
## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul

Creation of corpus

Here I am combining the data from all the three sample files created above into a single monolith corpus. This will subsequently be used for further analysis.

# Creating a vector of the sample files
samplefiles <- c("sample_en_US.blogs.txt", "sample_en_US.news.txt", "sample_en_US.twitter.txt")

# Creating a quanteda corpus of the three sample documents created above
myCorpus <- corpus(readtext(samplefiles))

Cleaning of the dataset and tokenizing the corpus

In this section I will clean the sample data set (the corpus created above) followed by profanity filtering from the corpus, i.e., removal of the bad words. This section performs the following functions: 1. Removal of numbers 2. Removal of whitespaces 3. Transforming content to lower case 4. Removal of puntucation 5. Removal of stopwords 6. Profanity filtering

#Tokenizing the copus
myTokens <- tokens(myCorpus,
                   what = "word",
                   remove_punct = TRUE,     #Remove Punctuation
                   remove_hyphens = FALSE,  #preserve_intra_word_dashes
                   remove_numbers = TRUE,   #Remove Numbers
                   remove_symbols = TRUE)   #Remove Symbols
                   

#Getting the profane words data
profanity <- read.table(url("https://www.cs.cmu.edu/~biglou/resources/bad-words.txt"))
profanity <- as.character(unlist(profanity))

#Creating a vector of nGrams
#For this report, we are considering unigrams, bigrams and trigrams
nGram = c(1,2,3)

#This stores a list of data frames (term document matrix for each unigram, bigram and trigram

myNgramDF <- lapply(nGram, function(x){
  
                      #Creating a document feature matrix
                      mydfm <- dfm(tokens_ngrams(myTokens, n=x), 
                                    tolower=TRUE,                                  #Covert to lowercase
                                    remove = c(stopwords("english"), profanity))   #remove stopwords and profanity

                      #Trimming DFM to consider only those term which appear a minimum of 10 times in all the 3 documents
                      mydfm <- dfm_trim(mydfm, min_termfreq = 10)
                      
                      #Converting the DFM to a data-frame
                      mydf <- convert(mydfm, to = "data.frame")
                      
                      #Creating a Document Term Matrix from DFM
                      #A Document Term Matrix is a tranposed DFM
                      mydf <- t(mydf)
                      
                      #Setting column names of DTM as the name of the documents
                      colnames(mydf) <- mydf[1,]
                      mydf <- mydf[-1,]
                      mydf <- as.data.frame(mydf)
})

#Printing how the data set looks like

for(i in 1:3){
  cat("\n\n\t\tThis is how the Document Term Matrix of",i,"-gram looks like.\n\n")
  print(head(myNgramDF[[i]]))
}
## 
## 
##      This is how the Document Term Matrix of 1 -gram looks like.
## 
##        sample_en_US.blogs.txt sample_en_US.news.txt
## back                      828                    32
## shop                       85                     3
## little                    747                    25
## cafe                       17                     0
## post                      329                     6
## menu                       31                     1
##        sample_en_US.twitter.txt
## back                       1001
## shop                         47
## little                      328
## cafe                         25
## post                        146
## menu                         21
## 
## 
##      This is how the Document Term Matrix of 2 -gram looks like.
## 
##          sample_en_US.blogs.txt sample_en_US.news.txt
## at_the                      827                    83
## the_back                     63                     2
## back_of                      24                     4
## of_the                     3227                   235
## the_shop                     19                     0
## there_is                    322                    13
##          sample_en_US.twitter.txt
## at_the                        652
## the_back                       24
## back_of                        12
## of_the                        986
## the_shop                        5
## there_is                      125
## 
## 
##      This is how the Document Term Matrix of 3 -gram looks like.
## 
##              sample_en_US.blogs.txt sample_en_US.news.txt
## the_back_of                      21                     2
## back_of_the                      10                     2
## there_is_a                       91                     0
## is_a_little                      14                     0
## which_is_the                     15                     0
## of_this_post                     10                     0
##              sample_en_US.twitter.txt
## the_back_of                         8
## back_of_the                         6
## there_is_a                         36
## is_a_little                         3
## which_is_the                        6
## of_this_post                        0

Exploratory data analysis of the corpus

#This function plots the Frequency Histogram of the n-grams for each of the three documents
plotFreqWords <- function(x){
                
                #Getting the m-gram data frame from the list and storing it temporarily
                a <- myNgramDF[[x]]
                
                
                #Coverting each of the columns into numeric data-type
                #We need this since the data frame returned from the DFM function has numbers as type factors
                a[] <- lapply(a, function(x) {
                    if(is.factor(x)) as.numeric(as.character(x)) else x
                })
                
                #Removing row names and maing it as a separate column
                a <- data.frame(DocTerms = row.names(a), a, row.names = NULL)
                
                #A vector containg the document types. This will be used for naming each of the histograms
                doc = c("US Blogs", "US News", "US Twitter")
                
                #For each n-gram we need 3 histograms corresponding to each of the three types of documents
                for(i in 1:3){
                    main = paste0("15 Most Frequent ",x,"-grams ", "in ",doc[i], " article")
      
                    b <- a[,c(1,i+1)]
                    #b <- b[order(-b$sample_en_US.news.txt),]
                    b <- b[order(-b[,2]),]
                    colnames(b) <- c("word","Frequency")
                    #b
                    
                    p <- ggplot(data=b[1:20,], 
                           aes(x=reorder(word,Frequency), 
                               y=Frequency,
                               fill=Frequency))+ 
                      geom_bar(stat="identity")+
                      xlab("Word")+
                      labs(title = main) +
                      #theme(legend.title=element_blank()) + 
                      coord_flip()
                    
                    print(p)
                    
                    cat("\n\n\tWord-cloud of 15 Most Frequent ",x,"-grams ", "in ",doc[i], " article")
                    w <-  wordcloud(b$word[1:15], b$Frequency[1:20],
                                    colors=brewer.pal(8, "Dark2"))
                    
                    print(w)
                }
}
for(i in 1:3)
  plotFreqWords(i)

## 
## 
##  Word-cloud of 15 Most Frequent  1 -grams  in  US Blogs  article

## NULL

## 
## 
##  Word-cloud of 15 Most Frequent  1 -grams  in  US News  article

## NULL

## 
## 
##  Word-cloud of 15 Most Frequent  1 -grams  in  US Twitter  article

## NULL

## 
## 
##  Word-cloud of 15 Most Frequent  2 -grams  in  US Blogs  article

## NULL

## 
## 
##  Word-cloud of 15 Most Frequent  2 -grams  in  US News  article

## NULL

## 
## 
##  Word-cloud of 15 Most Frequent  2 -grams  in  US Twitter  article

## NULL

## 
## 
##  Word-cloud of 15 Most Frequent  3 -grams  in  US Blogs  article

## NULL

## 
## 
##  Word-cloud of 15 Most Frequent  3 -grams  in  US News  article

## NULL

## 
## 
##  Word-cloud of 15 Most Frequent  3 -grams  in  US Twitter  article

## NULL

Interesting findings

I had initially used the TM package for creating the corpus and for tokenization of the corpus. However, I observed that the TM package is too slow and cannot be used for even a moderately large data set. However, when I used quanteda, I realized that is is faster than as compared to the TM package by orders of magnitude. TM package was unable to clean a 3.5 MB corpus, but quamteda was able to process the corpus as large as 12 MB.

Tentative steps for further analysis of the dataset