## Loading required package: NLP
## Package version: 1.4.0
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
##
## View
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
## Loading required package: RColorBrewer
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
This milestone report is a requirement of one of the deliverables as part of the Data Sceince Capstone Project which in turn is a part of the Data Sceince Specialization by the John Hopkins University on Coursera. Follwoing tasks are accomplished in this milestone report:
For this project, I am using the “quanteda” package as it provides a rich set of text analysis features coupled with excellent performance relative to Java-based R packages for text analysis. More information here [https://github.com/lgreski/datasciencectacontent/blob/master/markdown/capstone-choosingATextPackage.md].
fileStat<- function(fileName) {
#Reading each of the files by opening a file connection
con <- file(fileName)
lines <- readLines(con)
close(con)
size <- file.info(fileName)[1]/1024^2 #Getting the file size
nchars <- lapply(lines, nchar) #Getting the number of lines
maxchars <- which.max(nchars) #Getting the length of the longest line
word_count <- sum(sapply(strsplit(lines, "\\s+"), length)) #Counting the number of words
return(c(fileName, format(round(as.double(size), 2), nsmall=2), length(lines), maxchars, word_count))
}
files <- c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt")
#Getting the text summary
test_summary <- c(fileStat(files[1]), fileStat(files[2]), fileStat(files[3]))
## Warning in readLines(con): incomplete final line found on 'en_US.news.txt'
## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul
#Coverting all the data into a data frame and printing it
df <- data.frame(matrix(unlist(test_summary), nrow=3, byrow=T))
colnames(df) <- c("Text File", "Size(MB)", "Line Count", "Max Line Length", "Words Count")
print(df)
## Text File Size(MB) Line Count Max Line Length Words Count
## 1 en_US.blogs.txt 200.42 899288 483415 37334441
## 2 en_US.news.txt 196.28 77259 14556 2643972
## 3 en_US.twitter.txt 159.36 2360148 1484357 30373792
samplingFun <- function(filename, p)
{
con <- file(paste0("en_US.",filename,".txt"))
tempData <- readLines(con)
close(con)
#Random sampling of the file
set.seed(100)
sampleFile <- tempData[rbinom(n=1000, size = 1, prob = p) == 1]
#Making a new file
con <- file(paste0("sample_en_US.",filename,".txt"))
writeLines(sampleFile, con)
close(con)
}
#Saving the file names
files <- c("blogs", "news", "twitter")
prob <- c(0.01, 0.01, 0.01)
#Calling the function to sample each of the three files
for(i in 1:3)
samplingFun(files[i],prob[i])
## Warning in readLines(con): incomplete final line found on 'en_US.news.txt'
## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul
Here I am combining the data from all the three sample files created above into a single monolith corpus. This will subsequently be used for further analysis.
# Creating a vector of the sample files
samplefiles <- c("sample_en_US.blogs.txt", "sample_en_US.news.txt", "sample_en_US.twitter.txt")
# Creating a quanteda corpus of the three sample documents created above
myCorpus <- corpus(readtext(samplefiles))
In this section I will clean the sample data set (the corpus created above) followed by profanity filtering from the corpus, i.e., removal of the bad words. This section performs the following functions: 1. Removal of numbers 2. Removal of whitespaces 3. Transforming content to lower case 4. Removal of puntucation 5. Removal of stopwords 6. Profanity filtering
#Tokenizing the copus
myTokens <- tokens(myCorpus,
what = "word",
remove_punct = TRUE, #Remove Punctuation
remove_hyphens = FALSE, #preserve_intra_word_dashes
remove_numbers = TRUE, #Remove Numbers
remove_symbols = TRUE) #Remove Symbols
#Getting the profane words data
profanity <- read.table(url("https://www.cs.cmu.edu/~biglou/resources/bad-words.txt"))
profanity <- as.character(unlist(profanity))
#Creating a vector of nGrams
#For this report, we are considering unigrams, bigrams and trigrams
nGram = c(1,2,3)
#This stores a list of data frames (term document matrix for each unigram, bigram and trigram
myNgramDF <- lapply(nGram, function(x){
#Creating a document feature matrix
mydfm <- dfm(tokens_ngrams(myTokens, n=x),
tolower=TRUE, #Covert to lowercase
remove = c(stopwords("english"), profanity)) #remove stopwords and profanity
#Trimming DFM to consider only those term which appear a minimum of 10 times in all the 3 documents
mydfm <- dfm_trim(mydfm, min_termfreq = 10)
#Converting the DFM to a data-frame
mydf <- convert(mydfm, to = "data.frame")
#Creating a Document Term Matrix from DFM
#A Document Term Matrix is a tranposed DFM
mydf <- t(mydf)
#Setting column names of DTM as the name of the documents
colnames(mydf) <- mydf[1,]
mydf <- mydf[-1,]
mydf <- as.data.frame(mydf)
})
#Printing how the data set looks like
for(i in 1:3){
cat("\n\n\t\tThis is how the Document Term Matrix of",i,"-gram looks like.\n\n")
print(head(myNgramDF[[i]]))
}
##
##
## This is how the Document Term Matrix of 1 -gram looks like.
##
## sample_en_US.blogs.txt sample_en_US.news.txt
## back 828 32
## shop 85 3
## little 747 25
## cafe 17 0
## post 329 6
## menu 31 1
## sample_en_US.twitter.txt
## back 1001
## shop 47
## little 328
## cafe 25
## post 146
## menu 21
##
##
## This is how the Document Term Matrix of 2 -gram looks like.
##
## sample_en_US.blogs.txt sample_en_US.news.txt
## at_the 827 83
## the_back 63 2
## back_of 24 4
## of_the 3227 235
## the_shop 19 0
## there_is 322 13
## sample_en_US.twitter.txt
## at_the 652
## the_back 24
## back_of 12
## of_the 986
## the_shop 5
## there_is 125
##
##
## This is how the Document Term Matrix of 3 -gram looks like.
##
## sample_en_US.blogs.txt sample_en_US.news.txt
## the_back_of 21 2
## back_of_the 10 2
## there_is_a 91 0
## is_a_little 14 0
## which_is_the 15 0
## of_this_post 10 0
## sample_en_US.twitter.txt
## the_back_of 8
## back_of_the 6
## there_is_a 36
## is_a_little 3
## which_is_the 6
## of_this_post 0
#This function plots the Frequency Histogram of the n-grams for each of the three documents
plotFreqWords <- function(x){
#Getting the m-gram data frame from the list and storing it temporarily
a <- myNgramDF[[x]]
#Coverting each of the columns into numeric data-type
#We need this since the data frame returned from the DFM function has numbers as type factors
a[] <- lapply(a, function(x) {
if(is.factor(x)) as.numeric(as.character(x)) else x
})
#Removing row names and maing it as a separate column
a <- data.frame(DocTerms = row.names(a), a, row.names = NULL)
#A vector containg the document types. This will be used for naming each of the histograms
doc = c("US Blogs", "US News", "US Twitter")
#For each n-gram we need 3 histograms corresponding to each of the three types of documents
for(i in 1:3){
main = paste0("15 Most Frequent ",x,"-grams ", "in ",doc[i], " article")
b <- a[,c(1,i+1)]
#b <- b[order(-b$sample_en_US.news.txt),]
b <- b[order(-b[,2]),]
colnames(b) <- c("word","Frequency")
#b
p <- ggplot(data=b[1:20,],
aes(x=reorder(word,Frequency),
y=Frequency,
fill=Frequency))+
geom_bar(stat="identity")+
xlab("Word")+
labs(title = main) +
#theme(legend.title=element_blank()) +
coord_flip()
print(p)
cat("\n\n\tWord-cloud of 15 Most Frequent ",x,"-grams ", "in ",doc[i], " article")
w <- wordcloud(b$word[1:15], b$Frequency[1:20],
colors=brewer.pal(8, "Dark2"))
print(w)
}
}
for(i in 1:3)
plotFreqWords(i)
##
##
## Word-cloud of 15 Most Frequent 1 -grams in US Blogs article
## NULL
##
##
## Word-cloud of 15 Most Frequent 1 -grams in US News article
## NULL
##
##
## Word-cloud of 15 Most Frequent 1 -grams in US Twitter article
## NULL
##
##
## Word-cloud of 15 Most Frequent 2 -grams in US Blogs article
## NULL
##
##
## Word-cloud of 15 Most Frequent 2 -grams in US News article
## NULL
##
##
## Word-cloud of 15 Most Frequent 2 -grams in US Twitter article
## NULL
##
##
## Word-cloud of 15 Most Frequent 3 -grams in US Blogs article
## NULL
##
##
## Word-cloud of 15 Most Frequent 3 -grams in US News article
## NULL
##
##
## Word-cloud of 15 Most Frequent 3 -grams in US Twitter article
## NULL
I had initially used the TM package for creating the corpus and for tokenization of the corpus. However, I observed that the TM package is too slow and cannot be used for even a moderately large data set. However, when I used quanteda, I realized that is is faster than as compared to the TM package by orders of magnitude. TM package was unable to clean a 3.5 MB corpus, but quamteda was able to process the corpus as large as 12 MB.