This exploratory analysis is the first step in the Johns Hopkins Data Science Capstone Project. This project will use machine learning to generate a text prediction application. This initial analysis will read in the training dataset, preprocess the data, and characterize the data.
There are three files that will be included in this anlaysis - blogs, news, and twitter. This first graph will quantify the number of lines and the number of words for each of the files.
library(readr)
#Read in txt files from directory to generate line count
fileList <- list.files(path=
'/Users/shawnfoley/Desktop/Coursera/ML Capstone/final/test/',
pattern='.txt',full.names=T)
txtList <- lapply(as.list(fileList),read_file)
names(txtList) <- sapply(fileList,function(x) strsplit(x,'.',fixed=T)[[1]][2])
lineCount <- unlist(lapply(txtList,function(x) length(strsplit(x,'\n')[[1]])))
#Read in data using ngram to better process
library(ngram)
docs <- multiread('/Users/shawnfoley/Desktop/Coursera/ML Capstone/final/test/',
extension = "txt")
names(docs) <- sapply(fileList,function(x) strsplit(x,'.',fixed=T)[[1]][2])
docs <- lapply(docs, function(x) preprocess(x, case ="lower", remove.punct = TRUE))
wordCount <- unlist(lapply(docs,function(x) length(strsplit(x,' ')[[1]])))
uniqueWords <- unlist(lapply(docs,function(x) length(unique(strsplit(x,' ')[[1]]))))
#Barplot of line count
barplot(lineCount/1000,main='Lines per file',xlab='File',
ylab='Lines (1000)',ylim=c(0,25))
#Barplot of words and unique words
barplot(t(as.matrix(merge(uniqueWords,wordCount,by=0)[,-1]))/1000,beside=T,
names=names(docs),ylab="Words (1000)", xlab="File",main="Words per file",
col=c("black","white"),cex.axis=0.8)
legend('topright',fill=c('black','white'),
c('Unique words','All words'),cex=0.8,bty='n')
An N-gram is a series of N words that appear in a file. This analysis will generate 1-, 2-, 3-, 4-, 5-, and 6-grams for each of the three files. These n-grams will be assayed to see how many grams are needed to account for 50% or 90% of all words.
#Generate ngrams
ng <- list()
for (i in 1:6) {
ng[[i]] <- lapply(docs,ngram,n=i)
}
#Get frequency of each gram
ng.freq <- lapply(ng,function(gram) lapply(gram,get.phrasetable))
#Count number of grams to equal 0.5 or 0.9 of all words
gram50 <- matrix(unlist(lapply(ng.freq, function(gram)
lapply(gram, function(text) which(cumsum(text[,3]) >= 0.5)[1]))),nrow=3)
gram90 <- matrix(unlist(lapply(ng.freq, function(gram)
lapply(gram, function(text) which(cumsum(text$prop) >= 0.9)[1]))),nrow=3)
colnames(gram50) <- paste0(1:6,'-gram')
rownames(gram50) <- names(ng.freq)
colnames(gram90) <- paste0(1:6,'-gram')
rownames(gram90) <- names(ng.freq)
#Barplot of words/grams accounting for 50% of text
barplot(gram50/1000,beside=T,cex.axis=0.9,xlab='N-gram',ylim=c(0,200),
main='Number of words/grams accounting for 50% of dataset',
ylab='Count (1000)',col=c('red','blue','darkgreen'))
legend('topleft',fill=c('red','blue','darkgreen'),names(docs),bty='n',cex=0.8)
#Barplot of words/grams accounting for 90% of text
barplot(gram90/1000,beside=T,cex.axis=0.9,xlab='N-gram',ylim=c(0,350),
main='Number of words/grams accounting for 90% of dataset',
ylab='Count (1000)',col=c('red','blue','darkgreen'))
legend('topleft',fill=c('red','blue','darkgreen'),names(docs),bty='n',cex=0.8)