Read in data

The first step is to read in the data. Each of the three text files are read in as character vectors using the readLines() function. We read in the full files and also take samples of each with which we perform our more memory intensive exploratory analysis.

bcon = file("final/en_US/en_US.blogs.txt")
blog <- readLines(bcon, encoding = "UTF-8", skipNul = TRUE)

ncon = file("final/en_US/en_US.news.txt")
news <- readLines(ncon, encoding = "UTF-8", skipNul = TRUE)

tcon = file("final/en_US/en_US.twitter.txt")
twit <- readLines(tcon, encoding = "UTF-8", skipNul = TRUE)

bsamp = sample(blog, 1000)
nsamp = sample(news, 1000)
tsamp = sample(twit, 1000)

Get summary stats

The next step is to get summary stats on our character vectors.

blog_stats = stri_stats_general(blog)
blog_words = stri_stats_latex(blog)[4]
blog_sum_df = data.frame(File='Blog',
                         Lines=blog_stats[[1]],
                         LinesNEmpty=blog_stats[[2]],
                         Characters =blog_stats[[3]],
                         CharactersNWhite =blog_stats[[4]],
                         TotalWords = blog_words[[1]])


news_stats = stri_stats_general(news)
news_words = stri_stats_latex(news)[4]
news_sum_df = data.frame(File='News',
                         Lines=news_stats[[1]],
                         LinesNEmpty=news_stats[[2]],
                         Characters =news_stats[[3]],
                         CharactersNWhite =news_stats[[4]],
                         TotalWords =news_words[[1]])


twit_stats = stri_stats_general(twit)
twit_words = stri_stats_latex(twit)[4]
twit_sum_df = data.frame(File='Twitter',
                         Lines=twit_stats[[1]],
                         LinesNEmpty=twit_stats[[2]],
                         Characters =twit_stats[[3]],
                         CharactersNWhite =twit_stats[[4]],
                         TotalWords =twit_words[[1]])

summary_df = rbind(blog_sum_df,news_sum_df,twit_sum_df)

print(summary_df)
##      File   Lines LinesNEmpty Characters CharactersNWhite TotalWords
## 1    Blog  899288      899288  206824382        170389539   37570839
## 2    News 1010242     1010242  203223154        169860866   34494539
## 3 Twitter 2360148     2360148  162096031        134082634   30451128

Define needed functions

Here we define functions to apply various text mining procedures to the sample data we got from each of the three files.

#Function to create corpus from character object, run general text cleaning
# returns: Corpus
mkCorpus <- function(chr){
    #Define corpus
    corp <- NULL
    corp <- Corpus(VectorSource(chr))
    corp <- tm_map(corp, stripWhitespace) %>%
        tm_map(tolower) %>%
        tm_map(removePunctuation) %>%
        tm_map(removeNumbers)
    return(corp)}

#Function to create annotation object 
# returns: annotation object
annotateText <- function(corpus){
    #Create annotators
    words_ann <- Maxent_Word_Token_Annotator()
    sents_ann <- Maxent_Sent_Token_Annotator()
    pos_tag <- Maxent_POS_Tag_Annotator()
    annot <- NLP::annotate(corpus$content, list(sents_ann, words_ann, pos_tag))
    return(annot)
}

#Function that retrieves POS tags and words, returning a Word/Tag dataframe
# returns: dataframe
getWordTags <- function(annotation, corpus){
    #extract words with POS tags
    sWords <- subset(annotation, type=="word")
    sTags <- sapply(sWords$features,'[[','POS' )
    sWordTags <- sprintf("%s[]%s", as.String(corpus$content)[sWords], sTags)
    sWordTags = as.data.frame(sWordTags)
    cleanWordTags = separate(data = sWordTags, col = sWordTags, sep = '\\[]', into = c('Word', 'POS'))
}

# Perform ngram tokenization and return dataframe with counts of phrases
# returns: dataframe
ngramTokenize <- function(corpus,min,max){
   ngram <- NGramTokenizer(corpus$content, Weka_control(min=min,max=max))
   ngdf <- data.frame(phrase = ngram) %>%
       group_by(phrase) %>%
       summarise(count = n())
   return(ngdf)
}

Run functions and get summary data

Here we run the functions we defined above in order to get our exploratory data from the sample sets. We use the 1000 row sample sets we obtained in the Read in data section above due to the memory-intensive size of the full text files.

bscorp <- mkCorpus(bsamp)
nscorp <- mkCorpus(nsamp)
tscorp <- mkCorpus(tsamp)
#Annotate (not working currently 8/7)
blog.annotation <- annotateText(bscorp) 
news.annotation <- annotateText(nscorp)
twit.annotation <- annotateText(tscorp)
#NGram tokenization
blog.grams <- ngramTokenize(bscorp,3,4)
blog.grams <- blog.grams[order(-blog.grams$count),]
news.grams <- ngramTokenize(nscorp,3,4)
news.grams <- news.grams[order(-news.grams$count),]
twit.grams <- ngramTokenize(tscorp,3,4)
twit.grams <- twit.grams[order(-twit.grams$count),]
#Get word tags from annotation
blog.word.tags <- getWordTags(blog.annotation, bscorp)
news.word.tags <- getWordTags(news.annotation, nscorp)
twit.word.tags <- getWordTags(twit.annotation, tscorp)
#Summarize words and POS tag counts
blog.word.counts <- blog.word.tags %>%
    group_by(Word) %>%
    summarise(Count = n()) 

blog.word.counts <- blog.word.counts[order(-blog.word.counts$Count),]

blog.pos.counts <- blog.word.tags %>%
    group_by(POS) %>%
    summarise(count = n())

blog.pos.counts <- blog.pos.counts[order(-blog.pos.counts$count),]

news.word.counts <- news.word.tags %>%
    group_by(Word) %>%
    summarise(Count = n())

news.word.counts <- news.word.counts[order(-news.word.counts$Count),]

news.pos.counts <- news.word.tags %>%
    group_by(POS) %>%
    summarise(count = n())

news.pos.counts <- news.pos.counts[order(-news.pos.counts$count),]

twit.word.counts <- twit.word.tags %>%
    group_by(Word) %>%
    summarise(Count = n())

twit.word.counts <- twit.word.counts[order(-twit.word.counts$Count),]

twit.pos.counts <- twit.word.tags %>%
    group_by(POS) %>%
    summarise(count = n())

twit.pos.counts <- twit.pos.counts[order(-twit.pos.counts$count),]

Exploratory charts

Below we chart the results of our text mining functions. The charts are colorized by what file they were obtained from: red for Blog data, green for News data, and blue for Twitter data.

Top Words Frequency

The following three charts show the top 10 Most frequently appearing words in each of the sample datasets.

Top Parts-Of-Speech Frequency

The following chart group shows the Top 10 most frequent Parts-Of-Speech in the sample datasets. Parts-Of-Speech were obtained by first running a Maxent_POS_Tag_Annotator() on the corpora and then obtaining the Word/Tag combinations from the result. A dictionary of the Parts-Of-Speech tags is available at: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html.

Top nGram Gram Frequency for 3 to 4 Grams

Below are the results of the ngram tokenization for 3 to 4 gram word groups (‘phrases’). The top 10 phrases for the sample data for each file are displayed. Ngram tokenization will also be the basis for our prediction model.

Next Steps

The next steps for the project are: