This report is intended to reflect the beginning steps of text mining. This starts with importing data successfully, some basic exploration, tokenization, and just getting a grasp on what the data looks like and how it can be used in the future.

Load Packages and Data Import

The first tasks are to load data packages and import data.

library(tidyverse)
library(tm)
library(RWeka)
library(stringi)
library(SnowballC)


blogs <- readLines("Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
news <- readLines("Coursera-SwiftKey/final/en_US/en_US.news.txt")
tweet <- readLines("Coursera-SwiftKey/final/en_US/en_US.twitter.txt")

Pull a sample of the Data

Since we have such a large and substantial volume of data, we will jst use a sample to explore and build our model.

dir.create("Sample/")
set.seed(627)
blogs_samp <- blogs[rbinom(n=length(blogs)*.01, length(blogs), .05)]
write.csv(blogs_samp, file = "Sample/blogs_samp.csv", row.names = FALSE)

news_samp <- news[rbinom(n=length(news)*.01, length(news), .05)]
write.csv(news_samp, file = "Sample/news_samp.csv", row.names = FALSE)
 
tweet_samp <- tweet[rbinom(n=length(tweet)*.01, length(tweet), .05)]
write.csv(tweet_samp, file = "Sample//tweet_samp.csv", row.names = FALSE)

Explore Sample Data

Here we take a moment to explore the size of the sample data

#File Size
file.info("Sample/blogs_samp.csv")$size

## [1] 2181738

file.info("Sample/news_samp.csv")$size

## [1] 160678

file.info("Sample/tweet_samp.csv")$size

## [1] 1703159

#Number of lines
length(blogs_samp)

## [1] 8992

length(news_samp)

## [1] 772

length(tweet_samp)

## [1] 23601

#Number of words Total
sum(stri_count_words(blogs_samp))

## [1] 394800

sum(stri_count_words(news_samp))

## [1] 26782

sum(stri_count_words(tweet_samp))

## [1] 298179

Tokenize Data

Now that we have a decen sample of raw data, we will create a corpus(collection of files) and clean it up by removing white space, punctuation, numbers, and profanity. We will also change everything to lowercase.

#Create a Corpus of the sample for text mining
CapCorp <- VCorpus(DirSource("Sample"), readerControl = list(reader=readPlain, language = "en_US"))

#Clean Sample Data
cleanCC <- tm_map(CapCorp, stripWhitespace)#remove whitespace
cleanCC <- tm_map(cleanCC, content_transformer(tolower)) #convert to lowercase
cleanCC <- tm_map(cleanCC, removePunctuation) #remove punctuation
cleanCC <- tm_map(cleanCC, removeNumbers) # remove numbers
cleanCC <- tm_map(cleanCC, PlainTextDocument)

#Remove Profanity
prof <- read.csv(file = "profanity.csv", stringsAsFactors = F)
profane <- gsub("," ,"", tolower(prof))
cleanCC <- tm_map(cleanCC, removeWords, profane)

Build N-grams

The next step is to build N-grams. Here we will look at singe word as well as 2 and 3 word sequences.

unigram <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
tdm_uni <- TermDocumentMatrix(cleanCC, control=list(tokenize=unigram))

bigram <- function(x) NGramTokenizer(x, Weka_control(min=2, max = 2))
tdm_bi <- TermDocumentMatrix(cleanCC, control=list(tokenize=bigram))

trigram <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
tdm_tri <- TermDocumentMatrix(cleanCC, control=list(tokenize=trigram))

Explore N-grams

Here is a simple exploration into the N-grams we just created

#Unigram Plot
uni <- as.matrix(tdm_uni)
uni_sort <- sort(rowSums(uni), decreasing = TRUE)
uni_word <- data.frame(word=names(uni_sort), freq = uni_sort)
uni_top <- uni_word[1:15,]
uni_plot <- ggplot(uni_top, aes(x=reorder(word, freq), y=freq)) +
        geom_bar(stat = "identity") +
        labs(y = "Frequency", x = "Word", title = "Most Frequent Unigrams in Corpus")
print(uni_plot)

#Bigram Plot
bi <- as.matrix(tdm_bi)
bi_sort <- sort(rowSums(bi), decreasing = TRUE)
bi_word <- data.frame(word=names(bi_sort), freq = bi_sort)
bi_top <- bi_word[1:15,]
bi_plot <- ggplot(bi_top, aes(x=reorder(word, freq), y=freq)) +
        geom_bar(stat = "identity") +
        labs(y = "Frequency", x = "Words", title = "Most Frequent Bigrams in Corpus")
print(bi_plot)

#Trigram Plot
tri <- as.matrix(tdm_tri)
tri_sort <- sort(rowSums(tri), decreasing = TRUE)
tri_word <- data.frame(word=names(tri_sort), freq = tri_sort)
tri_top <- tri_word[1:15,]
tri_plot <- ggplot(tri_top, aes(x=reorder(word, freq), y=freq)) +
        geom_bar(stat = "identity") +
        labs(y = "Frequency", x = "Words", title = "Most Frequent Trigrams in Corpus")
print(tri_plot)

#High Frequency Words
findFreqTerms(tdm_uni, lowfreq=1000)

##  [1] "about"  "all"    "and"    "are"    "been"   "but"    "can"   
##  [8] "day"    "dont"   "for"    "from"   "get"    "good"   "had"   
## [15] "has"    "have"   "her"    "him"    "his"    "how"    "into"  
## [22] "its"    "just"   "know"   "like"   "love"   "make"   "more"  
## [29] "new"    "not"    "now"    "one"    "our"    "out"    "people"
## [36] "see"    "she"    "some"   "than"   "thanks" "that"   "the"   
## [43] "their"  "them"   "there"  "they"   "this"   "time"   "today" 
## [50] "was"    "well"   "were"   "what"   "when"   "who"    "will"  
## [57] "with"   "would"  "you"    "your"

findFreqTerms(tdm_bi, lowfreq=500)

##  [1] "and i"    "and the"  "as a"     "at the"   "for a"    "for the" 
##  [7] "have a"   "have to"  "i am"     "i have"   "i love"   "i was"   
## [13] "if you"   "in a"     "in my"    "in the"   "is a"     "is the"  
## [19] "it is"    "it was"   "need to"  "of a"     "of the"   "on the"  
## [25] "one of"   "that i"   "the best" "this is"  "to be"    "to do"   
## [31] "to see"   "to the"   "with a"   "with the" "you have"

 findFreqTerms(tdm_tri, lowfreq=100)

##  [1] "a lot of"       "be able to"     "cant wait to"   "check it out"  
##  [5] "for the follow" "have to be"     "i cant wait"    "i had to"      
##  [9] "i love you"     "i want to"      "if you have"    "in the world"  
## [13] "it is the"      "it was a"       "let me know"    "one of the"    
## [17] "some of the"    "thank you for"  "thanks for the" "the best of"   
## [21] "the end of"     "the rest of"    "this is a"      "you for the"   
## [25] "you have to"

Next Steps

Now that we have taken a brief look at the data, building a model and exploring options as far as efficiency and accuracy will be out next steps.

Data Science Capstone; Milestone Report

Janet M

September 22, 2018