This report is intended to reflect the beginning steps of text mining. This starts with importing data successfully, some basic exploration, tokenization, and just getting a grasp on what the data looks like and how it can be used in the future.
The first tasks are to load data packages and import data.
library(tidyverse)
library(tm)
library(RWeka)
library(stringi)
library(SnowballC)
blogs <- readLines("Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
news <- readLines("Coursera-SwiftKey/final/en_US/en_US.news.txt")
tweet <- readLines("Coursera-SwiftKey/final/en_US/en_US.twitter.txt")
Since we have such a large and substantial volume of data, we will jst use a sample to explore and build our model.
dir.create("Sample/")
set.seed(627)
blogs_samp <- blogs[rbinom(n=length(blogs)*.01, length(blogs), .05)]
write.csv(blogs_samp, file = "Sample/blogs_samp.csv", row.names = FALSE)
news_samp <- news[rbinom(n=length(news)*.01, length(news), .05)]
write.csv(news_samp, file = "Sample/news_samp.csv", row.names = FALSE)
tweet_samp <- tweet[rbinom(n=length(tweet)*.01, length(tweet), .05)]
write.csv(tweet_samp, file = "Sample//tweet_samp.csv", row.names = FALSE)
Here we take a moment to explore the size of the sample data
#File Size
file.info("Sample/blogs_samp.csv")$size
## [1] 2181738
file.info("Sample/news_samp.csv")$size
## [1] 160678
file.info("Sample/tweet_samp.csv")$size
## [1] 1703159
#Number of lines
length(blogs_samp)
## [1] 8992
length(news_samp)
## [1] 772
length(tweet_samp)
## [1] 23601
#Number of words Total
sum(stri_count_words(blogs_samp))
## [1] 394800
sum(stri_count_words(news_samp))
## [1] 26782
sum(stri_count_words(tweet_samp))
## [1] 298179
Now that we have a decen sample of raw data, we will create a corpus(collection of files) and clean it up by removing white space, punctuation, numbers, and profanity. We will also change everything to lowercase.
#Create a Corpus of the sample for text mining
CapCorp <- VCorpus(DirSource("Sample"), readerControl = list(reader=readPlain, language = "en_US"))
#Clean Sample Data
cleanCC <- tm_map(CapCorp, stripWhitespace)#remove whitespace
cleanCC <- tm_map(cleanCC, content_transformer(tolower)) #convert to lowercase
cleanCC <- tm_map(cleanCC, removePunctuation) #remove punctuation
cleanCC <- tm_map(cleanCC, removeNumbers) # remove numbers
cleanCC <- tm_map(cleanCC, PlainTextDocument)
#Remove Profanity
prof <- read.csv(file = "profanity.csv", stringsAsFactors = F)
profane <- gsub("," ,"", tolower(prof))
cleanCC <- tm_map(cleanCC, removeWords, profane)
The next step is to build N-grams. Here we will look at singe word as well as 2 and 3 word sequences.
unigram <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
tdm_uni <- TermDocumentMatrix(cleanCC, control=list(tokenize=unigram))
bigram <- function(x) NGramTokenizer(x, Weka_control(min=2, max = 2))
tdm_bi <- TermDocumentMatrix(cleanCC, control=list(tokenize=bigram))
trigram <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
tdm_tri <- TermDocumentMatrix(cleanCC, control=list(tokenize=trigram))
Here is a simple exploration into the N-grams we just created
#Unigram Plot
uni <- as.matrix(tdm_uni)
uni_sort <- sort(rowSums(uni), decreasing = TRUE)
uni_word <- data.frame(word=names(uni_sort), freq = uni_sort)
uni_top <- uni_word[1:15,]
uni_plot <- ggplot(uni_top, aes(x=reorder(word, freq), y=freq)) +
geom_bar(stat = "identity") +
labs(y = "Frequency", x = "Word", title = "Most Frequent Unigrams in Corpus")
print(uni_plot)
#Bigram Plot
bi <- as.matrix(tdm_bi)
bi_sort <- sort(rowSums(bi), decreasing = TRUE)
bi_word <- data.frame(word=names(bi_sort), freq = bi_sort)
bi_top <- bi_word[1:15,]
bi_plot <- ggplot(bi_top, aes(x=reorder(word, freq), y=freq)) +
geom_bar(stat = "identity") +
labs(y = "Frequency", x = "Words", title = "Most Frequent Bigrams in Corpus")
print(bi_plot)
#Trigram Plot
tri <- as.matrix(tdm_tri)
tri_sort <- sort(rowSums(tri), decreasing = TRUE)
tri_word <- data.frame(word=names(tri_sort), freq = tri_sort)
tri_top <- tri_word[1:15,]
tri_plot <- ggplot(tri_top, aes(x=reorder(word, freq), y=freq)) +
geom_bar(stat = "identity") +
labs(y = "Frequency", x = "Words", title = "Most Frequent Trigrams in Corpus")
print(tri_plot)
#High Frequency Words
findFreqTerms(tdm_uni, lowfreq=1000)
## [1] "about" "all" "and" "are" "been" "but" "can"
## [8] "day" "dont" "for" "from" "get" "good" "had"
## [15] "has" "have" "her" "him" "his" "how" "into"
## [22] "its" "just" "know" "like" "love" "make" "more"
## [29] "new" "not" "now" "one" "our" "out" "people"
## [36] "see" "she" "some" "than" "thanks" "that" "the"
## [43] "their" "them" "there" "they" "this" "time" "today"
## [50] "was" "well" "were" "what" "when" "who" "will"
## [57] "with" "would" "you" "your"
findFreqTerms(tdm_bi, lowfreq=500)
## [1] "and i" "and the" "as a" "at the" "for a" "for the"
## [7] "have a" "have to" "i am" "i have" "i love" "i was"
## [13] "if you" "in a" "in my" "in the" "is a" "is the"
## [19] "it is" "it was" "need to" "of a" "of the" "on the"
## [25] "one of" "that i" "the best" "this is" "to be" "to do"
## [31] "to see" "to the" "with a" "with the" "you have"
findFreqTerms(tdm_tri, lowfreq=100)
## [1] "a lot of" "be able to" "cant wait to" "check it out"
## [5] "for the follow" "have to be" "i cant wait" "i had to"
## [9] "i love you" "i want to" "if you have" "in the world"
## [13] "it is the" "it was a" "let me know" "one of the"
## [17] "some of the" "thank you for" "thanks for the" "the best of"
## [21] "the end of" "the rest of" "this is a" "you for the"
## [25] "you have to"
Now that we have taken a brief look at the data, building a model and exploring options as far as efficiency and accuracy will be out next steps.