This is the Milestone Report for the Data Science Capstone course. The goal of this project is to perform exploratory data analysis of a large corpus of text documents to discover the structure in the data and how words are put together. This training data supplied for getting started on the capstone project can be downloaded here [1.41GB].
The data package contains text files from several different languages. For this project, only the English text files will be evaluated. There are three English text files: en_US.blogs.txt, en_US.news.txt, and en_US.twitter.txt. Since the text files are large, only random subsets of the files will be used for the analysis.
library(stringi)
library(tm)
library(slam)
library(ggplot2)
library(RWeka)
The code below is used to calculate and summarize the size, the number of lines, and the number of words in each text file.
# Check the file sizes in MB
size_blog <- file.info("final/en_US/en_US.blogs.txt")$size / 1024^2
size_news <- file.info("final/en_US/en_US.news.txt")$size / 1024^2
size_twit <- file.info("final/en_US/en_US.twitter.txt")$size / 1024^2
blogs <- readLines("final/en_US/en_US.blogs.txt")
news <- readLines("final/en_US/en_US.news.txt")
twitter <- readLines("final/en_US/en_US.twitter.txt")
# Check the number of lines in each file
len_blog <- length(blogs)
len_news <- length(news)
len_twit <- length(twitter)
# Total words in each file
words_blog <- sum(stri_count_words(blogs))
words_news <- sum(stri_count_words(news))
word_twit <- sum(stri_count_words(twitter))
df <- data.frame (
file_name = c("Blog", "News", "Twitter"),
size_MB = c(size_blog, size_news, size_twit),
number_lines = c(len_blog, len_news, len_twit),
number_words = c(words_blog, words_news, word_twit)
)
print(df)
## file_name size_MB number_lines number_words
## 1 Blog 200.4242 899288 37546246
## 2 News 196.2775 1010242 34762395
## 3 Twitter 159.3641 2360148 30093369
Given the large size of the text files, a random sample of 10,000 lines from each of the three files is used.
set.seed(320)
sample_blogs <- blogs[sample(1:length(blogs),10000)]
sample_news <- news[sample(1:length(news),10000)]
sample_twitter <- twitter[sample(1:length(twitter),10000)]
sample_data <- c(sample_blogs, sample_news, sample_twitter)
The tm package is used to create and clean a corpus. Transformations to the data that are performed are:
docs <- Corpus(VectorSource(sample_data))
# Change special characters to spaces
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/|@|\\|")
# Convert to lowercase
docs <- tm_map(docs, content_transformer(tolower))
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove punctuation
docs <- tm_map(docs, removePunctuation)
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Eliminate white space
docs <- tm_map(docs, stripWhitespace)
N-gram models are used to evaluate word frequenies. Using the RWeka package, 3 term-document matrics are used to create unigrams, bigrams, and trigrams.
# Sets the default number of threads to use
options(mc.cores=1)
unitdm <- TermDocumentMatrix(docs)
unitdm <- rollup(unitdm, 2, na.rm=TRUE, FUN = sum)
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bitdm <- TermDocumentMatrix(docs, control = list(tokenize = BigramTokenizer))
bitdm <- rollup(bitdm, 2, na.rm=TRUE, FUN = sum)
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tritdm <- TermDocumentMatrix(docs, control = list(tokenize = TrigramTokenizer))
tritdm <- rollup(tritdm, 2, na.rm=TRUE, FUN = sum)
The top 25 most frequent unigrams
uni_freq <- sort(rowSums(as.matrix(unitdm)),decreasing=TRUE)
uni_freq_df <- data.frame(word = names(uni_freq),freq=uni_freq)
print(head(uni_freq_df, 25))
## word freq
## said said 2929
## will will 2716
## one one 2534
## just just 2245
## like like 2133
## can can 2016
## time time 1820
## get get 1672
## new new 1596
## now now 1396
## people people 1344
## also also 1316
## good good 1270
## day day 1196
## know know 1166
## first first 1139
## make make 1135
## year year 1124
## back back 1102
## last last 1083
## two two 1081
## even even 1035
## love love 1032
## see see 1028
## think think 999
The top 25 most frequent bigrams
bi_freq <- sort(rowSums(as.matrix(bitdm)),decreasing=TRUE)
bi_freq_df <- data.frame(word = names(bi_freq),freq=bi_freq)
print(head(bi_freq_df, 25))
## word freq
## last year last year 194
## new york new york 183
## right now right now 166
## years ago years ago 130
## high school high school 117
## last week last week 116
## feel like feel like 104
## even though even though 101
## first time first time 100
## new jersey new jersey 100
## make sure make sure 87
## last night last night 86
## can get can get 81
## two years two years 79
## st louis st louis 77
## united states united states 76
## los angeles los angeles 75
## little bit little bit 72
## looking forward looking forward 70
## can see can see 68
## many people many people 67
## just like just like 65
## looks like looks like 64
## one day one day 63
## every day every day 59
The top 25 most frequent trigrams
tri_freq <- sort(rowSums(as.matrix(tritdm)),decreasing=TRUE)
tri_freq_df <- data.frame(word = names(tri_freq),freq=tri_freq)
print(head(tri_freq_df, 25))
## word freq
## new york city new york city 21
## world war ii world war ii 15
## president barack obama president barack obama 14
## happy mothers day happy mothers day 13
## two years ago two years ago 13
## amazon services llc amazon services llc 12
## llc amazon eu llc amazon eu 12
## new york times new york times 12
## services llc amazon services llc amazon 12
## george w bush george w bush 11
## gov chris christie gov chris christie 11
## let us know let us know 11
## st louis county st louis county 11
## happy new year happy new year 10
## new years eve new years eve 10
## two weeks ago two weeks ago 10
## first time since first time since 9
## g basal diet g basal diet 9
## attorney generals office attorney generals office 8
## three years ago three years ago 8
## cinco de mayo cinco de mayo 7
## county prosecutors office county prosecutors office 7
## executive vice president executive vice president 7
## farm holiday italy farm holiday italy 7
## four years ago four years ago 7
Graph the 25 most frequently occuring unigrams
g <- ggplot(head(uni_freq_df,25), aes(x=reorder(word, freq), y=freq, fill=freq)) +
geom_bar(stat = "identity") +
theme_bw() +
coord_flip() +
xlab("Word") + ylab("Frequency") +
labs(title = "Top 25 Unigrams by Frequency")
print(g)
Graph the 25 most frequently occuring bigrams
g <- ggplot(head(bi_freq_df,25), aes(x=reorder(word, freq), y=freq, fill=freq)) +
geom_bar(stat = "identity") +
theme_bw() +
coord_flip() +
xlab("Word") + ylab("Frequency") +
labs(title = "Top 25 Bigrams by Frequency")
print(g)
Graph the 25 most frequently occuring trigrams
g <- ggplot(head(tri_freq_df,25), aes(x=reorder(word, freq), y=freq, fill=freq)) +
geom_bar(stat = "identity") +
theme_bw() +
coord_flip() +
xlab("Word") + ylab("Frequency") +
labs(title = "Top 25 Trigrams by Frequency")
print(g)
The above methods will be used to analyze more text data. Then a predictive text model will be built on the data to help predict words. Finally, a predictive data product will be created in Shiny.
The Shiny app will consist of a user interface that will allow a user to type a phrase into an input box, press submit, and an output will print the suggested next word.