library(knitr)
library(RColorBrewer)
library(stringi)
library(wordcloud2)
library(ggplot2)
library(ngram)
library(NLP)
library(tm)
library(slam)
library(xtable)
library(wordcloud)
library(dplyr)
data_dir <- '~/Data science course in coursera/Capstone/Capstone/en_US/'
fileInfo <- function(filePath, TextSource){
fileSize <- file.info(paste0(filePath))$size/1048576
conection <- file(filePath,'r')
text <- readLines(conection)
nlines <- length(text)
maxline <- 0
for (i in 1:nlines) {
linelength <- nchar(text[i])
if (linelength > maxline) { maxline <- linelength }
}
nwords <- sum(stri_count_words(text))
df <- data.frame(
TextSource,
fileSize,
nlines,
maxline,
nwords
)
close(conection)
return(df)
}
BlogsConnection <- file("~/Data science course in coursera/Capstone/Capstone/en_US/en_US.blogs.txt", "r")
NewsConnection <- file("~/Data science course in coursera/Capstone/Capstone/en_US/en_US.news.txt", "r")
TwitterConnection <- file("~/Data science course in coursera/Capstone/Capstone/en_US/en_US.twitter.txt", "r")
The data comes from 3 text sources, blogs, news and twitter. The table below shows information about their structure.
## TextSource fileSize nlines maxline nwords
## 1 Blogs 200.4242 899288 40835 38154238
## 2 News 196.2775 77259 5760 2693898
## 3 Twitter 159.3641 2360148 213 30218125
An initial analysis will be performed using 1000 lines sample.
blogs <- readLines(BlogsConnection , 1000)
news <- readLines(NewsConnection , 1000)
twitter <- readLines(TwitterConnection, 1000)
corpus <- VCorpus(VectorSource(c(blogs, news, twitter)),
readerControl = list(readPlain,
language="en",
load=TRUE))
close(BlogsConnection)
close(NewsConnection )
close(TwitterConnection)
The next step is to use text mining techniques to clean and organize the data set.
Converting the document to lowercase, remove: punctuation marks, numbers, stopwords (i.e. “and”, “or”, “not”, “is”, etc), undesired terms, extra whitespaces.
corpus_lowercase <- tm_map(corpus, content_transformer(tolower))
corpus_low_punct <- tm_map(corpus_lowercase, removePunctuation)
corpus_low_punct_no <- tm_map(corpus_low_punct, removeNumbers)
corpus_low_punct_no_stop <- tm_map(corpus_low_punct_no, removeWords,stopwords("english"))
corpus_final <- tm_map(corpus_low_punct_no_stop, stripWhitespace)
To examine the data, we will produce a word clouds showing frequently used terms in the datasets. The word clouds show generally the top words with size varying by frequency.
wordcloud(corpus_final,
max.words=75,
random.order=TRUE,
rot.per=.15,
scale=c(3, .3))
The following eunigram shows the first 30 words with the highest frequency within the analyzed sample.
corpus_tdm <- TermDocumentMatrix(corpus_final)
corpus_tdm_m <- as.matrix(corpus_tdm)
corpus_tdm_m_freq <- rowSums(corpus_tdm_m)
corpus_tdm_m_freq <- sort(corpus_tdm_m_freq, decreasing = TRUE)
str(corpus_tdm_m_freq[1:30])
## Named num [1:30] 304 259 254 249 248 191 191 186 171 144 ...
## - attr(*, "names")= chr [1:30] "said" "will" "one" "just" ...
# ggplot(corpus_tdm_m_freq[1:30],
# col = "Blue",
# las = 2,
# main = "Word Frequency of the data")
df <- data.frame(
Word = names(corpus_tdm_m_freq[1:30]),
Frequency = corpus_tdm_m_freq[1:30]
)
df
## Word Frequency
## said said 304
## will will 259
## one one 254
## just just 249
## like like 248
## can can 191
## time time 191
## new new 186
## get get 171
## know know 144
## day day 141
## now now 137
## good good 131
## first first 128
## people people 124
## much much 122
## year year 120
## make make 112
## also also 110
## two two 106
## dont dont 104
## love love 102
## last last 99
## really really 99
## see see 99
## right right 97
## think think 97
## well well 95
## going going 93
## got got 93
ggplot( df %>% arrange(Frequency),
aes(x = reorder(Word,-Frequency), y = Frequency)) +
geom_bar(stat = "identity") +
labs(title = "Unigrams", x = "Words") +
theme(axis.text.x = element_text(angle = 90))